diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4a711a2c7..9a812c779 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -2,7 +2,6 @@ version: 2 updates: - - package-ecosystem: "github-actions" directory: "/" schedule: diff --git a/.github/release-draft-template.yml b/.github/release-draft-template.yml new file mode 100644 index 000000000..ba19b9d6e --- /dev/null +++ b/.github/release-draft-template.yml @@ -0,0 +1,30 @@ +name-template: 'Milli v$RESOLVED_VERSION' +tag-template: 'v$RESOLVED_VERSION' +exclude-labels: + - 'skip changelog' +version-resolver: + minor: + labels: + - 'DB breaking' + - 'API breaking' + default: patch +categories: + - title: 'API breaking' + label: 'API breaking' + - title: 'DB breaking' + label: 'DB breaking' + - title: 'Changes' + label: 'no breaking' +template: | + $CHANGES + + Thanks again to $CONTRIBUTORS! 🎉 +no-changes-template: 'Changes are coming soon 😎' +sort-direction: 'ascending' +replacers: + - search: '/(?:and )?@dependabot-preview(?:\[bot\])?,?/g' + replace: '' + - search: '/(?:and )?@bors(?:\[bot\])?,?/g' + replace: '' + - search: '/(?:and )?@meili-bot,?/g' + replace: '' diff --git a/.github/workflows/enforce-label.yml b/.github/workflows/enforce-label.yml new file mode 100644 index 000000000..61038d91d --- /dev/null +++ b/.github/workflows/enforce-label.yml @@ -0,0 +1,14 @@ +name: Enforce PR labels + +on: + pull_request: + types: [labeled, unlabeled, opened, edited, synchronize] + +jobs: + enforce-label: + name: Specify breaking + runs-on: ubuntu-latest + steps: + - uses: yogevbd/enforce-label-action@2.2.2 + with: + REQUIRED_LABELS_ANY: 'no breaking,DB breaking,API breaking,skip changelog' diff --git a/.github/workflows/manual_benchmarks.yml b/.github/workflows/manual_benchmarks.yml new file mode 100644 index 000000000..76c6fe0fe --- /dev/null +++ b/.github/workflows/manual_benchmarks.yml @@ -0,0 +1,77 @@ +name: Benchmarks + +on: + workflow_dispatch: + inputs: + dataset_name: + description: 'The name of the dataset used to benchmark (search_songs, search_wiki, search_geo or indexing)' + required: false + default: 'search_songs' + +env: + BENCH_NAME: ${{ github.event.inputs.dataset_name }} + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: benchmarks + timeout-minutes: 4320 # 72h + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "basename=$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmp files + - name: Install critcmp + uses: taiki-e/install-action@v2 + with: + tool: critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json" diff --git a/.github/workflows/push_benchmarks_indexing.yml b/.github/workflows/push_benchmarks_indexing.yml new file mode 100644 index 000000000..12f9f6eda --- /dev/null +++ b/.github/workflows/push_benchmarks_indexing.yml @@ -0,0 +1,79 @@ +name: Benchmarks indexing (push) + +on: + push: + branches: + - main + +env: + INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }} + BENCH_NAME: "indexing" + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: benchmarks + timeout-minutes: 4320 # 72h + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "basename=$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmp files + - name: Install critcmp + uses: taiki-e/install-action@v2 + with: + tool: critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results + + # Upload benchmarks to influxdb + - name: Upload ${{ steps.file.outputs.basename }}.json to influxDB + run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json" diff --git a/.github/workflows/push_benchmarks_search_geo.yml b/.github/workflows/push_benchmarks_search_geo.yml new file mode 100644 index 000000000..02661061f --- /dev/null +++ b/.github/workflows/push_benchmarks_search_geo.yml @@ -0,0 +1,78 @@ +name: Benchmarks search geo (push) + +on: + push: + branches: + - main + +env: + BENCH_NAME: "search_geo" + INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }} + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: benchmarks + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "basename=$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmp files + - name: Install critcmp + uses: taiki-e/install-action@v2 + with: + tool: critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results + + # Upload benchmarks to influxdb + - name: Upload ${{ steps.file.outputs.basename }}.json to influxDB + run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json" diff --git a/.github/workflows/push_benchmarks_search_songs.yml b/.github/workflows/push_benchmarks_search_songs.yml new file mode 100644 index 000000000..92684a907 --- /dev/null +++ b/.github/workflows/push_benchmarks_search_songs.yml @@ -0,0 +1,78 @@ +name: Benchmarks search songs (push) + +on: + push: + branches: + - main + +env: + BENCH_NAME: "search_songs" + INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }} + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: benchmarks + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "basename=$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmp files + - name: Install critcmp + uses: taiki-e/install-action@v2 + with: + tool: critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results + + # Upload benchmarks to influxdb + - name: Upload ${{ steps.file.outputs.basename }}.json to influxDB + run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json" diff --git a/.github/workflows/push_benchmarks_search_wiki.yml b/.github/workflows/push_benchmarks_search_wiki.yml new file mode 100644 index 000000000..0f6511337 --- /dev/null +++ b/.github/workflows/push_benchmarks_search_wiki.yml @@ -0,0 +1,78 @@ +name: Benchmarks search wikipedia articles (push) + +on: + push: + branches: + - main + +env: + BENCH_NAME: "search_wiki" + INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }} + +jobs: + benchmarks: + name: Run and upload benchmarks + runs-on: benchmarks + steps: + - uses: actions/checkout@v3 + - uses: actions-rs/toolchain@v1 + with: + profile: minimal + toolchain: stable + override: true + + # Set variables + - name: Set current branch name + shell: bash + run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT + id: current_branch + - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3 + shell: bash + run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT + id: normalized_current_branch + - name: Set shorter commit SHA + shell: bash + run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT + id: commit_sha + - name: Set file basename with format "dataset_branch_commitSHA" + shell: bash + run: echo "basename=$(echo ${BENCH_NAME}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT + id: file + + # Run benchmarks + - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} + run: | + cd benchmarks + cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} + + # Generate critcmp files + - name: Install critcmp + uses: taiki-e/install-action@v2 + with: + tool: critcmp + - name: Export cripcmp file + run: | + critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json + + # Upload benchmarks + - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3 + uses: BetaHuhn/do-spaces-action@v2 + with: + access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }} + secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }} + space_name: ${{ secrets.DO_SPACES_SPACE_NAME }} + space_region: ${{ secrets.DO_SPACES_SPACE_REGION }} + source: ${{ steps.file.outputs.basename }}.json + out_dir: critcmp_results + + # Upload benchmarks to influxdb + - name: Upload ${{ steps.file.outputs.basename }}.json to influxDB + run: telegraf --config https://eu-central-1-1.aws.cloud2.influxdata.com/api/v2/telegrafs/08b52e34a370b000 --once --debug + + # Helper + - name: 'README: compare with another benchmark' + run: | + echo "${{ steps.file.outputs.basename }}.json has just been pushed." + echo 'How to compare this benchmark with another one?' + echo ' - Check the available files with: ./benchmarks/scripts/list.sh' + echo " - Run the following command: ./benchmaks/scipts/compare.sh ${{ steps.file.outputs.basename }}.json" diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml new file mode 100644 index 000000000..9ec8b9d64 --- /dev/null +++ b/.github/workflows/release-drafter.yml @@ -0,0 +1,16 @@ +name: Release Drafter + +on: + push: + branches: + - main + +jobs: + update_release_draft: + runs-on: ubuntu-latest + steps: + - uses: release-drafter/release-drafter@v5 + with: + config-name: release-draft-template.yml + env: + GITHUB_TOKEN: ${{ secrets.RELEASE_DRAFTER_TOKEN }} diff --git a/.github/workflows/update-cargo-toml-version.yml b/.github/workflows/update-cargo-toml-version.yml index d4d36fec4..80961e878 100644 --- a/.github/workflows/update-cargo-toml-version.yml +++ b/.github/workflows/update-cargo-toml-version.yml @@ -13,7 +13,6 @@ env: GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }} jobs: - update-version-cargo-toml: name: Update version in Cargo.toml files runs-on: ubuntu-latest diff --git a/.gitignore b/.gitignore index 6fc47753d..ecf90ef8f 100644 --- a/.gitignore +++ b/.gitignore @@ -8,9 +8,11 @@ /snapshots /dumps - # Snapshots ## ... large *.full.snap ## ... unreviewed *.snap.new + +# Fuzzcheck data for the facet indexing fuzz test +milli/fuzz/update::facet::incremental::fuzz::fuzz/ diff --git a/Cargo.lock b/Cargo.lock index 64012c10e..72fd77bab 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -289,6 +289,30 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anyhow" version = "1.0.68" @@ -377,7 +401,7 @@ dependencies = [ "cfg-if", "libc", "miniz_oxide", - "object", + "object 0.30.1", "rustc-demangle", ] @@ -393,6 +417,25 @@ version = "1.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b645a089122eccb6111b4f81cbc1a49f5900ac4666bb93ac027feaecf15607bf" +[[package]] +name = "benchmarks" +version = "0.39.0" +dependencies = [ + "anyhow", + "bytes", + "convert_case 0.6.0", + "criterion", + "csv", + "flate2", + "milli 0.39.0", + "mimalloc", + "rand", + "rand_chacha", + "reqwest", + "roaring", + "serde_json", +] + [[package]] name = "big_s" version = "1.0.2" @@ -592,6 +635,12 @@ dependencies = [ "toml", ] +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cc" version = "1.0.78" @@ -644,9 +693,51 @@ dependencies = [ "slice-group-by", "unicode-normalization", "unicode-segmentation", + "wana_kana", "whatlang", ] +[[package]] +name = "chrono" +version = "0.4.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16b0a3d9ed01224b22057780a37bb8c5dbfe1be8ba48678e7bf57ec4b385411f" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-integer", + "num-traits", + "wasm-bindgen", + "winapi", +] + +[[package]] +name = "ciborium" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c137568cc60b904a7724001b35ce2630fd00d5d84805fbb608ab89509d788f" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "346de753af073cc87b52b2083a506b38ac176a44cfb05497b622e27be899b369" + +[[package]] +name = "ciborium-ll" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "213030a2b5a4e0c0892b6652260cf6ccac84827b83a85a534e178e3906c4cf1b" +dependencies = [ + "ciborium-io", + "half", +] + [[package]] name = "cipher" version = "0.3.0" @@ -656,6 +747,21 @@ dependencies = [ "generic-array", ] +[[package]] +name = "clap" +version = "2.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" +dependencies = [ + "ansi_term", + "atty", + "bitflags", + "strsim 0.8.0", + "textwrap 0.11.0", + "unicode-width", + "vec_map", +] + [[package]] name = "clap" version = "3.2.23" @@ -668,9 +774,9 @@ dependencies = [ "clap_lex 0.2.4", "indexmap", "once_cell", - "strsim", + "strsim 0.10.0", "termcolor", - "textwrap", + "textwrap 0.16.0", ] [[package]] @@ -684,7 +790,7 @@ dependencies = [ "clap_lex 0.3.0", "is-terminal", "once_cell", - "strsim", + "strsim 0.10.0", "termcolor", ] @@ -694,7 +800,7 @@ version = "3.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea0c8bce528c4be4da13ea6fead8965e95b6073585a2f05204bd8f4119f82a65" dependencies = [ - "heck", + "heck 0.4.0", "proc-macro-error", "proc-macro2 1.0.49", "quote 1.0.23", @@ -707,7 +813,7 @@ version = "4.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014" dependencies = [ - "heck", + "heck 0.4.0", "proc-macro-error", "proc-macro2 1.0.49", "quote 1.0.23", @@ -732,6 +838,61 @@ dependencies = [ "os_str_bytes", ] +[[package]] +name = "cli" +version = "0.39.0" +dependencies = [ + "bimap", + "byte-unit", + "color-eyre", + "csv", + "eyre", + "indicatif", + "milli 0.39.0", + "mimalloc", + "serde", + "serde_json", + "stderrlog", + "structopt", +] + +[[package]] +name = "codespan-reporting" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" +dependencies = [ + "termcolor", + "unicode-width", +] + +[[package]] +name = "color-eyre" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a667583cca8c4f8436db8de46ea8233c42a7d9ae424a82d338f2e4675229204" +dependencies = [ + "backtrace", + "color-spantrace", + "eyre", + "indenter", + "once_cell", + "owo-colors", + "tracing-error", +] + +[[package]] +name = "color-spantrace" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ba75b3d9449ecdccb27ecbc479fdc0b87fa2dd43d2f8298f9bf0e59aacc8dce" +dependencies = [ + "once_cell", + "owo-colors", + "tracing-core", + "tracing-error", +] + [[package]] name = "concat-arrays" version = "0.1.2" @@ -752,6 +913,7 @@ dependencies = [ "encode_unicode", "lazy_static", "libc", + "unicode-width", "windows-sys", ] @@ -832,6 +994,42 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "criterion" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c76e09c1aae2bc52b3d2f29e13c6572553b30c4aa1b8a49fd70de6412654cb" +dependencies = [ + "anes", + "atty", + "cast", + "ciborium", + "clap 3.2.23", + "criterion-plot", + "itertools", + "lazy_static", + "num-traits", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + [[package]] name = "critical-section" version = "1.1.1" @@ -937,6 +1135,50 @@ dependencies = [ "memchr", ] +[[package]] +name = "cxx" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d1075c37807dcf850c379432f0df05ba52cc30f279c5cfc43cc221ce7f8579" +dependencies = [ + "cc", + "cxxbridge-flags", + "cxxbridge-macro", + "link-cplusplus", +] + +[[package]] +name = "cxx-build" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5044281f61b27bc598f2f6647d480aed48d2bf52d6eb0b627d84c0361b17aa70" +dependencies = [ + "cc", + "codespan-reporting", + "once_cell", + "proc-macro2 1.0.49", + "quote 1.0.23", + "scratch", + "syn 1.0.107", +] + +[[package]] +name = "cxxbridge-flags" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61b50bc93ba22c27b0d31128d2d130a0a6b3d267ae27ef7e4fae2167dfe8781c" + +[[package]] +name = "cxxbridge-macro" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39e61fda7e62115119469c7b3591fd913ecca96fb766cfd3f2e2502ab7bc87a5" +dependencies = [ + "proc-macro2 1.0.49", + "quote 1.0.23", + "syn 1.0.107", +] + [[package]] name = "darling" version = "0.14.2" @@ -957,7 +1199,7 @@ dependencies = [ "ident_case", "proc-macro2 1.0.49", "quote 1.0.23", - "strsim", + "strsim 0.10.0", "syn 1.0.107", ] @@ -1018,9 +1260,9 @@ dependencies = [ [[package]] name = "deserr" -version = "0.1.2" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb73133def0ebeb6f2e911a8ea3495cde53b00a5337dddc49bdb4b6c450ac8c7" +checksum = "86290491a2b5c21a1a5083da8dae831006761258fabd5617309c3eebc5f89468" dependencies = [ "deserr-internal", "serde-cs", @@ -1029,9 +1271,9 @@ dependencies = [ [[package]] name = "deserr-internal" -version = "0.1.2" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f290f1f52fbf7d0afe91e6f71f3c831ae5b223a55cc396e819c5748ba73a7bfa" +checksum = "7131de1c27581bc376a22166c9f570be91b76cb096be2f6aecf224c27bf7c49a" dependencies = [ "convert_case 0.5.0", "proc-macro2 1.0.49", @@ -1253,6 +1495,16 @@ dependencies = [ "libc", ] +[[package]] +name = "eyre" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c2b6b5a29c02cdc822728b7d7b8ae1bab3e3b05d44522770ddd49722eeac7eb" +dependencies = [ + "indenter", + "once_cell", +] + [[package]] name = "fastrand" version = "1.8.0" @@ -1316,6 +1568,15 @@ dependencies = [ "nom_locate", ] +[[package]] +name = "filter-parser" +version = "0.39.0" +dependencies = [ + "insta", + "nom", + "nom_locate", +] + [[package]] name = "flate2" version = "1.0.25" @@ -1323,6 +1584,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" dependencies = [ "crc32fast", + "libz-sys", "miniz_oxide", ] @@ -1334,6 +1596,14 @@ dependencies = [ "serde_json", ] +[[package]] +name = "flatten-serde-json" +version = "0.39.0" +dependencies = [ + "criterion", + "serde_json", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1444,6 +1714,50 @@ dependencies = [ "slab", ] +[[package]] +name = "fuzzcheck" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee76e8096c3fcd82ab23177edddcc9b81b72c123caab54bb1e2dc19fd09d2dec" +dependencies = [ + "ahash", + "bit-vec", + "cc", + "cfg-if", + "fastrand", + "flate2", + "fuzzcheck_common", + "fuzzcheck_mutators_derive", + "getopts", + "libc", + "md5", + "nu-ansi-term", + "object 0.27.1", + "regex-syntax", + "serde", + "serde_json", +] + +[[package]] +name = "fuzzcheck_common" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dde06f8d25b14a35d43eb2d3dbace3b9193424460b10ad4ccf1b3d542d48f06f" +dependencies = [ + "getopts", +] + +[[package]] +name = "fuzzcheck_mutators_derive" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30ce01e8bbb3e7e0758dcf907fe799f5998a54368963f766ae94b84624ba60c8" +dependencies = [ + "proc-macro2 1.0.49", + "quote 1.0.23", + "syn 1.0.107", +] + [[package]] name = "fxhash" version = "0.2.1" @@ -1469,6 +1783,15 @@ version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36d244a08113319b5ebcabad2b8b7925732d15eec46d7e7ac3c11734f3b7a6ad" +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.8" @@ -1547,6 +1870,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" + [[package]] name = "hash32" version = "0.2.1" @@ -1578,6 +1907,15 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "heck" version = "0.4.0" @@ -1728,6 +2066,30 @@ dependencies = [ "tokio-rustls", ] +[[package]] +name = "iana-time-zone" +version = "0.1.53" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64c122667b287044802d6ce17ee2ddf13207ed924c712de9a66a5814d5b64765" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "winapi", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +dependencies = [ + "cxx", + "cxx-build", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -1744,6 +2106,12 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "indenter" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce23b50ad8242c51a442f3ff322d56b02f08852c77e4c0b4d3fd684abc89c683" + [[package]] name = "index-scheduler" version = "1.0.0" @@ -1784,6 +2152,18 @@ dependencies = [ "serde", ] +[[package]] +name = "indicatif" +version = "0.17.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cef509aa9bc73864d6756f0d34d35504af3cf0844373afe9b8669a5b8005a729" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", +] + [[package]] name = "insta" version = "1.24.1" @@ -1899,6 +2279,14 @@ dependencies = [ "serde_json", ] +[[package]] +name = "json-depth-checker" +version = "0.39.0" +dependencies = [ + "criterion", + "serde_json", +] + [[package]] name = "jsonwebtoken" version = "8.2.0" @@ -2156,6 +2544,15 @@ dependencies = [ "yada", ] +[[package]] +name = "link-cplusplus" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" +dependencies = [ + "cc", +] + [[package]] name = "linked-hash-map" version = "0.5.6" @@ -2402,7 +2799,7 @@ dependencies = [ "insta", "meili-snap", "memmap2", - "milli", + "milli 0.38.0", "proptest", "proptest-derive", "roaring", @@ -2454,15 +2851,15 @@ dependencies = [ "crossbeam-channel", "csv", "either", - "filter-parser", - "flatten-serde-json", + "filter-parser 0.38.0", + "flatten-serde-json 0.38.0", "fst", "fxhash", "geoutils", "grenad", "heed", "itertools", - "json-depth-checker", + "json-depth-checker 0.38.0", "levenshtein_automata", "log", "logging_timer", @@ -2485,6 +2882,57 @@ dependencies = [ "uuid 1.2.2", ] +[[package]] +name = "milli" +version = "0.39.0" +dependencies = [ + "big_s", + "bimap", + "bincode", + "bstr 1.1.0", + "byteorder", + "charabia", + "concat-arrays", + "crossbeam-channel", + "csv", + "deserr", + "either", + "filter-parser 0.39.0", + "flatten-serde-json 0.39.0", + "fst", + "fuzzcheck", + "fxhash", + "geoutils", + "grenad", + "heed", + "insta", + "itertools", + "json-depth-checker 0.39.0", + "levenshtein_automata", + "log", + "logging_timer", + "maplit", + "md5", + "memmap2", + "obkv", + "once_cell", + "ordered-float", + "rand", + "rayon", + "roaring", + "rstar", + "serde", + "serde_json", + "slice-group-by", + "smallstr", + "smallvec", + "smartstring", + "tempfile", + "thiserror", + "time", + "uuid 1.2.2", +] + [[package]] name = "mimalloc" version = "0.1.32" @@ -2572,6 +3020,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "nu-ansi-term" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e62e2187cbceeafee9fb7b5e5e182623e0628ebf430a479df4487beb8f92fd7a" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num-bigint" version = "0.4.3" @@ -2613,6 +3071,21 @@ dependencies = [ "libc", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "object" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67ac1d3f9a1d3616fd9a60c8d74296f22406a238b6a72f5cc1e6f314df4ffbf9" +dependencies = [ + "memchr", +] + [[package]] name = "object" version = "0.30.1" @@ -2634,6 +3107,12 @@ version = "1.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f61fba1741ea2b3d6a1e3178721804bb716a68a6aeba1149b5d52e3d464ea66" +[[package]] +name = "oorandom" +version = "11.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" + [[package]] name = "opaque-debug" version = "0.3.0" @@ -2655,6 +3134,18 @@ version = "6.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b7820b9daea5457c9f21c69448905d723fbd21136ccf521748f23fd49e723ee" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + +[[package]] +name = "owo-colors" +version = "3.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1b04fb49957986fdce4d6ee7a65027d55d4b6d2265e5848bbb507b58ccfdb6f" + [[package]] name = "page_size" version = "0.4.2" @@ -2880,6 +3371,40 @@ dependencies = [ "dirs-next", ] +[[package]] +name = "plotters" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2538b639e642295546c50fcd545198c9d64ee2a38620a628724a3b266d5fbf97" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "193228616381fecdc1224c62e96946dfbc73ff4384fba576e052ff8c1bea8142" + +[[package]] +name = "plotters-svg" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9a81d2759aae1dae668f783c308bc5c8ebd191ff4184aaa1b37f65a6ae5a56f" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "portable-atomic" +version = "0.3.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26f6a7b87c2e435a3241addceeeff740ff8b7e76b74c13bf9acb17fa454ea00b" + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -3311,6 +3836,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" +[[package]] +name = "scratch" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddccb15bcce173023b3fedd9436f882a0739b8dfb45e4f6b6002bee5929f61b2" + [[package]] name = "sct" version = "0.7.0" @@ -3427,6 +3958,15 @@ dependencies = [ "digest", ] +[[package]] +name = "sharded-slab" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31" +dependencies = [ + "lazy_static", +] + [[package]] name = "signal-hook-registry" version = "1.4.0" @@ -3550,12 +4090,55 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "stderrlog" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69a26bbf6de627d389164afa9783739b56746c6c72c4ed16539f4ff54170327b" +dependencies = [ + "atty", + "chrono", + "log", + "termcolor", + "thread_local", +] + +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + [[package]] name = "strsim" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "structopt" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" +dependencies = [ + "clap 2.34.0", + "lazy_static", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" +dependencies = [ + "heck 0.3.3", + "proc-macro-error", + "proc-macro2 1.0.49", + "quote 1.0.23", + "syn 1.0.107", +] + [[package]] name = "subtle" version = "2.4.1" @@ -3663,6 +4246,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "textwrap" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] + [[package]] name = "textwrap" version = "0.16.0" @@ -3689,6 +4281,15 @@ dependencies = [ "syn 1.0.107", ] +[[package]] +name = "thread_local" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" +dependencies = [ + "once_cell", +] + [[package]] name = "time" version = "0.3.17" @@ -3716,6 +4317,16 @@ dependencies = [ "time-core", ] +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "tinyvec" version = "1.6.0" @@ -3832,6 +4443,28 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-error" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d686ec1c0f384b1277f097b2f279a2ecc11afe8c133c1aabf036a27cb4cd206e" +dependencies = [ + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70" +dependencies = [ + "sharded-slab", + "thread_local", + "tracing-core", ] [[package]] @@ -3888,6 +4521,12 @@ version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a" +[[package]] +name = "unicode-width" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + [[package]] name = "unicode-xid" version = "0.1.0" @@ -3948,12 +4587,24 @@ dependencies = [ "serde", ] +[[package]] +name = "valuable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + [[package]] name = "vcpkg" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + [[package]] name = "vergen" version = "7.5.0" @@ -3996,6 +4647,17 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "wana_kana" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5770f395a1c9d6d64bee602f0a36763d7861ef5715f9d4f707cb0086f82dba54" +dependencies = [ + "fnv", + "itertools", + "lazy_static", +] + [[package]] name = "want" version = "0.3.0" diff --git a/Cargo.toml b/Cargo.toml index b3be0075a..908544e46 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,12 @@ members = [ "dump", "file-store", "permissive-json-pointer", + "milli", + "filter-parser", + "flatten-serde-json", + "json-depth-checker", + "benchmarks", + "cli" ] [profile.release] diff --git a/assets/logo-black.svg b/assets/logo-black.svg new file mode 100644 index 000000000..2a3fb1d89 --- /dev/null +++ b/assets/logo-black.svg @@ -0,0 +1,6 @@ + + + + + + diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 000000000..1f259516b --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1 @@ +benches/datasets_paths.rs diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml new file mode 100644 index 000000000..1cb63db4a --- /dev/null +++ b/benchmarks/Cargo.toml @@ -0,0 +1,48 @@ +[package] +name = "benchmarks" +version = "0.39.0" +edition = "2018" +publish = false + +[dependencies] +anyhow = "1.0.65" +csv = "1.1.6" +milli = { path = "../milli", default-features = false } +mimalloc = { version = "0.1.29", default-features = false } +serde_json = { version = "1.0.85", features = ["preserve_order"] } + +[dev-dependencies] +criterion = { version = "0.4.0", features = ["html_reports"] } +rand = "0.8.5" +rand_chacha = "0.3.1" +roaring = "0.10.1" + +[build-dependencies] +anyhow = "1.0.65" +bytes = "1.2.1" +convert_case = "0.6.0" +flate2 = "1.0.24" +reqwest = { version = "0.11.12", features = ["blocking", "rustls-tls"], default-features = false } + +[features] +default = ["milli/default"] + +[[bench]] +name = "search_songs" +harness = false + +[[bench]] +name = "search_wiki" +harness = false + +[[bench]] +name = "search_geo" +harness = false + +[[bench]] +name = "indexing" +harness = false + +[[bench]] +name = "formatting" +harness = false diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..a0634bcb4 --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,138 @@ +Benchmarks +========== + +## TOC + +- [Run the benchmarks](#run-the-benchmarks) +- [Comparison between benchmarks](#comparison-between-benchmarks) +- [Datasets](#datasets) + +## Run the benchmarks + +### On our private server + +The Meili team has self-hosted his own GitHub runner to run benchmarks on our dedicated bare metal server. + +To trigger the benchmark workflow: +- Go to the `Actions` tab of this repository. +- Select the `Benchmarks` workflow on the left. +- Click on `Run workflow` in the blue banner. +- Select the branch on which you want to run the benchmarks and select the dataset you want (default: `songs`). +- Finally, click on `Run workflow`. + +This GitHub workflow will run the benchmarks and push the `critcmp` report to a DigitalOcean Space (= S3). + +The name of the uploaded file is displayed in the workflow. + +_[More about critcmp](https://github.com/BurntSushi/critcmp)._ + +💡 To compare the just-uploaded benchmark with another one, check out the [next section](#comparison-between-benchmarks). + +### On your machine + +To run all the benchmarks (~5h): + +```bash +cargo bench +``` + +To run only the `search_songs` (~1h), `search_wiki` (~3h), `search_geo` (~20m) or `indexing` (~2h) benchmark: + +```bash +cargo bench --bench +``` + +By default, the benchmarks will be downloaded and uncompressed automatically in the target directory.
+If you don't want to download the datasets every time you update something on the code, you can specify a custom directory with the environment variable `MILLI_BENCH_DATASETS_PATH`: + +```bash +mkdir ~/datasets +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench search_songs # the four datasets are downloaded +touch build.rs +MILLI_BENCH_DATASETS_PATH=~/datasets cargo bench --bench songs # the code is compiled again but the datasets are not downloaded +``` + +## Comparison between benchmarks + +The benchmark reports we push are generated with `critcmp`. Thus, we use `critcmp` to show the result of a benchmark, or compare results between multiple benchmarks. + +We provide a script to download and display the comparison report. + +Requirements: +- `grep` +- `curl` +- [`critcmp`](https://github.com/BurntSushi/critcmp) + +List the available file in the DO Space: + +```bash +./benchmarks/script/list.sh +``` +```bash +songs_main_09a4321.json +songs_geosearch_24ec456.json +search_songs_main_cb45a10b.json +``` + +Run the comparison script: + +```bash +# we get the result of ONE benchmark, this give you an idea of how much time an operation took +./benchmarks/scripts/compare.sh son songs_geosearch_24ec456.json +# we compare two benchmarks +./benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json +# we compare three benchmarks +./benchmarks/scripts/compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json search_songs_main_cb45a10b.json +``` + +## Datasets + +The benchmarks uses the following datasets: +- `smol-songs` +- `smol-wiki` +- `movies` +- `smol-all-countries` + +### Songs + +`smol-songs` is a subset of the [`songs.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/songs.csv.gz). + +It was generated with this command: + +```bash +xsv sample --seed 42 1000000 songs.csv -o smol-songs.csv +``` + +_[Download the generated `smol-songs` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-songs.csv.gz)._ + +### Wiki + +`smol-wiki` is a subset of the [`wikipedia-articles.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/wiki-articles.csv.gz). + +It was generated with the following command: + +```bash +xsv sample --seed 42 500000 wiki-articles.csv -o smol-wiki-articles.csv +``` + +_[Download the `smol-wiki` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-wiki-articles.csv.gz)._ + +### Movies + +`movies` is a really small dataset we uses as our example in the [getting started](https://docs.meilisearch.com/learn/getting_started/) + +_[Download the `movies` dataset](https://docs.meilisearch.com/movies.json)._ + + +### All Countries + +`smol-all-countries` is a subset of the [`all-countries.csv` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/all-countries.csv.gz) +It has been converted to jsonlines and then edited so it matches our format for the `_geo` field. + +It was generated with the following command: +```bash +bat all-countries.csv.gz | gunzip | xsv sample --seed 42 1000000 | csv2json-lite | sd '"latitude":"(.*?)","longitude":"(.*?)"' '"_geo": { "lat": $1, "lng": $2 }' | sd '\[|\]|,$' '' | gzip > smol-all-countries.jsonl.gz +``` + +_[Download the `smol-all-countries` dataset](https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets/smol-all-countries.jsonl.gz)._ + diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs new file mode 100644 index 000000000..3479029f4 --- /dev/null +++ b/benchmarks/benches/formatting.rs @@ -0,0 +1,67 @@ +use std::rc::Rc; + +use criterion::{criterion_group, criterion_main}; +use milli::tokenizer::TokenizerBuilder; +use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; + +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +struct Conf<'a> { + name: &'a str, + text: &'a str, + matching_words: MatcherBuilder<'a, Vec>, +} + +fn bench_formatting(c: &mut criterion::Criterion) { + #[rustfmt::skip] + let confs = &[ + Conf { + name: "'the door d'", + text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#, + matching_words: MatcherBuilder::new(MatchingWords::new(vec![ + (vec![Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap())], vec![0]), + (vec![Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap())], vec![0]), + (vec![Rc::new(MatchingWord::new("door".to_string(), 1, false).unwrap())], vec![1]), + (vec![Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap())], vec![0]), + (vec![Rc::new(MatchingWord::new("thedoor".to_string(), 1, false).unwrap())], vec![0, 1]), + (vec![Rc::new(MatchingWord::new("d".to_string(), 0, true).unwrap())], vec![2]), + (vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]), + (vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]), + ] + ), TokenizerBuilder::default().build()), + }, + ]; + + let format_options = &[ + FormatOptions { highlight: false, crop: None }, + FormatOptions { highlight: true, crop: None }, + FormatOptions { highlight: false, crop: Some(10) }, + FormatOptions { highlight: true, crop: Some(10) }, + FormatOptions { highlight: false, crop: Some(20) }, + FormatOptions { highlight: true, crop: Some(20) }, + ]; + + for option in format_options { + let highlight = if option.highlight { "highlight" } else { "no-highlight" }; + + let name = match option.crop { + Some(size) => format!("{}-crop({})", highlight, size), + None => format!("{}-no-crop", highlight), + }; + + let mut group = c.benchmark_group(&name); + for conf in confs { + group.bench_function(conf.name, |b| { + b.iter(|| { + let mut matcher = conf.matching_words.build(conf.text); + matcher.format(option.clone()); + }) + }); + } + group.finish(); + } +} + +criterion_group!(benches, bench_formatting); +criterion_main!(benches); diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs new file mode 100644 index 000000000..d567b3da1 --- /dev/null +++ b/benchmarks/benches/indexing.rs @@ -0,0 +1,1380 @@ +mod datasets_paths; +mod utils; + +use std::fs::{create_dir_all, remove_dir_all}; +use std::path::Path; + +use criterion::{criterion_group, criterion_main, Criterion}; +use milli::heed::{EnvOpenOptions, RwTxn}; +use milli::update::{ + DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, +}; +use milli::Index; +use rand::seq::SliceRandom; +use rand_chacha::rand_core::SeedableRng; +use roaring::RoaringBitmap; + +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +const BENCHMARK_ITERATION: usize = 10; + +fn setup_dir(path: impl AsRef) { + match remove_dir_all(path.as_ref()) { + Ok(_) => (), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), + Err(e) => panic!("{}", e), + } + create_dir_all(path).unwrap(); +} + +fn setup_index() -> Index { + let path = "benches.mmdb"; + setup_dir(&path); + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + options.max_readers(10); + Index::new(options, path).unwrap() +} + +fn setup_settings<'t>( + wtxn: &mut RwTxn<'t, '_>, + index: &'t Index, + primary_key: &str, + searchable_fields: &[&str], + filterable_fields: &[&str], + sortable_fields: &[&str], +) { + let config = IndexerConfig::default(); + let mut builder = Settings::new(wtxn, index, &config); + + builder.set_primary_key(primary_key.to_owned()); + + let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(filterable_fields); + + let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_sortable_fields(sortable_fields); + + builder.execute(|_| (), || false).unwrap(); +} + +fn setup_index_with_settings<'t>( + primary_key: &str, + searchable_fields: &[&str], + filterable_fields: &[&str], + sortable_fields: &[&str], +) -> milli::Index { + let index = setup_index(); + let mut wtxn = index.write_txn().unwrap(); + setup_settings( + &mut wtxn, + &index, + primary_key, + searchable_fields, + filterable_fields, + sortable_fields, + ); + wtxn.commit().unwrap(); + + index +} + +fn choose_document_ids_from_index_batched( + index: &Index, + count: usize, + batch_size: usize, +) -> Vec { + let rtxn = index.read_txn().unwrap(); + // create batch of document ids to delete + let mut rng = rand_chacha::ChaCha8Rng::seed_from_u64(7700); + let document_ids: Vec<_> = index.documents_ids(&rtxn).unwrap().into_iter().collect(); + let document_ids_to_delete: Vec<_> = + document_ids.choose_multiple(&mut rng, count).map(Clone::clone).collect(); + + document_ids_to_delete + .chunks(batch_size) + .map(|c| { + let mut batch = RoaringBitmap::new(); + for id in c { + batch.insert(*id); + } + + batch + }) + .collect() +} + +fn indexing_songs_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing songs with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = + ["released-timestamp", "duration-float", "genre", "country", "artist"]; + let sortable_fields = []; + + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn reindexing_songs_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Reindexing songs with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = + ["released-timestamp", "duration-float", "genre", "country", "artist"]; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn deleting_songs_in_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("-songs-delete-facetedString-facetedNumber-searchable-", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = + ["released-timestamp", "duration-float", "genre", "country", "artist"]; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let count = 1250; + let batch_size = 250; + let document_ids_to_delete = + choose_document_ids_from_index_batched(&index, count, batch_size); + + (index, document_ids_to_delete) + }, + move |(index, document_ids_to_delete)| { + let mut wtxn = index.write_txn().unwrap(); + + for ids in document_ids_to_delete { + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_documents(&ids); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_songs_in_three_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing songs in three batches with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = + ["released-timestamp", "duration-float", "genre", "country", "artist"]; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + let indexing_config = IndexDocumentsConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing songs without faceted numbers", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = ["genre", "country", "artist"]; + let sortable_fields = []; + + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_songs_without_faceted_fields(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing songs without any facets", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "album", "artist"]; + let filterable_fields = []; + let sortable_fields = []; + + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_wiki(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing wiki", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "body"]; + let filterable_fields = []; + let sortable_fields = []; + + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn reindexing_wiki(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Reindexing wiki", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "body"]; + let filterable_fields = []; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn deleting_wiki_in_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("-wiki-delete-searchable-", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "body"]; + let filterable_fields = []; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let count = 1250; + let batch_size = 250; + let document_ids_to_delete = + choose_document_ids_from_index_batched(&index, count, batch_size); + + (index, document_ids_to_delete) + }, + move |(index, document_ids_to_delete)| { + let mut wtxn = index.write_txn().unwrap(); + + for ids in document_ids_to_delete { + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_documents(&ids); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_wiki_in_three_batches(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing wiki in three batches", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "body"]; + let filterable_fields = []; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + let mut wtxn = index.write_txn().unwrap(); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + let documents = + utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = + utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + let indexing_config = + IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = + utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_movies_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing movies with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "overview"]; + let filterable_fields = ["release_date", "genres"]; + let sortable_fields = []; + + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::MOVIES, "json"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn reindexing_movies_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Reindexing movies with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "overview"]; + let filterable_fields = ["release_date", "genres"]; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::MOVIES, "json"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::MOVIES, "json"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn deleting_movies_in_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("-movies-delete-facetedString-facetedNumber-searchable-", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "overview"]; + let filterable_fields = ["release_date", "genres"]; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + let documents = utils::documents_from(datasets_paths::MOVIES, "json"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let count = 1250; + let batch_size = 250; + let document_ids_to_delete = + choose_document_ids_from_index_batched(&index, count, batch_size); + + (index, document_ids_to_delete) + }, + move |(index, document_ids_to_delete)| { + let mut wtxn = index.write_txn().unwrap(); + + for ids in document_ids_to_delete { + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_documents(&ids); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_movies_in_three_batches(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing movies in three batches", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = ["title", "overview"]; + let filterable_fields = ["release_date", "genres"]; + let sortable_fields = []; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + let mut wtxn = index.write_txn().unwrap(); + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + let indexing_config = IndexDocumentsConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_nested_movies_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing nested movies with default settings", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = [ + "title", + "overview", + "provider_names", + "genres", + "crew.name", + "cast.character", + "cast.name", + ]; + let filterable_fields = [ + "popularity", + "release_date", + "runtime", + "vote_average", + "external_ids", + "keywords", + "providers.buy.name", + "providers.rent.name", + "providers.flatrate.name", + "provider_names", + "genres", + "crew.name", + "cast.character", + "cast.name", + ]; + let sortable_fields = ["popularity", "runtime", "vote_average", "release_date"]; + + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("-movies-delete-facetedString-facetedNumber-searchable-nested-", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = [ + "title", + "overview", + "provider_names", + "genres", + "crew.name", + "cast.character", + "cast.name", + ]; + let filterable_fields = [ + "popularity", + "release_date", + "runtime", + "vote_average", + "external_ids", + "keywords", + "providers.buy.name", + "providers.rent.name", + "providers.flatrate.name", + "provider_names", + "genres", + "crew.name", + "cast.character", + "cast.name", + ]; + let sortable_fields = ["popularity", "runtime", "vote_average", "release_date"]; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let count = 1250; + let batch_size = 250; + let document_ids_to_delete = + choose_document_ids_from_index_batched(&index, count, batch_size); + + (index, document_ids_to_delete) + }, + move |(index, document_ids_to_delete)| { + let mut wtxn = index.write_txn().unwrap(); + + for ids in document_ids_to_delete { + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_documents(&ids); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing nested movies without any facets", |b| { + b.iter_with_setup( + move || { + let primary_key = "id"; + let searchable_fields = [ + "title", + "overview", + "provider_names", + "genres", + "crew.name", + "cast.character", + "cast.name", + ]; + let filterable_fields = []; + let sortable_fields = []; + + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn indexing_geo(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Indexing geo_point", |b| { + b.iter_with_setup( + move || { + let primary_key = "geonameid"; + let searchable_fields = ["name", "alternatenames", "elevation"]; + let filterable_fields = ["_geo", "population", "elevation"]; + let sortable_fields = ["_geo", "population", "elevation"]; + + setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ) + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn reindexing_geo(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("Reindexing geo_point", |b| { + b.iter_with_setup( + move || { + let primary_key = "geonameid"; + let searchable_fields = ["name", "alternatenames", "elevation"]; + let filterable_fields = ["_geo", "population", "elevation"]; + let sortable_fields = ["_geo", "population", "elevation"]; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index + }, + move |index| { + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +fn deleting_geo_in_batches_default(c: &mut Criterion) { + let mut group = c.benchmark_group("indexing"); + group.sample_size(BENCHMARK_ITERATION); + group.bench_function("-geo-delete-facetedNumber-facetedGeo-searchable-", |b| { + b.iter_with_setup( + move || { + let primary_key = "geonameid"; + let searchable_fields = ["name", "alternatenames", "elevation"]; + let filterable_fields = ["_geo", "population", "elevation"]; + let sortable_fields = ["_geo", "population", "elevation"]; + + let index = setup_index_with_settings( + &primary_key, + &searchable_fields, + &filterable_fields, + &sortable_fields, + ); + + // We index only one half of the dataset in the setup part + // as we don't care about the time it takes. + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &config, + indexing_config, + |_| (), + || false, + ) + .unwrap(); + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let count = 1250; + let batch_size = 250; + let document_ids_to_delete = + choose_document_ids_from_index_batched(&index, count, batch_size); + + (index, document_ids_to_delete) + }, + move |(index, document_ids_to_delete)| { + let mut wtxn = index.write_txn().unwrap(); + + for ids in document_ids_to_delete { + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_documents(&ids); + builder.execute().unwrap(); + } + + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + }, + ) + }); +} + +criterion_group!( + benches, + indexing_songs_default, + reindexing_songs_default, + deleting_songs_in_batches_default, + indexing_songs_without_faceted_numbers, + indexing_songs_without_faceted_fields, + indexing_songs_in_three_batches_default, + indexing_wiki, + reindexing_wiki, + deleting_wiki_in_batches_default, + indexing_wiki_in_three_batches, + indexing_movies_default, + reindexing_movies_default, + deleting_movies_in_batches_default, + indexing_movies_in_three_batches, + indexing_nested_movies_default, + deleting_nested_movies_in_batches_default, + indexing_nested_movies_without_faceted_fields, + indexing_geo, + reindexing_geo, + deleting_geo_in_batches_default +); +criterion_main!(benches); diff --git a/benchmarks/benches/search_geo.rs b/benchmarks/benches/search_geo.rs new file mode 100644 index 000000000..faea4e3e0 --- /dev/null +++ b/benchmarks/benches/search_geo.rs @@ -0,0 +1,122 @@ +mod datasets_paths; +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = + ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = + ["name", "alternatenames", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let filterable_fields = + ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(filterable_fields); + + let sortable_fields = + ["_geo", "population", "elevation"].iter().map(|s| s.to_string()).collect(); + builder.set_sortable_fields(sortable_fields); +} + +#[rustfmt::skip] +const BASE_CONF: Conf = Conf { + dataset: datasets_paths::SMOL_ALL_COUNTRIES, + dataset_format: "jsonl", + queries: &[ + "", + ], + configure: base_conf, + primary_key: Some("geonameid"), + ..Conf::BASE +}; + +fn bench_geo(c: &mut criterion::Criterion) { + #[rustfmt::skip] + let confs = &[ + // A basic placeholder with no geo + utils::Conf { + group_name: "placeholder with no geo", + ..BASE_CONF + }, + // Medium aglomeration: probably the most common usecase + utils::Conf { + group_name: "asc sort from Lille", + sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):asc"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc sort from Lille", + sort: Some(vec!["_geoPoint(50.62999333378238, 3.086269263384099):desc"]), + ..BASE_CONF + }, + // Big agglomeration: a lot of documents close to our point + utils::Conf { + group_name: "asc sort from Tokyo", + sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):asc"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc sort from Tokyo", + sort: Some(vec!["_geoPoint(35.749512532692144, 139.61664952543356):desc"]), + ..BASE_CONF + }, + // The furthest point from any civilization + utils::Conf { + group_name: "asc sort from Point Nemo", + sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):asc"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc sort from Point Nemo", + sort: Some(vec!["_geoPoint(-48.87561645055408, -123.39275749319793):desc"]), + ..BASE_CONF + }, + // Filters + utils::Conf { + group_name: "filter of 100km from Lille", + filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 100000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 1km from Lille", + filter: Some("_geoRadius(50.62999333378238, 3.086269263384099, 1000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 100km from Tokyo", + filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 100000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 1km from Tokyo", + filter: Some("_geoRadius(35.749512532692144, 139.61664952543356, 1000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 100km from Point Nemo", + filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 100000)"), + ..BASE_CONF + }, + utils::Conf { + group_name: "filter of 1km from Point Nemo", + filter: Some("_geoRadius(-48.87561645055408, -123.39275749319793, 1000)"), + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_geo); +criterion_main!(benches); diff --git a/benchmarks/benches/search_songs.rs b/benchmarks/benches/search_songs.rs new file mode 100644 index 000000000..a1245528f --- /dev/null +++ b/benchmarks/benches/search_songs.rs @@ -0,0 +1,196 @@ +mod datasets_paths; +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = + ["id", "title", "album", "artist", "genre", "country", "released", "duration"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "album", "artist"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let faceted_fields = ["released-timestamp", "duration-float", "genre", "country", "artist"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_filterable_fields(faceted_fields); +} + +#[rustfmt::skip] +const BASE_CONF: Conf = Conf { + dataset: datasets_paths::SMOL_SONGS, + queries: &[ + "john ", // 9097 + "david ", // 4794 + "charles ", // 1957 + "david bowie ", // 1200 + "michael jackson ", // 600 + "thelonious monk ", // 303 + "charles mingus ", // 142 + "marcus miller ", // 60 + "tamo ", // 13 + "Notstandskomitee ", // 4 + ], + configure: base_conf, + primary_key: Some("id"), + ..Conf::BASE +}; + +fn bench_songs(c: &mut criterion::Criterion) { + let default_criterion: Vec = + milli::default_criteria().iter().map(|criteria| criteria.to_string()).collect(); + let default_criterion = default_criterion.iter().map(|s| s.as_str()); + let asc_default: Vec<&str> = + std::iter::once("released-timestamp:asc").chain(default_criterion.clone()).collect(); + let desc_default: Vec<&str> = + std::iter::once("released-timestamp:desc").chain(default_criterion.clone()).collect(); + + let basic_with_quote: Vec = BASE_CONF + .queries + .iter() + .map(|s| { + s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::>().join(" ") + }) + .collect(); + let basic_with_quote: &[&str] = + &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + + #[rustfmt::skip] + let confs = &[ + /* first we bench each criterion alone */ + utils::Conf { + group_name: "proximity", + queries: &[ + "black saint sinner lady ", + "les dangeureuses 1960 ", + "The Disneyland Sing-Along Chorus ", + "Under Great Northern Lights ", + "7000 Danses Un Jour Dans Notre Vie ", + ], + criterion: Some(&["proximity"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "typo", + queries: &[ + "mongus ", + "thelonius monk ", + "Disnaylande ", + "the white striper ", + "indochie ", + "indochien ", + "klub des loopers ", + "fear of the duck ", + "michel depech ", + "stromal ", + "dire straights ", + "Arethla Franklin ", + ], + criterion: Some(&["typo"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "words", + queries: &[ + "the black saint and the sinner lady and the good doggo ", // four words to pop + "les liaisons dangeureuses 1793 ", // one word to pop + "The Disneyland Children's Sing-Alone song ", // two words to pop + "seven nation mummy ", // one word to pop + "7000 Danses / Le Baiser / je me trompe de mots ", // four words to pop + "Bring Your Daughter To The Slaughter but now this is not part of the title ", // nine words to pop + "whathavenotnsuchforth and a good amount of words to pop to match the first one ", // 13 + ], + criterion: Some(&["words"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "asc", + criterion: Some(&["released-timestamp:desc"]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc", + criterion: Some(&["released-timestamp:desc"]), + ..BASE_CONF + }, + + /* then we bench the asc and desc criterion on top of the default criterion */ + utils::Conf { + group_name: "asc + default", + criterion: Some(&asc_default[..]), + ..BASE_CONF + }, + utils::Conf { + group_name: "desc + default", + criterion: Some(&desc_default[..]), + ..BASE_CONF + }, + + /* we bench the filters with the default request */ + utils::Conf { + group_name: "basic filter: <=", + filter: Some("released-timestamp <= 946728000"), // year 2000 + ..BASE_CONF + }, + utils::Conf { + group_name: "basic filter: TO", + filter: Some("released-timestamp 946728000 TO 1262347200"), // year 2000 to 2010 + ..BASE_CONF + }, + utils::Conf { + group_name: "big filter", + filter: Some("released-timestamp != 1262347200 AND (NOT (released-timestamp = 946728000)) AND (duration-float = 1 OR (duration-float 1.1 TO 1.5 AND released-timestamp > 315576000))"), + ..BASE_CONF + }, + + /* the we bench some global / normal search with all the default criterion in the default + * order */ + utils::Conf { + group_name: "basic placeholder", + queries: &[""], + ..BASE_CONF + }, + utils::Conf { + group_name: "basic without quote", + queries: &BASE_CONF + .queries + .iter() + .map(|s| s.trim()) // we remove the space at the end of each request + .collect::>(), + ..BASE_CONF + }, + utils::Conf { + group_name: "basic with quote", + queries: basic_with_quote, + ..BASE_CONF + }, + utils::Conf { + group_name: "prefix search", + queries: &[ + "s", // 500k+ results + "a", // + "b", // + "i", // + "x", // only 7k results + ], + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_songs); +criterion_main!(benches); diff --git a/benchmarks/benches/search_wiki.rs b/benchmarks/benches/search_wiki.rs new file mode 100644 index 000000000..b792c2645 --- /dev/null +++ b/benchmarks/benches/search_wiki.rs @@ -0,0 +1,129 @@ +mod datasets_paths; +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = ["title", "body", "url"].iter().map(|s| s.to_string()).collect(); + builder.set_displayed_fields(displayed_fields); + + let searchable_fields = ["title", "body"].iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); +} + +#[rustfmt::skip] +const BASE_CONF: Conf = Conf { + dataset: datasets_paths::SMOL_WIKI_ARTICLES, + queries: &[ + "mingus ", // 46 candidates + "miles davis ", // 159 + "rock and roll ", // 1007 + "machine ", // 3448 + "spain ", // 7002 + "japan ", // 10.593 + "france ", // 17.616 + "film ", // 24.959 + ], + configure: base_conf, + ..Conf::BASE +}; + +fn bench_songs(c: &mut criterion::Criterion) { + let basic_with_quote: Vec = BASE_CONF + .queries + .iter() + .map(|s| { + s.trim().split(' ').map(|s| format!(r#""{}""#, s)).collect::>().join(" ") + }) + .collect(); + let basic_with_quote: &[&str] = + &basic_with_quote.iter().map(|s| s.as_str()).collect::>(); + + #[rustfmt::skip] + let confs = &[ + /* first we bench each criterion alone */ + utils::Conf { + group_name: "proximity", + queries: &[ + "herald sings ", + "april paris ", + "tea two ", + "diesel engine ", + ], + criterion: Some(&["proximity"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "typo", + queries: &[ + "migrosoft ", + "linax ", + "Disnaylande ", + "phytogropher ", + "nympalidea ", + "aritmetric ", + "the fronce ", + "sisan ", + ], + criterion: Some(&["typo"]), + optional_words: false, + ..BASE_CONF + }, + utils::Conf { + group_name: "words", + queries: &[ + "the black saint and the sinner lady and the good doggo ", // four words to pop, 27 results + "Kameya Tokujirō mingus monk ", // two words to pop, 55 + "Ulrich Hensel meilisearch milli ", // two words to pop, 306 + "Idaho Bellevue pizza ", // one word to pop, 800 + "Abraham machin ", // one word to pop, 1141 + ], + criterion: Some(&["words"]), + ..BASE_CONF + }, + /* the we bench some global / normal search with all the default criterion in the default + * order */ + utils::Conf { + group_name: "basic placeholder", + queries: &[""], + ..BASE_CONF + }, + utils::Conf { + group_name: "basic without quote", + queries: &BASE_CONF + .queries + .iter() + .map(|s| s.trim()) // we remove the space at the end of each request + .collect::>(), + ..BASE_CONF + }, + utils::Conf { + group_name: "basic with quote", + queries: basic_with_quote, + ..BASE_CONF + }, + utils::Conf { + group_name: "prefix search", + queries: &[ + "t", // 453k results + "c", // 405k + "g", // 318k + "j", // 227k + "q", // 71k + "x", // 17k + ], + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_songs); +criterion_main!(benches); diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs new file mode 100644 index 000000000..470d2030d --- /dev/null +++ b/benchmarks/benches/utils.rs @@ -0,0 +1,256 @@ +#![allow(dead_code)] + +use std::fs::{create_dir_all, remove_dir_all, File}; +use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; +use std::num::ParseFloatError; +use std::path::Path; +use std::str::FromStr; + +use criterion::BenchmarkId; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use milli::heed::EnvOpenOptions; +use milli::update::{ + IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, +}; +use milli::{Criterion, Filter, Index, Object, TermsMatchingStrategy}; +use serde_json::Value; + +pub struct Conf<'a> { + /// where we are going to create our database.mmdb directory + /// each benchmark will first try to delete it and then recreate it + pub database_name: &'a str, + /// the dataset to be used, it must be an uncompressed csv + pub dataset: &'a str, + /// The format of the dataset + pub dataset_format: &'a str, + pub group_name: &'a str, + pub queries: &'a [&'a str], + /// here you can change which criterion are used and in which order. + /// - if you specify something all the base configuration will be thrown out + /// - if you don't specify anything (None) the default configuration will be kept + pub criterion: Option<&'a [&'a str]>, + /// the last chance to configure your database as you want + pub configure: fn(&mut Settings), + pub filter: Option<&'a str>, + pub sort: Option>, + /// enable or disable the optional words on the query + pub optional_words: bool, + /// primary key, if there is None we'll auto-generate docids for every documents + pub primary_key: Option<&'a str>, +} + +impl Conf<'_> { + pub const BASE: Self = Conf { + database_name: "benches.mmdb", + dataset_format: "csv", + dataset: "", + group_name: "", + queries: &[], + criterion: None, + configure: |_| (), + filter: None, + sort: None, + optional_words: true, + primary_key: None, + }; +} + +pub fn base_setup(conf: &Conf) -> Index { + match remove_dir_all(&conf.database_name) { + Ok(_) => (), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => (), + Err(e) => panic!("{}", e), + } + create_dir_all(&conf.database_name).unwrap(); + + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + options.max_readers(10); + let index = Index::new(options, conf.database_name).unwrap(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + if let Some(primary_key) = conf.primary_key { + builder.set_primary_key(primary_key.to_string()); + } + + if let Some(criterion) = conf.criterion { + builder.reset_filterable_fields(); + builder.reset_criteria(); + builder.reset_stop_words(); + + let criterion = criterion.iter().map(|s| Criterion::from_str(s).unwrap()).collect(); + builder.set_criteria(criterion); + } + + (conf.configure)(&mut builder); + + builder.execute(|_| (), || false).unwrap(); + wtxn.commit().unwrap(); + + let config = IndexerConfig::default(); + let mut wtxn = index.write_txn().unwrap(); + let indexing_config = IndexDocumentsConfig { + autogenerate_docids: conf.primary_key.is_none(), + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); + let documents = documents_from(conf.dataset, conf.dataset_format); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index +} + +pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { + for conf in confs { + let index = base_setup(conf); + + let file_name = Path::new(conf.dataset).file_name().and_then(|f| f.to_str()).unwrap(); + let name = format!("{}: {}", file_name, conf.group_name); + let mut group = c.benchmark_group(&name); + + for &query in conf.queries { + group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + search.query(query).terms_matching_strategy(TermsMatchingStrategy::default()); + if let Some(filter) = conf.filter { + let filter = Filter::from_str(filter).unwrap().unwrap(); + search.filter(filter); + } + if let Some(sort) = &conf.sort { + let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); + search.sort_criteria(sort); + } + let _ids = search.execute().unwrap(); + }); + }); + } + group.finish(); + + index.prepare_for_closing().wait(); + } +} + +pub fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { + let reader = + File::open(filename).expect(&format!("could not find the dataset in: {}", filename)); + let reader = BufReader::new(reader); + let documents = match filetype { + "csv" => documents_from_csv(reader).unwrap(), + "json" => documents_from_json(reader).unwrap(), + "jsonl" => documents_from_jsonl(reader).unwrap(), + otherwise => panic!("invalid update format {:?}", otherwise), + }; + DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() +} + +fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { + let object = result?; + documents.append_json_object(&object)?; + } + + documents.into_inner().map_err(Into::into) +} + +fn documents_from_json(reader: impl BufRead) -> anyhow::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + documents.append_json_array(reader)?; + + documents.into_inner().map_err(Into::into) +} + +fn documents_from_csv(reader: impl BufRead) -> anyhow::Result> { + let csv = csv::Reader::from_reader(reader); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) +} + +enum AllowedType { + String, + Number, +} + +fn parse_csv_header(header: &str) -> (String, AllowedType) { + // if there are several separators we only split on the last one. + match header.rsplit_once(':') { + Some((field_name, field_type)) => match field_type { + "string" => (field_name.to_string(), AllowedType::String), + "number" => (field_name.to_string(), AllowedType::Number), + // we may return an error in this case. + _otherwise => (header.to_string(), AllowedType::String), + }, + None => (header.to_string(), AllowedType::String), + } +} + +struct CSVDocumentDeserializer +where + R: Read, +{ + documents: csv::StringRecordsIntoIter, + headers: Vec<(String, AllowedType)>, +} + +impl CSVDocumentDeserializer { + fn from_reader(reader: R) -> io::Result { + let mut records = csv::Reader::from_reader(reader); + + let headers = records.headers()?.into_iter().map(parse_csv_header).collect(); + + Ok(Self { documents: records.into_records(), headers }) + } +} + +impl Iterator for CSVDocumentDeserializer { + type Item = anyhow::Result; + + fn next(&mut self) -> Option { + let csv_document = self.documents.next()?; + + match csv_document { + Ok(csv_document) => { + let mut document = Object::new(); + + for ((field_name, field_type), value) in + self.headers.iter().zip(csv_document.into_iter()) + { + let parsed_value: Result = match field_type { + AllowedType::Number => { + value.parse::().map(Value::from).map_err(Into::into) + } + AllowedType::String => Ok(Value::String(value.to_string())), + }; + + match parsed_value { + Ok(value) => drop(document.insert(field_name.to_string(), value)), + Err(_e) => { + return Some(Err(anyhow::anyhow!( + "Value '{}' is not a valid number", + value + ))) + } + } + } + + Some(Ok(document)) + } + Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))), + } + } +} diff --git a/benchmarks/build.rs b/benchmarks/build.rs new file mode 100644 index 000000000..d7b99db37 --- /dev/null +++ b/benchmarks/build.rs @@ -0,0 +1,115 @@ +use std::fs::File; +use std::io::{Cursor, Read, Seek, Write}; +use std::path::{Path, PathBuf}; +use std::{env, fs}; + +use bytes::Bytes; +use convert_case::{Case, Casing}; +use flate2::read::GzDecoder; +use reqwest::IntoUrl; + +const BASE_URL: &str = "https://milli-benchmarks.fra1.digitaloceanspaces.com/datasets"; + +const DATASET_SONGS: (&str, &str) = ("smol-songs", "csv"); +const DATASET_SONGS_1_2: (&str, &str) = ("smol-songs-1_2", "csv"); +const DATASET_SONGS_3_4: (&str, &str) = ("smol-songs-3_4", "csv"); +const DATASET_SONGS_4_4: (&str, &str) = ("smol-songs-4_4", "csv"); +const DATASET_WIKI: (&str, &str) = ("smol-wiki-articles", "csv"); +const DATASET_WIKI_1_2: (&str, &str) = ("smol-wiki-articles-1_2", "csv"); +const DATASET_WIKI_3_4: (&str, &str) = ("smol-wiki-articles-3_4", "csv"); +const DATASET_WIKI_4_4: (&str, &str) = ("smol-wiki-articles-4_4", "csv"); +const DATASET_MOVIES: (&str, &str) = ("movies", "json"); +const DATASET_MOVIES_1_2: (&str, &str) = ("movies-1_2", "json"); +const DATASET_MOVIES_3_4: (&str, &str) = ("movies-3_4", "json"); +const DATASET_MOVIES_4_4: (&str, &str) = ("movies-4_4", "json"); +const DATASET_NESTED_MOVIES: (&str, &str) = ("nested_movies", "json"); +const DATASET_GEO: (&str, &str) = ("smol-all-countries", "jsonl"); + +const ALL_DATASETS: &[(&str, &str)] = &[ + DATASET_SONGS, + DATASET_SONGS_1_2, + DATASET_SONGS_3_4, + DATASET_SONGS_4_4, + DATASET_WIKI, + DATASET_WIKI_1_2, + DATASET_WIKI_3_4, + DATASET_WIKI_4_4, + DATASET_MOVIES, + DATASET_MOVIES_1_2, + DATASET_MOVIES_3_4, + DATASET_MOVIES_4_4, + DATASET_NESTED_MOVIES, + DATASET_GEO, +]; + +/// The name of the environment variable used to select the path +/// of the directory containing the datasets +const BASE_DATASETS_PATH_KEY: &str = "MILLI_BENCH_DATASETS_PATH"; + +fn main() -> anyhow::Result<()> { + let out_dir = PathBuf::from(env::var(BASE_DATASETS_PATH_KEY).unwrap_or(env::var("OUT_DIR")?)); + + let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches"); + let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?; + write!( + manifest_paths_file, + r#"//! This file is generated by the build script. +//! Do not modify by hand, use the build.rs file. +#![allow(dead_code)] +"# + )?; + writeln!(manifest_paths_file)?; + + for (dataset, extension) in ALL_DATASETS { + let out_path = out_dir.join(dataset); + let out_file = out_path.with_extension(extension); + + writeln!( + &mut manifest_paths_file, + r#"pub const {}: &str = {:?};"#, + dataset.to_case(Case::ScreamingSnake), + out_file.display(), + )?; + + if out_file.exists() { + eprintln!( + "The dataset {} already exists on the file system and will not be downloaded again", + out_path.display(), + ); + continue; + } + let url = format!("{}/{}.{}.gz", BASE_URL, dataset, extension); + eprintln!("downloading: {}", url); + let bytes = retry(|| download_dataset(url.clone()), 10)?; + eprintln!("{} downloaded successfully", url); + eprintln!("uncompressing in {}", out_file.display()); + uncompress_in_file(bytes, &out_file)?; + } + + Ok(()) +} + +fn retry(fun: impl Fn() -> Result, times: usize) -> Result { + for _ in 0..times { + if let ok @ Ok(_) = fun() { + return ok; + } + } + fun() +} + +fn download_dataset(url: U) -> anyhow::Result> { + let bytes = + reqwest::blocking::Client::builder().timeout(None).build()?.get(url).send()?.bytes()?; + Ok(Cursor::new(bytes)) +} + +fn uncompress_in_file>(bytes: R, path: P) -> anyhow::Result<()> { + let path = path.as_ref(); + let mut gz = GzDecoder::new(bytes); + let mut dataset = Vec::new(); + gz.read_to_end(&mut dataset)?; + + fs::write(path, dataset)?; + Ok(()) +} diff --git a/benchmarks/scripts/compare.sh b/benchmarks/scripts/compare.sh new file mode 100755 index 000000000..ff49144f3 --- /dev/null +++ b/benchmarks/scripts/compare.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +# Requirements: +# - critcmp. See: https://github.com/BurntSushi/critcmp +# - curl + +# Usage +# $ bash compare.sh json_file1 json_file1 +# ex: bash compare.sh songs_main_09a4321.json songs_geosearch_24ec456.json + +# Checking that critcmp is installed +command -v critcmp > /dev/null 2>&1 +if [[ "$?" -ne 0 ]]; then + echo 'You must install critcmp to make this script work.' + echo 'See: https://github.com/BurntSushi/critcmp' + echo ' $ cargo install critcmp' + exit 1 +fi + +s3_url='https://milli-benchmarks.fra1.digitaloceanspaces.com/critcmp_results' + +for file in $@ +do + file_s3_url="$s3_url/$file" + file_local_path="/tmp/$file" + + if [[ ! -f $file_local_path ]]; then + curl $file_s3_url --output $file_local_path --silent + if [[ "$?" -ne 0 ]]; then + echo 'curl command failed.' + exit 1 + fi + fi +done + +path_list=$(echo " $@" | sed 's/ / \/tmp\//g') + +critcmp $path_list diff --git a/benchmarks/scripts/list.sh b/benchmarks/scripts/list.sh new file mode 100755 index 000000000..764193329 --- /dev/null +++ b/benchmarks/scripts/list.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +# Requirements: +# - curl +# - grep + +res=$(curl -s https://milli-benchmarks.fra1.digitaloceanspaces.com | grep -o '[^<]\+' | cut -c 5- | grep critcmp_results/ | cut -c 18-) + +for pattern in "$@" +do + res=$(echo "$res" | grep $pattern) +done + +echo "$res" diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs new file mode 100644 index 000000000..4281ec115 --- /dev/null +++ b/benchmarks/src/lib.rs @@ -0,0 +1,5 @@ +//! This library is only used to isolate the benchmarks +//! from the original milli library. +//! +//! It does not include interesting functions for milli library +//! users only for milli contributors. diff --git a/cli/Cargo.toml b/cli/Cargo.toml new file mode 100644 index 000000000..f52b64973 --- /dev/null +++ b/cli/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "cli" +version = "0.39.0" +edition = "2018" +description = "A CLI to interact with a milli index" +publish = false + +[dependencies] +bimap = "0.6.2" +byte-unit = { version = "4.0.14", default-features = false, features = ["std", "serde"] } +color-eyre = "0.6.2" +csv = "1.1.6" +eyre = "0.6.8" +indicatif = "0.17.1" +milli = { path = "../milli", default-features = false } +mimalloc = { version = "0.1.29", default-features = false } +serde = "1.0.145" +serde_json = "1.0.85" +stderrlog = "0.5.3" +structopt = "0.3.26" + +[features] +default = ["milli/default"] diff --git a/cli/src/main.rs b/cli/src/main.rs new file mode 100644 index 000000000..09ee7f984 --- /dev/null +++ b/cli/src/main.rs @@ -0,0 +1,559 @@ +use std::collections::BTreeMap; +use std::fmt::Display; +use std::fs::File; +use std::io::{stdin, BufRead, BufReader, Cursor, Read, Write}; +use std::path::PathBuf; +use std::str::FromStr; +use std::time::{Duration, Instant}; + +use byte_unit::Byte; +use eyre::Result; +use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use milli::update::UpdateIndexingStep::{ + ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, +}; +use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; +use milli::{heed, CriterionImplementationStrategy, Index, Object}; +use structopt::StructOpt; + +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +#[derive(Debug, StructOpt)] +#[structopt(name = "Milli CLI", about = "A simple CLI to manipulate a milli index.")] +struct Cli { + #[structopt(short, long, default_value = ".")] + index_path: PathBuf, + #[structopt(short = "s", long, default_value = "100GiB")] + index_size: Byte, + /// Verbose mode (-v, -vv, -vvv, etc.) + #[structopt(short, long, parse(from_occurrences))] + verbose: usize, + #[structopt(subcommand)] + subcommand: Command, +} + +#[derive(Debug, StructOpt)] +enum Command { + Documents { + #[structopt(subcommand)] + cmd: Documents, + }, + Search(Search), + Settings { + #[structopt(subcommand)] + cmd: Settings, + }, +} + +impl Performer for Command { + fn perform(self, index: Index) -> Result<()> { + match self { + Command::Documents { cmd } => cmd.perform(index), + Command::Search(cmd) => cmd.perform(index), + Command::Settings { cmd } => cmd.perform(index), + } + } +} + +#[derive(Debug, StructOpt)] +enum Settings { + Update(SettingsUpdate), + Show, +} + +impl Settings { + fn show(&self, index: Index) -> Result<()> { + let txn = index.read_txn()?; + let displayed_attributes = index + .displayed_fields(&txn)? + .map(|fields| fields.into_iter().map(String::from).collect()); + + let searchable_attributes: Option> = index + .searchable_fields(&txn)? + .map(|fields| fields.into_iter().map(String::from).collect()); + + let filterable_attributes: Vec<_> = index.filterable_fields(&txn)?.into_iter().collect(); + + let sortable_attributes: Vec<_> = index.sortable_fields(&txn)?.into_iter().collect(); + + let criteria: Vec<_> = index.criteria(&txn)?.into_iter().map(|c| c.to_string()).collect(); + + let stop_words = index + .stop_words(&txn)? + .map(|stop_words| -> Result> { + Ok(stop_words.stream().into_strs()?.into_iter().collect()) + }) + .transpose()? + .unwrap_or_else(Vec::new); + let distinct_field = index.distinct_field(&txn)?.map(String::from); + + // in milli each word in the synonyms map were split on their separator. Since we lost + // this information we are going to put space between words. + let synonyms: BTreeMap<_, Vec<_>> = index + .synonyms(&txn)? + .iter() + .map(|(key, values)| { + (key.join(" "), values.iter().map(|value| value.join(" ")).collect()) + }) + .collect(); + + let exact_attributes = index.exact_attributes(&txn)?; + + println!( + "displayed attributes:\n\t{}\nsearchable attributes:\n\t{}\nfilterable attributes:\n\t{}\nsortable attributes:\n\t{}\ncriterion:\n\t{}\nstop words:\n\t{}\ndistinct fields:\n\t{}\nsynonyms:\n\t{}\nexact attributes:\n\t{}\n", + displayed_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), + searchable_attributes.unwrap_or(vec!["*".to_owned()]).join("\n\t"), + filterable_attributes.join("\n\t"), + sortable_attributes.join("\n\t"), + criteria.join("\n\t"), + stop_words.join("\n\t"), + distinct_field.unwrap_or_default(), + synonyms.into_iter().map(|(k, v)| format!("\n\t{}:\n{:?}", k, v)).collect::(), + exact_attributes.join("\n\t"), + ); + Ok(()) + } +} + +impl Performer for Settings { + fn perform(self, index: Index) -> Result<()> { + match self { + Settings::Update(update) => update.perform(index), + Settings::Show => self.show(index), + } + } +} + +#[derive(Debug, StructOpt)] +enum Documents { + Add(DocumentAddition), +} + +impl Performer for Documents { + fn perform(self, index: Index) -> Result<()> { + match self { + Self::Add(addition) => addition.perform(index), + } + } +} + +trait Performer { + fn perform(self, index: Index) -> Result<()>; +} + +fn setup(opt: &Cli) -> Result<()> { + color_eyre::install()?; + stderrlog::new() + .verbosity(opt.verbose) + .show_level(false) + .timestamp(stderrlog::Timestamp::Off) + .init()?; + Ok(()) +} + +fn main() -> Result<()> { + let command = Cli::from_args(); + + setup(&command)?; + + let mut options = heed::EnvOpenOptions::new(); + options.map_size(command.index_size.get_bytes() as usize); + let index = milli::Index::new(options, command.index_path)?; + + command.subcommand.perform(index)?; + + Ok(()) +} + +#[derive(Debug)] +enum DocumentAdditionFormat { + Csv, + Json, + Jsonl, +} + +impl FromStr for DocumentAdditionFormat { + type Err = eyre::Error; + + fn from_str(s: &str) -> Result { + match s { + "csv" => Ok(Self::Csv), + "jsonl" => Ok(Self::Jsonl), + "json" => Ok(Self::Json), + other => eyre::bail!("invalid format: {}", other), + } + } +} + +#[derive(Debug, StructOpt)] +struct DocumentAddition { + #[structopt(short, long, default_value = "json", possible_values = &["csv", "jsonl", "json"])] + format: DocumentAdditionFormat, + /// Path to the update file, if not present, will read from stdin. + #[structopt(short, long)] + path: Option, + /// Specify the primary key. + #[structopt(long)] + primary: Option, + /// Whether to generate missing document ids. + #[structopt(short, long)] + autogen_docids: bool, + /// Whether to update or replace the documents if they already exist. + #[structopt(short, long)] + update_documents: bool, +} + +impl Performer for DocumentAddition { + fn perform(self, index: milli::Index) -> Result<()> { + let reader: Box = match self.path { + Some(ref path) => { + let file = File::open(path)?; + Box::new(file) + } + None => Box::new(stdin()), + }; + + println!("parsing documents..."); + + let reader = BufReader::new(reader); + + let documents = match self.format { + DocumentAdditionFormat::Csv => documents_from_csv(reader)?, + DocumentAdditionFormat::Json => documents_from_json(reader)?, + DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?, + }; + + let reader = DocumentsBatchReader::from_reader(Cursor::new(documents))?; + + println!("Adding {} documents to the index.", reader.documents_count()); + + let mut txn = index.write_txn()?; + let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() }; + let update_method = if self.update_documents { + IndexDocumentsMethod::UpdateDocuments + } else { + IndexDocumentsMethod::ReplaceDocuments + }; + + if let Some(primary) = self.primary { + let mut builder = update::Settings::new(&mut txn, &index, &config); + builder.set_primary_key(primary); + builder.execute(|_| (), || false).unwrap(); + } + + let indexing_config = IndexDocumentsConfig { + update_method, + autogenerate_docids: self.autogen_docids, + ..Default::default() + }; + let mut bars = Vec::new(); + let progesses = MultiProgress::new(); + for _ in 0..4 { + let bar = ProgressBar::hidden(); + let bar = progesses.add(bar); + bars.push(bar); + } + let addition = milli::update::IndexDocuments::new( + &mut txn, + &index, + &config, + indexing_config, + |step| indexing_callback(step, &bars), + || false, + ) + .unwrap(); + let (addition, user_error) = addition.add_documents(reader)?; + if let Err(error) = user_error { + return Err(error.into()); + } + + let result = addition.execute()?; + + txn.commit()?; + + println!("{:?}", result); + Ok(()) + } +} + +fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBar]) { + let step_index = step.step(); + let bar = &bars[step_index]; + if step_index > 0 { + let prev = &bars[step_index - 1]; + if !prev.is_finished() { + prev.disable_steady_tick(); + prev.finish(); + } + } + + let style = ProgressStyle::default_bar() + .progress_chars("##-") + .template("[eta: {eta_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}") + .unwrap(); + + match step { + RemapDocumentAddition { documents_seen } => { + bar.set_style(ProgressStyle::default_spinner()); + bar.set_message(format!("remapped {} documents so far.", documents_seen)); + } + ComputeIdsAndMergeDocuments { documents_seen, total_documents } => { + bar.set_style(style); + bar.set_length(total_documents as u64); + bar.set_message("Merging documents..."); + bar.set_position(documents_seen as u64); + } + IndexDocuments { documents_seen, total_documents } => { + bar.set_style(style); + bar.set_length(total_documents as u64); + bar.set_message("Indexing documents..."); + bar.set_position(documents_seen as u64); + } + MergeDataIntoFinalDatabase { databases_seen, total_databases } => { + bar.set_style(style); + bar.set_length(total_databases as u64); + bar.set_message("Merging databases..."); + bar.set_position(databases_seen as u64); + } + } + bar.enable_steady_tick(Duration::from_millis(200)); +} + +fn documents_from_jsonl(reader: impl Read) -> Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let reader = BufReader::new(reader); + + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { + let object = result?; + documents.append_json_object(&object)?; + } + + documents.into_inner().map_err(Into::into) +} + +fn documents_from_json(reader: impl Read) -> Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + documents.append_json_array(reader)?; + + documents.into_inner().map_err(Into::into) +} + +fn documents_from_csv(reader: impl Read) -> Result> { + let csv = csv::Reader::from_reader(reader); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) +} + +#[derive(Debug, Clone, Copy)] +struct SearchStrategyOption(CriterionImplementationStrategy); +impl FromStr for SearchStrategyOption { + type Err = String; + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "dynamic" => Ok(SearchStrategyOption(CriterionImplementationStrategy::Dynamic)), + "set" => Ok(SearchStrategyOption(CriterionImplementationStrategy::OnlySetBased)), + "iterative" => Ok(SearchStrategyOption(CriterionImplementationStrategy::OnlyIterative)), + _ => Err("could not parse {s} as a criterion implementation strategy, available options are `dynamic`, `set`, and `iterative`".to_owned()), + } + } +} +impl Display for SearchStrategyOption { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self.0 { + CriterionImplementationStrategy::OnlyIterative => Display::fmt("iterative", f), + CriterionImplementationStrategy::OnlySetBased => Display::fmt("set", f), + CriterionImplementationStrategy::Dynamic => Display::fmt("dynamic", f), + } + } +} + +#[derive(Debug, StructOpt)] +struct Search { + query: Option, + #[structopt(short, long)] + filter: Option, + #[structopt(short, long)] + offset: Option, + #[structopt(short, long)] + limit: Option, + #[structopt(short, long, conflicts_with = "query")] + interactive: bool, + #[structopt(short, long)] + strategy: Option, +} + +impl Performer for Search { + fn perform(self, index: milli::Index) -> Result<()> { + if self.interactive { + let stdin = std::io::stdin(); + let mut lines = stdin.lock().lines(); + loop { + eprint!("> "); + std::io::stdout().flush()?; + match lines.next() { + Some(Ok(line)) => { + let now = Instant::now(); + let jsons = Self::perform_single_search( + &index, + &Some(line), + &self.filter, + &self.offset, + &self.limit, + &self.strategy, + )?; + + let time = now.elapsed(); + + let hits = serde_json::to_string_pretty(&jsons)?; + + println!("{}", hits); + + eprintln!("found {} results in {:.02?}", jsons.len(), time); + } + _ => break, + } + } + } else { + let now = Instant::now(); + let jsons = Self::perform_single_search( + &index, + &self.query, + &self.filter, + &self.offset, + &self.limit, + &self.strategy, + )?; + + let time = now.elapsed(); + + let hits = serde_json::to_string_pretty(&jsons)?; + + println!("{}", hits); + eprintln!("found {} results in {:.02?}", jsons.len(), time); + } + + Ok(()) + } +} + +impl Search { + fn perform_single_search( + index: &milli::Index, + query: &Option, + filter: &Option, + offset: &Option, + limit: &Option, + strategy: &Option, + ) -> Result> { + let txn = index.read_txn()?; + let mut search = index.search(&txn); + + if let Some(ref query) = query { + search.query(query); + } + + if let Some(ref filter) = filter { + if let Some(condition) = milli::Filter::from_str(filter)? { + search.filter(condition); + } + } + + if let Some(offset) = offset { + search.offset(*offset); + } + + if let Some(limit) = limit { + search.limit(*limit); + } + if let Some(strategy) = strategy { + search.criterion_implementation_strategy(strategy.0); + } + + let result = search.execute()?; + + let fields_ids_map = index.fields_ids_map(&txn)?; + let displayed_fields = + index.displayed_fields_ids(&txn)?.unwrap_or_else(|| fields_ids_map.ids().collect()); + let documents = index.documents(&txn, result.documents_ids)?; + let mut jsons = Vec::new(); + for (_, obkv) in documents { + let json = milli::obkv_to_json(&displayed_fields, &fields_ids_map, obkv)?; + jsons.push(json); + } + + Ok(jsons) + } +} + +#[derive(Debug, StructOpt)] +struct SettingsUpdate { + #[structopt(long)] + filterable_attributes: Option>, + #[structopt(long)] + criteria: Option>, + #[structopt(long)] + exact_attributes: Option>, + #[structopt(long)] + distinct_attribute: Option, +} + +impl Performer for SettingsUpdate { + fn perform(self, index: milli::Index) -> Result<()> { + let mut txn = index.write_txn()?; + + let config = IndexerConfig { log_every_n: Some(100), ..Default::default() }; + + let mut update = milli::update::Settings::new(&mut txn, &index, &config); + + if let Some(ref filterable_attributes) = self.filterable_attributes { + if !filterable_attributes.is_empty() { + update.set_filterable_fields(filterable_attributes.iter().cloned().collect()); + } else { + update.reset_filterable_fields(); + } + } + + if let Some(criteria) = self.criteria { + if !criteria.is_empty() { + update.set_criteria(criteria.iter().map(|c| c.parse()).collect::>()?); + } else { + update.reset_criteria(); + } + } + + if let Some(exact_attributes) = self.exact_attributes { + if !exact_attributes.is_empty() { + update.set_exact_attributes(exact_attributes.into_iter().collect()); + } else { + update.reset_exact_attributes(); + } + } + + if let Some(distinct_attr) = self.distinct_attribute { + if !distinct_attr.is_empty() { + update.set_distinct_field(distinct_attr); + } else { + update.reset_distinct_field(); + } + } + + let mut bars = Vec::new(); + let progesses = MultiProgress::new(); + for _ in 0..4 { + let bar = ProgressBar::hidden(); + let bar = progesses.add(bar); + bars.push(bar); + } + + update.execute(|step| indexing_callback(step, &bars), || false)?; + + txn.commit()?; + Ok(()) + } +} diff --git a/filter-parser/Cargo.toml b/filter-parser/Cargo.toml new file mode 100644 index 000000000..9202c3875 --- /dev/null +++ b/filter-parser/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "filter-parser" +version = "0.39.0" +edition = "2021" +description = "The parser for the Meilisearch filter syntax" +publish = false + +[dependencies] +nom = "7.1.1" +nom_locate = "4.0.0" + +[dev-dependencies] +insta = "1.21.0" diff --git a/filter-parser/README.md b/filter-parser/README.md new file mode 100644 index 000000000..b4ddda3d3 --- /dev/null +++ b/filter-parser/README.md @@ -0,0 +1,36 @@ +# Filter parser + +This workspace is dedicated to the parsing of the Meilisearch filters. + +Most of the code and explanation are in the [`lib.rs`](./src/lib.rs). Especially, the BNF of the filters at the top of this file. + +The parser use [nom](https://docs.rs/nom/) to do most of its work and [nom-locate](https://docs.rs/nom_locate/) to keep track of what we were doing when we encountered an error. + +## Cli +A simple main is provided to quick-test if a filter can be parsed or not without bringing milli. +It takes one argument and try to parse it. +``` +cargo run -- 'field = value' # success +cargo run -- 'field = "doggo' # error => missing closing delimiter " +``` + +## Fuzz +The workspace have been fuzzed with [cargo-fuzz](https://rust-fuzz.github.io/book/cargo-fuzz.html). + +### Setup +You'll need rust-nightly to execute the fuzzer. + +``` +cargo install cargo-fuzz +``` + +### Run +When the filter parser is executed by the fuzzer it's triggering a stackoverflow really fast. We can avoid this problem by limiting the `max_len` of [libfuzzer](https://llvm.org/docs/LibFuzzer.html) at 500 characters. +``` +cargo fuzz run parse -- -max_len=500 +``` + +## What to do if you find a bug in the parser + +- Write a test at the end of the [`lib.rs`](./src/lib.rs) to ensure it never happens again. +- Add a file in [the corpus directory](./fuzz/corpus/parse/) with your filter to help the fuzzer find new bugs. Since this directory is going to be heavily polluted by the execution of the fuzzer it's in the gitignore and you'll need to force push your new test. diff --git a/filter-parser/fuzz/.gitignore b/filter-parser/fuzz/.gitignore new file mode 100644 index 000000000..084aa18c1 --- /dev/null +++ b/filter-parser/fuzz/.gitignore @@ -0,0 +1,3 @@ +/corpus/ +/artifacts/ +/target/ diff --git a/filter-parser/fuzz/Cargo.toml b/filter-parser/fuzz/Cargo.toml new file mode 100644 index 000000000..246276f2c --- /dev/null +++ b/filter-parser/fuzz/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "filter-parser-fuzz" +version = "0.0.0" +authors = ["Automatically generated"] +publish = false +edition = "2018" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" + +[dependencies.filter-parser] +path = ".." + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[[bin]] +name = "parse" +path = "fuzz_targets/parse.rs" +test = false +doc = false diff --git a/filter-parser/fuzz/corpus/parse/test_1 b/filter-parser/fuzz/corpus/parse/test_1 new file mode 100644 index 000000000..2523a328e --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_1 @@ -0,0 +1 @@ +channel = Ponce diff --git a/filter-parser/fuzz/corpus/parse/test_10 b/filter-parser/fuzz/corpus/parse/test_10 new file mode 100644 index 000000000..d0e9f1e51 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_10 @@ -0,0 +1 @@ +channel != ponce diff --git a/filter-parser/fuzz/corpus/parse/test_11 b/filter-parser/fuzz/corpus/parse/test_11 new file mode 100644 index 000000000..ca3db9223 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_11 @@ -0,0 +1 @@ +NOT channel = ponce diff --git a/filter-parser/fuzz/corpus/parse/test_12 b/filter-parser/fuzz/corpus/parse/test_12 new file mode 100644 index 000000000..325f848c1 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_12 @@ -0,0 +1 @@ +subscribers < 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_13 b/filter-parser/fuzz/corpus/parse/test_13 new file mode 100644 index 000000000..ca7b96f30 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_13 @@ -0,0 +1 @@ +subscribers > 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_14 b/filter-parser/fuzz/corpus/parse/test_14 new file mode 100644 index 000000000..f72f48bdb --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_14 @@ -0,0 +1 @@ +subscribers <= 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_15 b/filter-parser/fuzz/corpus/parse/test_15 new file mode 100644 index 000000000..75073fc74 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_15 @@ -0,0 +1 @@ +subscribers >= 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_16 b/filter-parser/fuzz/corpus/parse/test_16 new file mode 100644 index 000000000..bdd39241b --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_16 @@ -0,0 +1 @@ +NOT subscribers < 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_17 b/filter-parser/fuzz/corpus/parse/test_17 new file mode 100644 index 000000000..4487643e4 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_17 @@ -0,0 +1 @@ +NOT subscribers > 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_18 b/filter-parser/fuzz/corpus/parse/test_18 new file mode 100644 index 000000000..150604012 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_18 @@ -0,0 +1 @@ +NOT subscribers <= 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_19 b/filter-parser/fuzz/corpus/parse/test_19 new file mode 100644 index 000000000..11bc15103 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_19 @@ -0,0 +1 @@ +NOT subscribers >= 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_2 b/filter-parser/fuzz/corpus/parse/test_2 new file mode 100644 index 000000000..8ac19cad4 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_2 @@ -0,0 +1 @@ +subscribers = 12 diff --git a/filter-parser/fuzz/corpus/parse/test_20 b/filter-parser/fuzz/corpus/parse/test_20 new file mode 100644 index 000000000..f52ad8ff2 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_20 @@ -0,0 +1 @@ +subscribers 100 TO 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_21 b/filter-parser/fuzz/corpus/parse/test_21 new file mode 100644 index 000000000..e86e6b89d --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_21 @@ -0,0 +1 @@ +NOT subscribers 100 TO 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_22 b/filter-parser/fuzz/corpus/parse/test_22 new file mode 100644 index 000000000..8ceeb6c1a --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_22 @@ -0,0 +1 @@ +_geoRadius(12, 13, 14) diff --git a/filter-parser/fuzz/corpus/parse/test_23 b/filter-parser/fuzz/corpus/parse/test_23 new file mode 100644 index 000000000..614effb98 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_23 @@ -0,0 +1 @@ +NOT _geoRadius(12, 13, 14) diff --git a/filter-parser/fuzz/corpus/parse/test_24 b/filter-parser/fuzz/corpus/parse/test_24 new file mode 100644 index 000000000..2b8b39279 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_24 @@ -0,0 +1 @@ +channel = ponce AND 'dog race' != 'bernese mountain' diff --git a/filter-parser/fuzz/corpus/parse/test_25 b/filter-parser/fuzz/corpus/parse/test_25 new file mode 100644 index 000000000..8f6fef74a --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_25 @@ -0,0 +1 @@ +channel = ponce OR 'dog race' != 'bernese mountain' diff --git a/filter-parser/fuzz/corpus/parse/test_26 b/filter-parser/fuzz/corpus/parse/test_26 new file mode 100644 index 000000000..5134b354d --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_26 @@ -0,0 +1 @@ +channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_27 b/filter-parser/fuzz/corpus/parse/test_27 new file mode 100644 index 000000000..b63559b9f --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_27 @@ -0,0 +1 @@ +channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 ) diff --git a/filter-parser/fuzz/corpus/parse/test_28 b/filter-parser/fuzz/corpus/parse/test_28 new file mode 100644 index 000000000..5bc97fb2b --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_28 @@ -0,0 +1 @@ +(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14) diff --git a/filter-parser/fuzz/corpus/parse/test_29 b/filter-parser/fuzz/corpus/parse/test_29 new file mode 100644 index 000000000..7713618bb --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_29 @@ -0,0 +1 @@ +channel = Ponce = 12 diff --git a/filter-parser/fuzz/corpus/parse/test_3 b/filter-parser/fuzz/corpus/parse/test_3 new file mode 100644 index 000000000..2533e8fcf --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_3 @@ -0,0 +1 @@ +channel = 'Mister Mv' diff --git a/filter-parser/fuzz/corpus/parse/test_30 b/filter-parser/fuzz/corpus/parse/test_30 new file mode 100644 index 000000000..c35941150 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_30 @@ -0,0 +1 @@ +channel = diff --git a/filter-parser/fuzz/corpus/parse/test_31 b/filter-parser/fuzz/corpus/parse/test_31 new file mode 100644 index 000000000..f7982669f --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_31 @@ -0,0 +1 @@ +channel = 🐻 diff --git a/filter-parser/fuzz/corpus/parse/test_32 b/filter-parser/fuzz/corpus/parse/test_32 new file mode 100644 index 000000000..c4a102dc8 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_32 @@ -0,0 +1 @@ +OR diff --git a/filter-parser/fuzz/corpus/parse/test_33 b/filter-parser/fuzz/corpus/parse/test_33 new file mode 100644 index 000000000..eb80eb4e6 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_33 @@ -0,0 +1 @@ +AND diff --git a/filter-parser/fuzz/corpus/parse/test_34 b/filter-parser/fuzz/corpus/parse/test_34 new file mode 100644 index 000000000..60fc05e7f --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_34 @@ -0,0 +1 @@ +channel Ponce diff --git a/filter-parser/fuzz/corpus/parse/test_35 b/filter-parser/fuzz/corpus/parse/test_35 new file mode 100644 index 000000000..4a868f1d8 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_35 @@ -0,0 +1 @@ +channel = Ponce OR diff --git a/filter-parser/fuzz/corpus/parse/test_36 b/filter-parser/fuzz/corpus/parse/test_36 new file mode 100644 index 000000000..d7a0abac7 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_36 @@ -0,0 +1 @@ +_geoRadius diff --git a/filter-parser/fuzz/corpus/parse/test_37 b/filter-parser/fuzz/corpus/parse/test_37 new file mode 100644 index 000000000..44b5105b6 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_37 @@ -0,0 +1 @@ +_geoRadius = 12 diff --git a/filter-parser/fuzz/corpus/parse/test_38 b/filter-parser/fuzz/corpus/parse/test_38 new file mode 100644 index 000000000..ab45b973f --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_38 @@ -0,0 +1 @@ +_geoPoint(12, 13, 14) diff --git a/filter-parser/fuzz/corpus/parse/test_39 b/filter-parser/fuzz/corpus/parse/test_39 new file mode 100644 index 000000000..283095326 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_39 @@ -0,0 +1 @@ +position <= _geoPoint(12, 13, 14) diff --git a/filter-parser/fuzz/corpus/parse/test_4 b/filter-parser/fuzz/corpus/parse/test_4 new file mode 100644 index 000000000..9c2716e79 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_4 @@ -0,0 +1 @@ +channel = "Mister Mv" diff --git a/filter-parser/fuzz/corpus/parse/test_40 b/filter-parser/fuzz/corpus/parse/test_40 new file mode 100644 index 000000000..c4c038c15 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_40 @@ -0,0 +1 @@ +position <= _geoRadius(12, 13, 14) diff --git a/filter-parser/fuzz/corpus/parse/test_41 b/filter-parser/fuzz/corpus/parse/test_41 new file mode 100644 index 000000000..6952aa87e --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_41 @@ -0,0 +1 @@ +channel = 'ponce diff --git a/filter-parser/fuzz/corpus/parse/test_42 b/filter-parser/fuzz/corpus/parse/test_42 new file mode 100644 index 000000000..485d8da96 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_42 @@ -0,0 +1 @@ +channel = "ponce diff --git a/filter-parser/fuzz/corpus/parse/test_43 b/filter-parser/fuzz/corpus/parse/test_43 new file mode 100644 index 000000000..728c8aa22 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_43 @@ -0,0 +1 @@ +channel = mv OR (followers >= 1000 diff --git a/filter-parser/fuzz/corpus/parse/test_5 b/filter-parser/fuzz/corpus/parse/test_5 new file mode 100644 index 000000000..89f5ec8ee --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_5 @@ -0,0 +1 @@ +'dog race' = Borzoi diff --git a/filter-parser/fuzz/corpus/parse/test_6 b/filter-parser/fuzz/corpus/parse/test_6 new file mode 100644 index 000000000..be3e203cb --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_6 @@ -0,0 +1 @@ +"dog race" = Chusky diff --git a/filter-parser/fuzz/corpus/parse/test_7 b/filter-parser/fuzz/corpus/parse/test_7 new file mode 100644 index 000000000..eb77a2875 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_7 @@ -0,0 +1 @@ +"dog race" = "Bernese Mountain" diff --git a/filter-parser/fuzz/corpus/parse/test_8 b/filter-parser/fuzz/corpus/parse/test_8 new file mode 100644 index 000000000..a25477648 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_8 @@ -0,0 +1 @@ +'dog race' = 'Bernese Mountain' diff --git a/filter-parser/fuzz/corpus/parse/test_9 b/filter-parser/fuzz/corpus/parse/test_9 new file mode 100644 index 000000000..c347e68f5 --- /dev/null +++ b/filter-parser/fuzz/corpus/parse/test_9 @@ -0,0 +1 @@ +"dog race" = 'Bernese Mountain' diff --git a/filter-parser/fuzz/fuzz_targets/parse.rs b/filter-parser/fuzz/fuzz_targets/parse.rs new file mode 100644 index 000000000..6d0069c15 --- /dev/null +++ b/filter-parser/fuzz/fuzz_targets/parse.rs @@ -0,0 +1,18 @@ +#![no_main] +use filter_parser::{ErrorKind, FilterCondition}; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + if let Ok(s) = std::str::from_utf8(data) { + // When we are fuzzing the parser we can get a stack overflow very easily. + // But since this doesn't happens with a normal build we are just going to limit the fuzzer to 500 characters. + if s.len() < 500 { + match FilterCondition::parse(s) { + Err(e) if matches!(e.kind(), ErrorKind::InternalError(_)) => { + panic!("Found an internal error: `{:?}`", e) + } + _ => (), + } + } + } +}); diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs new file mode 100644 index 000000000..735ffec0e --- /dev/null +++ b/filter-parser/src/condition.rs @@ -0,0 +1,67 @@ +//! BNF grammar: +//! +//! ```text +//! condition = value ("==" | ">" ...) value +//! to = value value TO value +//! ``` + +use nom::branch::alt; +use nom::bytes::complete::tag; +use nom::character::complete::multispace1; +use nom::combinator::cut; +use nom::sequence::{terminated, tuple}; +use Condition::*; + +use crate::{parse_value, FilterCondition, IResult, Span, Token}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum Condition<'a> { + GreaterThan(Token<'a>), + GreaterThanOrEqual(Token<'a>), + Equal(Token<'a>), + NotEqual(Token<'a>), + Exists, + LowerThan(Token<'a>), + LowerThanOrEqual(Token<'a>), + Between { from: Token<'a>, to: Token<'a> }, +} + +/// condition = value ("==" | ">" ...) value +pub fn parse_condition(input: Span) -> IResult { + let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); + let (input, (fid, op, value)) = tuple((parse_value, operator, cut(parse_value)))(input)?; + + let condition = match *op.fragment() { + "<=" => FilterCondition::Condition { fid, op: LowerThanOrEqual(value) }, + ">=" => FilterCondition::Condition { fid, op: GreaterThanOrEqual(value) }, + "!=" => FilterCondition::Condition { fid, op: NotEqual(value) }, + "<" => FilterCondition::Condition { fid, op: LowerThan(value) }, + ">" => FilterCondition::Condition { fid, op: GreaterThan(value) }, + "=" => FilterCondition::Condition { fid, op: Equal(value) }, + _ => unreachable!(), + }; + + Ok((input, condition)) +} + +/// exist = value "EXISTS" +pub fn parse_exists(input: Span) -> IResult { + let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; + + Ok((input, FilterCondition::Condition { fid: key, op: Exists })) +} +/// exist = value "NOT" WS+ "EXISTS" +pub fn parse_not_exists(input: Span) -> IResult { + let (input, key) = parse_value(input)?; + + let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?; + Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Exists })))) +} + +/// to = value value "TO" WS+ value +pub fn parse_to(input: Span) -> IResult { + let (input, (key, from, _, _, to)) = + tuple((parse_value, parse_value, tag("TO"), multispace1, cut(parse_value)))(input)?; + + Ok((input, FilterCondition::Condition { fid: key, op: Between { from, to } })) +} diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs new file mode 100644 index 000000000..ea95caba7 --- /dev/null +++ b/filter-parser/src/error.rs @@ -0,0 +1,198 @@ +use std::fmt::Display; + +use nom::error::{self, ParseError}; +use nom::Parser; + +use crate::{IResult, Span}; + +pub trait NomErrorExt { + fn is_failure(&self) -> bool; + fn map_err E>(self, op: O) -> nom::Err; + fn map_fail E>(self, op: O) -> nom::Err; +} + +impl NomErrorExt for nom::Err { + fn is_failure(&self) -> bool { + matches!(self, Self::Failure(_)) + } + + fn map_err E>(self, op: O) -> nom::Err { + match self { + e @ Self::Failure(_) => e, + e => e.map(op), + } + } + + fn map_fail E>(self, op: O) -> nom::Err { + match self { + e @ Self::Error(_) => e, + e => e.map(op), + } + } +} + +/// cut a parser and map the error +pub fn cut_with_err<'a, O>( + mut parser: impl FnMut(Span<'a>) -> IResult<'a, O>, + mut with: impl FnMut(Error<'a>) -> Error<'a>, +) -> impl FnMut(Span<'a>) -> IResult { + move |input| match parser.parse(input) { + Err(nom::Err::Error(e)) => Err(nom::Err::Failure(with(e))), + rest => rest, + } +} + +#[derive(Debug)] +pub struct Error<'a> { + context: Span<'a>, + kind: ErrorKind<'a>, +} + +#[derive(Debug)] +pub enum ExpectedValueKind { + ReservedKeyword, + Other, +} + +#[derive(Debug)] +pub enum ErrorKind<'a> { + ReservedGeo(&'a str), + Geo, + MisusedGeo, + InvalidPrimary, + ExpectedEof, + ExpectedValue(ExpectedValueKind), + MalformedValue, + InOpeningBracket, + InClosingBracket, + NonFiniteFloat, + InExpectedValue(ExpectedValueKind), + ReservedKeyword(String), + MissingClosingDelimiter(char), + Char(char), + InternalError(error::ErrorKind), + DepthLimitReached, + External(String), +} + +impl<'a> Error<'a> { + pub fn kind(&self) -> &ErrorKind<'a> { + &self.kind + } + + pub fn context(&self) -> &Span<'a> { + &self.context + } + + pub fn new_from_kind(context: Span<'a>, kind: ErrorKind<'a>) -> Self { + Self { context, kind } + } + + pub fn new_from_external(context: Span<'a>, error: impl std::error::Error) -> Self { + Self::new_from_kind(context, ErrorKind::External(error.to_string())) + } + + pub fn char(self) -> char { + match self.kind { + ErrorKind::Char(c) => c, + error => panic!("Internal filter parser error: {:?}", error), + } + } +} + +impl<'a> ParseError> for Error<'a> { + fn from_error_kind(input: Span<'a>, kind: error::ErrorKind) -> Self { + let kind = match kind { + error::ErrorKind::Eof => ErrorKind::ExpectedEof, + kind => ErrorKind::InternalError(kind), + }; + Self { context: input, kind } + } + + fn append(_input: Span<'a>, _kind: error::ErrorKind, other: Self) -> Self { + other + } + + fn from_char(input: Span<'a>, c: char) -> Self { + Self { context: input, kind: ErrorKind::Char(c) } + } +} + +impl<'a> Display for Error<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let input = self.context.fragment(); + // When printing our error message we want to escape all `\n` to be sure we keep our format with the + // first line being the diagnostic and the second line being the incriminated filter. + let escaped_input = input.escape_debug(); + + match &self.kind { + ErrorKind::ExpectedValue(_) if input.trim().is_empty() => { + writeln!(f, "Was expecting a value but instead got nothing.")? + } + ErrorKind::ExpectedValue(ExpectedValueKind::ReservedKeyword) => { + writeln!(f, "Was expecting a value but instead got `{escaped_input}`, which is a reserved keyword. To use `{escaped_input}` as a field name or a value, surround it by quotes.")? + } + ErrorKind::ExpectedValue(ExpectedValueKind::Other) => { + writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)? + } + ErrorKind::MalformedValue => { + writeln!(f, "Malformed value: `{}`.", escaped_input)? + } + ErrorKind::MissingClosingDelimiter(c) => { + writeln!(f, "Expression `{}` is missing the following closing delimiter: `{}`.", escaped_input, c)? + } + ErrorKind::InvalidPrimary if input.trim().is_empty() => { + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing.")? + } + ErrorKind::InvalidPrimary => { + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `{}`.", escaped_input)? + } + ErrorKind::ExpectedEof => { + writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? + } + ErrorKind::Geo => { + writeln!(f, "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`.")? + } + ErrorKind::ReservedGeo(name) => { + writeln!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates.", name.escape_debug())? + } + ErrorKind::MisusedGeo => { + writeln!(f, "The `_geoRadius` filter is an operation and can't be used as a value.")? + } + ErrorKind::ReservedKeyword(word) => { + writeln!(f, "`{word}` is a reserved keyword and thus cannot be used as a field name unless it is put inside quotes. Use \"{word}\" or \'{word}\' instead.")? + } + ErrorKind::InOpeningBracket => { + writeln!(f, "Expected `[` after `IN` keyword.")? + } + ErrorKind::InClosingBracket => { + writeln!(f, "Expected matching `]` after the list of field names given to `IN[`")? + } + ErrorKind::NonFiniteFloat => { + writeln!(f, "Non finite floats are not supported")? + } + ErrorKind::InExpectedValue(ExpectedValueKind::ReservedKeyword) => { + writeln!(f, "Expected only comma-separated field names inside `IN[..]` but instead found `{escaped_input}`, which is a keyword. To use `{escaped_input}` as a field name or a value, surround it by quotes.")? + } + ErrorKind::InExpectedValue(ExpectedValueKind::Other) => { + writeln!(f, "Expected only comma-separated field names inside `IN[..]` but instead found `{escaped_input}`.")? + } + ErrorKind::Char(c) => { + panic!("Tried to display a char error with `{}`", c) + } + ErrorKind::DepthLimitReached => writeln!( + f, + "The filter exceeded the maximum depth limit. Try rewriting the filter so that it contains fewer nested conditions." + )?, + ErrorKind::InternalError(kind) => writeln!( + f, + "Encountered an internal `{:?}` error while parsing your filter. Please fill an issue", kind + )?, + ErrorKind::External(ref error) => writeln!(f, "{}", error)?, + } + let base_column = self.context.get_utf8_column(); + let size = self.context.fragment().chars().count(); + + write!(f, "{}:{} {}", base_column, base_column + size, self.context.extra) + } +} diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs new file mode 100644 index 000000000..d10136ace --- /dev/null +++ b/filter-parser/src/lib.rs @@ -0,0 +1,739 @@ +//! BNF grammar: +//! +//! ```text +//! filter = expression EOF +//! expression = or +//! or = and ("OR" WS+ and)* +//! and = not ("AND" WS+ not)* +//! not = ("NOT" WS+ not) | primary +//! primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | in | condition | exists | not_exists | to +//! in = value "IN" WS* "[" value_list "]" +//! condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value +//! exists = value "EXISTS" +//! not_exists = value "NOT" WS+ "EXISTS" +//! to = value value "TO" WS+ value +//! value = WS* ( word | singleQuoted | doubleQuoted) WS+ +//! value_list = (value ("," value)* ","?)? +//! singleQuoted = "'" .* all but quotes "'" +//! doubleQuoted = "\"" .* all but double quotes "\"" +//! word = (alphanumeric | _ | - | .)+ +//! geoRadius = "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" +//! ``` +//! +//! Other BNF grammar used to handle some specific errors: +//! ```text +//! geoPoint = WS* "_geoPoint(" (float ",")* ")" +//! ``` +//! +//! Specific errors: +//! ================ +//! - If a user try to use a geoPoint, as a primary OR as a value we must throw an error. +//! ```text +//! field = _geoPoint(12, 13, 14) +//! field < 12 AND _geoPoint(1, 2) +//! ``` +//! +//! - If a user try to use a geoRadius as a value we must throw an error. +//! ```text +//! field = _geoRadius(12, 13, 14) +//! ``` +//! + +mod condition; +mod error; +mod value; + +use std::fmt::Debug; + +pub use condition::{parse_condition, parse_to, Condition}; +use condition::{parse_exists, parse_not_exists}; +use error::{cut_with_err, ExpectedValueKind, NomErrorExt}; +pub use error::{Error, ErrorKind}; +use nom::branch::alt; +use nom::bytes::complete::tag; +use nom::character::complete::{char, multispace0}; +use nom::combinator::{cut, eof, map, opt}; +use nom::multi::{many0, separated_list1}; +use nom::number::complete::recognize_float; +use nom::sequence::{delimited, preceded, terminated, tuple}; +use nom::Finish; +use nom_locate::LocatedSpan; +pub(crate) use value::parse_value; +use value::word_exact; + +pub type Span<'a> = LocatedSpan<&'a str, &'a str>; + +type IResult<'a, Ret> = nom::IResult, Ret, Error<'a>>; + +const MAX_FILTER_DEPTH: usize = 200; + +#[derive(Debug, Clone, Eq)] +pub struct Token<'a> { + /// The token in the original input, it should be used when possible. + span: Span<'a>, + /// If you need to modify the original input you can use the `value` field + /// to store your modified input. + value: Option, +} + +impl<'a> PartialEq for Token<'a> { + fn eq(&self, other: &Self) -> bool { + self.span.fragment() == other.span.fragment() + } +} + +impl<'a> Token<'a> { + pub fn new(span: Span<'a>, value: Option) -> Self { + Self { span, value } + } + + pub fn lexeme(&self) -> &str { + &self.span + } + + pub fn value(&self) -> &str { + self.value.as_ref().map_or(&self.span, |value| value) + } + + pub fn as_external_error(&self, error: impl std::error::Error) -> Error<'a> { + Error::new_from_external(self.span, error) + } + + pub fn parse_finite_float(&self) -> Result { + let value: f64 = self.span.parse().map_err(|e| self.as_external_error(e))?; + if value.is_finite() { + Ok(value) + } else { + Err(Error::new_from_kind(self.span, ErrorKind::NonFiniteFloat)) + } + } +} + +impl<'a> From> for Token<'a> { + fn from(span: Span<'a>) -> Self { + Self { span, value: None } + } +} + +/// Allow [Token] to be constructed from &[str] +impl<'a> From<&'a str> for Token<'a> { + fn from(s: &'a str) -> Self { + Token::from(Span::new_extra(s, s)) + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FilterCondition<'a> { + Not(Box), + Condition { fid: Token<'a>, op: Condition<'a> }, + In { fid: Token<'a>, els: Vec> }, + Or(Vec), + And(Vec), + GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> }, +} + +impl<'a> FilterCondition<'a> { + /// Returns the first token found at the specified depth, `None` if no token at this depth. + pub fn token_at_depth(&self, depth: usize) -> Option<&Token> { + match self { + FilterCondition::Condition { fid, .. } if depth == 0 => Some(fid), + FilterCondition::Or(subfilters) => { + let depth = depth.saturating_sub(1); + for f in subfilters.iter() { + if let Some(t) = f.token_at_depth(depth) { + return Some(t); + } + } + None + } + FilterCondition::And(subfilters) => { + let depth = depth.saturating_sub(1); + for f in subfilters.iter() { + if let Some(t) = f.token_at_depth(depth) { + return Some(t); + } + } + None + } + FilterCondition::GeoLowerThan { point: [point, _], .. } if depth == 0 => Some(point), + _ => None, + } + } + + pub fn parse(input: &'a str) -> Result, Error> { + if input.trim().is_empty() { + return Ok(None); + } + let span = Span::new_extra(input, input); + parse_filter(span).finish().map(|(_rem, output)| Some(output)) + } +} + +/// remove OPTIONAL whitespaces before AND after the provided parser. +fn ws<'a, O>( + inner: impl FnMut(Span<'a>) -> IResult<'a, O>, +) -> impl FnMut(Span<'a>) -> IResult<'a, O> { + delimited(multispace0, inner, multispace0) +} + +/// value_list = (value ("," value)* ","?)? +fn parse_value_list(input: Span) -> IResult> { + let (input, first_value) = opt(parse_value)(input)?; + if let Some(first_value) = first_value { + let value_list_el_parser = preceded(ws(tag(",")), parse_value); + + let (input, mut values) = many0(value_list_el_parser)(input)?; + let (input, _) = opt(ws(tag(",")))(input)?; + values.insert(0, first_value); + + Ok((input, values)) + } else { + Ok((input, vec![])) + } +} + +/// "IN" WS* "[" value_list "]" +fn parse_in_body(input: Span) -> IResult> { + let (input, _) = ws(word_exact("IN"))(input)?; + + // everything after `IN` can be a failure + let (input, _) = + cut_with_err(tag("["), |_| Error::new_from_kind(input, ErrorKind::InOpeningBracket))( + input, + )?; + + let (input, content) = cut(parse_value_list)(input)?; + + // everything after `IN` can be a failure + let (input, _) = cut_with_err(ws(tag("]")), |_| { + if eof::<_, ()>(input).is_ok() { + Error::new_from_kind(input, ErrorKind::InClosingBracket) + } else { + let expected_value_kind = match parse_value(input) { + Err(nom::Err::Error(e)) => match e.kind() { + ErrorKind::ReservedKeyword(_) => ExpectedValueKind::ReservedKeyword, + _ => ExpectedValueKind::Other, + }, + _ => ExpectedValueKind::Other, + }; + Error::new_from_kind(input, ErrorKind::InExpectedValue(expected_value_kind)) + } + })(input)?; + + Ok((input, content)) +} + +/// in = value "IN" "[" value_list "]" +fn parse_in(input: Span) -> IResult { + let (input, value) = parse_value(input)?; + let (input, content) = parse_in_body(input)?; + + let filter = FilterCondition::In { fid: value, els: content }; + Ok((input, filter)) +} + +/// in = value "NOT" WS* "IN" "[" value_list "]" +fn parse_not_in(input: Span) -> IResult { + let (input, value) = parse_value(input)?; + let (input, _) = word_exact("NOT")(input)?; + let (input, content) = parse_in_body(input)?; + + let filter = FilterCondition::Not(Box::new(FilterCondition::In { fid: value, els: content })); + Ok((input, filter)) +} + +/// or = and ("OR" and) +fn parse_or(input: Span, depth: usize) -> IResult { + if depth > MAX_FILTER_DEPTH { + return Err(nom::Err::Error(Error::new_from_kind(input, ErrorKind::DepthLimitReached))); + } + let (input, first_filter) = parse_and(input, depth + 1)?; + // if we found a `OR` then we MUST find something next + let (input, mut ors) = + many0(preceded(ws(word_exact("OR")), cut(|input| parse_and(input, depth + 1))))(input)?; + + let filter = if ors.is_empty() { + first_filter + } else { + ors.insert(0, first_filter); + FilterCondition::Or(ors) + }; + + Ok((input, filter)) +} + +/// and = not ("AND" not)* +fn parse_and(input: Span, depth: usize) -> IResult { + if depth > MAX_FILTER_DEPTH { + return Err(nom::Err::Error(Error::new_from_kind(input, ErrorKind::DepthLimitReached))); + } + let (input, first_filter) = parse_not(input, depth + 1)?; + // if we found a `AND` then we MUST find something next + let (input, mut ands) = + many0(preceded(ws(word_exact("AND")), cut(|input| parse_not(input, depth + 1))))(input)?; + + let filter = if ands.is_empty() { + first_filter + } else { + ands.insert(0, first_filter); + FilterCondition::And(ands) + }; + + Ok((input, filter)) +} + +/// not = ("NOT" WS+ not) | primary +/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. +/// If we parse a `NOT` we MUST parse something behind. +fn parse_not(input: Span, depth: usize) -> IResult { + if depth > MAX_FILTER_DEPTH { + return Err(nom::Err::Error(Error::new_from_kind(input, ErrorKind::DepthLimitReached))); + } + alt(( + map( + preceded(ws(word_exact("NOT")), cut(|input| parse_not(input, depth + 1))), + |e| match e { + FilterCondition::Not(e) => *e, + _ => FilterCondition::Not(Box::new(e)), + }, + ), + |input| parse_primary(input, depth + 1), + ))(input) +} + +/// geoRadius = WS* "_geoRadius(float WS* "," WS* float WS* "," WS* float) +/// If we parse `_geoRadius` we MUST parse the rest of the expression. +fn parse_geo_radius(input: Span) -> IResult { + // we want to allow space BEFORE the _geoRadius but not after + let parsed = preceded( + tuple((multispace0, word_exact("_geoRadius"))), + // if we were able to parse `_geoRadius` and can't parse the rest of the input we return a failure + cut(delimited(char('('), separated_list1(tag(","), ws(recognize_float)), char(')'))), + )(input) + .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::Geo))); + + let (input, args) = parsed?; + + if args.len() != 3 { + return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::Geo))); + } + + let res = FilterCondition::GeoLowerThan { + point: [args[0].into(), args[1].into()], + radius: args[2].into(), + }; + Ok((input, res)) +} + +/// geoPoint = WS* "_geoPoint(float WS* "," WS* float WS* "," WS* float) +fn parse_geo_point(input: Span) -> IResult { + // we want to forbid space BEFORE the _geoPoint but not after + tuple(( + multispace0, + tag("_geoPoint"), + // if we were able to parse `_geoPoint` we are going to return a Failure whatever happens next. + cut(delimited(char('('), separated_list1(tag(","), ws(recognize_float)), char(')'))), + ))(input) + .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))))?; + // if we succeeded we still return a `Failure` because geoPoints are not allowed + Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) +} + +fn parse_error_reserved_keyword(input: Span) -> IResult { + match parse_condition(input) { + Ok(result) => Ok(result), + Err(nom::Err::Error(inner) | nom::Err::Failure(inner)) => match inner.kind() { + ErrorKind::ExpectedValue(ExpectedValueKind::ReservedKeyword) => { + Err(nom::Err::Failure(inner)) + } + _ => Err(nom::Err::Error(inner)), + }, + Err(e) => Err(e), + } +} + +/// primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to +fn parse_primary(input: Span, depth: usize) -> IResult { + if depth > MAX_FILTER_DEPTH { + return Err(nom::Err::Error(Error::new_from_kind(input, ErrorKind::DepthLimitReached))); + } + alt(( + // if we find a first parenthesis, then we must parse an expression and find the closing parenthesis + delimited( + ws(char('(')), + cut(|input| parse_expression(input, depth + 1)), + cut_with_err(ws(char(')')), |c| { + Error::new_from_kind(input, ErrorKind::MissingClosingDelimiter(c.char())) + }), + ), + parse_geo_radius, + parse_in, + parse_not_in, + parse_condition, + parse_exists, + parse_not_exists, + parse_to, + // the next lines are only for error handling and are written at the end to have the less possible performance impact + parse_geo_point, + parse_error_reserved_keyword, + ))(input) + // if the inner parsers did not match enough information to return an accurate error + .map_err(|e| e.map_err(|_| Error::new_from_kind(input, ErrorKind::InvalidPrimary))) +} + +/// expression = or +pub fn parse_expression(input: Span, depth: usize) -> IResult { + parse_or(input, depth) +} + +/// filter = expression EOF +pub fn parse_filter(input: Span) -> IResult { + terminated(|input| parse_expression(input, 0), eof)(input) +} + +#[cfg(test)] +pub mod tests { + use super::*; + + /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element + pub fn rtok<'a>(before: &'a str, value: &'a str) -> Token<'a> { + // if the string is empty we still need to return 1 for the line number + let lines = before.is_empty().then(|| 1).unwrap_or_else(|| before.lines().count()); + let offset = before.chars().count(); + // the extra field is not checked in the tests so we can set it to nothing + unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into() + } + + #[test] + fn parse() { + use FilterCondition as Fc; + + fn p(s: &str) -> impl std::fmt::Display + '_ { + Fc::parse(s).unwrap().unwrap() + } + + // Test equal + insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}"); + insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}"); + insta::assert_display_snapshot!(p("channel = 'Mister Mv'"), @"{channel} = {Mister Mv}"); + insta::assert_display_snapshot!(p("channel = \"Mister Mv\""), @"{channel} = {Mister Mv}"); + insta::assert_display_snapshot!(p("'dog race' = Borzoi"), @"{dog race} = {Borzoi}"); + insta::assert_display_snapshot!(p("\"dog race\" = Chusky"), @"{dog race} = {Chusky}"); + insta::assert_display_snapshot!(p("\"dog race\" = \"Bernese Mountain\""), @"{dog race} = {Bernese Mountain}"); + insta::assert_display_snapshot!(p("'dog race' = 'Bernese Mountain'"), @"{dog race} = {Bernese Mountain}"); + insta::assert_display_snapshot!(p("\"dog race\" = 'Bernese Mountain'"), @"{dog race} = {Bernese Mountain}"); + + // Test IN + insta::assert_display_snapshot!(p("colour IN[]"), @"{colour} IN[]"); + insta::assert_display_snapshot!(p("colour IN[green]"), @"{colour} IN[{green}, ]"); + insta::assert_display_snapshot!(p("colour IN[green,]"), @"{colour} IN[{green}, ]"); + insta::assert_display_snapshot!(p("colour NOT IN[green,blue]"), @"NOT ({colour} IN[{green}, {blue}, ])"); + insta::assert_display_snapshot!(p(" colour IN [ green , blue , ]"), @"{colour} IN[{green}, {blue}, ]"); + + // Test IN + OR/AND/() + insta::assert_display_snapshot!(p(" colour IN [green, blue] AND color = green "), @"AND[{colour} IN[{green}, {blue}, ], {color} = {green}, ]"); + insta::assert_display_snapshot!(p("NOT (colour IN [green, blue]) AND color = green "), @"AND[NOT ({colour} IN[{green}, {blue}, ]), {color} = {green}, ]"); + insta::assert_display_snapshot!(p("x = 1 OR NOT (colour IN [green, blue] OR color = green) "), @"OR[{x} = {1}, NOT (OR[{colour} IN[{green}, {blue}, ], {color} = {green}, ]), ]"); + + // Test whitespace start/end + insta::assert_display_snapshot!(p(" colour = green "), @"{colour} = {green}"); + insta::assert_display_snapshot!(p(" (colour = green OR colour = red) "), @"OR[{colour} = {green}, {colour} = {red}, ]"); + insta::assert_display_snapshot!(p(" colour IN [green, blue] AND color = green "), @"AND[{colour} IN[{green}, {blue}, ], {color} = {green}, ]"); + insta::assert_display_snapshot!(p(" colour NOT IN [green, blue] "), @"NOT ({colour} IN[{green}, {blue}, ])"); + insta::assert_display_snapshot!(p(" colour IN [green, blue] "), @"{colour} IN[{green}, {blue}, ]"); + + // Test conditions + insta::assert_display_snapshot!(p("channel != ponce"), @"{channel} != {ponce}"); + insta::assert_display_snapshot!(p("NOT channel = ponce"), @"NOT ({channel} = {ponce})"); + insta::assert_display_snapshot!(p("subscribers < 1000"), @"{subscribers} < {1000}"); + insta::assert_display_snapshot!(p("subscribers > 1000"), @"{subscribers} > {1000}"); + insta::assert_display_snapshot!(p("subscribers <= 1000"), @"{subscribers} <= {1000}"); + insta::assert_display_snapshot!(p("subscribers >= 1000"), @"{subscribers} >= {1000}"); + insta::assert_display_snapshot!(p("subscribers <= 1000"), @"{subscribers} <= {1000}"); + insta::assert_display_snapshot!(p("subscribers 100 TO 1000"), @"{subscribers} {100} TO {1000}"); + + // Test NOT + EXISTS + insta::assert_display_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS"); + insta::assert_display_snapshot!(p("NOT subscribers < 1000"), @"NOT ({subscribers} < {1000})"); + insta::assert_display_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); + insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); + insta::assert_display_snapshot!(p("NOT subscribers NOT EXISTS"), @"{subscribers} EXISTS"); + insta::assert_display_snapshot!(p("subscribers NOT EXISTS"), @"NOT ({subscribers} EXISTS)"); + insta::assert_display_snapshot!(p("NOT subscribers 100 TO 1000"), @"NOT ({subscribers} {100} TO {1000})"); + + // Test nested NOT + insta::assert_display_snapshot!(p("NOT NOT NOT NOT x = 5"), @"{x} = {5}"); + insta::assert_display_snapshot!(p("NOT NOT (NOT NOT x = 5)"), @"{x} = {5}"); + + // Test geo radius + insta::assert_display_snapshot!(p("_geoRadius(12, 13, 14)"), @"_geoRadius({12}, {13}, {14})"); + insta::assert_display_snapshot!(p("NOT _geoRadius(12, 13, 14)"), @"NOT (_geoRadius({12}, {13}, {14}))"); + + // Test OR + AND + insta::assert_display_snapshot!(p("channel = ponce AND 'dog race' != 'bernese mountain'"), @"AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ]"); + insta::assert_display_snapshot!(p("channel = ponce OR 'dog race' != 'bernese mountain'"), @"OR[{channel} = {ponce}, {dog race} != {bernese mountain}, ]"); + insta::assert_display_snapshot!(p("channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000"), @"OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, ]"); + insta::assert_display_snapshot!( + p("channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000 OR colour = red OR colour = blue AND size = 7"), + @"OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, {colour} = {red}, AND[{colour} = {blue}, {size} = {7}, ], ]" + ); + + // Test parentheses + insta::assert_display_snapshot!(p("channel = ponce AND ( 'dog race' != 'bernese mountain' OR subscribers > 1000 )"), @"AND[{channel} = {ponce}, OR[{dog race} != {bernese mountain}, {subscribers} > {1000}, ], ]"); + insta::assert_display_snapshot!(p("(channel = ponce AND 'dog race' != 'bernese mountain' OR subscribers > 1000) AND _geoRadius(12, 13, 14)"), @"AND[OR[AND[{channel} = {ponce}, {dog race} != {bernese mountain}, ], {subscribers} > {1000}, ], _geoRadius({12}, {13}, {14}), ]"); + + // Test recursion + // This is the most that is allowed + insta::assert_display_snapshot!( + p("(((((((((((((((((((((((((((((((((((((((((((((((((x = 1)))))))))))))))))))))))))))))))))))))))))))))))))"), + @"{x} = {1}" + ); + insta::assert_display_snapshot!( + px = 1"), + @"NOT ({x} = {1})" + ); + + // Confusing keywords + insta::assert_display_snapshot!(p(r#"NOT "OR" EXISTS AND "EXISTS" NOT EXISTS"#), @"AND[NOT ({OR} EXISTS), NOT ({EXISTS} EXISTS), ]"); + } + + #[test] + fn error() { + use FilterCondition as Fc; + + fn p(s: &str) -> impl std::fmt::Display + '_ { + Fc::parse(s).unwrap_err().to_string() + } + + insta::assert_display_snapshot!(p("channel = Ponce = 12"), @r###" + Found unexpected characters at the end of the filter: `= 12`. You probably forgot an `OR` or an `AND` rule. + 17:21 channel = Ponce = 12 + "###); + + insta::assert_display_snapshot!(p("channel = "), @r###" + Was expecting a value but instead got nothing. + 14:14 channel = + "###); + + insta::assert_display_snapshot!(p("channel = 🐻"), @r###" + Was expecting a value but instead got `🐻`. + 11:12 channel = 🐻 + "###); + + insta::assert_display_snapshot!(p("channel = 🐻 AND followers < 100"), @r###" + Was expecting a value but instead got `🐻`. + 11:12 channel = 🐻 AND followers < 100 + "###); + + insta::assert_display_snapshot!(p("'OR'"), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `\'OR\'`. + 1:5 'OR' + "###); + + insta::assert_display_snapshot!(p("OR"), @r###" + Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. + 1:3 OR + "###); + + insta::assert_display_snapshot!(p("channel Ponce"), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`. + 1:14 channel Ponce + "###); + + insta::assert_display_snapshot!(p("channel = Ponce OR"), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing. + 19:19 channel = Ponce OR + "###); + + insta::assert_display_snapshot!(p("_geoRadius"), @r###" + The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`. + 1:11 _geoRadius + "###); + + insta::assert_display_snapshot!(p("_geoRadius = 12"), @r###" + The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`. + 1:16 _geoRadius = 12 + "###); + + insta::assert_display_snapshot!(p("_geoPoint(12, 13, 14)"), @r###" + `_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates. + 1:22 _geoPoint(12, 13, 14) + "###); + + insta::assert_display_snapshot!(p("position <= _geoPoint(12, 13, 14)"), @r###" + `_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates. + 13:34 position <= _geoPoint(12, 13, 14) + "###); + + insta::assert_display_snapshot!(p("position <= _geoRadius(12, 13, 14)"), @r###" + The `_geoRadius` filter is an operation and can't be used as a value. + 13:35 position <= _geoRadius(12, 13, 14) + "###); + + insta::assert_display_snapshot!(p("channel = 'ponce"), @r###" + Expression `\'ponce` is missing the following closing delimiter: `'`. + 11:17 channel = 'ponce + "###); + + insta::assert_display_snapshot!(p("channel = \"ponce"), @r###" + Expression `\"ponce` is missing the following closing delimiter: `"`. + 11:17 channel = "ponce + "###); + + insta::assert_display_snapshot!(p("channel = mv OR (followers >= 1000"), @r###" + Expression `(followers >= 1000` is missing the following closing delimiter: `)`. + 17:35 channel = mv OR (followers >= 1000 + "###); + + insta::assert_display_snapshot!(p("channel = mv OR followers >= 1000)"), @r###" + Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule. + 34:35 channel = mv OR followers >= 1000) + "###); + + insta::assert_display_snapshot!(p("colour NOT EXIST"), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`. + 1:17 colour NOT EXIST + "###); + + insta::assert_display_snapshot!(p("subscribers 100 TO1000"), @r###" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`. + 1:23 subscribers 100 TO1000 + "###); + + insta::assert_display_snapshot!(p("channel = ponce ORdog != 'bernese mountain'"), @r###" + Found unexpected characters at the end of the filter: `ORdog != \'bernese mountain\'`. You probably forgot an `OR` or an `AND` rule. + 17:44 channel = ponce ORdog != 'bernese mountain' + "###); + + insta::assert_display_snapshot!(p("colour IN blue, green]"), @r###" + Expected `[` after `IN` keyword. + 11:23 colour IN blue, green] + "###); + + insta::assert_display_snapshot!(p("colour IN [blue, green, 'blue' > 2]"), @r###" + Expected only comma-separated field names inside `IN[..]` but instead found `> 2]`. + 32:36 colour IN [blue, green, 'blue' > 2] + "###); + + insta::assert_display_snapshot!(p("colour IN [blue, green, AND]"), @r###" + Expected only comma-separated field names inside `IN[..]` but instead found `AND]`. + 25:29 colour IN [blue, green, AND] + "###); + + insta::assert_display_snapshot!(p("colour IN [blue, green"), @r###" + Expected matching `]` after the list of field names given to `IN[` + 23:23 colour IN [blue, green + "###); + + insta::assert_display_snapshot!(p("colour IN ['blue, green"), @r###" + Expression `\'blue, green` is missing the following closing delimiter: `'`. + 12:24 colour IN ['blue, green + "###); + + insta::assert_display_snapshot!(p("x = EXISTS"), @r###" + Was expecting a value but instead got `EXISTS`, which is a reserved keyword. To use `EXISTS` as a field name or a value, surround it by quotes. + 5:11 x = EXISTS + "###); + + insta::assert_display_snapshot!(p("AND = 8"), @r###" + Was expecting a value but instead got `AND`, which is a reserved keyword. To use `AND` as a field name or a value, surround it by quotes. + 1:4 AND = 8 + "###); + + insta::assert_display_snapshot!(p("((((((((((((((((((((((((((((((((((((((((((((((((((x = 1))))))))))))))))))))))))))))))))))))))))))))))))))"), @r###" + The filter exceeded the maximum depth limit. Try rewriting the filter so that it contains fewer nested conditions. + 51:106 ((((((((((((((((((((((((((((((((((((((((((((((((((x = 1)))))))))))))))))))))))))))))))))))))))))))))))))) + "###); + + insta::assert_display_snapshot!( + px = 1"), + @r###" + The filter exceeded the maximum depth limit. Try rewriting the filter so that it contains fewer nested conditionsx = 1 + "### + ); + + insta::assert_display_snapshot!(p(r#"NOT OR EXISTS AND EXISTS NOT EXISTS"#), @r###" + Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. + 5:7 NOT OR EXISTS AND EXISTS NOT EXISTS + "###); + } + + #[test] + fn depth() { + let filter = FilterCondition::parse("account_ids=1 OR account_ids=2 OR account_ids=3 OR account_ids=4 OR account_ids=5 OR account_ids=6").unwrap().unwrap(); + assert!(filter.token_at_depth(1).is_some()); + assert!(filter.token_at_depth(2).is_none()); + + let filter = FilterCondition::parse("(account_ids=1 OR (account_ids=2 AND account_ids=3) OR (account_ids=4 AND account_ids=5) OR account_ids=6)").unwrap().unwrap(); + assert!(filter.token_at_depth(2).is_some()); + assert!(filter.token_at_depth(3).is_none()); + + let filter = FilterCondition::parse("account_ids=1 OR account_ids=2 AND account_ids=3 OR account_ids=4 AND account_ids=5 OR account_ids=6").unwrap().unwrap(); + assert!(filter.token_at_depth(2).is_some()); + assert!(filter.token_at_depth(3).is_none()); + } + + #[test] + fn token_from_str() { + let s = "test string that should not be parsed"; + let token: Token = s.into(); + assert_eq!(token.value(), s); + } +} + +impl<'a> std::fmt::Display for FilterCondition<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + FilterCondition::Not(filter) => { + write!(f, "NOT ({filter})") + } + FilterCondition::Condition { fid, op } => { + write!(f, "{fid} {op}") + } + FilterCondition::In { fid, els } => { + write!(f, "{fid} IN[")?; + for el in els { + write!(f, "{el}, ")?; + } + write!(f, "]") + } + FilterCondition::Or(els) => { + write!(f, "OR[")?; + for el in els { + write!(f, "{el}, ")?; + } + write!(f, "]") + } + FilterCondition::And(els) => { + write!(f, "AND[")?; + for el in els { + write!(f, "{el}, ")?; + } + write!(f, "]") + } + FilterCondition::GeoLowerThan { point, radius } => { + write!(f, "_geoRadius({}, {}, {})", point[0], point[1], radius) + } + } + } +} +impl<'a> std::fmt::Display for Condition<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Condition::GreaterThan(token) => write!(f, "> {token}"), + Condition::GreaterThanOrEqual(token) => write!(f, ">= {token}"), + Condition::Equal(token) => write!(f, "= {token}"), + Condition::NotEqual(token) => write!(f, "!= {token}"), + Condition::Exists => write!(f, "EXISTS"), + Condition::LowerThan(token) => write!(f, "< {token}"), + Condition::LowerThanOrEqual(token) => write!(f, "<= {token}"), + Condition::Between { from, to } => write!(f, "{from} TO {to}"), + } + } +} +impl<'a> std::fmt::Display for Token<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{{{}}}", self.value()) + } +} diff --git a/filter-parser/src/main.rs b/filter-parser/src/main.rs new file mode 100644 index 000000000..15ab86188 --- /dev/null +++ b/filter-parser/src/main.rs @@ -0,0 +1,16 @@ +fn main() { + let input = std::env::args().nth(1).expect("You must provide a filter to test"); + + println!("Trying to execute the following filter:\n{}\n", input); + + match filter_parser::FilterCondition::parse(&input) { + Ok(filter) => { + println!("✅ Valid filter"); + println!("{:#?}", filter); + } + Err(e) => { + println!("❎ Invalid filter"); + println!("{}", e); + } + } +} diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs new file mode 100644 index 000000000..73ef61480 --- /dev/null +++ b/filter-parser/src/value.rs @@ -0,0 +1,341 @@ +use nom::branch::alt; +use nom::bytes::complete::{take_till, take_while, take_while1}; +use nom::character::complete::{char, multispace0}; +use nom::combinator::cut; +use nom::sequence::{delimited, terminated}; +use nom::{InputIter, InputLength, InputTake, Slice}; + +use crate::error::{ExpectedValueKind, NomErrorExt}; +use crate::{parse_geo_point, parse_geo_radius, Error, ErrorKind, IResult, Span, Token}; + +/// This function goes through all characters in the [Span] if it finds any escaped character (`\`). +/// It generates a new string with all `\` removed from the [Span]. +fn unescape(buf: Span, char_to_escape: char) -> String { + let to_escape = format!("\\{}", char_to_escape); + buf.replace(&to_escape, &char_to_escape.to_string()) +} + +/// Parse a value in quote. If it encounter an escaped quote it'll unescape it. +fn quoted_by(quote: char, input: Span) -> IResult { + // empty fields / values are valid in json + if input.is_empty() { + return Ok((input.slice(input.input_len()..), input.into())); + } + + let mut escaped = false; + let mut i = input.iter_indices(); + + while let Some((idx, c)) = i.next() { + if c == quote { + let (rem, output) = input.take_split(idx); + return Ok((rem, Token::new(output, escaped.then(|| unescape(output, quote))))); + } else if c == '\\' { + if let Some((_, c)) = i.next() { + escaped |= c == quote; + } else { + return Err(nom::Err::Error(Error::new_from_kind( + input, + ErrorKind::MalformedValue, + ))); + } + } + // if it was preceeded by a `\` or if it was anything else we can continue to advance + } + + Ok(( + input.slice(input.input_len()..), + Token::new(input, escaped.then(|| unescape(input, quote))), + )) +} + +// word = (alphanumeric | _ | - | .)+ except for reserved keywords +pub fn word_not_keyword<'a>(input: Span<'a>) -> IResult> { + let (input, word): (_, Token<'a>) = + take_while1(is_value_component)(input).map(|(s, t)| (s, t.into()))?; + if is_keyword(word.value()) { + return Err(nom::Err::Error(Error::new_from_kind( + input, + ErrorKind::ReservedKeyword(word.value().to_owned()), + ))); + } + Ok((input, word)) +} + +// word = {tag} +pub fn word_exact<'a, 'b: 'a>(tag: &'b str) -> impl Fn(Span<'a>) -> IResult<'a, Token<'a>> { + move |input| { + let (input, word): (_, Token<'a>) = + take_while1(is_value_component)(input).map(|(s, t)| (s, t.into()))?; + if word.value() == tag { + Ok((input, word)) + } else { + Err(nom::Err::Error(Error::new_from_kind( + input, + ErrorKind::InternalError(nom::error::ErrorKind::Tag), + ))) + } + } +} + +/// value = WS* ( word | singleQuoted | doubleQuoted) WS+ +pub fn parse_value(input: Span) -> IResult { + // to get better diagnostic message we are going to strip the left whitespaces from the input right now + let (input, _) = take_while(char::is_whitespace)(input)?; + + // then, we want to check if the user is misusing a geo expression + // This expression can’t finish without error. + // We want to return an error in case of failure. + if let Err(err) = parse_geo_point(input) { + if err.is_failure() { + return Err(err); + } + } + match parse_geo_radius(input) { + Ok(_) => return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::MisusedGeo))), + // if we encountered a failure it means the user badly wrote a _geoRadius filter. + // But instead of showing him how to fix his syntax we are going to tell him he should not use this filter as a value. + Err(e) if e.is_failure() => { + return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::MisusedGeo))) + } + _ => (), + } + + // this parser is only used when an error is encountered and it parse the + // largest string possible that do not contain any “language” syntax. + // If we try to parse `name = 🦀 AND language = rust` we want to return an + // error saying we could not parse `🦀`. Not that no value were found or that + // we could note parse `🦀 AND language = rust`. + // we want to remove the space before entering the alt because if we don't, + // when we create the errors from the output of the alt we have spaces everywhere + let error_word = take_till::<_, _, Error>(is_syntax_component); + + let (input, value) = terminated( + alt(( + delimited(char('\''), cut(|input| quoted_by('\'', input)), cut(char('\''))), + delimited(char('"'), cut(|input| quoted_by('"', input)), cut(char('"'))), + word_not_keyword, + )), + multispace0, + )(input) + // if we found nothing in the alt it means the user specified something that was not recognized as a value + .map_err(|e: nom::Err| { + e.map_err(|error| { + let expected_value_kind = if matches!(error.kind(), ErrorKind::ReservedKeyword(_)) { + ExpectedValueKind::ReservedKeyword + } else { + ExpectedValueKind::Other + }; + Error::new_from_kind( + error_word(input).unwrap().1, + ErrorKind::ExpectedValue(expected_value_kind), + ) + }) + }) + .map_err(|e| { + e.map_fail(|failure| { + // if we found encountered a char failure it means the user had an unmatched quote + if matches!(failure.kind(), ErrorKind::Char(_)) { + Error::new_from_kind(input, ErrorKind::MissingClosingDelimiter(failure.char())) + } else { + // else we let the failure untouched + failure + } + }) + })?; + + Ok((input, value)) +} + +fn is_value_component(c: char) -> bool { + c.is_alphanumeric() || ['_', '-', '.'].contains(&c) +} + +fn is_syntax_component(c: char) -> bool { + c.is_whitespace() || ['(', ')', '=', '<', '>', '!'].contains(&c) +} + +fn is_keyword(s: &str) -> bool { + matches!(s, "AND" | "OR" | "IN" | "NOT" | "TO" | "EXISTS" | "_geoRadius") +} + +#[cfg(test)] +pub mod test { + use nom::Finish; + + use super::*; + use crate::tests::rtok; + + #[test] + fn test_span() { + let test_case = [ + ("channel", rtok("", "channel")), + (".private", rtok("", ".private")), + ("I-love-kebab", rtok("", "I-love-kebab")), + ("but_snakes_is_also_good", rtok("", "but_snakes_is_also_good")), + ("parens(", rtok("", "parens")), + ("parens)", rtok("", "parens")), + ("not!", rtok("", "not")), + (" channel", rtok(" ", "channel")), + ("channel ", rtok("", "channel")), + (" channel ", rtok(" ", "channel")), + ("'channel'", rtok("'", "channel")), + ("\"channel\"", rtok("\"", "channel")), + ("'cha)nnel'", rtok("'", "cha)nnel")), + ("'cha\"nnel'", rtok("'", "cha\"nnel")), + ("\"cha'nnel\"", rtok("\"", "cha'nnel")), + ("\" some spaces \"", rtok("\"", " some spaces ")), + ("\"cha'nnel\"", rtok("'", "cha'nnel")), + ("\"cha'nnel\"", rtok("'", "cha'nnel")), + ("I'm tamo", rtok("'m tamo", "I")), + ("\"I'm \\\"super\\\" tamo\"", rtok("\"", "I'm \\\"super\\\" tamo")), + ]; + + for (input, expected) in test_case { + let input = Span::new_extra(input, input); + let result = parse_value(input); + + assert!( + result.is_ok(), + "Filter `{:?}` was supposed to be parsed but failed with the following error: `{}`", + expected, + result.unwrap_err() + ); + let token = result.unwrap().1; + assert_eq!(token, expected, "Filter `{}` failed.", input); + } + } + + #[test] + fn test_escape_inside_double_quote() { + // (input, remaining, expected output token, output value) + let test_case = [ + ("aaaa", "", rtok("", "aaaa"), "aaaa"), + (r#"aa"aa"#, r#""aa"#, rtok("", "aa"), "aa"), + (r#"aa\"aa"#, r#""#, rtok("", r#"aa\"aa"#), r#"aa"aa"#), + (r#"aa\\\aa"#, r#""#, rtok("", r#"aa\\\aa"#), r#"aa\\\aa"#), + (r#"aa\\"\aa"#, r#""\aa"#, rtok("", r#"aa\\"#), r#"aa\\"#), + (r#"aa\\\"\aa"#, r#""#, rtok("", r#"aa\\\"\aa"#), r#"aa\\"\aa"#), + (r#"\"\""#, r#""#, rtok("", r#"\"\""#), r#""""#), + ]; + + for (input, remaining, expected_tok, expected_val) in test_case { + let span = Span::new_extra(input, ""); + let result = quoted_by('"', span); + assert!(result.is_ok()); + + let (rem, output) = result.unwrap(); + assert_eq!(rem.to_string(), remaining); + assert_eq!(output, expected_tok); + assert_eq!(output.value(), expected_val.to_string()); + } + } + + #[test] + fn test_unescape() { + // double quote + assert_eq!( + unescape(Span::new_extra(r#"Hello \"World\""#, ""), '"'), + r#"Hello "World""#.to_string() + ); + assert_eq!( + unescape(Span::new_extra(r#"Hello \\\"World\\\""#, ""), '"'), + r#"Hello \\"World\\""#.to_string() + ); + // simple quote + assert_eq!( + unescape(Span::new_extra(r#"Hello \'World\'"#, ""), '\''), + r#"Hello 'World'"#.to_string() + ); + assert_eq!( + unescape(Span::new_extra(r#"Hello \\\'World\\\'"#, ""), '\''), + r#"Hello \\'World\\'"#.to_string() + ); + } + + #[test] + fn test_value() { + let test_case = [ + // (input, expected value, if a string was generated to hold the new value) + ("channel", "channel", false), + // All the base test, no escaped string should be generated + (".private", ".private", false), + ("I-love-kebab", "I-love-kebab", false), + ("but_snakes_is_also_good", "but_snakes_is_also_good", false), + ("parens(", "parens", false), + ("parens)", "parens", false), + ("not!", "not", false), + (" channel", "channel", false), + ("channel ", "channel", false), + (" channel ", "channel", false), + ("'channel'", "channel", false), + ("\"channel\"", "channel", false), + ("'cha)nnel'", "cha)nnel", false), + ("'cha\"nnel'", "cha\"nnel", false), + ("\"cha'nnel\"", "cha'nnel", false), + ("\" some spaces \"", " some spaces ", false), + ("\"cha'nnel\"", "cha'nnel", false), + ("\"cha'nnel\"", "cha'nnel", false), + ("I'm tamo", "I", false), + // escaped thing but not quote + (r#""\\""#, r#"\\"#, false), + (r#""\\\\\\""#, r#"\\\\\\"#, false), + (r#""aa\\aa""#, r#"aa\\aa"#, false), + // with double quote + (r#""Hello \"world\"""#, r#"Hello "world""#, true), + (r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true), + (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true), + (r#""\"\"""#, r#""""#, true), + // with simple quote + (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true), + (r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true), + (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true), + (r#"'\'\''"#, r#"''"#, true), + ]; + + for (input, expected, escaped) in test_case { + let input = Span::new_extra(input, input); + let result = parse_value(input); + + assert!( + result.is_ok(), + "Filter `{:?}` was supposed to be parsed but failed with the following error: `{}`", + expected, + result.unwrap_err() + ); + let token = result.unwrap().1; + assert_eq!( + token.value.is_some(), + escaped, + "Filter `{}` was not supposed to be escaped", + input + ); + assert_eq!(token.value(), expected, "Filter `{}` failed.", input); + } + } + + #[test] + fn diagnostic() { + let test_case = [ + ("🦀", "🦀"), + (" 🦀", "🦀"), + ("🦀 AND crab = truc", "🦀"), + ("🦀_in_name", "🦀_in_name"), + (" (name = ...", ""), + ]; + + for (input, expected) in test_case { + let input = Span::new_extra(input, input); + let result = parse_value(input); + + assert!( + result.is_err(), + "Filter `{}` wasn’t supposed to be parsed but it did with the following result: `{:?}`", + expected, + result.unwrap() + ); + // get the inner string referenced in the error + let value = *result.finish().unwrap_err().context().fragment(); + assert_eq!(value, expected, "Filter `{}` was supposed to fail with the following value: `{}`, but it failed with: `{}`.", input, expected, value); + } + } +} diff --git a/flatten-serde-json/Cargo.toml b/flatten-serde-json/Cargo.toml new file mode 100644 index 000000000..2fb668f86 --- /dev/null +++ b/flatten-serde-json/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "flatten-serde-json" +version = "0.39.0" +edition = "2021" +description = "Flatten serde-json objects like elastic search" +readme = "README.md" +publish = false + +[dependencies] +serde_json = "1.0" + +[dev-dependencies] +criterion = { version = "0.4.0", features = ["html_reports"] } + +[[bench]] +name = "benchmarks" +harness = false diff --git a/flatten-serde-json/README.md b/flatten-serde-json/README.md new file mode 100644 index 000000000..a1dd7d275 --- /dev/null +++ b/flatten-serde-json/README.md @@ -0,0 +1,153 @@ +# Flatten serde Json + +This crate flatten [`serde_json`](https://docs.rs/serde_json/latest/serde_json/) `Object` in a format +similar to [elastic search](https://www.elastic.co/guide/en/elasticsearch/reference/current/nested.html). + +## Examples + +### There is nothing to do + +```json +{ + "id": "287947", + "title": "Shazam!", + "release_date": 1553299200, + "genres": [ + "Action", + "Comedy", + "Fantasy" + ] +} +``` + +Flattens to: +```json +{ + "id": "287947", + "title": "Shazam!", + "release_date": 1553299200, + "genres": [ + "Action", + "Comedy", + "Fantasy" + ] +} +``` + +------------ + +### Objects + +```json +{ + "a": { + "b": "c", + "d": "e", + "f": "g" + } +} +``` + +Flattens to: +```json +{ + "a.b": "c", + "a.d": "e", + "a.f": "g" +} +``` + +------------ + +### Array of objects + +```json +{ + "a": [ + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] +} +``` + +Flattens to: +```json +{ + "a.b": ["c", "d", "e"], +} +``` + +------------ + +### Array of objects with normal value in the array + +```json +{ + "a": [ + 42, + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] +} +``` + +Flattens to: +```json +{ + "a": 42, + "a.b": ["c", "d", "e"], +} +``` + +------------ + +### Array of objects of array of objects of ... + +```json +{ + "a": [ + "b", + ["c", "d"], + { "e": ["f", "g"] }, + [ + { "h": "i" }, + { "e": ["j", { "z": "y" }] }, + ], + ["l"], + "m", + ] +} +``` + +Flattens to: +```json +{ + "a": ["b", "c", "d", "l", "m"], + "a.e": ["f", "g", "j"], + "a.h": "i", + "a.e.z": "y", +} +``` + +------------ + +### Collision between a generated field name and an already existing field + +```json +{ + "a": { + "b": "c", + }, + "a.b": "d", +} +``` + +Flattens to: +```json +{ + "a.b": ["c", "d"], +} +``` + diff --git a/flatten-serde-json/benches/benchmarks.rs b/flatten-serde-json/benches/benchmarks.rs new file mode 100644 index 000000000..6536bb513 --- /dev/null +++ b/flatten-serde-json/benches/benchmarks.rs @@ -0,0 +1,42 @@ +use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; +use flatten_serde_json::flatten; +use serde_json::json; + +pub fn flatten_simple(c: &mut Criterion) { + let mut input = json!({ + "a": { + "b": "c", + "d": "e", + "f": "g" + } + }); + let object = input.as_object_mut().unwrap(); + + c.bench_with_input(BenchmarkId::new("flatten", "simple"), &object, |b, input| { + b.iter(|| flatten(input)) + }); +} + +pub fn flatten_complex(c: &mut Criterion) { + let mut input = json!({ + "a": [ + "b", + ["c", "d"], + { "e": ["f", "g"] }, + [ + { "h": "i" }, + { "e": ["j", { "z": "y" }] }, + ], + ["l"], + "m", + ] + }); + let object = input.as_object_mut().unwrap(); + + c.bench_with_input(BenchmarkId::new("flatten", "complex"), &object, |b, input| { + b.iter(|| flatten(input)) + }); +} + +criterion_group!(benches, flatten_simple, flatten_complex); +criterion_main!(benches); diff --git a/flatten-serde-json/fuzz/Cargo.toml b/flatten-serde-json/fuzz/Cargo.toml new file mode 100644 index 000000000..52b514785 --- /dev/null +++ b/flatten-serde-json/fuzz/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "flatten-serde-json-fuzz" +version = "0.0.0" +authors = ["Automatically generated"] +publish = false +edition = "2018" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +arbitrary-json = "0.1.1" +json-depth-checker = { path = "../../json-depth-checker" } + +[dependencies.flatten-serde-json] +path = ".." + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[[bin]] +name = "flatten" +path = "fuzz_targets/flatten.rs" +test = false +doc = false diff --git a/flatten-serde-json/fuzz/fuzz_targets/flatten.rs b/flatten-serde-json/fuzz/fuzz_targets/flatten.rs new file mode 100644 index 000000000..97969dfab --- /dev/null +++ b/flatten-serde-json/fuzz/fuzz_targets/flatten.rs @@ -0,0 +1,12 @@ +#![no_main] +use arbitrary_json::ArbitraryObject; +use flatten_serde_json::flatten; +use json_depth_checker::should_flatten_from_value; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|object: ArbitraryObject| { + let object = flatten(&object); + if !object.is_empty() { + assert!(object.values().any(|value| !should_flatten_from_value(value))); + } +}); diff --git a/flatten-serde-json/src/lib.rs b/flatten-serde-json/src/lib.rs new file mode 100644 index 000000000..e1b2b20c7 --- /dev/null +++ b/flatten-serde-json/src/lib.rs @@ -0,0 +1,305 @@ +#![doc = include_str!("../README.md")] + +use serde_json::{Map, Value}; + +pub fn flatten(json: &Map) -> Map { + let mut obj = Map::new(); + let mut all_keys = vec![]; + insert_object(&mut obj, None, json, &mut all_keys); + for key in all_keys { + obj.entry(key).or_insert(Value::Array(vec![])); + } + obj +} + +fn insert_object( + base_json: &mut Map, + base_key: Option<&str>, + object: &Map, + all_keys: &mut Vec, +) { + for (key, value) in object { + let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); + all_keys.push(new_key.clone()); + if let Some(array) = value.as_array() { + insert_array(base_json, &new_key, array, all_keys); + } else if let Some(object) = value.as_object() { + insert_object(base_json, Some(&new_key), object, all_keys); + } else { + insert_value(base_json, &new_key, value.clone()); + } + } +} + +fn insert_array( + base_json: &mut Map, + base_key: &str, + array: &Vec, + all_keys: &mut Vec, +) { + for value in array { + if let Some(object) = value.as_object() { + insert_object(base_json, Some(base_key), object, all_keys); + } else if let Some(sub_array) = value.as_array() { + insert_array(base_json, base_key, sub_array, all_keys); + } else { + insert_value(base_json, base_key, value.clone()); + } + } +} + +fn insert_value(base_json: &mut Map, key: &str, to_insert: Value) { + debug_assert!(!to_insert.is_object()); + debug_assert!(!to_insert.is_array()); + + // does the field already exists? + if let Some(value) = base_json.get_mut(key) { + // is it already an array + if let Some(array) = value.as_array_mut() { + array.push(to_insert); + // or is there a collision + } else { + let value = std::mem::take(value); + base_json[key] = Value::Array(vec![value, to_insert]); + } + // if it does not exist we can push the value untouched + } else { + base_json.insert(key.to_string(), to_insert); + } +} + +#[cfg(test)] +mod tests { + use serde_json::json; + + use super::*; + + #[test] + fn no_flattening() { + let mut base: Value = json!({ + "id": "287947", + "title": "Shazam!", + "release_date": 1553299200, + "genres": [ + "Action", + "Comedy", + "Fantasy" + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + println!( + "got:\n{}\nexpected:\n{}\n", + serde_json::to_string_pretty(&flat).unwrap(), + serde_json::to_string_pretty(&json).unwrap() + ); + + assert_eq!(flat, json); + } + + #[test] + fn flatten_object() { + let mut base: Value = json!({ + "a": { + "b": "c", + "d": "e", + "f": "g" + } + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": [], + "a.b": "c", + "a.d": "e", + "a.f": "g" + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn flatten_array() { + let mut base: Value = json!({ + "a": [ + 1, + "b", + [], + [{}], + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": [1, "b"], + "a.b": ["c", "d", "e"], + }) + .as_object() + .unwrap() + ); + + // here we must keep 42 in "a" + let mut base: Value = json!({ + "a": [ + 42, + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": 42, + "a.b": ["c", "d", "e"], + }) + .as_object() + .unwrap() + ); + + // here we must keep 42 in "a" + let mut base: Value = json!({ + "a": [ + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + null, + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": null, + "a.b": ["c", "d", "e"], + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn collision_with_object() { + let mut base: Value = json!({ + "a": { + "b": "c", + }, + "a.b": "d", + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": [], + "a.b": ["c", "d"], + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn collision_with_array() { + let mut base: Value = json!({ + "a": [ + { "b": "c" }, + { "b": "d", "c": "e" }, + [35], + ], + "a.b": "f", + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a.b": ["c", "d", "f"], + "a.c": "e", + "a": 35, + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn flatten_nested_arrays() { + let mut base: Value = json!({ + "a": [ + ["b", "c"], + { "d": "e" }, + ["f", "g"], + [ + { "h": "i" }, + { "d": "j" }, + ], + ["k", "l"], + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": ["b", "c", "f", "g", "k", "l"], + "a.d": ["e", "j"], + "a.h": "i", + }) + .as_object() + .unwrap() + ); + } + + #[test] + fn flatten_nested_arrays_and_objects() { + let mut base: Value = json!({ + "a": [ + "b", + ["c", "d"], + { "e": ["f", "g"] }, + [ + { "h": "i" }, + { "e": ["j", { "z": "y" }] }, + ], + ["l"], + "m", + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + println!("{}", serde_json::to_string_pretty(&flat).unwrap()); + + assert_eq!( + &flat, + json!({ + "a": ["b", "c", "d", "l", "m"], + "a.e": ["f", "g", "j"], + "a.h": "i", + "a.e.z": "y", + }) + .as_object() + .unwrap() + ); + } +} diff --git a/flatten-serde-json/src/main.rs b/flatten-serde-json/src/main.rs new file mode 100644 index 000000000..dabb386f1 --- /dev/null +++ b/flatten-serde-json/src/main.rs @@ -0,0 +1,11 @@ +use std::io::stdin; + +use flatten_serde_json::flatten; +use serde_json::{Map, Value}; + +fn main() { + let json: Map = serde_json::from_reader(stdin()).unwrap(); + + let result = flatten(&json); + println!("{}", serde_json::to_string_pretty(&result).unwrap()); +} diff --git a/json-depth-checker/Cargo.toml b/json-depth-checker/Cargo.toml new file mode 100644 index 000000000..feb245e5e --- /dev/null +++ b/json-depth-checker/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "json-depth-checker" +version = "0.39.0" +edition = "2021" +description = "A library that indicates if a JSON must be flattened" +publish = false + +[dependencies] +serde_json = "1.0" + +[dev-dependencies] +criterion = "0.4.0" + +[[bench]] +name = "depth" +harness = false diff --git a/json-depth-checker/benches/depth.rs b/json-depth-checker/benches/depth.rs new file mode 100644 index 000000000..e11bc1a68 --- /dev/null +++ b/json-depth-checker/benches/depth.rs @@ -0,0 +1,59 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use json_depth_checker::should_flatten_from_unchecked_slice; +use serde_json::json; + +fn criterion_benchmark(c: &mut Criterion) { + let null = serde_json::to_vec(&json!(null)).unwrap(); + let bool_true = serde_json::to_vec(&json!(true)).unwrap(); + let bool_false = serde_json::to_vec(&json!(false)).unwrap(); + let integer = serde_json::to_vec(&json!(42)).unwrap(); + let float = serde_json::to_vec(&json!(1456.258)).unwrap(); + let string = serde_json::to_vec(&json!("hello world")).unwrap(); + let object = serde_json::to_vec(&json!({ "hello": "world",})).unwrap(); + let complex_object = serde_json::to_vec(&json!({ + "doggos": [ + { "bernard": true }, + { "michel": 42 }, + false, + ], + "bouvier": true, + "caniche": null, + })) + .unwrap(); + let simple_array = serde_json::to_vec(&json!([ + 1, + 2, + 3, + "viva", + "l\"algeria", + true, + "[array]", + "escaped string \"" + ])) + .unwrap(); + let array_of_array = serde_json::to_vec(&json!([1, [2, [3]]])).unwrap(); + let array_of_object = serde_json::to_vec(&json!([1, [2, [3]], {}])).unwrap(); + + c.bench_function("null", |b| b.iter(|| should_flatten_from_unchecked_slice(&null))); + c.bench_function("true", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_true))); + c.bench_function("false", |b| b.iter(|| should_flatten_from_unchecked_slice(&bool_false))); + c.bench_function("integer", |b| b.iter(|| should_flatten_from_unchecked_slice(&integer))); + c.bench_function("float", |b| b.iter(|| should_flatten_from_unchecked_slice(&float))); + c.bench_function("string", |b| b.iter(|| should_flatten_from_unchecked_slice(&string))); + c.bench_function("object", |b| b.iter(|| should_flatten_from_unchecked_slice(&object))); + c.bench_function("complex object", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&complex_object)) + }); + c.bench_function("simple array", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&simple_array)) + }); + c.bench_function("array of array", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&array_of_array)) + }); + c.bench_function("array of object", |b| { + b.iter(|| should_flatten_from_unchecked_slice(&array_of_object)) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/json-depth-checker/fuzz/Cargo.toml b/json-depth-checker/fuzz/Cargo.toml new file mode 100644 index 000000000..e36657ec2 --- /dev/null +++ b/json-depth-checker/fuzz/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "json-depth-checker" +version = "0.0.0" +authors = ["Automatically generated"] +publish = false +edition = "2018" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" +arbitrary-json = "0.1.1" +serde_json = "1.0.79" + +[dependencies.json-depth-checker] +path = ".." + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[[bin]] +name = "depth" +path = "fuzz_targets/depth.rs" +test = false +doc = false diff --git a/json-depth-checker/fuzz/fuzz_targets/depth.rs b/json-depth-checker/fuzz/fuzz_targets/depth.rs new file mode 100644 index 000000000..6c3a6efe7 --- /dev/null +++ b/json-depth-checker/fuzz/fuzz_targets/depth.rs @@ -0,0 +1,13 @@ +#![no_main] +use arbitrary_json::ArbitraryValue; +use json_depth_checker::*; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|value: ArbitraryValue| { + let value = serde_json::Value::from(value); + let left = should_flatten_from_value(&value); + let value = serde_json::to_vec(&value).unwrap(); + let right = should_flatten_from_unchecked_slice(&value); + + assert_eq!(left, right); +}); diff --git a/json-depth-checker/src/lib.rs b/json-depth-checker/src/lib.rs new file mode 100644 index 000000000..d571a0ca5 --- /dev/null +++ b/json-depth-checker/src/lib.rs @@ -0,0 +1,114 @@ +use serde_json::Value; + +/// Your json MUST BE valid and generated by `serde_json::to_vec` before being +/// sent in this function. This function is DUMB and FAST but makes a lot of +/// asumption about the way `serde_json` will generate its input. +/// +/// Will return `true` if the JSON contains an object, an array of array +/// or an array containing an object. Returns `false` for everything else. +pub fn should_flatten_from_unchecked_slice(json: &[u8]) -> bool { + if json.is_empty() { + return false; + } + + // since the json we receive has been generated by serde_json we know + // it doesn't contains any whitespace at the beginning thus we can check + // directly if we're looking at an object. + if json[0] == b'{' { + return true; + } else if json[0] != b'[' { + // if the json isn't an object or an array it means it's a simple value. + return false; + } + + // The array case is a little bit more complex. We are looking for a second + // `[` but we need to ensure that it doesn't appear inside of a string. Thus + // we need to keep track of if we're in a string or not. + + // will be used when we met a `\` to skip the next character. + let mut skip_next = false; + let mut in_string = false; + + for byte in json.iter().skip(1) { + match byte { + // handle the backlash. + _ if skip_next => skip_next = false, + b'\\' => skip_next = true, + + // handle the strings. + byte if in_string => { + if *byte == b'"' { + in_string = false; + } + } + b'"' => in_string = true, + + // handle the arrays. + b'[' => return true, + // since we know the json is valid we don't need to ensure the + // array is correctly closed + + // handle the objects. + b'{' => return true, + + // ignore everything else + _ => (), + } + } + + false +} + +/// Consider using [`should_flatten_from_unchecked_slice`] when you can. +/// Will returns `true` if the json contains an object, an array of array +/// or an array containing an object. +/// Returns `false` for everything else. +/// This function has been written to test the [`should_flatten_from_unchecked_slice`]. +pub fn should_flatten_from_value(json: &Value) -> bool { + match json { + Value::Object(..) => true, + Value::Array(array) => array.iter().any(|value| value.is_array() || value.is_object()), + _ => false, + } +} + +#[cfg(test)] +mod tests { + use serde_json::*; + + use super::*; + + #[test] + fn test_shouldnt_flatten() { + let shouldnt_flatten = vec![ + json!(null), + json!(true), + json!(false), + json!("a superb string"), + json!("a string escaping other \"string\""), + json!([null, true, false]), + json!(["hello", "world", "!"]), + json!(["a \"string\" escaping 'an other'", "\"[\"", "\"{\""]), + ]; + for value in shouldnt_flatten { + assert!(!should_flatten_from_value(&value)); + let value = serde_json::to_vec(&value).unwrap(); + assert!(!should_flatten_from_unchecked_slice(&value)); + } + } + + #[test] + fn test_should_flatten() { + let should_flatten = vec![ + json!({}), + json!({ "hello": "world" }), + json!(["hello", ["world"]]), + json!([true, true, true, true, true, true, true, true, true, {}]), + ]; + for value in should_flatten { + assert!(should_flatten_from_value(&value)); + let value = serde_json::to_vec(&value).unwrap(); + assert!(should_flatten_from_unchecked_slice(&value)); + } + } +} diff --git a/milli/Cargo.toml b/milli/Cargo.toml new file mode 100644 index 000000000..e23051b69 --- /dev/null +++ b/milli/Cargo.toml @@ -0,0 +1,80 @@ +[package] +name = "milli" +version = "0.39.0" +authors = ["Kerollmops "] +edition = "2018" + +[dependencies] +bimap = { version = "0.6.2", features = ["serde"] } +bincode = "1.3.3" +bstr = "1.0.1" +byteorder = "1.4.3" +charabia = { version = "0.7.0", default-features = false } +concat-arrays = "0.1.2" +crossbeam-channel = "0.5.6" +deserr = "0.1.4" +either = "1.8.0" +flatten-serde-json = { path = "../flatten-serde-json" } +fst = "0.4.7" +fxhash = "0.2.1" +geoutils = "0.5.1" +grenad = { version = "0.4.3", default-features = false, features = ["tempfile"] } +heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.4", default-features = false, features = ["lmdb", "sync-read-txn"] } +json-depth-checker = { path = "../json-depth-checker" } +levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } +memmap2 = "0.5.7" +obkv = "0.2.0" +once_cell = "1.15.0" +ordered-float = "3.2.0" +rayon = "1.5.3" +roaring = "0.10.1" +rstar = { version = "0.9.3", features = ["serde"] } +serde = { version = "1.0.145", features = ["derive"] } +serde_json = { version = "1.0.85", features = ["preserve_order"] } +slice-group-by = "0.3.0" +smallstr = { version = "0.3.0", features = ["serde"] } +smallvec = "1.10.0" +smartstring = "1.0.1" +tempfile = "3.3.0" +thiserror = "1.0.37" +time = { version = "0.3.15", features = ["serde-well-known", "formatting", "parsing", "macros"] } +uuid = { version = "1.1.2", features = ["v4"] } + +filter-parser = { path = "../filter-parser" } + +# documents words self-join +itertools = "0.10.5" + +# logging +log = "0.4.17" +logging_timer = "1.1.0" +csv = "1.1.6" + +[dev-dependencies] +big_s = "1.0.2" +insta = "1.21.0" +maplit = "1.0.2" +md5 = "0.7.0" +rand = {version = "0.8.5", features = ["small_rng"] } + +[target.'cfg(fuzzing)'.dev-dependencies] +fuzzcheck = "0.12.1" + +[features] +default = [ "charabia/default" ] + +# allow chinese specialized tokenization +chinese = ["charabia/chinese"] + +# allow hebrew specialized tokenization +hebrew = ["charabia/hebrew"] + +# allow japanese specialized tokenization +japanese = ["charabia/japanese"] +japanese-transliteration = ["charabia/japanese-transliteration"] + +# allow korean specialized tokenization +korean = ["charabia/korean"] + +# allow thai specialized tokenization +thai = ["charabia/thai"] diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs new file mode 100644 index 000000000..21065da36 --- /dev/null +++ b/milli/src/asc_desc.rs @@ -0,0 +1,297 @@ +//! This module provides the `AscDesc` type and defines all the errors related to this type. + +use std::fmt; +use std::str::FromStr; + +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +use crate::error::is_reserved_keyword; +use crate::{CriterionError, Error, UserError}; + +/// This error type is never supposed to be shown to the end user. +/// You must always cast it to a sort error or a criterion error. +#[derive(Debug)] +pub enum AscDescError { + InvalidLatitude, + InvalidLongitude, + InvalidSyntax { name: String }, + ReservedKeyword { name: String }, +} + +impl fmt::Display for AscDescError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::InvalidLatitude => { + write!(f, "Latitude must be contained between -90 and 90 degrees.",) + } + Self::InvalidLongitude => { + write!(f, "Longitude must be contained between -180 and 180 degrees.",) + } + Self::InvalidSyntax { name } => { + write!(f, "Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name) + } + Self::ReservedKeyword { name } => { + write!( + f, + "`{}` is a reserved keyword and thus can't be used as a asc/desc rule.", + name + ) + } + } + } +} + +impl From for CriterionError { + fn from(error: AscDescError) -> Self { + match error { + AscDescError::InvalidLatitude | AscDescError::InvalidLongitude => { + CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() } + } + AscDescError::InvalidSyntax { name } => CriterionError::InvalidName { name }, + AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => { + CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() } + } + AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => { + CriterionError::ReservedNameForFilter { name: "_geoRadius".to_string() } + } + AscDescError::ReservedKeyword { name } => CriterionError::ReservedName { name }, + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +pub enum Member { + Field(String), + Geo([f64; 2]), +} + +impl FromStr for Member { + type Err = AscDescError; + + fn from_str(text: &str) -> Result { + match text.strip_prefix("_geoPoint(").and_then(|text| text.strip_suffix(')')) { + Some(point) => { + let (lat, lng) = point + .split_once(',') + .ok_or_else(|| AscDescError::ReservedKeyword { name: text.to_string() }) + .and_then(|(lat, lng)| { + lat.trim() + .parse() + .and_then(|lat| lng.trim().parse().map(|lng| (lat, lng))) + .map_err(|_| AscDescError::ReservedKeyword { name: text.to_string() }) + })?; + if !(-90.0..=90.0).contains(&lat) { + return Err(AscDescError::InvalidLatitude)?; + } else if !(-180.0..=180.0).contains(&lng) { + return Err(AscDescError::InvalidLongitude)?; + } + Ok(Member::Geo([lat, lng])) + } + None => { + if is_reserved_keyword(text) || text.starts_with("_geoRadius(") { + return Err(AscDescError::ReservedKeyword { name: text.to_string() })?; + } + Ok(Member::Field(text.to_string())) + } + } + } +} + +impl fmt::Display for Member { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Member::Field(name) => f.write_str(name), + Member::Geo([lat, lng]) => write!(f, "_geoPoint({}, {})", lat, lng), + } + } +} + +impl Member { + pub fn field(&self) -> Option<&str> { + match self { + Member::Field(field) => Some(field), + Member::Geo(_) => None, + } + } + + pub fn geo_point(&self) -> Option<&[f64; 2]> { + match self { + Member::Geo(point) => Some(point), + Member::Field(_) => None, + } + } +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +pub enum AscDesc { + Asc(Member), + Desc(Member), +} + +impl AscDesc { + pub fn member(&self) -> &Member { + match self { + AscDesc::Asc(member) => member, + AscDesc::Desc(member) => member, + } + } + + pub fn field(&self) -> Option<&str> { + self.member().field() + } +} + +impl FromStr for AscDesc { + type Err = AscDescError; + + fn from_str(text: &str) -> Result { + match text.rsplit_once(':') { + Some((left, "asc")) => Ok(AscDesc::Asc(left.parse()?)), + Some((left, "desc")) => Ok(AscDesc::Desc(left.parse()?)), + _ => Err(AscDescError::InvalidSyntax { name: text.to_string() }), + } + } +} + +#[derive(Error, Debug)] +pub enum SortError { + #[error("{}", AscDescError::InvalidLatitude)] + InvalidLatitude, + #[error("{}", AscDescError::InvalidLongitude)] + InvalidLongitude, + #[error("Invalid syntax for the geo parameter: expected expression formated like \ + `_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{name}`.")] + BadGeoPointUsage { name: String }, + #[error("Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{name}`.")] + InvalidName { name: String }, + #[error("`{name}` is a reserved keyword and thus can't be used as a sort expression.")] + ReservedName { name: String }, + #[error("`{name}` is a reserved keyword and thus can't be used as a sort expression. \ + Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.")] + ReservedNameForSettings { name: String }, + #[error("`{name}` is a reserved keyword and thus can't be used as a sort expression. \ + Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.")] + ReservedNameForFilter { name: String }, +} + +impl From for SortError { + fn from(error: AscDescError) -> Self { + match error { + AscDescError::InvalidLatitude => SortError::InvalidLatitude, + AscDescError::InvalidLongitude => SortError::InvalidLongitude, + AscDescError::InvalidSyntax { name } => SortError::InvalidName { name }, + AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => { + SortError::BadGeoPointUsage { name } + } + AscDescError::ReservedKeyword { name } if &name == "_geo" => { + SortError::ReservedNameForSettings { name } + } + AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => { + SortError::ReservedNameForFilter { name: String::from("_geoRadius") } + } + AscDescError::ReservedKeyword { name } => SortError::ReservedName { name }, + } + } +} + +impl From for Error { + fn from(error: SortError) -> Self { + Self::UserError(UserError::SortError(error)) + } +} + +#[cfg(test)] +mod tests { + use big_s::S; + use AscDesc::*; + use AscDescError::*; + use Member::*; + + use super::*; + + #[test] + fn parse_asc_desc() { + let valid_req = [ + ("truc:asc", Asc(Field(S("truc")))), + ("bidule:desc", Desc(Field(S("bidule")))), + ("a-b:desc", Desc(Field(S("a-b")))), + ("a:b:desc", Desc(Field(S("a:b")))), + ("a12:asc", Asc(Field(S("a12")))), + ("42:asc", Asc(Field(S("42")))), + ("_geoPoint(42, 59):asc", Asc(Geo([42., 59.]))), + ("_geoPoint(42.459, 59):desc", Desc(Geo([42.459, 59.]))), + ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), + ("_geoPoint(42, 59.895):desc", Desc(Geo([42., 59.895]))), + ("_geoPoint(90.000000000, 180):desc", Desc(Geo([90., 180.]))), + ("_geoPoint(-90, -180.0000000000):asc", Asc(Geo([-90., -180.]))), + ("_geoPoint(42.0002, 59.895):desc", Desc(Geo([42.0002, 59.895]))), + ("_geoPoint(42., 59.):desc", Desc(Geo([42., 59.]))), + ("truc(12, 13):desc", Desc(Field(S("truc(12, 13)")))), + ]; + + for (req, expected) in valid_req { + let res = req.parse::(); + assert!( + res.is_ok(), + "Failed to parse `{}`, was expecting `{:?}` but instead got `{:?}`", + req, + expected, + res + ); + assert_eq!(res.unwrap(), expected); + } + + let invalid_req = [ + ("truc:machin", InvalidSyntax { name: S("truc:machin") }), + ("truc:deesc", InvalidSyntax { name: S("truc:deesc") }), + ("truc:asc:deesc", InvalidSyntax { name: S("truc:asc:deesc") }), + ("42desc", InvalidSyntax { name: S("42desc") }), + ("_geoPoint:asc", ReservedKeyword { name: S("_geoPoint") }), + ("_geoDistance:asc", ReservedKeyword { name: S("_geoDistance") }), + ("_geoPoint(42.12 , 59.598)", InvalidSyntax { name: S("_geoPoint(42.12 , 59.598)") }), + ( + "_geoPoint(42.12 , 59.598):deesc", + InvalidSyntax { name: S("_geoPoint(42.12 , 59.598):deesc") }, + ), + ( + "_geoPoint(42.12 , 59.598):machin", + InvalidSyntax { name: S("_geoPoint(42.12 , 59.598):machin") }, + ), + ( + "_geoPoint(42.12 , 59.598):asc:aasc", + InvalidSyntax { name: S("_geoPoint(42.12 , 59.598):asc:aasc") }, + ), + ( + "_geoPoint(42,12 , 59,598):desc", + ReservedKeyword { name: S("_geoPoint(42,12 , 59,598)") }, + ), + ("_geoPoint(35, 85, 75):asc", ReservedKeyword { name: S("_geoPoint(35, 85, 75)") }), + ("_geoPoint(18):asc", ReservedKeyword { name: S("_geoPoint(18)") }), + ("_geoPoint(200, 200):asc", InvalidLatitude), + ("_geoPoint(90.000001, 0):asc", InvalidLatitude), + ("_geoPoint(0, -180.000001):desc", InvalidLongitude), + ("_geoPoint(159.256, 130):asc", InvalidLatitude), + ("_geoPoint(12, -2021):desc", InvalidLongitude), + ]; + + for (req, expected_error) in invalid_req { + let res = req.parse::(); + assert!( + res.is_err(), + "Should no be able to parse `{}`, was expecting an error but instead got: `{:?}`", + req, + res, + ); + let res = res.unwrap_err(); + assert_eq!( + res.to_string(), + expected_error.to_string(), + "Bad error for input {}: got `{:?}` instead of `{:?}`", + req, + res, + expected_error + ); + } + } +} diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs new file mode 100644 index 000000000..c02cd2525 --- /dev/null +++ b/milli/src/criterion.rs @@ -0,0 +1,183 @@ +use std::fmt; +use std::str::FromStr; + +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +use crate::{AscDesc, Member}; + +#[derive(Error, Debug)] +pub enum CriterionError { + #[error("`{name}` ranking rule is invalid. Valid ranking rules are words, typo, sort, proximity, attribute, exactness and custom ranking rules.")] + InvalidName { name: String }, + #[error("`{name}` is a reserved keyword and thus can't be used as a ranking rule")] + ReservedName { name: String }, + #[error( + "`{name}` is a reserved keyword and thus can't be used as a ranking rule. \ +`{name}` can only be used for sorting at search time" + )] + ReservedNameForSort { name: String }, + #[error( + "`{name}` is a reserved keyword and thus can't be used as a ranking rule. \ +`{name}` can only be used for filtering at search time" + )] + ReservedNameForFilter { name: String }, +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] +pub enum Criterion { + /// Sorted by decreasing number of matched query terms. + /// Query words at the front of an attribute is considered better than if it was at the back. + Words, + /// Sorted by increasing number of typos. + Typo, + /// Sorted by increasing distance between matched query terms. + Proximity, + /// Documents with quey words contained in more important + /// attributes are considered better. + Attribute, + /// Dynamically sort at query time the documents. None, one or multiple Asc/Desc sortable + /// attributes can be used in place of this criterion at query time. + Sort, + /// Sorted by the similarity of the matched words with the query words. + Exactness, + /// Sorted by the increasing value of the field specified. + Asc(String), + /// Sorted by the decreasing value of the field specified. + Desc(String), +} + +impl Criterion { + /// Returns the field name parameter of this criterion. + pub fn field_name(&self) -> Option<&str> { + match self { + Criterion::Asc(name) | Criterion::Desc(name) => Some(name), + _otherwise => None, + } + } +} + +impl FromStr for Criterion { + type Err = CriterionError; + + fn from_str(text: &str) -> Result { + match text { + "words" => Ok(Criterion::Words), + "typo" => Ok(Criterion::Typo), + "proximity" => Ok(Criterion::Proximity), + "attribute" => Ok(Criterion::Attribute), + "sort" => Ok(Criterion::Sort), + "exactness" => Ok(Criterion::Exactness), + text => match AscDesc::from_str(text)? { + AscDesc::Asc(Member::Field(field)) => Ok(Criterion::Asc(field)), + AscDesc::Desc(Member::Field(field)) => Ok(Criterion::Desc(field)), + AscDesc::Asc(Member::Geo(_)) | AscDesc::Desc(Member::Geo(_)) => { + Err(CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() })? + } + }, + } + } +} + +pub fn default_criteria() -> Vec { + vec![ + Criterion::Words, + Criterion::Typo, + Criterion::Proximity, + Criterion::Attribute, + Criterion::Sort, + Criterion::Exactness, + ] +} + +impl fmt::Display for Criterion { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + use Criterion::*; + + match self { + Words => f.write_str("words"), + Typo => f.write_str("typo"), + Proximity => f.write_str("proximity"), + Attribute => f.write_str("attribute"), + Sort => f.write_str("sort"), + Exactness => f.write_str("exactness"), + Asc(attr) => write!(f, "{}:asc", attr), + Desc(attr) => write!(f, "{}:desc", attr), + } + } +} + +#[cfg(test)] +mod tests { + use big_s::S; + use CriterionError::*; + + use super::*; + + #[test] + fn parse_criterion() { + let valid_criteria = [ + ("words", Criterion::Words), + ("typo", Criterion::Typo), + ("proximity", Criterion::Proximity), + ("attribute", Criterion::Attribute), + ("sort", Criterion::Sort), + ("exactness", Criterion::Exactness), + ("price:asc", Criterion::Asc(S("price"))), + ("price:desc", Criterion::Desc(S("price"))), + ("price:asc:desc", Criterion::Desc(S("price:asc"))), + ("truc:machin:desc", Criterion::Desc(S("truc:machin"))), + ("hello-world!:desc", Criterion::Desc(S("hello-world!"))), + ("it's spacy over there:asc", Criterion::Asc(S("it's spacy over there"))), + ]; + + for (input, expected) in valid_criteria { + let res = input.parse::(); + assert!( + res.is_ok(), + "Failed to parse `{}`, was expecting `{:?}` but instead got `{:?}`", + input, + expected, + res + ); + assert_eq!(res.unwrap(), expected); + } + + let invalid_criteria = [ + ("words suffix", InvalidName { name: S("words suffix") }), + ("prefix typo", InvalidName { name: S("prefix typo") }), + ("proximity attribute", InvalidName { name: S("proximity attribute") }), + ("price", InvalidName { name: S("price") }), + ("asc:price", InvalidName { name: S("asc:price") }), + ("price:deesc", InvalidName { name: S("price:deesc") }), + ("price:aasc", InvalidName { name: S("price:aasc") }), + ("price:asc and desc", InvalidName { name: S("price:asc and desc") }), + ("price:asc:truc", InvalidName { name: S("price:asc:truc") }), + ("_geo:asc", ReservedName { name: S("_geo") }), + ("_geoDistance:asc", ReservedName { name: S("_geoDistance") }), + ("_geoPoint:asc", ReservedNameForSort { name: S("_geoPoint") }), + ("_geoPoint(42, 75):asc", ReservedNameForSort { name: S("_geoPoint") }), + ("_geoRadius:asc", ReservedNameForFilter { name: S("_geoRadius") }), + ("_geoRadius(42, 75, 59):asc", ReservedNameForFilter { name: S("_geoRadius") }), + ]; + + for (input, expected) in invalid_criteria { + let res = input.parse::(); + assert!( + res.is_err(), + "Should no be able to parse `{}`, was expecting an error but instead got: `{:?}`", + input, + res + ); + let res = res.unwrap_err(); + assert_eq!( + res.to_string(), + expected.to_string(), + "Bad error for input {}: got `{:?}` instead of `{:?}`", + input, + res, + expected + ); + } + } +} diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs new file mode 100644 index 000000000..1fa59168e --- /dev/null +++ b/milli/src/documents/builder.rs @@ -0,0 +1,579 @@ +use std::io::{self, Write}; + +use grenad::{CompressionType, WriterBuilder}; +use serde::de::Deserializer; +use serde_json::{to_writer, Value}; + +use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; +use crate::documents::serde_impl::DocumentVisitor; +use crate::Object; + +/// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary +/// format used by milli. +/// +/// The writer used by the `DocumentsBatchBuilder` can be read using a `DocumentsBatchReader` +/// to iterate over the documents. +/// +/// ## example: +/// ``` +/// use serde_json::json; +/// use milli::documents::DocumentsBatchBuilder; +/// +/// let json = json!({ "id": 1, "name": "foo" }); +/// +/// let mut builder = DocumentsBatchBuilder::new(Vec::new()); +/// builder.append_json_object(json.as_object().unwrap()).unwrap(); +/// let _vector = builder.into_inner().unwrap(); +/// ``` +pub struct DocumentsBatchBuilder { + /// The inner grenad writer, the last value must always be the `DocumentsBatchIndex`. + writer: grenad::Writer, + /// A map that creates the relation between field ids and field names. + fields_index: DocumentsBatchIndex, + /// The number of documents that were added to this builder, + /// it doesn't take the primary key of the documents into account at this point. + documents_count: u32, + + /// A buffer to store a temporary obkv buffer and avoid reallocating. + obkv_buffer: Vec, + /// A buffer to serialize the values and avoid reallocating, + /// serialized values are stored in an obkv. + value_buffer: Vec, +} + +impl DocumentsBatchBuilder { + pub fn new(writer: W) -> DocumentsBatchBuilder { + DocumentsBatchBuilder { + writer: WriterBuilder::new().compression_type(CompressionType::None).build(writer), + fields_index: DocumentsBatchIndex::default(), + documents_count: 0, + obkv_buffer: Vec::new(), + value_buffer: Vec::new(), + } + } + + /// Returns the number of documents inserted into this builder. + pub fn documents_count(&self) -> u32 { + self.documents_count + } + + /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_json_object(&mut self, object: &Object) -> io::Result<()> { + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. + let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(k)).collect(); + fields_ids.sort_unstable(); + + self.obkv_buffer.clear(); + let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer); + for field_id in fields_ids { + let key = self.fields_index.name(field_id).unwrap(); + self.value_buffer.clear(); + to_writer(&mut self.value_buffer, &object[key])?; + writer.insert(field_id, &self.value_buffer)?; + } + + let internal_id = self.documents_count.to_be_bytes(); + let document_bytes = writer.into_inner()?; + self.writer.insert(internal_id, &document_bytes)?; + self.documents_count += 1; + + Ok(()) + } + + /// Appends a new JSON array of objects into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_json_array(&mut self, reader: R) -> Result<(), Error> { + let mut de = serde_json::Deserializer::from_reader(reader); + let mut visitor = DocumentVisitor::new(self); + de.deserialize_any(&mut visitor)? + } + + /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_csv(&mut self, mut reader: csv::Reader) -> Result<(), Error> { + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. + let mut typed_fields_ids: Vec<_> = reader + .headers()? + .into_iter() + .map(parse_csv_header) + .map(|(k, t)| (self.fields_index.insert(k), t)) + .enumerate() + .collect(); + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. + typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid); + + let mut record = csv::StringRecord::new(); + let mut line = 0; + while reader.read_record(&mut record)? { + // We increment here and not at the end of the while loop to take + // the header offset into account. + line += 1; + + self.obkv_buffer.clear(); + let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer); + + for (i, (field_id, type_)) in typed_fields_ids.iter() { + self.value_buffer.clear(); + + let value = &record[*i]; + match type_ { + AllowedType::Number => { + if value.trim().is_empty() { + to_writer(&mut self.value_buffer, &Value::Null)?; + } else if let Ok(integer) = value.trim().parse::() { + to_writer(&mut self.value_buffer, &integer)?; + } else { + match value.trim().parse::() { + Ok(float) => { + to_writer(&mut self.value_buffer, &float)?; + } + Err(error) => { + return Err(Error::ParseFloat { + error, + line, + value: value.to_string(), + }); + } + } + } + } + AllowedType::String => { + if value.is_empty() { + to_writer(&mut self.value_buffer, &Value::Null)?; + } else { + to_writer(&mut self.value_buffer, value)?; + } + } + } + + // We insert into the obkv writer the value buffer that has been filled just above. + writer.insert(*field_id, &self.value_buffer)?; + } + + let internal_id = self.documents_count.to_be_bytes(); + let document_bytes = writer.into_inner()?; + self.writer.insert(internal_id, &document_bytes)?; + self.documents_count += 1; + } + + Ok(()) + } + + /// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`. + pub fn into_inner(mut self) -> io::Result { + let DocumentsBatchBuilder { mut writer, fields_index, .. } = self; + + // We serialize and insert the `DocumentsBatchIndex` as the last key of the grenad writer. + self.value_buffer.clear(); + to_writer(&mut self.value_buffer, &fields_index)?; + writer.insert(DOCUMENTS_BATCH_INDEX_KEY, &self.value_buffer)?; + + writer.into_inner() + } +} + +#[derive(Debug)] +enum AllowedType { + String, + Number, +} + +fn parse_csv_header(header: &str) -> (&str, AllowedType) { + // if there are several separators we only split on the last one. + match header.rsplit_once(':') { + Some((field_name, field_type)) => match field_type { + "string" => (field_name, AllowedType::String), + "number" => (field_name, AllowedType::Number), + // if the pattern isn't reconized, we keep the whole field. + _otherwise => (header, AllowedType::String), + }, + None => (header, AllowedType::String), + } +} + +#[cfg(test)] +mod test { + use std::io::Cursor; + + use serde_json::json; + + use super::*; + use crate::documents::{obkv_to_object, DocumentsBatchReader}; + + #[test] + fn add_single_documents_json() { + let json = serde_json::json!({ + "id": 1, + "field": "hello!", + }); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(json.as_object().unwrap()).unwrap(); + + let json = serde_json::json!({ + "blabla": false, + "field": "hello!", + "id": 1, + }); + + builder.append_json_object(json.as_object().unwrap()).unwrap(); + + assert_eq!(builder.documents_count(), 2); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + assert_eq!(index.len(), 3); + + let document = cursor.next_document().unwrap().unwrap(); + assert_eq!(document.iter().count(), 2); + + let document = cursor.next_document().unwrap().unwrap(); + assert_eq!(document.iter().count(), 3); + + assert!(cursor.next_document().unwrap().is_none()); + } + + #[test] + fn add_documents_csv() { + let csv_content = "id:number,field:string\n1,hello!\n2,blabla"; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + assert_eq!(builder.documents_count(), 2); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + assert_eq!(index.len(), 2); + + let document = cursor.next_document().unwrap().unwrap(); + assert_eq!(document.iter().count(), 2); + + let document = cursor.next_document().unwrap().unwrap(); + assert_eq!(document.iter().count(), 2); + + assert!(cursor.next_document().unwrap().is_none()); + } + + #[test] + fn simple_csv_document() { + let csv_content = r#"city,country,pop +"Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + + assert!(cursor.next_document().unwrap().is_none()); + } + + #[test] + fn coma_in_field() { + let csv_content = r#"city,country,pop +"Boston","United, States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United, States", + "pop": "4628910", + }) + ); + } + + #[test] + fn quote_in_field() { + let csv_content = r#"city,country,pop +"Boston","United"" States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United\" States", + "pop": "4628910", + }) + ); + } + + #[test] + fn integer_in_field() { + let csv_content = r#"city,country,pop:number +"Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": 4628910, + }) + ); + } + + #[test] + fn integer_as_id() { + let csv_content = r#""id:number","title:string","comment:string" +"1239","Pride and Prejudice","A great book""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + + assert_eq!( + val, + json!({ + "id": 1239, + "title": "Pride and Prejudice", + "comment": "A great book", + }) + ); + } + + #[test] + fn float_in_field() { + let csv_content = r#"city,country,pop:number +"Boston","United States","4628910.01""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": 4628910.01, + }) + ); + } + + #[test] + fn several_colon_in_header() { + let csv_content = r#"city:love:string,country:state,pop +"Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + + assert_eq!( + val, + json!({ + "city:love": "Boston", + "country:state": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn ending_by_colon_in_header() { + let csv_content = r#"city:,country,pop +"Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + + assert_eq!( + val, + json!({ + "city:": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn starting_by_colon_in_header() { + let csv_content = r#":city,country,pop +"Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + + assert_eq!( + val, + json!({ + ":city": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[ignore] + #[test] + fn starting_by_colon_in_header2() { + let csv_content = r#":string,country,pop +"Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + assert!(cursor.next_document().is_err()); + } + + #[test] + fn double_colon_in_header() { + let csv_content = r#"city::string,country,pop +"Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); + + assert_eq!( + val, + json!({ + "city:": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn bad_type_in_header() { + let csv_content = r#"city,country:number,pop +"Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); + } + + #[test] + fn bad_column_count1() { + let csv_content = r#"city,country,pop +"Boston","United States","4628910", "too much + let csv = csv::Reader::from_reader(Cursor::new(csv_content"#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); + } + + #[test] + fn bad_column_count2() { + let csv_content = r#"city,country,pop +"Boston","United States""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); + } +} diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs new file mode 100644 index 000000000..fa21c0f87 --- /dev/null +++ b/milli/src/documents/enriched.rs @@ -0,0 +1,109 @@ +use std::fs::File; +use std::{io, str}; + +use obkv::KvReader; + +use super::{ + DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader, + Error, +}; +use crate::update::DocumentId; +use crate::FieldId; + +/// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have +/// been created with a `DocumentsBatchWriter` and, for the enriched data, +/// a simple `grenad::Reader`. +/// +/// The documents are returned in the form of `obkv::Reader` where each field is identified with a +/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. +pub struct EnrichedDocumentsBatchReader { + documents: DocumentsBatchReader, + primary_key: String, + external_ids: grenad::ReaderCursor, +} + +impl EnrichedDocumentsBatchReader { + pub fn new( + documents: DocumentsBatchReader, + primary_key: String, + external_ids: grenad::Reader, + ) -> Result { + if documents.documents_count() as u64 == external_ids.len() { + Ok(EnrichedDocumentsBatchReader { + documents, + primary_key, + external_ids: external_ids.into_cursor()?, + }) + } else { + Err(Error::InvalidEnrichedData) + } + } + + pub fn documents_count(&self) -> u32 { + self.documents.documents_count() + } + + pub fn primary_key(&self) -> &str { + &self.primary_key + } + + pub fn is_empty(&self) -> bool { + self.documents.is_empty() + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + self.documents.documents_batch_index() + } + + /// This method returns a forward cursor over the enriched documents. + pub fn into_cursor_and_fields_index( + self, + ) -> (EnrichedDocumentsBatchCursor, DocumentsBatchIndex) { + let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self; + let (documents, fields_index) = documents.into_cursor_and_fields_index(); + external_ids.reset(); + (EnrichedDocumentsBatchCursor { documents, primary_key, external_ids }, fields_index) + } +} + +#[derive(Debug, Clone)] +pub struct EnrichedDocument<'a> { + pub document: KvReader<'a, FieldId>, + pub document_id: DocumentId, +} + +pub struct EnrichedDocumentsBatchCursor { + documents: DocumentsBatchCursor, + primary_key: String, + external_ids: grenad::ReaderCursor, +} + +impl EnrichedDocumentsBatchCursor { + pub fn primary_key(&self) -> &str { + &self.primary_key + } + /// Resets the cursor to be able to read from the start again. + pub fn reset(&mut self) { + self.documents.reset(); + self.external_ids.reset(); + } +} + +impl EnrichedDocumentsBatchCursor { + /// Returns the next document, starting from the first one. Subsequent calls to + /// `next_document` advance the document reader until all the documents have been read. + pub fn next_enriched_document( + &mut self, + ) -> Result, DocumentsBatchCursorError> { + let document = self.documents.next_document()?; + let document_id = match self.external_ids.move_on_next()? { + Some((_, bytes)) => serde_json::from_slice(bytes).map(Some)?, + None => None, + }; + + match document.zip(document_id) { + Some((document, document_id)) => Ok(Some(EnrichedDocument { document, document_id })), + None => Ok(None), + } + } +} diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs new file mode 100644 index 000000000..da3a07942 --- /dev/null +++ b/milli/src/documents/mod.rs @@ -0,0 +1,292 @@ +mod builder; +mod enriched; +mod reader; +mod serde_impl; + +use std::fmt::{self, Debug}; +use std::io; +use std::str::Utf8Error; + +use bimap::BiHashMap; +pub use builder::DocumentsBatchBuilder; +pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; +use obkv::KvReader; +pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; +use serde::{Deserialize, Serialize}; + +use crate::error::{FieldIdMapMissingEntry, InternalError}; +use crate::{FieldId, Object, Result}; + +/// The key that is used to store the `DocumentsBatchIndex` datastructure, +/// it is the absolute last key of the list. +const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes(); + +/// Helper function to convert an obkv reader into a JSON object. +pub fn obkv_to_object(obkv: &KvReader, index: &DocumentsBatchIndex) -> Result { + obkv.iter() + .map(|(field_id, value)| { + let field_name = index + .name(field_id) + .ok_or(FieldIdMapMissingEntry::FieldId { field_id, process: "obkv_to_object" })?; + let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; + Ok((field_name.to_string(), value)) + }) + .collect() +} + +/// A bidirectional map that links field ids to their name in a document batch. +#[derive(Default, Clone, Debug, Serialize, Deserialize)] +pub struct DocumentsBatchIndex(pub BiHashMap); + +impl DocumentsBatchIndex { + /// Insert the field in the map, or return it's field id if it doesn't already exists. + pub fn insert(&mut self, field: &str) -> FieldId { + match self.0.get_by_right(field) { + Some(field_id) => *field_id, + None => { + let field_id = self.0.len() as FieldId; + self.0.insert(field_id, field.to_string()); + field_id + } + } + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn iter(&self) -> bimap::hash::Iter { + self.0.iter() + } + + pub fn name(&self, id: FieldId) -> Option<&str> { + self.0.get_by_left(&id).map(AsRef::as_ref) + } + + pub fn id(&self, name: &str) -> Option { + self.0.get_by_right(name).cloned() + } + + pub fn recreate_json(&self, document: &obkv::KvReaderU16) -> Result { + let mut map = Object::new(); + + for (k, v) in document.iter() { + // TODO: TAMO: update the error type + let key = + self.0.get_by_left(&k).ok_or(crate::error::InternalError::DatabaseClosing)?.clone(); + let value = serde_json::from_slice::(v) + .map_err(crate::error::InternalError::SerdeJson)?; + map.insert(key, value); + } + + Ok(map) + } +} + +#[derive(Debug)] +pub enum Error { + ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, + InvalidDocumentFormat, + InvalidEnrichedData, + InvalidUtf8(Utf8Error), + Csv(csv::Error), + Json(serde_json::Error), + Serialize(serde_json::Error), + Grenad(grenad::Error), + Io(io::Error), +} + +impl From for Error { + fn from(e: csv::Error) -> Self { + Self::Csv(e) + } +} + +impl From for Error { + fn from(other: io::Error) -> Self { + Self::Io(other) + } +} + +impl From for Error { + fn from(other: serde_json::Error) -> Self { + Self::Json(other) + } +} + +impl From for Error { + fn from(other: grenad::Error) -> Self { + Self::Grenad(other) + } +} + +impl From for Error { + fn from(other: Utf8Error) -> Self { + Self::InvalidUtf8(other) + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Error::ParseFloat { error, line, value } => { + write!(f, "Error parsing number {:?} at line {}: {}", value, line, error) + } + Error::InvalidDocumentFormat => { + f.write_str("Invalid document addition format, missing the documents batch index.") + } + Error::InvalidEnrichedData => f.write_str("Invalid enriched data."), + Error::InvalidUtf8(e) => write!(f, "{}", e), + Error::Io(e) => write!(f, "{}", e), + Error::Serialize(e) => write!(f, "{}", e), + Error::Grenad(e) => write!(f, "{}", e), + Error::Csv(e) => write!(f, "{}", e), + Error::Json(e) => write!(f, "{}", e), + } + } +} + +impl std::error::Error for Error {} + +#[cfg(test)] +pub fn objects_from_json_value(json: serde_json::Value) -> Vec { + let documents = match json { + object @ serde_json::Value::Object(_) => vec![object], + serde_json::Value::Array(objects) => objects, + invalid => { + panic!("an array of objects must be specified, {:#?} is not an array", invalid) + } + }; + let mut objects = vec![]; + for document in documents { + let object = match document { + serde_json::Value::Object(object) => object, + invalid => panic!("an object must be specified, {:#?} is not an object", invalid), + }; + objects.push(object); + } + objects +} + +/// Macro used to generate documents, with the same syntax as `serde_json::json` +#[cfg(test)] +macro_rules! documents { + ($data:tt) => {{ + let documents = serde_json::json!($data); + let documents = $crate::documents::objects_from_json_value(documents); + $crate::documents::documents_batch_reader_from_objects(documents) + }}; +} + +#[cfg(test)] +pub fn documents_batch_reader_from_objects( + objects: impl IntoIterator, +) -> DocumentsBatchReader>> { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in objects { + builder.append_json_object(&object).unwrap(); + } + let vector = builder.into_inner().unwrap(); + DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap() +} + +#[cfg(test)] +mod test { + use std::io::Cursor; + + use serde_json::{json, Value}; + + use super::*; + + #[test] + fn create_documents_no_errors() { + let value = json!({ + "number": 1, + "string": "this is a field", + "array": ["an", "array"], + "object": { + "key": "value", + }, + "bool": true + }); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(value.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut documents, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + + assert_eq!(index.iter().count(), 5); + let reader = documents.next_document().unwrap().unwrap(); + assert_eq!(reader.iter().count(), 5); + assert!(documents.next_document().unwrap().is_none()); + } + + #[test] + fn test_add_multiple_documents() { + let doc1 = json!({ + "bool": true, + }); + let doc2 = json!({ + "toto": false, + }); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(doc1.as_object().unwrap()).unwrap(); + builder.append_json_object(doc2.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); + + let (mut documents, index) = DocumentsBatchReader::from_reader(io::Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + assert_eq!(index.iter().count(), 2); + let reader = documents.next_document().unwrap().unwrap(); + assert_eq!(reader.iter().count(), 1); + assert!(documents.next_document().unwrap().is_some()); + assert!(documents.next_document().unwrap().is_none()); + } + + #[test] + fn test_nested() { + let docs_reader = documents!([{ + "hello": { + "toto": ["hello"] + } + }]); + + let (mut cursor, _) = docs_reader.into_cursor_and_fields_index(); + let doc = cursor.next_document().unwrap().unwrap(); + let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); + assert_eq!(nested, json!({ "toto": ["hello"] })); + } + + #[test] + fn out_of_order_json_fields() { + let _documents = documents!([ + {"id": 1,"b": 0}, + {"id": 2,"a": 0,"b": 0}, + ]); + } + + #[test] + fn out_of_order_csv_fields() { + let csv1_content = "id:number,b\n1,0"; + let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content)); + + let csv2_content = "id:number,a,b\n2,0,0"; + let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv1).unwrap(); + builder.append_csv(csv2).unwrap(); + let vector = builder.into_inner().unwrap(); + + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); + } +} diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs new file mode 100644 index 000000000..a8a4c662d --- /dev/null +++ b/milli/src/documents/reader.rs @@ -0,0 +1,116 @@ +use std::convert::TryInto; +use std::{error, fmt, io}; + +use obkv::KvReader; + +use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; +use crate::FieldId; + +/// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with +/// a `DocumentsBatchWriter`. +/// +/// The documents are returned in the form of `obkv::Reader` where each field is identified with a +/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. +pub struct DocumentsBatchReader { + cursor: grenad::ReaderCursor, + fields_index: DocumentsBatchIndex, +} + +impl DocumentsBatchReader { + pub fn new(cursor: DocumentsBatchCursor, fields_index: DocumentsBatchIndex) -> Self { + Self { cursor: cursor.cursor, fields_index } + } + + /// Construct a `DocumentsReader` from a reader. + /// + /// It first retrieves the index, then moves to the first document. Use the `into_cursor` + /// method to iterator over the documents, from the first to the last. + pub fn from_reader(reader: R) -> Result { + let reader = grenad::Reader::new(reader)?; + let mut cursor = reader.into_cursor()?; + + let fields_index = match cursor.move_on_key_equal_to(DOCUMENTS_BATCH_INDEX_KEY)? { + Some((_, value)) => serde_json::from_slice(value).map_err(Error::Serialize)?, + None => return Err(Error::InvalidDocumentFormat), + }; + + Ok(DocumentsBatchReader { cursor, fields_index }) + } + + pub fn documents_count(&self) -> u32 { + self.cursor.len().saturating_sub(1).try_into().expect("Invalid number of documents") + } + + pub fn is_empty(&self) -> bool { + self.cursor.len().saturating_sub(1) == 0 + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + &self.fields_index + } + + /// This method returns a forward cursor over the documents. + pub fn into_cursor_and_fields_index(self) -> (DocumentsBatchCursor, DocumentsBatchIndex) { + let DocumentsBatchReader { cursor, fields_index } = self; + let mut cursor = DocumentsBatchCursor { cursor }; + cursor.reset(); + (cursor, fields_index) + } +} + +/// A forward cursor over the documents in a `DocumentsBatchReader`. +pub struct DocumentsBatchCursor { + cursor: grenad::ReaderCursor, +} + +impl DocumentsBatchCursor { + /// Resets the cursor to be able to read from the start again. + pub fn reset(&mut self) { + self.cursor.reset(); + } +} + +impl DocumentsBatchCursor { + /// Returns the next document, starting from the first one. Subsequent calls to + /// `next_document` advance the document reader until all the documents have been read. + pub fn next_document( + &mut self, + ) -> Result>, DocumentsBatchCursorError> { + match self.cursor.move_on_next()? { + Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => { + Ok(Some(KvReader::new(value))) + } + _otherwise => Ok(None), + } + } +} + +/// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents. +#[derive(Debug)] +pub enum DocumentsBatchCursorError { + Grenad(grenad::Error), + SerdeJson(serde_json::Error), +} + +impl From for DocumentsBatchCursorError { + fn from(error: grenad::Error) -> DocumentsBatchCursorError { + DocumentsBatchCursorError::Grenad(error) + } +} + +impl From for DocumentsBatchCursorError { + fn from(error: serde_json::Error) -> DocumentsBatchCursorError { + DocumentsBatchCursorError::SerdeJson(error) + } +} + +impl error::Error for DocumentsBatchCursorError {} + +impl fmt::Display for DocumentsBatchCursorError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + DocumentsBatchCursorError::Grenad(e) => e.fmt(f), + DocumentsBatchCursorError::SerdeJson(e) => e.fmt(f), + } + } +} diff --git a/milli/src/documents/serde_impl.rs b/milli/src/documents/serde_impl.rs new file mode 100644 index 000000000..d4abdc844 --- /dev/null +++ b/milli/src/documents/serde_impl.rs @@ -0,0 +1,76 @@ +use std::fmt; +use std::io::Write; + +use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; + +use super::Error; +use crate::documents::DocumentsBatchBuilder; +use crate::Object; + +macro_rules! tri { + ($e:expr) => { + match $e { + Ok(r) => r, + Err(e) => return Ok(Err(e.into())), + } + }; +} + +pub struct DocumentVisitor<'a, W> { + inner: &'a mut DocumentsBatchBuilder, + object: Object, +} + +impl<'a, W> DocumentVisitor<'a, W> { + pub fn new(inner: &'a mut DocumentsBatchBuilder) -> Self { + DocumentVisitor { inner, object: Object::new() } + } +} + +impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { + /// This Visitor value is nothing, since it write the value to a file. + type Value = Result<(), Error>; + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + while let Some(v) = seq.next_element_seed(&mut *self)? { + tri!(v) + } + + Ok(Ok(())) + } + + fn visit_map(self, mut map: A) -> Result + where + A: MapAccess<'de>, + { + self.object.clear(); + while let Some((key, value)) = map.next_entry()? { + self.object.insert(key, value); + } + + tri!(self.inner.append_json_object(&self.object)); + + Ok(Ok(())) + } + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "a documents, or a sequence of documents.") + } +} + +impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> +where + W: Write, +{ + type Value = Result<(), Error>; + + fn deserialize(self, deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_map(self) + } +} diff --git a/milli/src/error.rs b/milli/src/error.rs new file mode 100644 index 000000000..8734cb540 --- /dev/null +++ b/milli/src/error.rs @@ -0,0 +1,272 @@ +use std::collections::BTreeSet; +use std::convert::Infallible; +use std::{io, str}; + +use heed::{Error as HeedError, MdbError}; +use rayon::ThreadPoolBuildError; +use serde_json::Value; +use thiserror::Error; + +use crate::documents::{self, DocumentsBatchCursorError}; +use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; + +pub fn is_reserved_keyword(keyword: &str) -> bool { + ["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword) +} + +#[derive(Error, Debug)] +pub enum Error { + #[error("internal: {0}.")] + InternalError(#[from] InternalError), + #[error(transparent)] + IoError(#[from] io::Error), + #[error(transparent)] + UserError(#[from] UserError), +} + +#[derive(Error, Debug)] +pub enum InternalError { + #[error("{}", HeedError::DatabaseClosing)] + DatabaseClosing, + #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))] + DatabaseMissingEntry { db_name: &'static str, key: Option<&'static str> }, + #[error(transparent)] + FieldIdMapMissingEntry(#[from] FieldIdMapMissingEntry), + #[error("Missing {key} in the field id mapping.")] + FieldIdMappingMissingEntry { key: FieldId }, + #[error(transparent)] + Fst(#[from] fst::Error), + #[error(transparent)] + DocumentsError(#[from] documents::Error), + #[error("Invalid compression type have been specified to grenad.")] + GrenadInvalidCompressionType, + #[error("Invalid grenad file with an invalid version format.")] + GrenadInvalidFormatVersion, + #[error("Invalid merge while processing {process}.")] + IndexingMergingKeys { process: &'static str }, + #[error("{}", HeedError::InvalidDatabaseTyping)] + InvalidDatabaseTyping, + #[error(transparent)] + RayonThreadPool(#[from] ThreadPoolBuildError), + #[error(transparent)] + SerdeJson(#[from] serde_json::Error), + #[error(transparent)] + Serialization(#[from] SerializationError), + #[error(transparent)] + Store(#[from] MdbError), + #[error(transparent)] + Utf8(#[from] str::Utf8Error), + #[error("An indexation process was explicitly aborted.")] + AbortedIndexation, +} + +#[derive(Error, Debug)] +pub enum SerializationError { + #[error("{}", match .db_name { + Some(name) => format!("decoding from the {name} database failed"), + None => "decoding failed".to_string(), + })] + Decoding { db_name: Option<&'static str> }, + #[error("{}", match .db_name { + Some(name) => format!("encoding into the {name} database failed"), + None => "encoding failed".to_string(), + })] + Encoding { db_name: Option<&'static str> }, + #[error("number is not a valid finite number")] + InvalidNumberSerialization, +} + +#[derive(Error, Debug)] +pub enum FieldIdMapMissingEntry { + #[error("unknown field id {field_id} coming from the {process} process")] + FieldId { field_id: FieldId, process: &'static str }, + #[error("unknown field name {field_name} coming from the {process} process")] + FieldName { field_name: String, process: &'static str }, +} + +#[derive(Error, Debug)] +pub enum UserError { + #[error("A soft deleted internal document id have been used: `{document_id}`.")] + AccessingSoftDeletedDocument { document_id: DocumentId }, + #[error("A document cannot contain more than 65,535 fields.")] + AttributeLimitReached, + #[error(transparent)] + CriterionError(#[from] CriterionError), + #[error("Maximum number of documents reached.")] + DocumentLimitReached, + #[error( + "Document identifier `{}` is invalid. \ +A document identifier can be of type integer or string, \ +only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", .document_id.to_string() + )] + InvalidDocumentId { document_id: Value }, + #[error("Invalid facet distribution, the fields `{}` are not set as filterable.", + .invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") + )] + InvalidFacetsDistribution { invalid_facets_name: BTreeSet }, + #[error(transparent)] + InvalidGeoField(#[from] GeoError), + #[error("{0}")] + InvalidFilter(String), + #[error("Attribute `{}` is not sortable. {}", + .field, + match .valid_fields.is_empty() { + true => "This index does not have configured sortable attributes.".to_string(), + false => format!("Available sortable attributes are: `{}`.", + valid_fields.iter().map(AsRef::as_ref).collect::>().join(", ") + ), + } + )] + InvalidSortableAttribute { field: String, valid_fields: BTreeSet }, + #[error("{}", HeedError::BadOpenOptions)] + InvalidLmdbOpenOptions, + #[error("The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.")] + SortRankingRuleMissing, + #[error("The database file is in an invalid state.")] + InvalidStoreFile, + #[error("Maximum database size has been reached.")] + MaxDatabaseSizeReached, + #[error("Document doesn't have a `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] + MissingDocumentId { primary_key: String, document: Object }, + #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] + TooManyDocumentIds { primary_key: String, document: Object }, + #[error("The primary key inference failed as the engine did not find any field ending with `id` in its name. Please specify the primary key manually using the `primaryKey` query parameter.")] + NoPrimaryKeyCandidateFound, + #[error("The primary key inference failed as the engine found {} fields ending with `id` in their names: '{}' and '{}'. Please specify the primary key manually using the `primaryKey` query parameter.", .candidates.len(), .candidates.get(0).unwrap(), .candidates.get(1).unwrap())] + MultiplePrimaryKeyCandidatesFound { candidates: Vec }, + #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] + NoSpaceLeftOnDevice, + #[error("Index already has a primary key: `{0}`.")] + PrimaryKeyCannotBeChanged(String), + #[error(transparent)] + SerdeJson(serde_json::Error), + #[error(transparent)] + SortError(#[from] SortError), + #[error("An unknown internal document id have been used: `{document_id}`.")] + UnknownInternalDocumentId { document_id: DocumentId }, + #[error("`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {0}` and twoTypos: {1}`.")] + InvalidMinTypoWordLenSetting(u8, u8), +} + +#[derive(Error, Debug)] +pub enum GeoError { + #[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")] + NotAnObject { document_id: Value, value: Value }, + #[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")] + MissingLatitudeAndLongitude { document_id: Value }, + #[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")] + MissingLatitude { document_id: Value }, + #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")] + MissingLongitude { document_id: Value }, + #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting finite numbers but instead got `{lat}` and `{lng}`.")] + BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value }, + #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")] + BadLatitude { document_id: Value, value: Value }, + #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")] + BadLongitude { document_id: Value, value: Value }, +} + +/// A little macro helper to autogenerate From implementation that needs two `Into`. +/// Given the following parameters: `error_from_sub_error!(FieldIdMapMissingEntry => InternalError)` +/// the macro will create the following code: +/// ```ignore +/// impl From for Error { +/// fn from(error: FieldIdMapMissingEntry) -> Error { +/// Error::from(InternalError::from(error)) +/// } +/// } +/// ``` +macro_rules! error_from_sub_error { + () => {}; + ($sub:ty => $intermediate:ty) => { + impl From<$sub> for Error { + fn from(error: $sub) -> Error { + Error::from(<$intermediate>::from(error)) + } + } + }; + ($($sub:ty => $intermediate:ty $(,)?),+) => { + $(error_from_sub_error!($sub => $intermediate);)+ + }; +} + +error_from_sub_error! { + FieldIdMapMissingEntry => InternalError, + fst::Error => InternalError, + documents::Error => InternalError, + str::Utf8Error => InternalError, + ThreadPoolBuildError => InternalError, + SerializationError => InternalError, + GeoError => UserError, + CriterionError => UserError, +} + +impl From> for Error +where + Error: From, +{ + fn from(error: grenad::Error) -> Error { + match error { + grenad::Error::Io(error) => Error::IoError(error), + grenad::Error::Merge(error) => Error::from(error), + grenad::Error::InvalidCompressionType => { + Error::InternalError(InternalError::GrenadInvalidCompressionType) + } + grenad::Error::InvalidFormatVersion => { + Error::InternalError(InternalError::GrenadInvalidFormatVersion) + } + } + } +} + +impl From for Error { + fn from(error: DocumentsBatchCursorError) -> Error { + match error { + DocumentsBatchCursorError::Grenad(e) => Error::from(e), + DocumentsBatchCursorError::SerdeJson(e) => Error::from(InternalError::from(e)), + } + } +} + +impl From for Error { + fn from(_error: Infallible) -> Error { + unreachable!() + } +} + +impl From for Error { + fn from(error: HeedError) -> Error { + use self::Error::*; + use self::InternalError::*; + use self::SerializationError::*; + use self::UserError::*; + + match error { + HeedError::Io(error) => Error::from(error), + HeedError::Mdb(MdbError::MapFull) => UserError(MaxDatabaseSizeReached), + HeedError::Mdb(MdbError::Invalid) => UserError(InvalidStoreFile), + HeedError::Mdb(error) => InternalError(Store(error)), + HeedError::Encoding => InternalError(Serialization(Encoding { db_name: None })), + HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })), + HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), + HeedError::DatabaseClosing => InternalError(DatabaseClosing), + HeedError::BadOpenOptions => UserError(InvalidLmdbOpenOptions), + } + } +} + +#[test] +fn conditionally_lookup_for_error_message() { + let prefix = "Attribute `name` is not sortable."; + let messages = vec![ + (BTreeSet::new(), "This index does not have configured sortable attributes."), + (BTreeSet::from(["age".to_string()]), "Available sortable attributes are: `age`."), + ]; + + for (list, suffix) in messages { + let err = + UserError::InvalidSortableAttribute { field: "name".to_string(), valid_fields: list }; + + assert_eq!(err.to_string(), format!("{} {}", prefix, suffix)); + } +} diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs new file mode 100644 index 000000000..2cecd1abe --- /dev/null +++ b/milli/src/external_documents_ids.rs @@ -0,0 +1,151 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::convert::TryInto; +use std::{fmt, str}; + +use fst::map::IndexedValue; +use fst::{IntoStreamer, Streamer}; +use roaring::RoaringBitmap; + +const DELETED_ID: u64 = u64::MAX; + +pub struct ExternalDocumentsIds<'a> { + pub(crate) hard: fst::Map>, + pub(crate) soft: fst::Map>, + soft_deleted_docids: RoaringBitmap, +} + +impl<'a> ExternalDocumentsIds<'a> { + pub fn new( + hard: fst::Map>, + soft: fst::Map>, + soft_deleted_docids: RoaringBitmap, + ) -> ExternalDocumentsIds<'a> { + ExternalDocumentsIds { hard, soft, soft_deleted_docids } + } + + pub fn into_static(self) -> ExternalDocumentsIds<'static> { + ExternalDocumentsIds { + hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), + soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), + soft_deleted_docids: self.soft_deleted_docids, + } + } + + /// Returns `true` if hard and soft external documents lists are empty. + pub fn is_empty(&self) -> bool { + self.hard.is_empty() && self.soft.is_empty() + } + + pub fn get>(&self, external_id: A) -> Option { + let external_id = external_id.as_ref(); + match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { + Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => { + Some(id.try_into().unwrap()) + } + _otherwise => None, + } + } + + /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they + /// don't contain any soft deleted document id. + pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> { + let mut new_hard_builder = fst::MapBuilder::memory(); + + let union_op = self.hard.op().add(&self.soft).r#union(); + let mut iter = union_op.into_stream(); + while let Some((external_id, docids)) = iter.next() { + // prefer selecting the ids from soft, always + let id = indexed_last_value(docids).unwrap(); + if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) { + new_hard_builder.insert(external_id, id)?; + } + } + drop(iter); + + // Delete soft map completely + self.soft = fst::Map::default().map_data(Cow::Owned)?; + // We save the new map as the new hard map. + self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; + + Ok(()) + } + + pub fn insert_ids>(&mut self, other: &fst::Map) -> fst::Result<()> { + let union_op = self.soft.op().add(other).r#union(); + + let mut new_soft_builder = fst::MapBuilder::memory(); + let mut iter = union_op.into_stream(); + while let Some((external_id, marked_docids)) = iter.next() { + let id = indexed_last_value(marked_docids).unwrap(); + new_soft_builder.insert(external_id, id)?; + } + + drop(iter); + + // We save the new map as the new soft map. + self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; + self.merge_soft_into_hard() + } + + /// An helper function to debug this type, returns an `HashMap` of both, + /// soft and hard fst maps, combined. + pub fn to_hash_map(&self) -> HashMap { + let mut map = HashMap::new(); + + let union_op = self.hard.op().add(&self.soft).r#union(); + let mut iter = union_op.into_stream(); + while let Some((external_id, marked_docids)) = iter.next() { + let id = indexed_last_value(marked_docids).unwrap(); + if id != DELETED_ID { + let external_id = str::from_utf8(external_id).unwrap(); + map.insert(external_id.to_owned(), id.try_into().unwrap()); + } + } + + map + } + + fn merge_soft_into_hard(&mut self) -> fst::Result<()> { + if self.soft.len() >= self.hard.len() / 2 { + let union_op = self.hard.op().add(&self.soft).r#union(); + + let mut iter = union_op.into_stream(); + let mut new_hard_builder = fst::MapBuilder::memory(); + while let Some((external_id, marked_docids)) = iter.next() { + let value = indexed_last_value(marked_docids).unwrap(); + if value != DELETED_ID { + new_hard_builder.insert(external_id, value)?; + } + } + + drop(iter); + + self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; + self.soft = fst::Map::default().map_data(Cow::Owned)?; + } + + Ok(()) + } +} + +impl fmt::Debug for ExternalDocumentsIds<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish() + } +} + +impl Default for ExternalDocumentsIds<'static> { + fn default() -> Self { + ExternalDocumentsIds { + hard: fst::Map::default().map_data(Cow::Owned).unwrap(), + soft: fst::Map::default().map_data(Cow::Owned).unwrap(), + soft_deleted_docids: RoaringBitmap::new(), + } + } +} + +/// Returns the value of the `IndexedValue` with the highest _index_. +fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option { + indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value) +} diff --git a/milli/src/facet/facet_type.rs b/milli/src/facet/facet_type.rs new file mode 100644 index 000000000..51dd448e2 --- /dev/null +++ b/milli/src/facet/facet_type.rs @@ -0,0 +1,45 @@ +use std::error::Error; +use std::fmt; +use std::str::FromStr; + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum FacetType { + String, + Number, +} + +impl fmt::Display for FacetType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + FacetType::String => f.write_str("string"), + FacetType::Number => f.write_str("number"), + } + } +} + +impl FromStr for FacetType { + type Err = InvalidFacetType; + + fn from_str(s: &str) -> Result { + if s.trim().eq_ignore_ascii_case("string") { + Ok(FacetType::String) + } else if s.trim().eq_ignore_ascii_case("number") { + Ok(FacetType::Number) + } else { + Err(InvalidFacetType) + } + } +} + +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct InvalidFacetType; + +impl fmt::Display for InvalidFacetType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(r#"Invalid facet type, must be "string" or "number""#) + } +} + +impl Error for InvalidFacetType {} diff --git a/milli/src/facet/facet_value.rs b/milli/src/facet/facet_value.rs new file mode 100644 index 000000000..eb7fb3c5e --- /dev/null +++ b/milli/src/facet/facet_value.rs @@ -0,0 +1,56 @@ +use ordered_float::OrderedFloat; +use serde::{Serialize, Serializer}; + +#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub enum FacetValue { + String(String), + Number(OrderedFloat), +} + +impl From for FacetValue { + fn from(string: String) -> FacetValue { + FacetValue::String(string) + } +} + +impl From<&str> for FacetValue { + fn from(string: &str) -> FacetValue { + FacetValue::String(string.to_owned()) + } +} + +impl From for FacetValue { + fn from(float: f64) -> FacetValue { + FacetValue::Number(OrderedFloat(float)) + } +} + +impl From> for FacetValue { + fn from(float: OrderedFloat) -> FacetValue { + FacetValue::Number(float) + } +} + +impl From for FacetValue { + fn from(integer: i64) -> FacetValue { + FacetValue::Number(OrderedFloat(integer as f64)) + } +} + +/// We implement Serialize ourselves because we need to always serialize it as a string, +/// JSON object keys must be strings not numbers. +// TODO remove this impl and convert them into string, by hand, when required. +impl Serialize for FacetValue { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self { + FacetValue::String(string) => serializer.serialize_str(string), + FacetValue::Number(number) => { + let string = number.to_string(); + serializer.serialize_str(&string) + } + } + } +} diff --git a/milli/src/facet/mod.rs b/milli/src/facet/mod.rs new file mode 100644 index 000000000..274d2588d --- /dev/null +++ b/milli/src/facet/mod.rs @@ -0,0 +1,6 @@ +mod facet_type; +mod facet_value; +pub mod value_encoding; + +pub use self::facet_type::FacetType; +pub use self::facet_value::FacetValue; diff --git a/milli/src/facet/value_encoding.rs b/milli/src/facet/value_encoding.rs new file mode 100644 index 000000000..31c00bd2d --- /dev/null +++ b/milli/src/facet/value_encoding.rs @@ -0,0 +1,49 @@ +// https://stackoverflow.com/a/43305015/1941280 +#[inline] +pub fn f64_into_bytes(float: f64) -> Option<[u8; 8]> { + if float.is_finite() { + if float == 0.0 || float == -0.0 { + return Some(xor_first_bit(0.0_f64.to_be_bytes())); + } else if float.is_sign_negative() { + return Some(xor_all_bits(float.to_be_bytes())); + } else if float.is_sign_positive() { + return Some(xor_first_bit(float.to_be_bytes())); + } + } + None +} + +#[inline] +fn xor_first_bit(mut x: [u8; 8]) -> [u8; 8] { + x[0] ^= 0x80; + x +} + +#[inline] +fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] { + x.iter_mut().for_each(|b| *b ^= 0xff); + x +} + +#[cfg(test)] +mod tests { + use std::cmp::Ordering::Less; + + use super::*; + + fn is_sorted(x: &[T]) -> bool { + x.windows(2).map(|x| x[0].cmp(&x[1])).all(|o| o == Less) + } + + #[test] + fn ordered_f64_bytes() { + let a = -13_f64; + let b = -10.0; + let c = -0.0; + let d = 1.0; + let e = 43.0; + + let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect(); + assert!(is_sorted(&vec), "{:?}", vec); + } +} diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs new file mode 100644 index 000000000..810ff755b --- /dev/null +++ b/milli/src/fields_ids_map.rs @@ -0,0 +1,123 @@ +use std::collections::BTreeMap; + +use serde::{Deserialize, Serialize}; + +use crate::FieldId; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FieldsIdsMap { + names_ids: BTreeMap, + ids_names: BTreeMap, + next_id: Option, +} + +impl FieldsIdsMap { + pub fn new() -> FieldsIdsMap { + FieldsIdsMap { names_ids: BTreeMap::new(), ids_names: BTreeMap::new(), next_id: Some(0) } + } + + /// Returns the number of fields ids in the map. + pub fn len(&self) -> usize { + self.names_ids.len() + } + + /// Returns `true` if the map is empty. + pub fn is_empty(&self) -> bool { + self.names_ids.is_empty() + } + + /// Returns the field id related to a field name, it will create a new field id if the + /// name is not already known. Returns `None` if the maximum field id as been reached. + pub fn insert(&mut self, name: &str) -> Option { + match self.names_ids.get(name) { + Some(id) => Some(*id), + None => { + let id = self.next_id?; + self.next_id = id.checked_add(1); + self.names_ids.insert(name.to_owned(), id); + self.ids_names.insert(id, name.to_owned()); + Some(id) + } + } + } + + /// Get the id of a field based on its name. + pub fn id(&self, name: &str) -> Option { + self.names_ids.get(name).copied() + } + + /// Get the name of a field based on its id. + pub fn name(&self, id: FieldId) -> Option<&str> { + self.ids_names.get(&id).map(String::as_str) + } + + /// Remove a field name and id based on its name. + pub fn remove(&mut self, name: &str) -> Option { + match self.names_ids.remove(name) { + Some(id) => self.ids_names.remove_entry(&id).map(|(id, _)| id), + None => None, + } + } + + /// Iterate over the ids and names in the ids order. + pub fn iter(&self) -> impl Iterator { + self.ids_names.iter().map(|(id, name)| (*id, name.as_str())) + } + + /// Iterate over the ids in the order of the ids. + pub fn ids(&'_ self) -> impl Iterator + '_ { + self.ids_names.keys().copied() + } + + /// Iterate over the names in the order of the ids. + pub fn names(&self) -> impl Iterator { + self.ids_names.values().map(AsRef::as_ref) + } +} + +impl Default for FieldsIdsMap { + fn default() -> FieldsIdsMap { + FieldsIdsMap::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn fields_ids_map() { + let mut map = FieldsIdsMap::new(); + + assert_eq!(map.insert("id"), Some(0)); + assert_eq!(map.insert("title"), Some(1)); + assert_eq!(map.insert("description"), Some(2)); + assert_eq!(map.insert("id"), Some(0)); + assert_eq!(map.insert("title"), Some(1)); + assert_eq!(map.insert("description"), Some(2)); + + assert_eq!(map.id("id"), Some(0)); + assert_eq!(map.id("title"), Some(1)); + assert_eq!(map.id("description"), Some(2)); + assert_eq!(map.id("date"), None); + + assert_eq!(map.len(), 3); + + assert_eq!(map.name(0), Some("id")); + assert_eq!(map.name(1), Some("title")); + assert_eq!(map.name(2), Some("description")); + assert_eq!(map.name(4), None); + + assert_eq!(map.remove("title"), Some(1)); + + assert_eq!(map.id("title"), None); + assert_eq!(map.insert("title"), Some(3)); + assert_eq!(map.len(), 3); + + let mut iter = map.iter(); + assert_eq!(iter.next(), Some((0, "id"))); + assert_eq!(iter.next(), Some((2, "description"))); + assert_eq!(iter.next(), Some((3, "title"))); + assert_eq!(iter.next(), None); + } +} diff --git a/milli/src/heed_codec/beu32_str_codec.rs b/milli/src/heed_codec/beu32_str_codec.rs new file mode 100644 index 000000000..c525d6b5b --- /dev/null +++ b/milli/src/heed_codec/beu32_str_codec.rs @@ -0,0 +1,27 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::str; + +pub struct BEU32StrCodec; + +impl<'a> heed::BytesDecode<'a> for BEU32StrCodec { + type DItem = (u32, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (n_bytes, str_bytes) = bytes.split_at(4); + let n = n_bytes.try_into().map(u32::from_be_bytes).ok()?; + let s = str::from_utf8(str_bytes).ok()?; + Some((n, s)) + } +} + +impl<'a> heed::BytesEncode<'a> for BEU32StrCodec { + type EItem = (u32, &'a str); + + fn bytes_encode((n, s): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(s.len() + 4); + bytes.extend_from_slice(&n.to_be_bytes()); + bytes.extend_from_slice(s.as_bytes()); + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/byte_slice_ref.rs b/milli/src/heed_codec/byte_slice_ref.rs new file mode 100644 index 000000000..48eda63c5 --- /dev/null +++ b/milli/src/heed_codec/byte_slice_ref.rs @@ -0,0 +1,23 @@ +use std::borrow::Cow; + +use heed::{BytesDecode, BytesEncode}; + +/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated +/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure. +pub struct ByteSliceRefCodec; + +impl<'a> BytesEncode<'a> for ByteSliceRefCodec { + type EItem = &'a [u8]; + + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Borrowed(item)) + } +} + +impl<'a> BytesDecode<'a> for ByteSliceRefCodec { + type DItem = &'a [u8]; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + Some(bytes) + } +} diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs new file mode 100644 index 000000000..cc9919ad2 --- /dev/null +++ b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs @@ -0,0 +1,44 @@ +use std::borrow::Cow; +use std::marker::PhantomData; + +use heed::{BytesDecode, BytesEncode}; + +use crate::{try_split_array_at, DocumentId, FieldId}; + +pub struct FieldDocIdFacetCodec(PhantomData); + +impl<'a, C> BytesDecode<'a> for FieldDocIdFacetCodec +where + C: BytesDecode<'a>, +{ + type DItem = (FieldId, DocumentId, C::DItem); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + + let (document_id_bytes, bytes) = try_split_array_at(bytes)?; + let document_id = u32::from_be_bytes(document_id_bytes); + + let value = C::bytes_decode(bytes)?; + + Some((field_id, document_id, value)) + } +} + +impl<'a, C> BytesEncode<'a> for FieldDocIdFacetCodec +where + C: BytesEncode<'a>, +{ + type EItem = (FieldId, DocumentId, C::EItem); + + fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(32); + bytes.extend_from_slice(&field_id.to_be_bytes()); // 2 bytes + bytes.extend_from_slice(&document_id.to_be_bytes()); // 4 bytes + let value_bytes = C::bytes_encode(value)?; + // variable length, if f64 -> 16 bytes, if string -> large, potentially + bytes.extend_from_slice(&value_bytes); + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs new file mode 100644 index 000000000..d36ec8434 --- /dev/null +++ b/milli/src/heed_codec/facet/mod.rs @@ -0,0 +1,103 @@ +mod field_doc_id_facet_codec; +mod ordered_f64_codec; + +use std::borrow::Cow; +use std::convert::TryFrom; +use std::marker::PhantomData; + +use heed::types::{DecodeIgnore, OwnedType}; +use heed::{BytesDecode, BytesEncode}; +use roaring::RoaringBitmap; + +pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec; +pub use self::ordered_f64_codec::OrderedF64Codec; +use super::StrRefCodec; +use crate::{CboRoaringBitmapCodec, BEU16}; + +pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec; +pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec; +pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec; + +pub type FieldIdCodec = OwnedType; + +/// Tries to split a slice in half at the given middle point, +/// `None` if the slice is too short. +pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if slice.len() >= mid { + Some(slice.split_at(mid)) + } else { + None + } +} + +/// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`] +/// databases. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] // TODO: try removing PartialOrd and Ord +pub struct FacetGroupKey { + pub field_id: u16, + pub level: u8, + pub left_bound: T, +} + +/// The value in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`] +/// databases. +#[derive(Debug)] +pub struct FacetGroupValue { + pub size: u8, + pub bitmap: RoaringBitmap, +} + +pub struct FacetGroupKeyCodec { + _phantom: PhantomData, +} + +impl<'a, T> heed::BytesEncode<'a> for FacetGroupKeyCodec +where + T: BytesEncode<'a>, + T::EItem: Sized, +{ + type EItem = FacetGroupKey; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.extend_from_slice(&value.field_id.to_be_bytes()); + v.extend_from_slice(&[value.level]); + + let bound = T::bytes_encode(&value.left_bound)?; + v.extend_from_slice(&bound); + + Some(Cow::Owned(v)) + } +} +impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec +where + T: BytesDecode<'a>, +{ + type DItem = FacetGroupKey; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?); + let level = bytes[2]; + let bound = T::bytes_decode(&bytes[3..])?; + Some(FacetGroupKey { field_id: fid, level, left_bound: bound }) + } +} + +pub struct FacetGroupValueCodec; +impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { + type EItem = FacetGroupValue; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![value.size]; + CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); + Some(Cow::Owned(v)) + } +} +impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { + type DItem = FacetGroupValue; + fn bytes_decode(bytes: &'a [u8]) -> Option { + let size = bytes[0]; + let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?; + Some(FacetGroupValue { size, bitmap }) + } +} diff --git a/milli/src/heed_codec/facet/ordered_f64_codec.rs b/milli/src/heed_codec/facet/ordered_f64_codec.rs new file mode 100644 index 000000000..5ac9ffcfc --- /dev/null +++ b/milli/src/heed_codec/facet/ordered_f64_codec.rs @@ -0,0 +1,37 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use heed::BytesDecode; + +use crate::facet::value_encoding::f64_into_bytes; + +pub struct OrderedF64Codec; + +impl<'a> BytesDecode<'a> for OrderedF64Codec { + type DItem = f64; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + if bytes.len() < 16 { + return None; + } + let f = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; + Some(f) + } +} + +impl heed::BytesEncode<'_> for OrderedF64Codec { + type EItem = f64; + + fn bytes_encode(f: &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // write the globally ordered float + let bytes = f64_into_bytes(*f)?; + buffer[..8].copy_from_slice(&bytes[..]); + // Then the f64 value just to be able to read it back + let bytes = f.to_be_bytes(); + buffer[8..16].copy_from_slice(&bytes[..]); + + Some(Cow::Owned(buffer.to_vec())) + } +} diff --git a/milli/src/heed_codec/field_id_word_count_codec.rs b/milli/src/heed_codec/field_id_word_count_codec.rs new file mode 100644 index 000000000..aca7a80c4 --- /dev/null +++ b/milli/src/heed_codec/field_id_word_count_codec.rs @@ -0,0 +1,27 @@ +use std::borrow::Cow; + +use crate::{try_split_array_at, FieldId}; + +pub struct FieldIdWordCountCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec { + type DItem = (FieldId, u8); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + let ([word_count], _nothing) = try_split_array_at(bytes)?; + Some((field_id, word_count)) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec { + type EItem = (FieldId, u8); + + fn bytes_encode((field_id, word_count): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(2 + 1); + bytes.extend_from_slice(&field_id.to_be_bytes()); + bytes.push(*word_count); + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs new file mode 100644 index 000000000..702dcf661 --- /dev/null +++ b/milli/src/heed_codec/mod.rs @@ -0,0 +1,23 @@ +mod beu32_str_codec; +mod byte_slice_ref; +pub mod facet; +mod field_id_word_count_codec; +mod obkv_codec; +mod roaring_bitmap; +mod roaring_bitmap_length; +mod str_beu32_codec; +mod str_ref; +mod str_str_u8_codec; + +pub use byte_slice_ref::ByteSliceRefCodec; +pub use str_ref::StrRefCodec; + +pub use self::beu32_str_codec::BEU32StrCodec; +pub use self::field_id_word_count_codec::FieldIdWordCountCodec; +pub use self::obkv_codec::ObkvCodec; +pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; +pub use self::roaring_bitmap_length::{ + BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, +}; +pub use self::str_beu32_codec::StrBEU32Codec; +pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; diff --git a/milli/src/heed_codec/obkv_codec.rs b/milli/src/heed_codec/obkv_codec.rs new file mode 100644 index 000000000..6dad771a8 --- /dev/null +++ b/milli/src/heed_codec/obkv_codec.rs @@ -0,0 +1,21 @@ +use std::borrow::Cow; + +use obkv::{KvReaderU16, KvWriterU16}; + +pub struct ObkvCodec; + +impl<'a> heed::BytesDecode<'a> for ObkvCodec { + type DItem = KvReaderU16<'a>; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + Some(KvReaderU16::new(bytes)) + } +} + +impl heed::BytesEncode<'_> for ObkvCodec { + type EItem = KvWriterU16>; + + fn bytes_encode(item: &Self::EItem) -> Option> { + item.clone().into_inner().map(Cow::Owned).ok() + } +} diff --git a/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs new file mode 100644 index 000000000..994e23b39 --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap/bo_roaring_bitmap_codec.rs @@ -0,0 +1,39 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::mem::size_of; + +use roaring::RoaringBitmap; + +pub struct BoRoaringBitmapCodec; + +impl BoRoaringBitmapCodec { + pub fn serialize_into(bitmap: &RoaringBitmap, out: &mut Vec) { + out.reserve(bitmap.len() as usize * size_of::()); + bitmap.iter().map(u32::to_ne_bytes).for_each(|bytes| out.extend_from_slice(&bytes)); + } +} + +impl heed::BytesDecode<'_> for BoRoaringBitmapCodec { + type DItem = RoaringBitmap; + + fn bytes_decode(bytes: &[u8]) -> Option { + let mut bitmap = RoaringBitmap::new(); + + for chunk in bytes.chunks(size_of::()) { + let bytes = chunk.try_into().ok()?; + bitmap.push(u32::from_ne_bytes(bytes)); + } + + Some(bitmap) + } +} + +impl heed::BytesEncode<'_> for BoRoaringBitmapCodec { + type EItem = RoaringBitmap; + + fn bytes_encode(item: &Self::EItem) -> Option> { + let mut out = Vec::new(); + BoRoaringBitmapCodec::serialize_into(item, &mut out); + Some(Cow::Owned(out)) + } +} diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs new file mode 100644 index 000000000..1bd132974 --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -0,0 +1,186 @@ +use std::borrow::Cow; +use std::io; +use std::mem::size_of; + +use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; +use roaring::RoaringBitmap; + +/// This is the limit where using a byteorder became less size efficient +/// than using a direct roaring encoding, it is also the point where we are able +/// to determine the encoding used only by using the array of bytes length. +pub const THRESHOLD: usize = 7; + +/// A conditionnal codec that either use the RoaringBitmap +/// or a lighter ByteOrder en/decoding method. +pub struct CboRoaringBitmapCodec; + +impl CboRoaringBitmapCodec { + pub fn serialized_size(roaring: &RoaringBitmap) -> usize { + if roaring.len() <= THRESHOLD as u64 { + roaring.len() as usize * size_of::() + } else { + roaring.serialized_size() + } + } + + pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec) { + if roaring.len() <= THRESHOLD as u64 { + // If the number of items (u32s) to encode is less than or equal to the threshold + // it means that it would weigh the same or less than the RoaringBitmap + // header, so we directly encode them using ByteOrder instead. + for integer in roaring { + vec.write_u32::(integer).unwrap(); + } + } else { + // Otherwise, we use the classic RoaringBitmapCodec that writes a header. + roaring.serialize_into(vec).unwrap(); + } + } + + pub fn deserialize_from(mut bytes: &[u8]) -> io::Result { + if bytes.len() <= THRESHOLD * size_of::() { + // If there is threshold or less than threshold integers that can fit into this array + // of bytes it means that we used the ByteOrder codec serializer. + let mut bitmap = RoaringBitmap::new(); + while let Ok(integer) = bytes.read_u32::() { + bitmap.insert(integer); + } + Ok(bitmap) + } else { + // Otherwise, it means we used the classic RoaringBitmapCodec and + // that the header takes threshold integers. + RoaringBitmap::deserialize_from(bytes) + } + } + + /// Merge serialized CboRoaringBitmaps in a buffer. + /// + /// if the merged values length is under the threshold, values are directly + /// serialized in the buffer else a RoaringBitmap is created from the + /// values and is serialized in the buffer. + pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { + let mut roaring = RoaringBitmap::new(); + let mut vec = Vec::new(); + + for bytes in slices { + if bytes.len() <= THRESHOLD * size_of::() { + let mut reader = bytes.as_ref(); + while let Ok(integer) = reader.read_u32::() { + vec.push(integer); + } + } else { + roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?; + } + } + + if roaring.is_empty() { + vec.sort_unstable(); + vec.dedup(); + + if vec.len() <= THRESHOLD { + for integer in vec { + buffer.extend_from_slice(&integer.to_ne_bytes()); + } + } else { + // We can unwrap safely because the vector is sorted upper. + let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap(); + roaring.serialize_into(buffer)?; + } + } else { + roaring.extend(vec); + roaring.serialize_into(buffer)?; + } + + Ok(()) + } +} + +impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { + type DItem = RoaringBitmap; + + fn bytes_decode(bytes: &[u8]) -> Option { + Self::deserialize_from(bytes).ok() + } +} + +impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { + type EItem = RoaringBitmap; + + fn bytes_encode(item: &Self::EItem) -> Option> { + let mut vec = Vec::with_capacity(Self::serialized_size(item)); + Self::serialize_into(item, &mut vec); + Some(Cow::Owned(vec)) + } +} + +#[cfg(test)] +mod tests { + use std::iter::FromIterator; + + use heed::{BytesDecode, BytesEncode}; + + use super::*; + + #[test] + fn verify_encoding_decoding() { + let input = RoaringBitmap::from_iter(0..THRESHOLD as u32); + let bytes = CboRoaringBitmapCodec::bytes_encode(&input).unwrap(); + let output = CboRoaringBitmapCodec::bytes_decode(&bytes).unwrap(); + assert_eq!(input, output); + } + + #[test] + fn verify_threshold() { + let input = RoaringBitmap::from_iter(0..THRESHOLD as u32); + + // use roaring bitmap + let mut bytes = Vec::new(); + input.serialize_into(&mut bytes).unwrap(); + let roaring_size = bytes.len(); + + // use byteorder directly + let mut bytes = Vec::new(); + for integer in input { + bytes.write_u32::(integer).unwrap(); + } + let bo_size = bytes.len(); + + assert!(roaring_size > bo_size); + } + + #[test] + fn merge_cbo_roaring_bitmaps() { + let mut buffer = Vec::new(); + + let small_data = vec![ + RoaringBitmap::from_sorted_iter(1..4).unwrap(), + RoaringBitmap::from_sorted_iter(2..5).unwrap(), + RoaringBitmap::from_sorted_iter(4..6).unwrap(), + RoaringBitmap::from_sorted_iter(1..3).unwrap(), + ]; + + let small_data: Vec<_> = + small_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect(); + CboRoaringBitmapCodec::merge_into(small_data.as_slice(), &mut buffer).unwrap(); + let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); + let expected = RoaringBitmap::from_sorted_iter(1..6).unwrap(); + assert_eq!(bitmap, expected); + + let medium_data = vec![ + RoaringBitmap::from_sorted_iter(1..4).unwrap(), + RoaringBitmap::from_sorted_iter(2..5).unwrap(), + RoaringBitmap::from_sorted_iter(4..8).unwrap(), + RoaringBitmap::from_sorted_iter(0..3).unwrap(), + RoaringBitmap::from_sorted_iter(7..23).unwrap(), + ]; + + let medium_data: Vec<_> = + medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect(); + buffer.clear(); + CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap(); + + let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); + let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap(); + assert_eq!(bitmap, expected); + } +} diff --git a/milli/src/heed_codec/roaring_bitmap/mod.rs b/milli/src/heed_codec/roaring_bitmap/mod.rs new file mode 100644 index 000000000..6f8045c92 --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap/mod.rs @@ -0,0 +1,7 @@ +mod bo_roaring_bitmap_codec; +pub mod cbo_roaring_bitmap_codec; +mod roaring_bitmap_codec; + +pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; +pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec; +pub use self::roaring_bitmap_codec::RoaringBitmapCodec; diff --git a/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs new file mode 100644 index 000000000..8fae9b8fd --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap/roaring_bitmap_codec.rs @@ -0,0 +1,23 @@ +use std::borrow::Cow; + +use roaring::RoaringBitmap; + +pub struct RoaringBitmapCodec; + +impl heed::BytesDecode<'_> for RoaringBitmapCodec { + type DItem = RoaringBitmap; + + fn bytes_decode(bytes: &[u8]) -> Option { + RoaringBitmap::deserialize_from(bytes).ok() + } +} + +impl heed::BytesEncode<'_> for RoaringBitmapCodec { + type EItem = RoaringBitmap; + + fn bytes_encode(item: &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(item.serialized_size()); + item.serialize_into(&mut bytes).ok()?; + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs new file mode 100644 index 000000000..e749680a0 --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/bo_roaring_bitmap_len_codec.rs @@ -0,0 +1,11 @@ +use std::mem; + +pub struct BoRoaringBitmapLenCodec; + +impl heed::BytesDecode<'_> for BoRoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode(bytes: &[u8]) -> Option { + Some((bytes.len() / mem::size_of::()) as u64) + } +} diff --git a/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs new file mode 100644 index 000000000..4f728f1cd --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/cbo_roaring_bitmap_len_codec.rs @@ -0,0 +1,22 @@ +use std::mem; + +use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec}; +use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD; + +pub struct CboRoaringBitmapLenCodec; + +impl heed::BytesDecode<'_> for CboRoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode(bytes: &[u8]) -> Option { + if bytes.len() <= THRESHOLD * mem::size_of::() { + // If there is threshold or less than threshold integers that can fit into this array + // of bytes it means that we used the ByteOrder codec serializer. + BoRoaringBitmapLenCodec::bytes_decode(bytes) + } else { + // Otherwise, it means we used the classic RoaringBitmapCodec and + // that the header takes threshold integers. + RoaringBitmapLenCodec::bytes_decode(bytes) + } + } +} diff --git a/milli/src/heed_codec/roaring_bitmap_length/mod.rs b/milli/src/heed_codec/roaring_bitmap_length/mod.rs new file mode 100644 index 000000000..e503c5c7a --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/mod.rs @@ -0,0 +1,7 @@ +mod bo_roaring_bitmap_len_codec; +mod cbo_roaring_bitmap_len_codec; +mod roaring_bitmap_len_codec; + +pub use self::bo_roaring_bitmap_len_codec::BoRoaringBitmapLenCodec; +pub use self::cbo_roaring_bitmap_len_codec::CboRoaringBitmapLenCodec; +pub use self::roaring_bitmap_len_codec::RoaringBitmapLenCodec; diff --git a/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs new file mode 100644 index 000000000..4d266e413 --- /dev/null +++ b/milli/src/heed_codec/roaring_bitmap_length/roaring_bitmap_len_codec.rs @@ -0,0 +1,77 @@ +use std::io::{self, BufRead, Read}; +use std::mem; + +use byteorder::{LittleEndian, ReadBytesExt}; + +const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346; +const SERIAL_COOKIE: u16 = 12347; + +pub struct RoaringBitmapLenCodec; + +impl RoaringBitmapLenCodec { + // FIXME should be exported in the RoaringBitmap crate + fn deserialize_from_slice(mut bytes: &[u8]) -> io::Result { + let (size, has_offsets) = { + let cookie = bytes.read_u32::()?; + if cookie == SERIAL_COOKIE_NO_RUNCONTAINER { + (bytes.read_u32::()? as usize, true) + } else if (cookie as u16) == SERIAL_COOKIE { + return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported")); + } else { + return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value")); + } + }; + + if size > u16::max_value() as usize + 1 { + return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported")); + } + + let mut description_bytes = vec![0u8; size * 4]; + bytes.read_exact(&mut description_bytes)?; + let description_bytes = &mut &description_bytes[..]; + + if has_offsets { + bytes.consume(size * 4); + } + + let mut length = 0; + for _ in 0..size { + let _key = description_bytes.read_u16::()?; + let len = u64::from(description_bytes.read_u16::()?) + 1; + length += len; + + if len <= 4096 { + bytes.consume(len as usize * mem::size_of::()); + } else { + bytes.consume(1024 * mem::size_of::()) + } + } + + Ok(length) + } +} + +impl heed::BytesDecode<'_> for RoaringBitmapLenCodec { + type DItem = u64; + + fn bytes_decode(bytes: &[u8]) -> Option { + RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok() + } +} + +#[cfg(test)] +mod tests { + use heed::BytesEncode; + use roaring::RoaringBitmap; + + use super::*; + use crate::heed_codec::RoaringBitmapCodec; + + #[test] + fn deserialize_roaring_bitmap_length() { + let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect(); + let bytes = RoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + let len = RoaringBitmapLenCodec::deserialize_from_slice(&bytes).unwrap(); + assert_eq!(bitmap.len(), len); + } +} diff --git a/milli/src/heed_codec/str_beu32_codec.rs b/milli/src/heed_codec/str_beu32_codec.rs new file mode 100644 index 000000000..d1f379bdc --- /dev/null +++ b/milli/src/heed_codec/str_beu32_codec.rs @@ -0,0 +1,38 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::mem::size_of; +use std::str; + +pub struct StrBEU32Codec; + +impl<'a> heed::BytesDecode<'a> for StrBEU32Codec { + type DItem = (&'a str, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let footer_len = size_of::(); + + if bytes.len() < footer_len { + return None; + } + + let (word, bytes) = bytes.split_at(bytes.len() - footer_len); + let word = str::from_utf8(word).ok()?; + let pos = bytes.try_into().map(u32::from_be_bytes).ok()?; + + Some((word, pos)) + } +} + +impl<'a> heed::BytesEncode<'a> for StrBEU32Codec { + type EItem = (&'a str, u32); + + fn bytes_encode((word, pos): &Self::EItem) -> Option> { + let pos = pos.to_be_bytes(); + + let mut bytes = Vec::with_capacity(word.len() + pos.len()); + bytes.extend_from_slice(word.as_bytes()); + bytes.extend_from_slice(&pos[..]); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/str_ref.rs b/milli/src/heed_codec/str_ref.rs new file mode 100644 index 000000000..ced5cc65e --- /dev/null +++ b/milli/src/heed_codec/str_ref.rs @@ -0,0 +1,22 @@ +use std::borrow::Cow; + +use heed::{BytesDecode, BytesEncode}; + +/// A codec for values of type `&str`. Unlike `Str`, its `EItem` and `DItem` associated +/// types are equivalent (= `&'a str`) and these values can reside within another structure. +pub struct StrRefCodec; +impl<'a> BytesEncode<'a> for StrRefCodec { + type EItem = &'a str; + + fn bytes_encode(item: &'a &'a str) -> Option> { + Some(Cow::Borrowed(item.as_bytes())) + } +} +impl<'a> BytesDecode<'a> for StrRefCodec { + type DItem = &'a str; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let s = std::str::from_utf8(bytes).ok()?; + Some(s) + } +} diff --git a/milli/src/heed_codec/str_str_u8_codec.rs b/milli/src/heed_codec/str_str_u8_codec.rs new file mode 100644 index 000000000..60be8ddc7 --- /dev/null +++ b/milli/src/heed_codec/str_str_u8_codec.rs @@ -0,0 +1,57 @@ +use std::borrow::Cow; +use std::str; + +pub struct U8StrStrCodec; + +impl<'a> heed::BytesDecode<'a> for U8StrStrCodec { + type DItem = (u8, &'a str, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (n, bytes) = bytes.split_first()?; + let s1_end = bytes.iter().position(|b| *b == 0)?; + let (s1_bytes, rest) = bytes.split_at(s1_end); + let s2_bytes = &rest[1..]; + let s1 = str::from_utf8(s1_bytes).ok()?; + let s2 = str::from_utf8(s2_bytes).ok()?; + Some((*n, s1, s2)) + } +} + +impl<'a> heed::BytesEncode<'a> for U8StrStrCodec { + type EItem = (u8, &'a str, &'a str); + + fn bytes_encode((n, s1, s2): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); + bytes.push(*n); + bytes.extend_from_slice(s1.as_bytes()); + bytes.push(0); + bytes.extend_from_slice(s2.as_bytes()); + Some(Cow::Owned(bytes)) + } +} +pub struct UncheckedU8StrStrCodec; + +impl<'a> heed::BytesDecode<'a> for UncheckedU8StrStrCodec { + type DItem = (u8, &'a [u8], &'a [u8]); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (n, bytes) = bytes.split_first()?; + let s1_end = bytes.iter().position(|b| *b == 0)?; + let (s1_bytes, rest) = bytes.split_at(s1_end); + let s2_bytes = &rest[1..]; + Some((*n, s1_bytes, s2_bytes)) + } +} + +impl<'a> heed::BytesEncode<'a> for UncheckedU8StrStrCodec { + type EItem = (u8, &'a [u8], &'a [u8]); + + fn bytes_encode((n, s1, s2): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); + bytes.push(*n); + bytes.extend_from_slice(s1); + bytes.push(0); + bytes.extend_from_slice(s2); + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs new file mode 100644 index 000000000..46f8eb6a3 --- /dev/null +++ b/milli/src/index.rs @@ -0,0 +1,2295 @@ +use std::borrow::Cow; +use std::collections::{HashMap, HashSet}; +use std::fs::File; +use std::mem::size_of; +use std::path::Path; + +use heed::flags::Flags; +use heed::types::*; +use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn}; +use roaring::RoaringBitmap; +use rstar::RTree; +use time::OffsetDateTime; + +use crate::error::{InternalError, UserError}; +use crate::facet::FacetType; +use crate::fields_ids_map::FieldsIdsMap; +use crate::heed_codec::facet::{ + FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + FieldIdCodec, OrderedF64Codec, +}; +use crate::heed_codec::StrRefCodec; +use crate::{ + default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, + DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, + FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, + Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32, +}; + +pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; +pub const DEFAULT_MIN_WORD_LEN_TWO_TYPOS: u8 = 9; + +pub mod main_key { + pub const CRITERIA_KEY: &str = "criteria"; + pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; + pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; + pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; + pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids"; + pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields"; + pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; + pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; + pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; + pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; + pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; + pub const GEO_RTREE_KEY: &str = "geo-rtree"; + pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; + pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; + pub const PRIMARY_KEY_KEY: &str = "primary-key"; + pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; + pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; + pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; + pub const STOP_WORDS_KEY: &str = "stop-words"; + pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; + pub const SYNONYMS_KEY: &str = "synonyms"; + pub const WORDS_FST_KEY: &str = "words-fst"; + pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; + pub const CREATED_AT_KEY: &str = "created-at"; + pub const UPDATED_AT_KEY: &str = "updated-at"; + pub const AUTHORIZE_TYPOS: &str = "authorize-typos"; + pub const ONE_TYPO_WORD_LEN: &str = "one-typo-word-len"; + pub const TWO_TYPOS_WORD_LEN: &str = "two-typos-word-len"; + pub const EXACT_WORDS: &str = "exact-words"; + pub const EXACT_ATTRIBUTES: &str = "exact-attributes"; + pub const MAX_VALUES_PER_FACET: &str = "max-values-per-facet"; + pub const PAGINATION_MAX_TOTAL_HITS: &str = "pagination-max-total-hits"; +} + +pub mod db_name { + pub const MAIN: &str = "main"; + pub const WORD_DOCIDS: &str = "word-docids"; + pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; + pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; + pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids"; + pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; + pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; + pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; + pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; + pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; + pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; + pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; + pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; + pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; + pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; + pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; + pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; + pub const DOCUMENTS: &str = "documents"; +} + +#[derive(Clone)] +pub struct Index { + /// The LMDB environment which this index is associated with. + pub(crate) env: heed::Env, + + /// Contains many different types (e.g. the fields ids map). + pub(crate) main: PolyDatabase, + + /// A word and all the documents ids containing the word. + pub word_docids: Database, + + /// A word and all the documents ids containing the word, from attributes for which typos are not allowed. + pub exact_word_docids: Database, + + /// A prefix of word and all the documents ids containing this prefix. + pub word_prefix_docids: Database, + + /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. + pub exact_word_prefix_docids: Database, + + /// Maps a word and a document id (u32) to all the positions where the given word appears. + pub docid_word_positions: Database, + + /// Maps the proximity between a pair of words with all the docids where this relation appears. + pub word_pair_proximity_docids: Database, + /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. + pub word_prefix_pair_proximity_docids: Database, + /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears. + pub prefix_word_pair_proximity_docids: Database, + + /// Maps the word and the position with the docids that corresponds to it. + pub word_position_docids: Database, + /// Maps the field id and the word count with the docids that corresponds to it. + pub field_id_word_count_docids: Database, + /// Maps the position of a word prefix with all the docids where this prefix appears. + pub word_prefix_position_docids: Database, + + /// Maps the facet field id and the docids for which this field exists + pub facet_id_exists_docids: Database, + + /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. + pub facet_id_f64_docids: Database, FacetGroupValueCodec>, + /// Maps the facet field id and ranges of strings with the docids that corresponds to them. + pub facet_id_string_docids: Database, FacetGroupValueCodec>, + + /// Maps the document id, the facet field id and the numbers. + pub field_id_docid_facet_f64s: Database, + /// Maps the document id, the facet field id and the strings. + pub field_id_docid_facet_strings: Database, + + /// Maps the document id to the document as an obkv store. + pub(crate) documents: Database, ObkvCodec>, +} + +impl Index { + pub fn new_with_creation_dates>( + mut options: heed::EnvOpenOptions, + path: P, + created_at: OffsetDateTime, + updated_at: OffsetDateTime, + ) -> Result { + use db_name::*; + + options.max_dbs(18); + unsafe { options.flag(Flags::MdbAlwaysFreePages) }; + + let env = options.open(path)?; + let main = env.create_poly_database(Some(MAIN))?; + let word_docids = env.create_database(Some(WORD_DOCIDS))?; + let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?; + let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; + let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?; + let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; + let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; + let word_prefix_pair_proximity_docids = + env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; + let prefix_word_pair_proximity_docids = + env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; + let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; + let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; + let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; + let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; + let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; + let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; + + let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; + let field_id_docid_facet_strings = + env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; + let documents = env.create_database(Some(DOCUMENTS))?; + + Index::set_creation_dates(&env, main, created_at, updated_at)?; + + Ok(Index { + env, + main, + word_docids, + exact_word_docids, + word_prefix_docids, + exact_word_prefix_docids, + docid_word_positions, + word_pair_proximity_docids, + word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, + word_position_docids, + word_prefix_position_docids, + field_id_word_count_docids, + facet_id_f64_docids, + facet_id_string_docids, + facet_id_exists_docids, + field_id_docid_facet_f64s, + field_id_docid_facet_strings, + documents, + }) + } + + pub fn new>(options: heed::EnvOpenOptions, path: P) -> Result { + let now = OffsetDateTime::now_utc(); + Self::new_with_creation_dates(options, path, now, now) + } + + fn set_creation_dates( + env: &heed::Env, + main: PolyDatabase, + created_at: OffsetDateTime, + updated_at: OffsetDateTime, + ) -> heed::Result<()> { + let mut txn = env.write_txn()?; + // The db was just created, we update its metadata with the relevant information. + if main.get::<_, Str, SerdeJson>(&txn, main_key::CREATED_AT_KEY)?.is_none() + { + main.put::<_, Str, SerdeJson>( + &mut txn, + main_key::UPDATED_AT_KEY, + &updated_at, + )?; + main.put::<_, Str, SerdeJson>( + &mut txn, + main_key::CREATED_AT_KEY, + &created_at, + )?; + txn.commit()?; + } + Ok(()) + } + + /// Create a write transaction to be able to write into the index. + pub fn write_txn(&self) -> heed::Result { + self.env.write_txn() + } + + /// Create a read transaction to be able to read the index. + pub fn read_txn(&self) -> heed::Result { + self.env.read_txn() + } + + /// Returns the canonicalized path where the heed `Env` of this `Index` lives. + pub fn path(&self) -> &Path { + self.env.path() + } + + /// Returns the size used by the index without the cached pages. + pub fn used_size(&self) -> Result { + Ok(self.env.non_free_pages_size()?) + } + + /// Returns the real size used by the index. + pub fn on_disk_size(&self) -> Result { + Ok(self.env.real_disk_size()?) + } + + /// Returns the map size the underlying environment was opened with, in bytes. + /// + /// This value does not represent the current on-disk size of the index. + /// + /// This value is the maximum between the map size passed during the opening of the index + /// and the on-disk size of the index at the time of opening. + pub fn map_size(&self) -> Result { + Ok(self.env.map_size()?) + } + + pub fn copy_to_path>(&self, path: P, option: CompactionOption) -> Result { + self.env.copy_to_path(path, option).map_err(Into::into) + } + + /// Returns an `EnvClosingEvent` that can be used to wait for the closing event, + /// multiple threads can wait on this event. + /// + /// Make sure that you drop all the copies of `Index`es you have, env closing are triggered + /// when all references are dropped, the last one will eventually close the environment. + pub fn prepare_for_closing(self) -> heed::EnvClosingEvent { + self.env.prepare_for_closing() + } + + /* documents ids */ + + /// Writes the documents ids that corresponds to the user-ids-documents-ids FST. + pub(crate) fn put_documents_ids( + &self, + wtxn: &mut RwTxn, + docids: &RoaringBitmap, + ) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) + } + + /// Returns the internal documents ids. + pub fn documents_ids(&self, rtxn: &RoTxn) -> heed::Result { + Ok(self + .main + .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)? + .unwrap_or_default()) + } + + /// Returns the number of documents indexed in the database. + pub fn number_of_documents(&self, rtxn: &RoTxn) -> Result { + let count = + self.main.get::<_, Str, RoaringBitmapLenCodec>(rtxn, main_key::DOCUMENTS_IDS_KEY)?; + Ok(count.unwrap_or_default()) + } + + /* deleted documents ids */ + + /// Writes the soft deleted documents ids. + pub(crate) fn put_soft_deleted_documents_ids( + &self, + wtxn: &mut RwTxn, + docids: &RoaringBitmap, + ) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>( + wtxn, + main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY, + docids, + ) + } + + /// Returns the soft deleted documents ids. + pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { + Ok(self + .main + .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)? + .unwrap_or_default()) + } + + /* primary key */ + + /// Writes the documents primary key, this is the field name that is used to store the id. + pub(crate) fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { + self.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, primary_key) + } + + /// Deletes the primary key of the documents, this can be done to reset indexes settings. + pub(crate) fn delete_primary_key(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::PRIMARY_KEY_KEY) + } + + /// Returns the documents primary key, `None` if it hasn't been defined. + pub fn primary_key<'t>(&self, rtxn: &'t RoTxn) -> heed::Result> { + self.main.get::<_, Str, Str>(rtxn, main_key::PRIMARY_KEY_KEY) + } + + /* external documents ids */ + + /// Writes the external documents ids and internal ids (i.e. `u32`). + pub(crate) fn put_external_documents_ids<'a>( + &self, + wtxn: &mut RwTxn, + external_documents_ids: &ExternalDocumentsIds<'a>, + ) -> heed::Result<()> { + let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids; + let hard = hard.as_fst().as_bytes(); + let soft = soft.as_fst().as_bytes(); + self.main.put::<_, Str, ByteSlice>( + wtxn, + main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY, + hard, + )?; + self.main.put::<_, Str, ByteSlice>( + wtxn, + main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY, + soft, + )?; + Ok(()) + } + + /// Returns the external documents ids map which associate the external ids + /// with the internal ids (i.e. `u32`). + pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result> { + let hard = + self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?; + let soft = + self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?; + let hard = match hard { + Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?, + None => fst::Map::default().map_data(Cow::Owned)?, + }; + let soft = match soft { + Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?, + None => fst::Map::default().map_data(Cow::Owned)?, + }; + let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?; + Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids)) + } + + /* fields ids map */ + + /// Writes the fields ids map which associate the documents keys with an internal field id + /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. + pub(crate) fn put_fields_ids_map( + &self, + wtxn: &mut RwTxn, + map: &FieldsIdsMap, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) + } + + /// Returns the fields ids map which associate the documents keys with an internal field id + /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. + pub fn fields_ids_map(&self, rtxn: &RoTxn) -> heed::Result { + Ok(self + .main + .get::<_, Str, SerdeJson>(rtxn, main_key::FIELDS_IDS_MAP_KEY)? + .unwrap_or_default()) + } + + /* geo rtree */ + + /// Writes the provided `rtree` which associates coordinates to documents ids. + pub(crate) fn put_geo_rtree( + &self, + wtxn: &mut RwTxn, + rtree: &RTree, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode>>(wtxn, main_key::GEO_RTREE_KEY, rtree) + } + + /// Delete the `rtree` which associates coordinates to documents ids. + pub(crate) fn delete_geo_rtree(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::GEO_RTREE_KEY) + } + + /// Returns the `rtree` which associates coordinates to documents ids. + pub fn geo_rtree<'t>(&self, rtxn: &'t RoTxn) -> Result>> { + match self + .main + .get::<_, Str, SerdeBincode>>(rtxn, main_key::GEO_RTREE_KEY)? + { + Some(rtree) => Ok(Some(rtree)), + None => Ok(None), + } + } + + /* geo faceted */ + + /// Writes the documents ids that are faceted with a _geo field. + pub(crate) fn put_geo_faceted_documents_ids( + &self, + wtxn: &mut RwTxn, + docids: &RoaringBitmap, + ) -> heed::Result<()> { + self.main.put::<_, Str, RoaringBitmapCodec>( + wtxn, + main_key::GEO_FACETED_DOCUMENTS_IDS_KEY, + docids, + ) + } + + /// Delete the documents ids that are faceted with a _geo field. + pub(crate) fn delete_geo_faceted_documents_ids(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY) + } + + /// Retrieve all the documents ids that are faceted with a _geo field. + pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result { + match self + .main + .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY)? + { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + + /* field distribution */ + + /// Writes the field distribution which associates every field name with + /// the number of times it occurs in the documents. + pub(crate) fn put_field_distribution( + &self, + wtxn: &mut RwTxn, + distribution: &FieldDistribution, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson>( + wtxn, + main_key::FIELD_DISTRIBUTION_KEY, + distribution, + ) + } + + /// Returns the field distribution which associates every field name with + /// the number of times it occurs in the documents. + pub fn field_distribution(&self, rtxn: &RoTxn) -> heed::Result { + Ok(self + .main + .get::<_, Str, SerdeJson>(rtxn, main_key::FIELD_DISTRIBUTION_KEY)? + .unwrap_or_default()) + } + + /* displayed fields */ + + /// Writes the fields that must be displayed in the defined order. + /// There must be not be any duplicate field id. + pub(crate) fn put_displayed_fields( + &self, + wtxn: &mut RwTxn, + fields: &[&str], + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>( + wtxn, + main_key::DISPLAYED_FIELDS_KEY, + &fields, + ) + } + + /// Deletes the displayed fields ids, this will make the engine to display + /// all the documents attributes in the order of the `FieldsIdsMap`. + pub(crate) fn delete_displayed_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::DISPLAYED_FIELDS_KEY) + } + + /// Returns the displayed fields in the order they were set by the user. If it returns + /// `None` it means that all the attributes are set as displayed in the order of the `FieldsIdsMap`. + pub fn displayed_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { + self.main.get::<_, Str, SerdeBincode>>(rtxn, main_key::DISPLAYED_FIELDS_KEY) + } + + /// Identical to `displayed_fields`, but returns the ids instead. + pub fn displayed_fields_ids(&self, rtxn: &RoTxn) -> Result>> { + match self.displayed_fields(rtxn)? { + Some(fields) => { + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields.into_iter() { + if let Some(field_id) = fields_ids_map.id(name) { + fields_ids.push(field_id); + } + } + Ok(Some(fields_ids)) + } + None => Ok(None), + } + } + + /* searchable fields */ + + /// Write the user defined searchable fields and generate the real searchable fields from the specified fields ids map. + pub(crate) fn put_all_searchable_fields_from_fields_ids_map( + &self, + wtxn: &mut RwTxn, + user_fields: &[&str], + fields_ids_map: &FieldsIdsMap, + ) -> heed::Result<()> { + // We can write the user defined searchable fields as-is. + self.put_user_defined_searchable_fields(wtxn, user_fields)?; + + // Now we generate the real searchable fields: + // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. + // 2. Iterate over the user defined searchable fields. + // 3. If a user defined field is a subset of a field defined in the fields_ids_map + // (ie doggo.name is a subset of doggo) then we push it at the end of the fields. + let mut real_fields = user_fields.to_vec(); + + for field_from_map in fields_ids_map.names() { + for user_field in user_fields { + if crate::is_faceted_by(field_from_map, user_field) + && !user_fields.contains(&field_from_map) + { + real_fields.push(field_from_map); + } + } + } + + self.put_searchable_fields(wtxn, &real_fields) + } + + pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + let did_delete_searchable = self.delete_searchable_fields(wtxn)?; + let did_delete_user_defined = self.delete_user_defined_searchable_fields(wtxn)?; + Ok(did_delete_searchable || did_delete_user_defined) + } + + /// Writes the searchable fields, when this list is specified, only these are indexed. + fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>( + wtxn, + main_key::SEARCHABLE_FIELDS_KEY, + &fields, + ) + } + + /// Deletes the searchable fields, when no fields are specified, all fields are indexed. + fn delete_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::SEARCHABLE_FIELDS_KEY) + } + + /// Returns the searchable fields, those are the fields that are indexed, + /// if the searchable fields aren't there it means that **all** the fields are indexed. + pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { + self.main.get::<_, Str, SerdeBincode>>(rtxn, main_key::SEARCHABLE_FIELDS_KEY) + } + + /// Identical to `searchable_fields`, but returns the ids instead. + pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result>> { + match self.searchable_fields(rtxn)? { + Some(fields) => { + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields { + if let Some(field_id) = fields_ids_map.id(name) { + fields_ids.push(field_id); + } + } + Ok(Some(fields_ids)) + } + None => Ok(None), + } + } + + /// Writes the searchable fields, when this list is specified, only these are indexed. + pub(crate) fn put_user_defined_searchable_fields( + &self, + wtxn: &mut RwTxn, + fields: &[&str], + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<_>>( + wtxn, + main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY, + &fields, + ) + } + + /// Deletes the searchable fields, when no fields are specified, all fields are indexed. + pub(crate) fn delete_user_defined_searchable_fields( + &self, + wtxn: &mut RwTxn, + ) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) + } + + /// Returns the user defined searchable fields. + pub fn user_defined_searchable_fields<'t>( + &self, + rtxn: &'t RoTxn, + ) -> heed::Result>> { + self.main + .get::<_, Str, SerdeBincode>>(rtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) + } + + /* filterable fields */ + + /// Writes the filterable fields names in the database. + pub(crate) fn put_filterable_fields( + &self, + wtxn: &mut RwTxn, + fields: &HashSet, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) + } + + /// Deletes the filterable fields ids in the database. + pub(crate) fn delete_filterable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::FILTERABLE_FIELDS_KEY) + } + + /// Returns the filterable fields names. + pub fn filterable_fields(&self, rtxn: &RoTxn) -> heed::Result> { + Ok(self + .main + .get::<_, Str, SerdeJson<_>>(rtxn, main_key::FILTERABLE_FIELDS_KEY)? + .unwrap_or_default()) + } + + /// Identical to `filterable_fields`, but returns ids instead. + pub fn filterable_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.filterable_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + + let mut fields_ids = HashSet::new(); + for name in fields { + if let Some(field_id) = fields_ids_map.id(&name) { + fields_ids.insert(field_id); + } + } + + Ok(fields_ids) + } + + /* sortable fields */ + + /// Writes the sortable fields names in the database. + pub(crate) fn put_sortable_fields( + &self, + wtxn: &mut RwTxn, + fields: &HashSet, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::SORTABLE_FIELDS_KEY, fields) + } + + /// Deletes the sortable fields ids in the database. + pub(crate) fn delete_sortable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::SORTABLE_FIELDS_KEY) + } + + /// Returns the sortable fields names. + pub fn sortable_fields(&self, rtxn: &RoTxn) -> heed::Result> { + Ok(self + .main + .get::<_, Str, SerdeJson<_>>(rtxn, main_key::SORTABLE_FIELDS_KEY)? + .unwrap_or_default()) + } + + /// Identical to `sortable_fields`, but returns ids instead. + pub fn sortable_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.sortable_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + Ok(fields.into_iter().filter_map(|name| fields_ids_map.id(&name)).collect()) + } + + /* faceted fields */ + + /// Writes the faceted fields in the database. + pub(crate) fn put_faceted_fields( + &self, + wtxn: &mut RwTxn, + fields: &HashSet, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::HIDDEN_FACETED_FIELDS_KEY, fields) + } + + /// Returns the faceted fields names. + pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { + Ok(self + .main + .get::<_, Str, SerdeJson<_>>(rtxn, main_key::HIDDEN_FACETED_FIELDS_KEY)? + .unwrap_or_default()) + } + + /// Identical to `faceted_fields`, but returns ids instead. + pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.faceted_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + + let mut fields_ids = HashSet::new(); + for name in fields { + if let Some(field_id) = fields_ids_map.id(&name) { + fields_ids.insert(field_id); + } + } + + Ok(fields_ids) + } + + /* faceted documents ids */ + + /// Returns the user defined faceted fields names. + /// + /// The user faceted fields are the union of all the filterable, sortable, distinct, and Asc/Desc fields. + pub fn user_defined_faceted_fields(&self, rtxn: &RoTxn) -> Result> { + let filterable_fields = self.filterable_fields(rtxn)?; + let sortable_fields = self.sortable_fields(rtxn)?; + let distinct_field = self.distinct_field(rtxn)?; + let asc_desc_fields = + self.criteria(rtxn)?.into_iter().filter_map(|criterion| match criterion { + Criterion::Asc(field) | Criterion::Desc(field) => Some(field), + _otherwise => None, + }); + + let mut faceted_fields = filterable_fields; + faceted_fields.extend(sortable_fields); + faceted_fields.extend(asc_desc_fields); + if let Some(field) = distinct_field { + faceted_fields.insert(field.to_owned()); + } + + Ok(faceted_fields) + } + + /// Identical to `user_defined_faceted_fields`, but returns ids instead. + pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.faceted_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + + let mut fields_ids = HashSet::new(); + for name in fields.into_iter() { + if let Some(field_id) = fields_ids_map.id(&name) { + fields_ids.insert(field_id); + } + } + + Ok(fields_ids) + } + + /* faceted documents ids */ + + /// Writes the documents ids that are faceted under this field id for the given facet type. + pub fn put_faceted_documents_ids( + &self, + wtxn: &mut RwTxn, + field_id: FieldId, + facet_type: FacetType, + docids: &RoaringBitmap, + ) -> heed::Result<()> { + let key = match facet_type { + FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, + FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, + }; + let mut buffer = vec![0u8; key.len() + size_of::()]; + buffer[..key.len()].copy_from_slice(key.as_bytes()); + buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); + self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) + } + + /// Retrieve all the documents ids that are faceted under this field id for the given facet type. + pub fn faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + facet_type: FacetType, + ) -> heed::Result { + let key = match facet_type { + FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, + FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, + }; + let mut buffer = vec![0u8; key.len() + size_of::()]; + buffer[..key.len()].copy_from_slice(key.as_bytes()); + buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); + match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + + /// Retrieve all the documents which contain this field id + pub fn exists_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result { + match self.facet_id_exists_docids.get(rtxn, &BEU16::new(field_id))? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + + /* distinct field */ + + pub(crate) fn put_distinct_field( + &self, + wtxn: &mut RwTxn, + distinct_field: &str, + ) -> heed::Result<()> { + self.main.put::<_, Str, Str>(wtxn, main_key::DISTINCT_FIELD_KEY, distinct_field) + } + + pub fn distinct_field<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { + self.main.get::<_, Str, Str>(rtxn, main_key::DISTINCT_FIELD_KEY) + } + + pub(crate) fn delete_distinct_field(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::DISTINCT_FIELD_KEY) + } + + /* criteria */ + + pub(crate) fn put_criteria( + &self, + wtxn: &mut RwTxn, + criteria: &[Criterion], + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) + } + + pub(crate) fn delete_criteria(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::CRITERIA_KEY) + } + + pub fn criteria(&self, rtxn: &RoTxn) -> heed::Result> { + match self.main.get::<_, Str, SerdeJson>>(rtxn, main_key::CRITERIA_KEY)? { + Some(criteria) => Ok(criteria), + None => Ok(default_criteria()), + } + } + + /* words fst */ + + /// Writes the FST which is the words dictionary of the engine. + pub(crate) fn put_words_fst>( + &self, + wtxn: &mut RwTxn, + fst: &fst::Set, + ) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) + } + + /// Returns the FST which is the words dictionary of the engine. + pub fn words_fst<'t>(&self, rtxn: &'t RoTxn) -> Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::WORDS_FST_KEY)? { + Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), + None => Ok(fst::Set::default().map_data(Cow::Owned)?), + } + } + + /* stop words */ + + pub(crate) fn put_stop_words>( + &self, + wtxn: &mut RwTxn, + fst: &fst::Set, + ) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) + } + + pub(crate) fn delete_stop_words(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::STOP_WORDS_KEY) + } + + pub fn stop_words<'t>(&self, rtxn: &'t RoTxn) -> Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::STOP_WORDS_KEY)? { + Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), + None => Ok(None), + } + } + + /* synonyms */ + + pub(crate) fn put_synonyms( + &self, + wtxn: &mut RwTxn, + synonyms: &HashMap, Vec>>, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms) + } + + pub(crate) fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY) + } + + pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result, Vec>>> { + Ok(self + .main + .get::<_, Str, SerdeBincode<_>>(rtxn, main_key::SYNONYMS_KEY)? + .unwrap_or_default()) + } + + pub fn words_synonyms>( + &self, + rtxn: &RoTxn, + words: &[S], + ) -> heed::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); + Ok(self.synonyms(rtxn)?.remove(&words)) + } + + /* words prefixes fst */ + + /// Writes the FST which is the words prefixes dictionnary of the engine. + pub(crate) fn put_words_prefixes_fst>( + &self, + wtxn: &mut RwTxn, + fst: &fst::Set, + ) -> heed::Result<()> { + self.main.put::<_, Str, ByteSlice>( + wtxn, + main_key::WORDS_PREFIXES_FST_KEY, + fst.as_fst().as_bytes(), + ) + } + + /// Returns the FST which is the words prefixes dictionnary of the engine. + pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? { + Some(bytes) => Ok(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?), + None => Ok(fst::Set::default().map_data(Cow::Owned)?), + } + } + + /* word documents count */ + + /// Returns the number of documents ids associated with the given word, + /// it is much faster than deserializing the bitmap and getting the length of it. + pub fn word_documents_count(&self, rtxn: &RoTxn, word: &str) -> heed::Result> { + self.word_docids.remap_data_type::().get(rtxn, word) + } + + /* documents */ + + /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing. + pub fn documents<'t>( + &self, + rtxn: &'t RoTxn, + ids: impl IntoIterator, + ) -> Result)>> { + let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; + let mut documents = Vec::new(); + + for id in ids { + if soft_deleted_documents.contains(id) { + return Err(UserError::AccessingSoftDeletedDocument { document_id: id })?; + } + let kv = self + .documents + .get(rtxn, &BEU32::new(id))? + .ok_or(UserError::UnknownInternalDocumentId { document_id: id })?; + documents.push((id, kv)); + } + + Ok(documents) + } + + /// Returns an iterator over all the documents in the index. + pub fn all_documents<'t>( + &self, + rtxn: &'t RoTxn, + ) -> Result)>>> { + let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?; + + Ok(self + .documents + .iter(rtxn)? + // we cast the BEU32 to a DocumentId + .map(|document| document.map(|(id, obkv)| (id.get(), obkv))) + .filter(move |document| { + document.as_ref().map_or(true, |(id, _)| !soft_deleted_docids.contains(*id)) + })) + } + + pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { + FacetDistribution::new(rtxn, self) + } + + pub fn search<'a>(&'a self, rtxn: &'a RoTxn) -> Search<'a> { + Search::new(rtxn, self) + } + + /// Returns the index creation time. + pub fn created_at(&self, rtxn: &RoTxn) -> Result { + Ok(self + .main + .get::<_, Str, SerdeJson>(rtxn, main_key::CREATED_AT_KEY)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::CREATED_AT_KEY), + })?) + } + + /// Returns the index last updated time. + pub fn updated_at(&self, rtxn: &RoTxn) -> Result { + Ok(self + .main + .get::<_, Str, SerdeJson>(rtxn, main_key::UPDATED_AT_KEY)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::UPDATED_AT_KEY), + })?) + } + + pub(crate) fn set_updated_at( + &self, + wtxn: &mut RwTxn, + time: &OffsetDateTime, + ) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson>(wtxn, main_key::UPDATED_AT_KEY, time) + } + + pub fn authorize_typos(&self, txn: &RoTxn) -> heed::Result { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + match self.main.get::<_, Str, OwnedType>(txn, main_key::AUTHORIZE_TYPOS)? { + Some(0) => Ok(false), + _ => Ok(true), + } + } + + pub(crate) fn put_authorize_typos(&self, txn: &mut RwTxn, flag: bool) -> heed::Result<()> { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + self.main.put::<_, Str, OwnedType>(txn, main_key::AUTHORIZE_TYPOS, &(flag as u8))?; + + Ok(()) + } + + pub fn min_word_len_one_typo(&self, txn: &RoTxn) -> heed::Result { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + Ok(self + .main + .get::<_, Str, OwnedType>(txn, main_key::ONE_TYPO_WORD_LEN)? + .unwrap_or(DEFAULT_MIN_WORD_LEN_ONE_TYPO)) + } + + pub(crate) fn put_min_word_len_one_typo(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + self.main.put::<_, Str, OwnedType>(txn, main_key::ONE_TYPO_WORD_LEN, &val)?; + Ok(()) + } + + pub fn min_word_len_two_typos(&self, txn: &RoTxn) -> heed::Result { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + Ok(self + .main + .get::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN)? + .unwrap_or(DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) + } + + pub(crate) fn put_min_word_len_two_typos(&self, txn: &mut RwTxn, val: u8) -> heed::Result<()> { + // It is not possible to put a bool in heed with OwnedType, so we put a u8 instead. We + // identify 0 as being false, and anything else as true. The absence of a value is true, + // because by default, we authorize typos. + self.main.put::<_, Str, OwnedType>(txn, main_key::TWO_TYPOS_WORD_LEN, &val)?; + Ok(()) + } + + /// List the words on which typo are not allowed + pub fn exact_words<'t>(&self, txn: &'t RoTxn) -> Result>>> { + match self.main.get::<_, Str, ByteSlice>(txn, main_key::EXACT_WORDS)? { + Some(bytes) => Ok(Some(fst::Set::new(bytes)?.map_data(Cow::Borrowed)?)), + None => Ok(None), + } + } + + pub(crate) fn put_exact_words>( + &self, + txn: &mut RwTxn, + words: &fst::Set, + ) -> Result<()> { + self.main.put::<_, Str, ByteSlice>( + txn, + main_key::EXACT_WORDS, + words.as_fst().as_bytes(), + )?; + Ok(()) + } + + /// Returns the exact attributes: attributes for which typo is disallowed. + pub fn exact_attributes<'t>(&self, txn: &'t RoTxn) -> Result> { + Ok(self + .main + .get::<_, Str, SerdeBincode>>(txn, main_key::EXACT_ATTRIBUTES)? + .unwrap_or_default()) + } + + /// Returns the list of exact attributes field ids. + pub fn exact_attributes_ids(&self, txn: &RoTxn) -> Result> { + let attrs = self.exact_attributes(txn)?; + let fid_map = self.fields_ids_map(txn)?; + Ok(attrs.iter().filter_map(|attr| fid_map.id(attr)).collect()) + } + + /// Writes the exact attributes to the database. + pub(crate) fn put_exact_attributes(&self, txn: &mut RwTxn, attrs: &[&str]) -> Result<()> { + self.main.put::<_, Str, SerdeBincode<&[&str]>>(txn, main_key::EXACT_ATTRIBUTES, &attrs)?; + Ok(()) + } + + /// Clears the exact attributes from the store. + pub(crate) fn delete_exact_attributes(&self, txn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(txn, main_key::EXACT_ATTRIBUTES) + } + + pub fn max_values_per_facet(&self, txn: &RoTxn) -> heed::Result> { + self.main.get::<_, Str, OwnedType>(txn, main_key::MAX_VALUES_PER_FACET) + } + + pub(crate) fn put_max_values_per_facet(&self, txn: &mut RwTxn, val: usize) -> heed::Result<()> { + self.main.put::<_, Str, OwnedType>(txn, main_key::MAX_VALUES_PER_FACET, &val) + } + + pub(crate) fn delete_max_values_per_facet(&self, txn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(txn, main_key::MAX_VALUES_PER_FACET) + } + + pub fn pagination_max_total_hits(&self, txn: &RoTxn) -> heed::Result> { + self.main.get::<_, Str, OwnedType>(txn, main_key::PAGINATION_MAX_TOTAL_HITS) + } + + pub(crate) fn put_pagination_max_total_hits( + &self, + txn: &mut RwTxn, + val: usize, + ) -> heed::Result<()> { + self.main.put::<_, Str, OwnedType>(txn, main_key::PAGINATION_MAX_TOTAL_HITS, &val) + } + + pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS) + } +} + +#[cfg(test)] +pub(crate) mod tests { + use std::collections::HashSet; + use std::ops::Deref; + + use big_s::S; + use heed::{EnvOpenOptions, RwTxn}; + use maplit::hashset; + use tempfile::TempDir; + + use crate::documents::DocumentsBatchReader; + use crate::error::{Error, InternalError}; + use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; + use crate::update::{ + self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig, + IndexDocumentsMethod, IndexerConfig, Settings, + }; + use crate::{db_snap, obkv_to_json, Index, Search, SearchResult}; + + pub(crate) struct TempIndex { + pub inner: Index, + pub indexer_config: IndexerConfig, + pub index_documents_config: IndexDocumentsConfig, + _tempdir: TempDir, + } + + impl Deref for TempIndex { + type Target = Index; + + fn deref(&self) -> &Self::Target { + &self.inner + } + } + + impl TempIndex { + /// Creates a temporary index + pub fn new_with_map_size(size: usize) -> Self { + let mut options = EnvOpenOptions::new(); + options.map_size(size); + let _tempdir = TempDir::new_in(".").unwrap(); + let inner = Index::new(options, _tempdir.path()).unwrap(); + let indexer_config = IndexerConfig::default(); + let index_documents_config = IndexDocumentsConfig::default(); + Self { inner, indexer_config, index_documents_config, _tempdir } + } + /// Creates a temporary index, with a default `4096 * 1000` size. This should be enough for + /// most tests. + pub fn new() -> Self { + Self::new_with_map_size(4096 * 1000) + } + pub fn add_documents_using_wtxn<'t, R>( + &'t self, + wtxn: &mut RwTxn<'t, '_>, + documents: DocumentsBatchReader, + ) -> Result<(), crate::error::Error> + where + R: std::io::Read + std::io::Seek, + { + let builder = IndexDocuments::new( + wtxn, + self, + &self.indexer_config, + self.index_documents_config.clone(), + |_| (), + || false, + ) + .unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error?; + builder.execute()?; + Ok(()) + } + pub fn add_documents( + &self, + documents: DocumentsBatchReader, + ) -> Result<(), crate::error::Error> + where + R: std::io::Read + std::io::Seek, + { + let mut wtxn = self.write_txn().unwrap(); + self.add_documents_using_wtxn(&mut wtxn, documents)?; + wtxn.commit().unwrap(); + Ok(()) + } + + pub fn update_settings( + &self, + update: impl Fn(&mut Settings), + ) -> Result<(), crate::error::Error> { + let mut wtxn = self.write_txn().unwrap(); + self.update_settings_using_wtxn(&mut wtxn, update)?; + wtxn.commit().unwrap(); + Ok(()) + } + pub fn update_settings_using_wtxn<'t>( + &'t self, + wtxn: &mut RwTxn<'t, '_>, + update: impl Fn(&mut Settings), + ) -> Result<(), crate::error::Error> { + let mut builder = update::Settings::new(wtxn, &self.inner, &self.indexer_config); + update(&mut builder); + builder.execute(drop, || false)?; + Ok(()) + } + + pub fn delete_document(&self, external_document_id: &str) { + let mut wtxn = self.write_txn().unwrap(); + + let mut delete = DeleteDocuments::new(&mut wtxn, &self).unwrap(); + delete.strategy(self.index_documents_config.deletion_strategy); + + delete.delete_external_id(external_document_id); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + } + } + + #[test] + fn aborting_indexation() { + use std::sync::atomic::AtomicBool; + use std::sync::atomic::Ordering::Relaxed; + + let index = TempIndex::new(); + let mut wtxn = index.inner.write_txn().unwrap(); + + let should_abort = AtomicBool::new(false); + let builder = IndexDocuments::new( + &mut wtxn, + &index.inner, + &index.indexer_config, + index.index_documents_config.clone(), + |_| (), + || should_abort.load(Relaxed), + ) + .unwrap(); + + let (builder, user_error) = builder + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "bob", "age": 20 }, + { "id": 2, "name": "bob", "age": 20 }, + ])) + .unwrap(); + user_error.unwrap(); + + should_abort.store(true, Relaxed); + let err = builder.execute().unwrap_err(); + + assert!(matches!(err, Error::InternalError(InternalError::AbortedIndexation))); + } + + #[test] + fn initial_field_distribution() { + let index = TempIndex::new(); + index + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "bob", "age": 20 }, + { "id": 2, "name": "bob", "age": 20 }, + ])) + .unwrap(); + + db_snap!(index, field_distribution, 1); + + db_snap!(index, word_docids, + @r###" + 1 [0, ] + 2 [1, ] + 20 [1, ] + bob [1, ] + kevin [0, ] + "### + ); + + db_snap!(index, field_distribution); + + db_snap!(index, field_distribution, + @" + age 1 + id 2 + name 2 + " + ); + + // snapshot_index!(&index, "1", include: "^field_distribution$"); + + // we add all the documents a second time. we are supposed to get the same + // field_distribution in the end + index + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "bob", "age": 20 }, + { "id": 2, "name": "bob", "age": 20 }, + ])) + .unwrap(); + + db_snap!(index, field_distribution, + @r###" + age 1 + id 2 + name 2 + "### + ); + + // then we update a document by removing one field and another by adding one field + index + .add_documents(documents!([ + { "id": 1, "name": "kevin", "has_dog": true }, + { "id": 2, "name": "bob" } + ])) + .unwrap(); + + db_snap!(index, field_distribution, + @r###" + has_dog 1 + id 2 + name 2 + "### + ); + } + + #[test] + fn put_and_retrieve_disable_typo() { + let index = TempIndex::new(); + let mut txn = index.write_txn().unwrap(); + // default value is true + assert!(index.authorize_typos(&txn).unwrap()); + // set to false + index.put_authorize_typos(&mut txn, false).unwrap(); + txn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + assert!(!index.authorize_typos(&txn).unwrap()); + } + + #[test] + fn set_min_word_len_for_typos() { + let index = TempIndex::new(); + let mut txn = index.write_txn().unwrap(); + + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_ONE_TYPO); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_TWO_TYPOS); + + index.put_min_word_len_one_typo(&mut txn, 3).unwrap(); + index.put_min_word_len_two_typos(&mut txn, 15).unwrap(); + + txn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 3); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 15); + } + + #[test] + fn add_documents_and_set_searchable_fields() { + let index = TempIndex::new(); + index + .add_documents(documents!([ + { "id": 1, "doggo": "kevin" }, + { "id": 2, "doggo": { "name": "bob", "age": 20 } }, + { "id": 3, "name": "jean", "age": 25 }, + ])) + .unwrap(); + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("doggo"), S("name")]); + }) + .unwrap(); + + // ensure we get the right real searchable fields + user defined searchable fields + let rtxn = index.read_txn().unwrap(); + + let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); + + let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(user_defined, &["doggo", "name"]); + } + + #[test] + fn set_searchable_fields_and_add_documents() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("doggo"), S("name")]); + }) + .unwrap(); + + // ensure we get the right real searchable fields + user defined searchable fields + let rtxn = index.read_txn().unwrap(); + + let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(real, &["doggo", "name"]); + let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(user_defined, &["doggo", "name"]); + + index + .add_documents(documents!([ + { "id": 1, "doggo": "kevin" }, + { "id": 2, "doggo": { "name": "bob", "age": 20 } }, + { "id": 3, "name": "jean", "age": 25 }, + ])) + .unwrap(); + + // ensure we get the right real searchable fields + user defined searchable fields + let rtxn = index.read_txn().unwrap(); + + let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); + + let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); + assert_eq!(user_defined, &["doggo", "name"]); + } + + #[test] + fn replace_documents_external_ids_and_soft_deletion_check() { + use big_s::S; + use maplit::hashset; + + let mut index = TempIndex::new(); + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; + let index = index; + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("doggo") }); + }) + .unwrap(); + + let mut docs = vec![]; + for i in 0..4 { + docs.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + index.add_documents(documents!(docs)).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 0 + 1 1 + 2 2 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, ] + 1 0 2 1 [2, ] + 1 0 3 1 [3, ] + "###); + + let mut docs = vec![]; + for i in 0..3 { + docs.push(serde_json::json!( + { "id": i, "doggo": i + 1 } + )); + } + index.add_documents(documents!(docs)).unwrap(); + + db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[0, 1, 2, ]"); + db_snap!(index, facet_id_f64_docids, 2, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, 4, ] + 1 0 2 1 [2, 5, ] + 1 0 3 1 [3, 6, ] + "###); + + index + .add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }])) + .unwrap(); + + db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + 3 7 + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[0, 1, 2, 3, ]"); + db_snap!(index, facet_id_f64_docids, 3, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, 4, ] + 1 0 2 1 [2, 5, ] + 1 0 3 1 [3, 6, ] + 1 0 4 1 [7, ] + "###); + + index + .update_settings(|settings| { + settings.set_distinct_field("id".to_owned()); + }) + .unwrap(); + + db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + hard: + 0 4 + 1 5 + 2 6 + 3 7 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + db_snap!(index, facet_id_f64_docids, 3, @r###" + 0 0 0 1 [4, ] + 0 0 1 1 [5, ] + 0 0 2 1 [6, ] + 0 0 3 1 [7, ] + 1 0 1 1 [4, ] + 1 0 2 1 [5, ] + 1 0 3 1 [6, ] + 1 0 4 1 [7, ] + "###); + } + + #[test] + fn replace_documents_in_batches_external_ids_and_soft_deletion_check() { + use big_s::S; + use maplit::hashset; + + let mut index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("doggo") }); + }) + .unwrap(); + + let add_documents = |index: &TempIndex, docs: Vec>| { + let mut wtxn = index.write_txn().unwrap(); + let mut builder = IndexDocuments::new( + &mut wtxn, + index, + &index.indexer_config, + index.index_documents_config.clone(), + |_| (), + || false, + ) + .unwrap(); + for docs in docs { + (builder, _) = builder.add_documents(documents!(docs)).unwrap(); + } + builder.execute().unwrap(); + wtxn.commit().unwrap(); + }; + // First Batch + { + let mut docs1 = vec![]; + for i in 0..4 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1]); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 0 + 1 1 + 2 2 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, ] + 1 0 1 1 [1, ] + 1 0 2 1 [2, ] + 1 0 3 1 [3, ] + "###); + } + // Second Batch: replace the documents with soft-deletion + { + index.index_documents_config.deletion_strategy = + crate::update::DeletionStrategy::AlwaysSoft; + let mut docs1 = vec![]; + for i in 0..3 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i+1 } + )); + } + let mut docs2 = vec![]; + for i in 0..3 { + docs2.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1, docs2]); + + db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 4 + 1 5 + 2 6 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, ]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, 4, ] + 1 0 1 1 [1, 5, ] + 1 0 2 1 [2, 6, ] + 1 0 3 1 [3, ] + "###); + } + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(3), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [4]).unwrap()[0]; + + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(0), + "doggo": Number(0), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [5]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(1), + "doggo": Number(1), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [6]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(2), + "doggo": Number(2), + } + "###); + drop(rtxn); + // Third Batch: replace the documents with soft-deletion again + { + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; + let mut docs1 = vec![]; + for i in 0..3 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i+1 } + )); + } + let mut docs2 = vec![]; + for i in 0..4 { + docs2.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1, docs2]); + + db_snap!(index, documents_ids, @"[3, 7, 8, 9, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 7 + 1 8 + 2 9 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, 4, 5, 6, ]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [0, 4, 7, ] + 1 0 1 1 [1, 5, 8, ] + 1 0 2 1 [2, 6, 9, ] + 1 0 3 1 [3, ] + "###); + } + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(3), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [7]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(0), + "doggo": Number(0), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [8]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(1), + "doggo": Number(1), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [9]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(2), + "doggo": Number(2), + } + "###); + drop(rtxn); + + // Fourth Batch: replace the documents without soft-deletion + { + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; + let mut docs1 = vec![]; + for i in 0..3 { + docs1.push(serde_json::json!( + { "id": i, "doggo": i+2 } + )); + } + let mut docs2 = vec![]; + for i in 0..1 { + docs2.push(serde_json::json!( + { "id": i, "doggo": i } + )); + } + add_documents(&index, vec![docs1, docs2]); + + db_snap!(index, documents_ids, @"[3, 10, 11, 12, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 0 10 + 1 11 + 2 12 + 3 3 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + db_snap!(index, facet_id_f64_docids, 1, @r###" + 1 0 0 1 [10, ] + 1 0 3 1 [3, 11, ] + 1 0 4 1 [12, ] + "###); + + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(3), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [10]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(0), + "doggo": Number(0), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [11]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(1), + "doggo": Number(3), + } + "###); + let (_docid, obkv) = index.documents(&rtxn, [12]).unwrap()[0]; + let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "id": Number(2), + "doggo": Number(4), + } + "###); + drop(rtxn); + } + } + + #[test] + fn bug_3021_first() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 38 }, + { "primary_key": 34 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + + index.delete_document("34"); + + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec![]); + }) + .unwrap(); + + // The key point of the test is to verify that the external documents ids + // do not contain any entry for previously soft-deleted document ids + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + hard: + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + + // So that this document addition works correctly now. + // It would be wrongly interpreted as a replacement before + index.add_documents(documents!({ "primary_key": 34 })).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 4, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); + + // We do the test again, but deleting the document with id 0 instead of id 1 now + index.delete_document("38"); + + db_snap!(index, documents_ids, @"[1, ]"); + db_snap!(index, external_documents_ids, 5, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 5, @"[0, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec!["primary_key".to_owned()]); + }) + .unwrap(); + + db_snap!(index, documents_ids, @"[1, ]"); + db_snap!(index, external_documents_ids, 6, @r###" + soft: + hard: + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); + + // And adding lots of documents afterwards instead of just one. + // These extra subtests don't add much, but it's better than nothing. + index.add_documents(documents!([{ "primary_key": 38 }, { "primary_key": 39 }, { "primary_key": 41 }, { "primary_key": 40 }, { "primary_key": 41 }, { "primary_key": 42 }])).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); + db_snap!(index, external_documents_ids, 7, @r###" + soft: + hard: + 34 1 + 38 0 + 39 2 + 40 4 + 41 3 + 42 5 + "###); + db_snap!(index, soft_deleted_documents_ids, 7, @"[]"); + } + + #[test] + fn bug_3021_second() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 30 }, + { "primary_key": 34 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 30 0 + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + + index.delete_document("34"); + + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 30 0 + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec![]); + }) + .unwrap(); + + // The key point of the test is to verify that the external documents ids + // do not contain any entry for previously soft-deleted document ids + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + hard: + 30 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + + // So that when we add a new document + index.add_documents(documents!({ "primary_key": 35, "b": 2 })).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + // The external documents ids don't have several external ids pointing to the same + // internal document id + db_snap!(index, external_documents_ids, 4, @r###" + soft: + hard: + 30 0 + 35 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); + + // And when we add 34 again, we don't replace document 35 + index.add_documents(documents!({ "primary_key": 34, "a": 1 })).unwrap(); + + // And document 35 still exists, is not deleted + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); + db_snap!(index, external_documents_ids, 5, @r###" + soft: + hard: + 30 0 + 34 2 + 35 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 5, @"[]"); + + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0]; + let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "primary_key": Number(30), + } + "###); + + // Furthermore, when we retrieve document 34, it is not the result of merging 35 with 34 + let (_docid, obkv) = index.documents(&rtxn, [2]).unwrap()[0]; + let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "primary_key": Number(34), + "a": Number(1), + } + "###); + + drop(rtxn); + + // Add new documents again + index + .add_documents( + documents!([{ "primary_key": 37 }, { "primary_key": 38 }, { "primary_key": 39 }]), + ) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); + db_snap!(index, external_documents_ids, 6, @r###" + soft: + hard: + 30 0 + 34 2 + 35 1 + 37 3 + 38 4 + 39 5 + "###); + db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); + } + + #[test] + fn bug_3021_third() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 3 }, + { "primary_key": 4 }, + { "primary_key": 5 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 3 0 + 4 1 + 5 2 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + + index.delete_document("3"); + + db_snap!(index, documents_ids, @"[1, 2, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 3 0 + 4 1 + 5 2 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[0, ]"); + + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; + + index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap(); + + db_snap!(index, documents_ids, @"[2, 3, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 4 3 + 5 2 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); + + index + .add_documents(documents!([ + { "primary_key": "3" }, + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 3 0 + 4 3 + 5 2 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); + } + + #[test] + fn bug_3021_fourth() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 11 }, + { "primary_key": 4 }, + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 11 0 + 4 1 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[]"); + + index + .add_documents(documents!([ + { "primary_key": 4, "a": 0 }, + { "primary_key": 1 }, + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 1 3 + 11 0 + 4 2 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[1, ]"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.strategy(DeletionStrategy::AlwaysHard); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 1 3 + 11 0 + 4 2 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[]"); + + index + .add_documents(documents!([ + { "primary_key": 4, "a": 1 }, + { "primary_key": 1, "a": 0 }, + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 4, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 1 4 + 11 0 + 4 1 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]"); + + let rtxn = index.read_txn().unwrap(); + let search = Search::new(&rtxn, &index); + let SearchResult { matching_words: _, candidates: _, mut documents_ids } = + search.execute().unwrap(); + let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap(); + documents_ids.sort_unstable(); + let docs = index.documents(&rtxn, documents_ids).unwrap(); + let mut all_ids = HashSet::new(); + for (_docid, obkv) in docs { + let id = obkv.get(primary_key_id).unwrap(); + assert!(all_ids.insert(id)); + } + } +} diff --git a/milli/src/lib.rs b/milli/src/lib.rs new file mode 100644 index 000000000..865195df5 --- /dev/null +++ b/milli/src/lib.rs @@ -0,0 +1,332 @@ +#![cfg_attr(all(test, fuzzing), feature(no_coverage))] +#[macro_use] +pub mod documents; + +mod asc_desc; +mod criterion; +mod error; +mod external_documents_ids; +pub mod facet; +mod fields_ids_map; +pub mod heed_codec; +pub mod index; +pub mod proximity; +mod search; +pub mod update; + +#[cfg(test)] +#[macro_use] +pub mod snapshot_tests; + +use std::collections::{BTreeMap, HashMap}; +use std::convert::{TryFrom, TryInto}; +use std::hash::BuildHasherDefault; + +pub use filter_parser::{Condition, FilterCondition, Span, Token}; +use fxhash::{FxHasher32, FxHasher64}; +pub use grenad::CompressionType; +use serde_json::Value; +pub use {charabia as tokenizer, heed}; + +pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; +pub use self::criterion::{default_criteria, Criterion, CriterionError}; +pub use self::error::{ + Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, +}; +pub use self::external_documents_ids::ExternalDocumentsIds; +pub use self::fields_ids_map::FieldsIdsMap; +pub use self::heed_codec::{ + BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, + CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, + RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec, +}; +pub use self::index::Index; +pub use self::search::{ + CriterionImplementationStrategy, FacetDistribution, Filter, FormatOptions, MatchBounds, + MatcherBuilder, MatchingWord, MatchingWords, Search, SearchResult, TermsMatchingStrategy, + DEFAULT_VALUES_PER_FACET, +}; + +pub type Result = std::result::Result; + +pub type Attribute = u32; +pub type BEU16 = heed::zerocopy::U16; +pub type BEU32 = heed::zerocopy::U32; +pub type BEU64 = heed::zerocopy::U64; +pub type DocumentId = u32; +pub type FastMap4 = HashMap>; +pub type FastMap8 = HashMap>; +pub type FieldDistribution = BTreeMap; +pub type FieldId = u16; +pub type Object = serde_json::Map; +pub type Position = u32; +pub type RelativePosition = u16; +pub type SmallString32 = smallstr::SmallString<[u8; 32]>; +pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; +pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; +pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; + +/// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata +/// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point +/// expressed in term of latitude and longitude. +pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>; + +/// The maximum length a LMDB key can be. +/// +/// Note that the actual allowed length is a little bit higher, but +/// we keep a margin of safety. +const MAX_LMDB_KEY_LENGTH: usize = 500; + +/// The maximum length a field value can be when inserted in an LMDB key. +/// +/// This number is determined by the keys of the different facet databases +/// and adding a margin of safety. +pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20; + +/// The maximum length a word can be +pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; + +pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; + +// Convert an absolute word position into a relative position. +// Return the field id of the attribute related to the absolute position +// and the relative position in the attribute. +pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) { + ((absolute >> 16) as u16, (absolute & 0xFFFF) as u16) +} + +// Compute the absolute word position with the field id of the attribute and relative position in the attribute. +pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position { + (field_id as u32) << 16 | (relative as u32) +} + +/// Transform a raw obkv store into a JSON Object. +pub fn obkv_to_json( + displayed_fields: &[FieldId], + fields_ids_map: &FieldsIdsMap, + obkv: obkv::KvReaderU16, +) -> Result { + displayed_fields + .iter() + .copied() + .flat_map(|id| obkv.get(id).map(|value| (id, value))) + .map(|(id, value)| { + let name = fields_ids_map.name(id).ok_or(error::FieldIdMapMissingEntry::FieldId { + field_id: id, + process: "obkv_to_json", + })?; + let value = serde_json::from_slice(value).map_err(error::InternalError::SerdeJson)?; + Ok((name.to_owned(), value)) + }) + .collect() +} + +/// Transform every field of a raw obkv store into a JSON Object. +pub fn all_obkv_to_json(obkv: obkv::KvReaderU16, fields_ids_map: &FieldsIdsMap) -> Result { + let all_keys = obkv.iter().map(|(k, _v)| k).collect::>(); + obkv_to_json(all_keys.as_slice(), fields_ids_map, obkv) +} + +/// Transform a JSON value into a string that can be indexed. +pub fn json_to_string(value: &Value) -> Option { + fn inner(value: &Value, output: &mut String) -> bool { + use std::fmt::Write; + match value { + Value::Null => false, + Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(), + Value::Number(number) => write!(output, "{}", number).is_ok(), + Value::String(string) => write!(output, "{}", string).is_ok(), + Value::Array(array) => { + let mut count = 0; + for value in array { + if inner(value, output) { + output.push_str(". "); + count += 1; + } + } + // check that at least one value was written + count != 0 + } + Value::Object(object) => { + let mut buffer = String::new(); + let mut count = 0; + for (key, value) in object { + buffer.clear(); + let _ = write!(&mut buffer, "{}: ", key); + if inner(value, &mut buffer) { + buffer.push_str(". "); + // We write the "key: value. " pair only when + // we are sure that the value can be written. + output.push_str(&buffer); + count += 1; + } + } + // check that at least one value was written + count != 0 + } + } + } + + let mut string = String::new(); + if inner(value, &mut string) { + Some(string) + } else { + None + } +} + +/// Divides one slice into two at an index, returns `None` if mid is out of bounds. +fn try_split_at(slice: &[T], mid: usize) -> Option<(&[T], &[T])> { + if mid <= slice.len() { + Some(slice.split_at(mid)) + } else { + None + } +} + +/// Divides one slice into an array and the tail at an index, +/// returns `None` if `N` is out of bounds. +fn try_split_array_at(slice: &[T]) -> Option<([T; N], &[T])> +where + [T; N]: for<'a> TryFrom<&'a [T]>, +{ + let (head, tail) = try_split_at(slice, N)?; + let head = head.try_into().ok()?; + Some((head, tail)) +} + +/// Return the distance between two points in meters. Each points are composed of two f64, +/// one latitude and one longitude. +pub fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { + let a = geoutils::Location::new(a[0], a[1]); + let b = geoutils::Location::new(b[0], b[1]); + + a.haversine_distance_to(&b).meters() +} + +/// Convert a point expressed in terms of latitude and longitude to a point in the +/// cartesian coordinate expressed in terms of x, y and z. +pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] { + let [lat, lng] = coord.map(|f| f.to_radians()); + let x = lat.cos() * lng.cos(); + let y = lat.cos() * lng.sin(); + let z = lat.sin(); + + [x, y, z] +} + +/// Returns `true` if the field match one of the faceted fields. +/// See the function [`is_faceted_by`] below to see what “matching” means. +pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator>) -> bool { + faceted_fields.into_iter().any(|facet| is_faceted_by(field, facet.as_ref())) +} + +/// Returns `true` if the field match the facet. +/// ``` +/// use milli::is_faceted_by; +/// // -- the valid basics +/// assert!(is_faceted_by("animaux", "animaux")); +/// assert!(is_faceted_by("animaux.chien", "animaux")); +/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux")); +/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien")); +/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois")); +/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure")); +/// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure.couleur")); +/// +/// // -- the wrongs +/// assert!(!is_faceted_by("chien", "chat")); +/// assert!(!is_faceted_by("animaux", "animaux.chien")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.chat")); +/// +/// // -- the strange edge cases +/// assert!(!is_faceted_by("animaux.chien", "anima")); +/// assert!(!is_faceted_by("animaux.chien", "animau")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.c")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.ch")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.chi")); +/// assert!(!is_faceted_by("animaux.chien", "animaux.chie")); +/// ``` +pub fn is_faceted_by(field: &str, facet: &str) -> bool { + field.starts_with(facet) + && field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true) +} + +#[cfg(test)] +mod tests { + use serde_json::json; + + use super::*; + + #[test] + fn json_to_string_object() { + let value = json!({ + "name": "John Doe", + "age": 43, + "not_there": null, + }); + + let string = json_to_string(&value).unwrap(); + assert_eq!(string, "name: John Doe. age: 43. "); + } + + #[test] + fn json_to_string_array() { + let value = json!([ + { "name": "John Doe" }, + 43, + "hello", + [ "I", "am", "fine" ], + null, + ]); + + let string = json_to_string(&value).unwrap(); + // We don't care about having two point (.) after the other as + // the distance of hard separators is clamped to 8 anyway. + assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); + } + + #[test] + fn test_relative_position_conversion() { + assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000)); + assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF)); + assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000)); + assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00)); + assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF)); + assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678)); + assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF)); + } + + #[test] + fn test_absolute_position_conversion() { + assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000)); + assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF)); + assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000)); + assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00)); + assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF)); + assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678)); + assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF)); + } + + #[test] + fn test_all_obkv_to_json() { + let mut fields_ids_map = FieldsIdsMap::new(); + let id1 = fields_ids_map.insert("field1").unwrap(); + let id2 = fields_ids_map.insert("field2").unwrap(); + + let mut writer = obkv::KvWriterU16::memory(); + writer.insert(id1, b"1234").unwrap(); + writer.insert(id2, b"4321").unwrap(); + let contents = writer.into_inner().unwrap(); + let obkv = obkv::KvReaderU16::new(&contents); + + let expected = json!({ + "field1": 1234, + "field2": 4321, + }); + let expected = expected.as_object().unwrap(); + let actual = all_obkv_to_json(obkv, &fields_ids_map).unwrap(); + + assert_eq!(&actual, expected); + } +} diff --git a/milli/src/proximity.rs b/milli/src/proximity.rs new file mode 100644 index 000000000..62f490119 --- /dev/null +++ b/milli/src/proximity.rs @@ -0,0 +1,27 @@ +use std::cmp; + +use crate::{relative_from_absolute_position, Position}; + +pub const MAX_DISTANCE: u32 = 8; + +pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { + if lhs <= rhs { + cmp::min(rhs - lhs, MAX_DISTANCE) + } else { + cmp::min((lhs - rhs) + 1, MAX_DISTANCE) + } +} + +pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { + let (lhs_attr, lhs_index) = relative_from_absolute_position(lhs); + let (rhs_attr, rhs_index) = relative_from_absolute_position(rhs); + if lhs_attr != rhs_attr { + MAX_DISTANCE + } else { + index_proximity(lhs_index as u32, rhs_index as u32) + } +} + +pub fn path_proximity(path: &[Position]) -> u32 { + path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::() +} diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs new file mode 100644 index 000000000..b5afe6778 --- /dev/null +++ b/milli/src/search/criteria/asc_desc.rs @@ -0,0 +1,535 @@ +use std::mem::take; + +use itertools::Itertools; +use log::debug; +use ordered_float::OrderedFloat; +use roaring::RoaringBitmap; + +use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::facet::FacetType; +use crate::heed_codec::facet::FacetGroupKeyCodec; +use crate::heed_codec::ByteSliceRefCodec; +use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; +use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; +use crate::search::query_tree::Operation; +use crate::search::CriterionImplementationStrategy; +use crate::{FieldId, Index, Result}; + +/// Threshold on the number of candidates that will make +/// the system to choose between one algorithm or another. +const CANDIDATES_THRESHOLD: u64 = 1000; + +pub struct AscDesc<'t> { + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + field_name: String, + field_id: Option, + is_ascending: bool, + query_tree: Option, + candidates: Box> + 't>, + allowed_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, + faceted_candidates: RoaringBitmap, + implementation_strategy: CriterionImplementationStrategy, + parent: Box, +} + +impl<'t> AscDesc<'t> { + pub fn asc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_name: String, + implementation_strategy: CriterionImplementationStrategy, + ) -> Result { + Self::new(index, rtxn, parent, field_name, true, implementation_strategy) + } + + pub fn desc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_name: String, + implementation_strategy: CriterionImplementationStrategy, + ) -> Result { + Self::new(index, rtxn, parent, field_name, false, implementation_strategy) + } + + fn new( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_name: String, + is_ascending: bool, + implementation_strategy: CriterionImplementationStrategy, + ) -> Result { + let fields_ids_map = index.fields_ids_map(rtxn)?; + let field_id = fields_ids_map.id(&field_name); + let faceted_candidates = match field_id { + Some(field_id) => { + let number_faceted = + index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?; + let string_faceted = + index.faceted_documents_ids(rtxn, field_id, FacetType::String)?; + number_faceted | string_faceted + } + None => RoaringBitmap::default(), + }; + + Ok(AscDesc { + index, + rtxn, + field_name, + field_id, + is_ascending, + query_tree: None, + candidates: Box::new(std::iter::empty()), + allowed_candidates: RoaringBitmap::new(), + faceted_candidates, + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), + implementation_strategy, + parent, + }) + } +} + +impl<'t> Criterion for AscDesc<'t> { + #[logging_timer::time("AscDesc::{}")] + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + // remove excluded candidates when next is called, instead of doing it in the loop. + self.allowed_candidates -= params.excluded_candidates; + + loop { + debug!( + "Facet {}({}) iteration", + if self.is_ascending { "Asc" } else { "Desc" }, + self.field_name + ); + + match self.candidates.next().transpose()? { + None if !self.allowed_candidates.is_empty() => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: Some(take(&mut self.allowed_candidates)), + filtered_candidates: None, + initial_candidates: Some(self.initial_candidates.take()), + })); + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree, + candidates, + filtered_candidates, + initial_candidates, + }) => { + self.query_tree = query_tree; + let mut candidates = match (&self.query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => { + let context = CriteriaBuilder::new(self.rtxn, self.index)?; + resolve_query_tree(&context, qt, params.wdcache)? + } + (None, None) => self.index.documents_ids(self.rtxn)?, + }; + + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), + } + + if candidates.is_empty() { + continue; + } + + self.allowed_candidates = &candidates - params.excluded_candidates; + self.candidates = match self.field_id { + Some(field_id) => facet_ordered( + self.index, + self.rtxn, + field_id, + self.is_ascending, + candidates & &self.faceted_candidates, + self.implementation_strategy, + )?, + None => Box::new(std::iter::empty()), + }; + } + None => return Ok(None), + }, + Some(mut candidates) => { + candidates -= params.excluded_candidates; + self.allowed_candidates -= &candidates; + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: Some(candidates), + filtered_candidates: None, + initial_candidates: Some(self.initial_candidates.take()), + })); + } + } + } + } +} + +fn facet_ordered_iterative<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + is_ascending: bool, + candidates: RoaringBitmap, +) -> Result> + 't>> { + let number_iter = iterative_facet_number_ordered_iter( + index, + rtxn, + field_id, + is_ascending, + candidates.clone(), + )?; + let string_iter = + iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; + Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) +} + +fn facet_ordered_set_based<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + is_ascending: bool, + candidates: RoaringBitmap, +) -> Result> + 't>> { + let make_iter = if is_ascending { ascending_facet_sort } else { descending_facet_sort }; + + let number_iter = make_iter( + rtxn, + index.facet_id_f64_docids.remap_key_type::>(), + field_id, + candidates.clone(), + )?; + + let string_iter = make_iter( + rtxn, + index.facet_id_string_docids.remap_key_type::>(), + field_id, + candidates, + )?; + + Ok(Box::new(number_iter.chain(string_iter))) +} + +/// Returns an iterator over groups of the given candidates in ascending or descending order. +/// +/// It will either use an iterative or a recursive method on the whole facet database depending +/// on the number of candidates to rank. +fn facet_ordered<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + is_ascending: bool, + candidates: RoaringBitmap, + implementation_strategy: CriterionImplementationStrategy, +) -> Result> + 't>> { + match implementation_strategy { + CriterionImplementationStrategy::OnlyIterative => { + facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates) + } + CriterionImplementationStrategy::OnlySetBased => { + facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates) + } + CriterionImplementationStrategy::Dynamic => { + if candidates.len() <= CANDIDATES_THRESHOLD { + facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates) + } else { + facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates) + } + } + } +} + +/// Fetch the whole list of candidates facet number values one by one and order them by it. +/// +/// This function is fast when the amount of candidates to rank is small. +fn iterative_facet_number_ordered_iter<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + is_ascending: bool, + candidates: RoaringBitmap, +) -> Result + 't> { + let mut docids_values = Vec::with_capacity(candidates.len() as usize); + for docid in candidates.iter() { + let left = (field_id, docid, f64::MIN); + let right = (field_id, docid, f64::MAX); + let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?; + let entry = if is_ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), ())) = entry.transpose()? { + docids_values.push((docid, OrderedFloat(value))); + } + } + docids_values.sort_unstable_by_key(|(_, v)| *v); + let iter = docids_values.into_iter(); + let iter = if is_ascending { + Box::new(iter) as Box> + } else { + Box::new(iter.rev()) + }; + + // The itertools GroupBy iterator doesn't provide an owned version, we are therefore + // required to collect the result into an owned collection (a Vec). + // https://github.com/rust-itertools/itertools/issues/499 + #[allow(clippy::needless_collect)] + let vec: Vec<_> = iter + .group_by(|(_, v)| *v) + .into_iter() + .map(|(_, ids)| ids.map(|(id, _)| id).collect()) + .collect(); + + Ok(vec.into_iter()) +} + +/// Fetch the whole list of candidates facet string values one by one and order them by it. +/// +/// This function is fast when the amount of candidates to rank is small. +fn iterative_facet_string_ordered_iter<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + is_ascending: bool, + candidates: RoaringBitmap, +) -> Result + 't> { + let mut docids_values = Vec::with_capacity(candidates.len() as usize); + for docid in candidates.iter() { + let left = (field_id, docid, ""); + let right = (field_id, docid.saturating_add(1), ""); + // FIXME Doing this means that it will never be possible to retrieve + // the document with id 2^32, not sure this is a real problem. + let mut iter = index.field_id_docid_facet_strings.range(rtxn, &(left..right))?; + let entry = if is_ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), _)) = entry.transpose()? { + docids_values.push((docid, value)); + } + } + docids_values.sort_unstable_by_key(|(_, v)| *v); + let iter = docids_values.into_iter(); + let iter = if is_ascending { + Box::new(iter) as Box> + } else { + Box::new(iter.rev()) + }; + + // The itertools GroupBy iterator doesn't provide an owned version, we are therefore + // required to collect the result into an owned collection (a Vec). + // https://github.com/rust-itertools/itertools/issues/499 + #[allow(clippy::needless_collect)] + let vec: Vec<_> = iter + .group_by(|(_, v)| *v) + .into_iter() + .map(|(_, ids)| ids.map(|(id, _)| id).collect()) + .collect(); + + Ok(vec.into_iter()) +} + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use big_s::S; + use maplit::hashset; + + use crate::index::tests::TempIndex; + use crate::{AscDesc, Criterion, Filter, Search, SearchResult}; + + // Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD + // constant to 0 to ensure that the other sort algorithms are also correct. + #[test] + fn sort_criterion_placeholder() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings + .set_sortable_fields(maplit::hashset! { S("id"), S("mod_10"), S("mod_20") }); + settings.set_criteria(vec![Criterion::Sort]); + }) + .unwrap(); + + let mut docs = vec![]; + for i in 0..100 { + docs.push( + serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }), + ); + } + + index.add_documents(documents!(docs)).unwrap(); + + let all_ids = (0..100).collect::>(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.sort_criteria(vec![AscDesc::from_str("mod_10:desc").unwrap()]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 19, 29, 39, 49, 59, 69, 79, 89, 99, 8, 18, 28, 38, 48, 58, 68, 78, 88, 98, 7, 17, 27, 37, 47, 57, 67, 77, 87, 97, 6, 16, 26, 36, 46, 56, 66, 76, 86, 96, 5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 4, 14, 24, 34, 44, 54, 64, 74, 84, 94, 3, 13, 23, 33, 43, 53, 63, 73, 83, 93, 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90]"); + documents_ids.sort(); + assert_eq!(all_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:desc").unwrap(), + AscDesc::from_str("id:desc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 89, 79, 69, 59, 49, 39, 29, 19, 9, 98, 88, 78, 68, 58, 48, 38, 28, 18, 8, 97, 87, 77, 67, 57, 47, 37, 27, 17, 7, 96, 86, 76, 66, 56, 46, 36, 26, 16, 6, 95, 85, 75, 65, 55, 45, 35, 25, 15, 5, 94, 84, 74, 64, 54, 44, 34, 24, 14, 4, 93, 83, 73, 63, 53, 43, 33, 23, 13, 3, 92, 82, 72, 62, 52, 42, 32, 22, 12, 2, 91, 81, 71, 61, 51, 41, 31, 21, 11, 1, 90, 80, 70, 60, 50, 40, 30, 20, 10, 0]"); + documents_ids.sort(); + assert_eq!(all_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:desc").unwrap(), + AscDesc::from_str("mod_20:asc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 29, 49, 69, 89, 19, 39, 59, 79, 99, 8, 28, 48, 68, 88, 18, 38, 58, 78, 98, 7, 27, 47, 67, 87, 17, 37, 57, 77, 97, 6, 26, 46, 66, 86, 16, 36, 56, 76, 96, 5, 25, 45, 65, 85, 15, 35, 55, 75, 95, 4, 24, 44, 64, 84, 14, 34, 54, 74, 94, 3, 23, 43, 63, 83, 13, 33, 53, 73, 93, 2, 22, 42, 62, 82, 12, 32, 52, 72, 92, 1, 21, 41, 61, 81, 11, 31, 51, 71, 91, 0, 20, 40, 60, 80, 10, 30, 50, 70, 90]"); + documents_ids.sort(); + assert_eq!(all_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:desc").unwrap(), + AscDesc::from_str("mod_20:desc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 39, 59, 79, 99, 9, 29, 49, 69, 89, 18, 38, 58, 78, 98, 8, 28, 48, 68, 88, 17, 37, 57, 77, 97, 7, 27, 47, 67, 87, 16, 36, 56, 76, 96, 6, 26, 46, 66, 86, 15, 35, 55, 75, 95, 5, 25, 45, 65, 85, 14, 34, 54, 74, 94, 4, 24, 44, 64, 84, 13, 33, 53, 73, 93, 3, 23, 43, 63, 83, 12, 32, 52, 72, 92, 2, 22, 42, 62, 82, 11, 31, 51, 71, 91, 1, 21, 41, 61, 81, 10, 30, 50, 70, 90, 0, 20, 40, 60, 80]"); + documents_ids.sort(); + assert_eq!(all_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:desc").unwrap(), + AscDesc::from_str("mod_20:desc").unwrap(), + AscDesc::from_str("id:desc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 79, 59, 39, 19, 89, 69, 49, 29, 9, 98, 78, 58, 38, 18, 88, 68, 48, 28, 8, 97, 77, 57, 37, 17, 87, 67, 47, 27, 7, 96, 76, 56, 36, 16, 86, 66, 46, 26, 6, 95, 75, 55, 35, 15, 85, 65, 45, 25, 5, 94, 74, 54, 34, 14, 84, 64, 44, 24, 4, 93, 73, 53, 33, 13, 83, 63, 43, 23, 3, 92, 72, 52, 32, 12, 82, 62, 42, 22, 2, 91, 71, 51, 31, 11, 81, 61, 41, 21, 1, 90, 70, 50, 30, 10, 80, 60, 40, 20, 0]"); + documents_ids.sort(); + assert_eq!(all_ids, documents_ids); + } + + // Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD + // constant to 0 to ensure that the other sort algorithms are also correct. + #[test] + fn sort_criterion_non_placeholder() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") }); + settings.set_sortable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") }); + settings.set_criteria(vec![Criterion::Sort]); + }) + .unwrap(); + + let mut docs = vec![]; + for i in 0..100 { + docs.push( + serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }), + ); + } + + index.add_documents(documents!(docs)).unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.filter( + Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]") + .unwrap() + .unwrap(), + ); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:desc").unwrap(), + AscDesc::from_str("mod_20:asc").unwrap(), + AscDesc::from_str("id:desc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + // The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 93, 73, 53, 33, 13, 82, 62, 42, 22, 2, 92, 72, 52, 32, 12, 81, 61, 41, 21, 1, 91, 71, 51, 31, 11, 80, 60, 40, 20, 0, 90, 70, 50, 30, 10]"); + let expected_ids = (0..100) + .filter(|id| { + [1, 0, 2].contains(&(id % 10)) + || [10, 13].contains(&(id % 20)) + || [5, 6].contains(id) + }) + .collect::>(); + documents_ids.sort(); + assert_eq!(expected_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.filter( + Filter::from_str("mod_10 IN [7, 8, 0] OR mod_20 IN [1, 15, 16] OR id IN [0, 4]") + .unwrap() + .unwrap(), + ); + search.sort_criteria(vec![ + AscDesc::from_str("mod_10:asc").unwrap(), + AscDesc::from_str("mod_20:asc").unwrap(), + AscDesc::from_str("id:desc").unwrap(), + ]); + search.limit(100); + + let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); + // The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[80, 60, 40, 20, 0, 90, 70, 50, 30, 10, 81, 61, 41, 21, 1, 4, 95, 75, 55, 35, 15, 96, 76, 56, 36, 16, 87, 67, 47, 27, 7, 97, 77, 57, 37, 17, 88, 68, 48, 28, 8, 98, 78, 58, 38, 18]"); + let expected_ids = (0..100) + .filter(|id| { + [7, 8, 0].contains(&(id % 10)) + || [1, 15, 16].contains(&(id % 20)) + || [0, 4].contains(id) + }) + .collect::>(); + documents_ids.sort(); + assert_eq!(expected_ids, documents_ids); + + let mut search = Search::new(&rtxn, &index); + search.filter( + Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]") + .unwrap() + .unwrap(), + ); + search.sort_criteria(vec![AscDesc::from_str("id:desc").unwrap()]); + search.limit(100); + + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + // The order should be in decreasing value of the id + let mut expected_ids = (0..100) + .filter(|id| { + [1, 0, 2].contains(&(id % 10)) + || [10, 13].contains(&(id % 20)) + || [5, 6].contains(id) + }) + .collect::>(); + expected_ids.sort(); + expected_ids.reverse(); + assert_eq!(expected_ids, documents_ids); + } +} diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs new file mode 100644 index 000000000..d7ec0d382 --- /dev/null +++ b/milli/src/search/criteria/attribute.rs @@ -0,0 +1,709 @@ +use std::cmp::{self, Ordering}; +use std::collections::binary_heap::PeekMut; +use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap}; +use std::iter::Peekable; +use std::mem::take; + +use roaring::RoaringBitmap; + +use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::{InitialCandidates, Query}; +use crate::search::query_tree::{Operation, QueryKind}; +use crate::search::{ + build_dfa, word_derivations, CriterionImplementationStrategy, WordDerivationsCache, +}; +use crate::Result; + +/// To be able to divide integers by the number of words in the query +/// we want to find a multiplier that allow us to divide by any number between 1 and 10. +/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). +const LCM_10_FIRST_NUMBERS: u32 = 2520; + +/// Threshold on the number of candidates that will make +/// the system to choose between one algorithm or another. +const CANDIDATES_THRESHOLD: u64 = 500; + +type FlattenedQueryTree = Vec>>; + +pub struct Attribute<'t> { + ctx: &'t dyn Context<'t>, + state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>, + initial_candidates: InitialCandidates, + parent: Box, + linear_buckets: Option>, + set_buckets: Option>>, + implementation_strategy: CriterionImplementationStrategy, +} + +impl<'t> Attribute<'t> { + pub fn new( + ctx: &'t dyn Context<'t>, + parent: Box, + implementation_strategy: CriterionImplementationStrategy, + ) -> Self { + Attribute { + ctx, + state: None, + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), + parent, + linear_buckets: None, + set_buckets: None, + implementation_strategy, + } + } +} + +impl<'t> Criterion for Attribute<'t> { + #[logging_timer::time("Attribute::{}")] + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + // remove excluded candidates when next is called, instead of doing it in the loop. + if let Some((_, _, allowed_candidates)) = self.state.as_mut() { + *allowed_candidates -= params.excluded_candidates; + } + + loop { + match self.state.take() { + Some((query_tree, _, allowed_candidates)) if allowed_candidates.is_empty() => { + return Ok(Some(CriterionResult { + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + filtered_candidates: None, + initial_candidates: Some(self.initial_candidates.take()), + })); + } + Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { + let found_candidates = if matches!( + self.implementation_strategy, + CriterionImplementationStrategy::OnlyIterative + ) || (matches!( + self.implementation_strategy, + CriterionImplementationStrategy::Dynamic + ) && allowed_candidates.len() + < CANDIDATES_THRESHOLD) + { + let linear_buckets = match self.linear_buckets.as_mut() { + Some(linear_buckets) => linear_buckets, + None => { + let new_buckets = initialize_linear_buckets( + self.ctx, + &flattened_query_tree, + &allowed_candidates, + )?; + self.linear_buckets.get_or_insert(new_buckets.into_iter()) + } + }; + + match linear_buckets.next() { + Some((_score, candidates)) => candidates, + None => { + return Ok(Some(CriterionResult { + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + filtered_candidates: None, + initial_candidates: Some(self.initial_candidates.take()), + })); + } + } + } else { + let set_buckets = match self.set_buckets.as_mut() { + Some(set_buckets) => set_buckets, + None => { + let new_buckets = initialize_set_buckets( + self.ctx, + &flattened_query_tree, + &allowed_candidates, + params.wdcache, + )?; + self.set_buckets.get_or_insert(new_buckets) + } + }; + + match set_compute_candidates(set_buckets, &allowed_candidates)? { + Some((_score, candidates)) => candidates, + None => { + return Ok(Some(CriterionResult { + query_tree: Some(query_tree), + candidates: Some(RoaringBitmap::new()), + filtered_candidates: None, + initial_candidates: Some(self.initial_candidates.take()), + })); + } + } + }; + + allowed_candidates -= &found_candidates; + + self.state = + Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); + + return Ok(Some(CriterionResult { + query_tree: Some(query_tree), + candidates: Some(found_candidates), + filtered_candidates: None, + initial_candidates: Some(self.initial_candidates.take()), + })); + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + initial_candidates, + }) => { + let mut candidates = match candidates { + Some(candidates) => candidates, + None => { + resolve_query_tree(self.ctx, &query_tree, params.wdcache)? + - params.excluded_candidates + } + }; + + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + + let flattened_query_tree = flatten_query_tree(&query_tree); + + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), + } + + self.state = Some((query_tree, flattened_query_tree, candidates)); + self.linear_buckets = None; + } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + initial_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + initial_candidates, + })); + } + None => return Ok(None), + }, + } + } + } +} + +/// QueryPositionIterator is an Iterator over positions of a Query, +/// It contains iterators over words positions. +struct QueryPositionIterator<'t> { + #[allow(clippy::type_complexity)] + inner: + Vec> + 't>>>, +} + +impl<'t> QueryPositionIterator<'t> { + fn new( + ctx: &'t dyn Context<'t>, + queries: &[Query], + wdcache: &mut WordDerivationsCache, + ) -> Result { + let mut inner = Vec::with_capacity(queries.len()); + for query in queries { + let in_prefix_cache = query.prefix && ctx.in_prefix_cache(query.kind.word()); + match &query.kind { + QueryKind::Exact { word, .. } => { + if !query.prefix || in_prefix_cache { + let word = query.kind.word(); + let iter = ctx.word_position_iterator(word, in_prefix_cache)?; + inner.push(iter.peekable()); + } else { + for (word, _) in word_derivations(word, true, 0, ctx.words_fst(), wdcache)? + { + let iter = ctx.word_position_iterator(word, in_prefix_cache)?; + inner.push(iter.peekable()); + } + } + } + QueryKind::Tolerant { typo, word } => { + for (word, _) in + word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)? + { + let iter = ctx.word_position_iterator(word, in_prefix_cache)?; + inner.push(iter.peekable()); + } + } + }; + } + + Ok(Self { inner }) + } +} + +impl<'t> Iterator for QueryPositionIterator<'t> { + type Item = heed::Result<(u32, RoaringBitmap)>; + + fn next(&mut self) -> Option { + // sort inner words from the closest next position to the farthest next position. + let expected_pos = self + .inner + .iter_mut() + .filter_map(|wli| match wli.peek() { + Some(Ok(((_, pos), _))) => Some(*pos), + _ => None, + }) + .min()?; + + let mut candidates = None; + for wli in self.inner.iter_mut() { + if let Some(Ok(((_, pos), _))) = wli.peek() { + if *pos > expected_pos { + continue; + } + } + + match wli.next() { + Some(Ok((_, docids))) => { + candidates = match candidates.take() { + Some(candidates) => Some(candidates | docids), + None => Some(docids), + } + } + Some(Err(e)) => return Some(Err(e)), + None => continue, + } + } + + candidates.map(|candidates| Ok((expected_pos, candidates))) + } +} + +/// A Branch is represent a possible alternative of the original query and is build with the Query Tree, +/// This branch allows us to iterate over meta-interval of positions. +struct Branch<'t> { + query_level_iterator: Vec<(u32, RoaringBitmap, Peekable>)>, + last_result: (u32, RoaringBitmap), + branch_size: u32, +} + +impl<'t> Branch<'t> { + fn new( + ctx: &'t dyn Context<'t>, + flatten_branch: &[Vec], + wdcache: &mut WordDerivationsCache, + allowed_candidates: &RoaringBitmap, + ) -> Result { + let mut query_level_iterator = Vec::new(); + for queries in flatten_branch { + let mut qli = QueryPositionIterator::new(ctx, queries, wdcache)?.peekable(); + let (pos, docids) = qli.next().transpose()?.unwrap_or((0, RoaringBitmap::new())); + query_level_iterator.push((pos, docids & allowed_candidates, qli)); + } + + let mut branch = Self { + query_level_iterator, + last_result: (0, RoaringBitmap::new()), + branch_size: flatten_branch.len() as u32, + }; + + branch.update_last_result(); + + Ok(branch) + } + + /// return the next meta-interval of the branch, + /// and update inner interval in order to be ranked by the BinaryHeap. + fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result { + // update the first query. + let index = self.lowest_iterator_index(); + match self.query_level_iterator.get_mut(index) { + Some((cur_pos, cur_docids, qli)) => match qli.next().transpose()? { + Some((next_pos, next_docids)) => { + *cur_pos = next_pos; + *cur_docids |= next_docids & allowed_candidates; + self.update_last_result(); + Ok(true) + } + None => Ok(false), + }, + None => Ok(false), + } + } + + fn lowest_iterator_index(&mut self) -> usize { + let (index, _) = self + .query_level_iterator + .iter_mut() + .map(|(pos, docids, qli)| { + if docids.is_empty() { + 0 + } else { + match qli.peek() { + Some(result) => { + result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0) + } + None => u32::MAX, + } + } + }) + .enumerate() + .min_by_key(|(_, diff)| *diff) + .unwrap_or((0, 0)); + + index + } + + fn update_last_result(&mut self) { + let mut result_pos = 0; + let mut result_docids = None; + + for (pos, docids, _qli) in self.query_level_iterator.iter() { + result_pos += pos; + result_docids = result_docids + .take() + .map_or_else(|| Some(docids.clone()), |candidates| Some(candidates & docids)); + } + + // remove last result docids from inner iterators + if let Some(docids) = result_docids.as_ref() { + for (_, query_docids, _) in self.query_level_iterator.iter_mut() { + *query_docids -= docids; + } + } + + self.last_result = (result_pos, result_docids.unwrap_or_default()); + } + + /// return the score of the current inner interval. + fn compute_rank(&self) -> u32 { + // we compute a rank from the position. + let (pos, _) = self.last_result; + pos.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size + } + + fn cmp(&self, other: &Self) -> Ordering { + let self_rank = self.compute_rank(); + let other_rank = other.compute_rank(); + + // lower rank is better, and because BinaryHeap give the higher ranked branch, we reverse it. + self_rank.cmp(&other_rank).reverse() + } +} + +impl<'t> Ord for Branch<'t> { + fn cmp(&self, other: &Self) -> Ordering { + self.cmp(other) + } +} + +impl<'t> PartialOrd for Branch<'t> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'t> PartialEq for Branch<'t> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl<'t> Eq for Branch<'t> {} + +fn initialize_set_buckets<'t>( + ctx: &'t dyn Context<'t>, + branches: &FlattenedQueryTree, + allowed_candidates: &RoaringBitmap, + wdcache: &mut WordDerivationsCache, +) -> Result>> { + let mut heap = BinaryHeap::new(); + for flatten_branch in branches { + let branch = Branch::new(ctx, flatten_branch, wdcache, allowed_candidates)?; + heap.push(branch); + } + + Ok(heap) +} + +fn set_compute_candidates( + branches_heap: &mut BinaryHeap, + allowed_candidates: &RoaringBitmap, +) -> Result> { + let mut final_candidates: Option<(u32, RoaringBitmap)> = None; + let mut allowed_candidates = allowed_candidates.clone(); + + while let Some(mut branch) = branches_heap.peek_mut() { + // if current is worst than best we break to return + // candidates that correspond to the best rank + let branch_rank = branch.compute_rank(); + if let Some((best_rank, _)) = final_candidates { + if branch_rank > best_rank { + break; + } + } + + let candidates = take(&mut branch.last_result.1); + if candidates.is_empty() { + // we don't have candidates, get next interval. + if !branch.next(&allowed_candidates)? { + PeekMut::pop(branch); + } + } else { + allowed_candidates -= &candidates; + final_candidates = match final_candidates.take() { + // we add current candidates to best candidates + Some((best_rank, mut best_candidates)) => { + best_candidates |= candidates; + branch.next(&allowed_candidates)?; + Some((best_rank, best_candidates)) + } + // we take current candidates as best candidates + None => { + branch.next(&allowed_candidates)?; + Some((branch_rank, candidates)) + } + }; + } + } + + Ok(final_candidates) +} + +fn initialize_linear_buckets( + ctx: &dyn Context, + branches: &FlattenedQueryTree, + allowed_candidates: &RoaringBitmap, +) -> Result> { + fn compute_candidate_rank( + branches: &FlattenedQueryTree, + words_positions: HashMap, + ) -> u64 { + let mut min_rank = u64::max_value(); + for branch in branches { + let branch_len = branch.len(); + let mut branch_rank = Vec::with_capacity(branch_len); + for derivates in branch { + let mut position = None; + for Query { prefix, kind } in derivates { + // find the best position of the current word in the document. + let current_position = match kind { + QueryKind::Exact { word, .. } => { + if *prefix { + word_derivations(word, true, 0, &words_positions) + .flat_map(|positions| positions.iter().next()) + .min() + } else { + words_positions + .get(word) + .and_then(|positions| positions.iter().next()) + } + } + QueryKind::Tolerant { typo, word } => { + word_derivations(word, *prefix, *typo, &words_positions) + .flat_map(|positions| positions.iter().next()) + .min() + } + }; + + match (position, current_position) { + (Some(p), Some(cp)) => position = Some(cmp::min(p, cp)), + (None, Some(cp)) => position = Some(cp), + _ => (), + } + } + + // if a position is found, we add it to the branch score, + // otherwise the branch is considered as unfindable in this document and we break. + if let Some(position) = position { + branch_rank.push(position as u64); + } else { + branch_rank.clear(); + break; + } + } + + if !branch_rank.is_empty() { + branch_rank.sort_unstable(); + // because several words in same query can't match all a the position 0, + // we substract the word index to the position. + let branch_rank: u64 = + branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); + // here we do the means of the words of the branch + min_rank = + min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); + } + } + + min_rank + } + + fn word_derivations<'a>( + word: &str, + is_prefix: bool, + max_typo: u8, + words_positions: &'a HashMap, + ) -> impl Iterator { + let dfa = build_dfa(word, max_typo, is_prefix); + words_positions.iter().filter_map(move |(document_word, positions)| { + use levenshtein_automata::Distance; + match dfa.eval(document_word) { + Distance::Exact(_) => Some(positions), + Distance::AtLeast(_) => None, + } + }) + } + + let mut candidates = BTreeMap::new(); + for docid in allowed_candidates { + let words_positions = ctx.docid_words_positions(docid)?; + let rank = compute_candidate_rank(branches, words_positions); + candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid); + } + + Ok(candidates) +} + +// TODO can we keep refs of Query +fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { + use crate::search::criteria::Operation::{And, Or, Phrase}; + + fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree { + match tail.split_first() { + Some((thead, tail)) => { + let tail = and_recurse(thead, tail); + let mut out = Vec::new(); + for array in recurse(head) { + for tail_array in &tail { + let mut array = array.clone(); + array.extend(tail_array.iter().cloned()); + out.push(array); + } + } + out + } + None => recurse(head), + } + } + + fn recurse(op: &Operation) -> FlattenedQueryTree { + match op { + And(ops) => ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)), + Or(_, ops) => { + if ops.iter().all(|op| op.query().is_some()) { + vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] + } else { + ops.iter().flat_map(recurse).collect() + } + } + Phrase(words) => { + let queries = words + .iter() + .filter_map(|w| w.as_ref()) + .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }]) + .collect(); + vec![queries] + } + Operation::Query(query) => vec![vec![vec![query.clone()]]], + } + } + + recurse(query_tree) +} + +#[cfg(test)] +mod tests { + use big_s::S; + + use super::*; + use crate::search::criteria::QueryKind; + + #[test] + fn simple_flatten_query_tree() { + let query_tree = Operation::Or( + false, + vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), + ]), + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), + Operation::Or( + false, + vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact(S("thefish")), + }), + Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact(S("the")), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact(S("fish")), + }), + ]), + ], + ), + ]), + ], + ); + let result = flatten_query_tree(&query_tree); + + insta::assert_debug_snapshot!(result, @r###" + [ + [ + [ + Exact { + word: "manythefish", + }, + ], + ], + [ + [ + Exact { + word: "manythe", + }, + ], + [ + Exact { + word: "fish", + }, + ], + ], + [ + [ + Exact { + word: "many", + }, + ], + [ + Exact { + word: "thefish", + }, + ], + ], + [ + [ + Exact { + word: "many", + }, + ], + [ + Exact { + word: "the", + }, + ], + [ + Exact { + word: "fish", + }, + ], + ], + ] + "###); + } +} diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs new file mode 100644 index 000000000..078a9cd6c --- /dev/null +++ b/milli/src/search/criteria/exactness.rs @@ -0,0 +1,766 @@ +use std::collections::btree_map::Entry; +use std::collections::BTreeMap; +use std::convert::TryFrom; +use std::mem::take; + +use log::debug; +use roaring::{MultiOps, RoaringBitmap}; + +use crate::search::criteria::{ + resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, + InitialCandidates, +}; +use crate::search::query_tree::{Operation, PrimitiveQueryPart}; +use crate::{absolute_from_relative_position, FieldId, Result}; + +pub struct Exactness<'t> { + ctx: &'t dyn Context<'t>, + query_tree: Option, + state: Option, + initial_candidates: InitialCandidates, + parent: Box, + query: Vec, + cache: Option, +} + +impl<'t> Exactness<'t> { + pub fn new( + ctx: &'t dyn Context<'t>, + parent: Box, + primitive_query: &[PrimitiveQueryPart], + ) -> heed::Result { + let mut query: Vec<_> = Vec::with_capacity(primitive_query.len()); + for part in primitive_query { + query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?); + } + + Ok(Exactness { + ctx, + query_tree: None, + state: None, + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), + parent, + query, + cache: None, + }) + } +} + +impl<'t> Criterion for Exactness<'t> { + #[logging_timer::time("Exactness::{}")] + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + // remove excluded candidates when next is called, instead of doing it in the loop. + if let Some(state) = self.state.as_mut() { + state.difference_with(params.excluded_candidates); + } + loop { + debug!("Exactness at state {:?}", self.state); + + match self.state.as_mut() { + Some(state) if state.is_empty() => { + // reset state + self.state = None; + self.query_tree = None; + // we don't need to reset the combinations cache since it only depends on + // the primitive query, which does not change + } + Some(state) => { + let (candidates, state) = + resolve_state(self.ctx, take(state), &self.query, &mut self.cache)?; + self.state = state; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: Some(candidates), + filtered_candidates: None, + initial_candidates: Some(self.initial_candidates.take()), + })); + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + initial_candidates, + }) => { + let mut candidates = match candidates { + Some(candidates) => candidates, + None => { + resolve_query_tree(self.ctx, &query_tree, params.wdcache)? + - params.excluded_candidates + } + }; + + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), + } + + self.state = Some(State::new(candidates)); + self.query_tree = Some(query_tree); + } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + initial_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + initial_candidates, + })); + } + None => return Ok(None), + }, + } + } + } +} + +#[derive(Debug)] +enum State { + /// Extract the documents that have an attribute that contains exactly the query. + ExactAttribute(RoaringBitmap), + /// Extract the documents that have an attribute that starts with exactly the query. + AttributeStartsWith(RoaringBitmap), + /// Rank the remaining documents by the number of exact words contained. + ExactWords(RoaringBitmap), + Remainings(Vec), +} + +impl State { + fn new(candidates: RoaringBitmap) -> Self { + Self::ExactAttribute(candidates) + } + + fn difference_with(&mut self, lhs: &RoaringBitmap) { + match self { + Self::ExactAttribute(candidates) + | Self::AttributeStartsWith(candidates) + | Self::ExactWords(candidates) => *candidates -= lhs, + Self::Remainings(candidates_array) => { + candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs); + candidates_array.retain(|candidates| !candidates.is_empty()); + } + } + } + + fn is_empty(&self) -> bool { + match self { + Self::ExactAttribute(candidates) + | Self::AttributeStartsWith(candidates) + | Self::ExactWords(candidates) => candidates.is_empty(), + Self::Remainings(candidates_array) => { + candidates_array.iter().all(RoaringBitmap::is_empty) + } + } + } +} + +impl Default for State { + fn default() -> Self { + Self::Remainings(vec![]) + } +} +#[logging_timer::time("Exactness::{}")] +fn resolve_state( + ctx: &dyn Context, + state: State, + query: &[ExactQueryPart], + cache: &mut Option, +) -> Result<(RoaringBitmap, Option)> { + use State::*; + match state { + ExactAttribute(mut allowed_candidates) => { + let mut candidates = RoaringBitmap::new(); + if let Ok(query_len) = u8::try_from(query.len()) { + let attributes_ids = ctx.searchable_fields_ids()?; + for id in attributes_ids { + if let Some(attribute_allowed_docids) = + ctx.field_id_word_count_docids(id, query_len)? + { + let mut attribute_candidates_array = + attribute_start_with_docids(ctx, id, query)?; + attribute_candidates_array.push(attribute_allowed_docids); + + candidates |= MultiOps::intersection(attribute_candidates_array); + } + } + + // only keep allowed candidates + candidates &= &allowed_candidates; + // remove current candidates from allowed candidates + allowed_candidates -= &candidates; + } + + Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) + } + AttributeStartsWith(mut allowed_candidates) => { + let mut candidates = RoaringBitmap::new(); + let attributes_ids = ctx.searchable_fields_ids()?; + for id in attributes_ids { + let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; + candidates |= MultiOps::intersection(attribute_candidates_array); + } + + // only keep allowed candidates + candidates &= &allowed_candidates; + // remove current candidates from allowed candidates + allowed_candidates -= &candidates; + Ok((candidates, Some(ExactWords(allowed_candidates)))) + } + ExactWords(allowed_candidates) => { + // Retrieve the cache if it already exist, otherwise create it. + let owned_cache = if let Some(cache) = cache.take() { + cache + } else { + compute_combinations(ctx, query)? + }; + // The cache contains the sets of documents which contain exactly 1,2,3,.. exact words + // from the query. It cannot be empty. All the candidates in it are disjoint. + + let mut candidates_array = owned_cache.combinations.clone(); + for candidates in candidates_array.iter_mut() { + *candidates &= &allowed_candidates; + } + *cache = Some(owned_cache); + + let best_candidates = candidates_array.pop().unwrap(); + + candidates_array.insert(0, allowed_candidates); + Ok((best_candidates, Some(Remainings(candidates_array)))) + } + // pop remainings candidates until the emptiness + Remainings(mut candidates_array) => { + let candidates = candidates_array.pop().unwrap_or_default(); + if !candidates_array.is_empty() { + Ok((candidates, Some(Remainings(candidates_array)))) + } else { + Ok((candidates, None)) + } + } + } +} + +fn attribute_start_with_docids( + ctx: &dyn Context, + attribute_id: FieldId, + query: &[ExactQueryPart], +) -> heed::Result> { + let mut attribute_candidates_array = Vec::new(); + // start from attribute first position + let mut pos = absolute_from_relative_position(attribute_id, 0); + for part in query { + use ExactQueryPart::*; + match part { + Synonyms(synonyms) => { + let mut synonyms_candidates = RoaringBitmap::new(); + for word in synonyms { + let wc = ctx.word_position_docids(word, pos)?; + if let Some(word_candidates) = wc { + synonyms_candidates |= word_candidates; + } + } + attribute_candidates_array.push(synonyms_candidates); + pos += 1; + } + Phrase(phrase) => { + for word in phrase { + if let Some(word) = word { + let wc = ctx.word_position_docids(word, pos)?; + if let Some(word_candidates) = wc { + attribute_candidates_array.push(word_candidates); + } + } + pos += 1; + } + } + } + } + + Ok(attribute_candidates_array) +} + +#[derive(Debug, Clone)] +pub enum ExactQueryPart { + Phrase(Vec>), + Synonyms(Vec), +} + +impl ExactQueryPart { + fn from_primitive_query_part( + ctx: &dyn Context, + part: &PrimitiveQueryPart, + ) -> heed::Result { + let part = match part { + PrimitiveQueryPart::Word(word, _) => { + match ctx.synonyms(word)? { + Some(synonyms) => { + let mut synonyms: Vec<_> = synonyms + .into_iter() + .filter_map(|mut array| { + // keep 1 word synonyms only. + match array.pop() { + Some(word) if array.is_empty() => Some(word), + _ => None, + } + }) + .collect(); + synonyms.push(word.clone()); + ExactQueryPart::Synonyms(synonyms) + } + None => ExactQueryPart::Synonyms(vec![word.clone()]), + } + } + PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()), + }; + + Ok(part) + } +} + +struct ExactWordsCombinationCache { + // index 0 is only 1 word + combinations: Vec, +} + +fn compute_combinations( + ctx: &dyn Context, + query: &[ExactQueryPart], +) -> Result { + let number_of_part = query.len(); + let mut parts_candidates_array = Vec::with_capacity(number_of_part); + for part in query { + let mut candidates = RoaringBitmap::new(); + use ExactQueryPart::*; + match part { + Synonyms(synonyms) => { + for synonym in synonyms { + if let Some(synonym_candidates) = ctx.word_docids(synonym)? { + candidates |= synonym_candidates; + } + } + } + // compute intersection on pair of words with a proximity of 0. + Phrase(phrase) => { + candidates |= resolve_phrase(ctx, phrase)?; + } + } + parts_candidates_array.push(candidates); + } + let combinations = create_disjoint_combinations(parts_candidates_array); + + Ok(ExactWordsCombinationCache { combinations }) +} + +/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn` +/// such that `Xi` contains all the elements that are contained in **at least** `i+1` bitmaps among `b0,b1,...,bn`. +/// +/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`. +/// +/// ## Implementation +/// +/// We do so by iteratively building a map containing the union of all the different ways to intersect `J` bitmaps among `b0,b1,...,bn`. +/// - The key of the map is the index `i` of the last bitmap in the intersections +/// - The value is the union of all the possible intersections of J bitmaps such that the last bitmap in the intersection is `bi` +/// +/// For example, with the bitmaps `b0,b1,b2,b3`, this map should look like this +/// ```text +/// Map 0: (first iteration, contains all the combinations of 1 bitmap) +/// // What follows are unions of intersection of bitmaps asscociated with the index of their last component +/// 0: [b0] +/// 1: [b1] +/// 2: [b2] +/// 3: [b3] +/// Map 1: (second iteration, combinations of 2 bitmaps) +/// 1: [b0&b1] +/// 2: [b0&b2 | b1&b2] +/// 3: [b0&b3 | b1&b3 | b2&b3] +/// Map 2: (third iteration, combinations of 3 bitmaps) +/// 2: [b0&b1&b2] +/// 3: [b0&b2&b3 | b1&b2&b3] +/// Map 3: (fourth iteration, combinations of 4 bitmaps) +/// 3: [b0&b1&b2&b3] +/// ``` +/// +/// These maps are built one by one from the content of the preceding map. +/// For example, to create Map 2, we look at each line of Map 1, for example: +/// ```text +/// 2: [b0&b2 | b1&b2] +/// ``` +/// And then for each i > 2, we compute `(b0&b2 | b1&b2) & bi = b0&b2&bi | b1&b2&bi` +/// and then add it the new map (Map 3) under the key `i` (if it is not empty): +/// ```text +/// 3: [b0&b2&b3 | b1&b2&b3] +/// 4: [b0&b2&b4 | b1&b2&b4] +/// 5: [b0&b2&b5 | b1&b2&b5] +/// etc. +/// ``` +/// We only keep two maps in memory at any one point. As soon as Map J is built, we flatten Map J-1 into +/// a single bitmap by taking the union of all of its values. This union gives us Xj-1. +/// +/// ## Memory Usage +/// This function is expected to be called on a maximum of 10 bitmaps. The worst case thus happens when +/// 10 identical large bitmaps are given. +/// +/// In the context of Meilisearch, let's imagine that we are given 10 bitmaps containing all +/// the document ids. If the dataset contains 16 million documents, then each bitmap will take +/// around 2MB of memory. +/// +/// When creating Map 3, we will have, in memory: +/// 1. The 10 original bitmaps (20MB) +/// 2. X0 : 2MB +/// 3. Map 1, containing 9 bitmaps: 18MB +/// 4. Map 2, containing 8 bitmaps: 16MB +/// 5. X1: 2MB +/// for a total of around 60MB of memory. This roughly represents the maximum memory usage of this function. +/// +/// ## Time complexity +/// Let N be the size of the given list of bitmaps and M the length of each individual bitmap. +/// +/// We need to create N new bitmaps. The most expensive one to create is the second one, where we need to +/// iterate over the N keys of Map 1, and for each of those keys `k_i`, we perform `N-k_i` bitmap unions. +/// Unioning two bitmaps is O(M), and we need to do it O(N^2) times. +/// +/// Therefore the time complexity is O(N^3 * M). +fn create_non_disjoint_combinations(bitmaps: Vec) -> Vec { + let nbr_parts = bitmaps.len(); + if nbr_parts == 1 { + return bitmaps; + } + let mut flattened_levels = vec![]; + let mut last_level: BTreeMap = + bitmaps.clone().into_iter().enumerate().collect(); + + for _ in 2..=nbr_parts { + let mut new_level = BTreeMap::new(); + for (last_part_index, base_combination) in last_level.iter() { + #[allow(clippy::needless_range_loop)] + for new_last_part_index in last_part_index + 1..nbr_parts { + let new_combination = base_combination & &bitmaps[new_last_part_index]; + if !new_combination.is_empty() { + match new_level.entry(new_last_part_index) { + Entry::Occupied(mut b) => { + *b.get_mut() |= new_combination; + } + Entry::Vacant(entry) => { + entry.insert(new_combination); + } + } + } + } + } + // Now flatten the last level to save memory + let flattened_last_level = MultiOps::union(last_level.into_values()); + flattened_levels.push(flattened_last_level); + last_level = new_level; + } + // Flatten the last level + let flattened_last_level = MultiOps::union(last_level.into_values()); + flattened_levels.push(flattened_last_level); + flattened_levels +} + +/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn` +/// such that `Xi` contains all the elements that are contained in **exactly** `i+1` bitmaps among `b0,b1,...,bn`. +/// +/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`. +fn create_disjoint_combinations(parts_candidates_array: Vec) -> Vec { + let non_disjoint_combinations = create_non_disjoint_combinations(parts_candidates_array); + let mut disjoint_combinations = vec![]; + let mut combinations = non_disjoint_combinations.into_iter().peekable(); + while let Some(mut combination) = combinations.next() { + if let Some(forbidden) = combinations.peek() { + combination -= forbidden; + } + disjoint_combinations.push(combination) + } + + disjoint_combinations +} + +#[cfg(test)] +mod tests { + use big_s::S; + use roaring::RoaringBitmap; + + use crate::index::tests::TempIndex; + use crate::search::criteria::exactness::{ + create_disjoint_combinations, create_non_disjoint_combinations, + }; + use crate::snapshot_tests::display_bitmap; + use crate::{Criterion, SearchResult}; + + #[test] + fn test_exact_words_subcriterion() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key(S("id")); + settings.set_criteria(vec![Criterion::Exactness]); + }) + .unwrap(); + + index + .add_documents(documents!([ + // not relevant + { "id": "0", "text": "cat good dog bad" }, + // 1 exact word + { "id": "1", "text": "they said: cats arebetter thandogs" }, + // 3 exact words + { "id": "2", "text": "they said: cats arebetter than dogs" }, + // 5 exact words + { "id": "3", "text": "they said: cats are better than dogs" }, + // attribute starts with the exact words + { "id": "4", "text": "cats are better than dogs except on Saturday" }, + // attribute equal to the exact words + { "id": "5", "text": "cats are better than dogs" }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("cats are better than dogs").execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4, 3, 2, 1]"); + } + + fn print_combinations(rbs: &[RoaringBitmap]) -> String { + let mut s = String::new(); + for rb in rbs { + s.push_str(&format!("{}\n", &display_bitmap(rb))); + } + s + } + + // In these unit tests, the test bitmaps always contain all the multiple of a certain number. + // This makes it easy to check the validity of the results of `create_disjoint_combinations` by + // counting the number of dividers of elements in the returned bitmaps. + fn assert_correct_combinations(combinations: &[RoaringBitmap], dividers: &[u32]) { + for (i, set) in combinations.iter().enumerate() { + let expected_nbr_dividers = i + 1; + for el in set { + let nbr_dividers = dividers.iter().map(|d| usize::from(el % d == 0)).sum::(); + assert_eq!( + nbr_dividers, expected_nbr_dividers, + "{el} is divisible by {nbr_dividers} elements, not {expected_nbr_dividers}." + ); + } + } + } + + #[test] + fn compute_combinations_1() { + let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); + + let parts_candidates = vec![b0]; + + let combinations = create_disjoint_combinations(parts_candidates); + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, ] + "###); + + assert_correct_combinations(&combinations, &[2]); + } + + #[test] + fn compute_combinations_2() { + let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); + let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); + + let parts_candidates = vec![b0, b1]; + + let combinations = create_disjoint_combinations(parts_candidates); + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [2, 3, 4, 8, 9, 10, 14, 15, 16, 20, 21, 22, 26, 27, 28, 32, 33, 34, 38, 39, 40, 44, 45, 46, 50, 51, 52, 56, 57, 58, 62, 63, 64, 68, 69, 70, 74, 75, 76, 80, 81, 82, 86, 87, 88, 92, 93, 94, 98, 99, 100, 104, 105, 106, 110, 111, 112, 116, 117, 118, 122, 123, 124, 128, 129, 130, 134, 135, 136, 140, 141, 142, 146, 147, 148, ] + [0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108, 114, 120, 126, 132, 138, 144, ] + "###); + } + + #[test] + fn compute_combinations_4() { + let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); + let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); + let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); + let b3: RoaringBitmap = (0..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect(); + + let parts_candidates = vec![b0, b1, b2, b3]; + + let combinations = create_disjoint_combinations(parts_candidates); + + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ] + [6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ] + [30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ] + [0, ] + "###); + + // But we also check it programmatically + assert_correct_combinations(&combinations, &[2, 3, 5, 7]); + } + #[test] + fn compute_combinations_4_with_empty_results_at_end() { + let b0: RoaringBitmap = (1..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); + let b1: RoaringBitmap = (1..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); + let b2: RoaringBitmap = (1..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); + let b3: RoaringBitmap = (1..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect(); + + let parts_candidates = vec![b0, b1, b2, b3]; + + let combinations = create_disjoint_combinations(parts_candidates); + + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ] + [6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ] + [30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ] + [] + "###); + + // But we also check it programmatically + assert_correct_combinations(&combinations, &[2, 3, 5, 7]); + } + + #[test] + fn compute_combinations_4_with_some_equal_bitmaps() { + let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); + let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); + let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); + // b3 == b1 + let b3: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); + + let parts_candidates = vec![b0, b1, b2, b3]; + + let combinations = create_disjoint_combinations(parts_candidates); + + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [2, 4, 5, 8, 14, 16, 22, 25, 26, 28, 32, 34, 35, 38, 44, 46, 52, 55, 56, 58, 62, 64, 65, 68, 74, 76, 82, 85, 86, 88, 92, 94, 95, 98, 104, 106, 112, 115, 116, 118, 122, 124, 125, 128, 134, 136, 142, 145, 146, 148, ] + [3, 9, 10, 20, 21, 27, 33, 39, 40, 50, 51, 57, 63, 69, 70, 80, 81, 87, 93, 99, 100, 110, 111, 117, 123, 129, 130, 140, 141, 147, ] + [6, 12, 15, 18, 24, 36, 42, 45, 48, 54, 66, 72, 75, 78, 84, 96, 102, 105, 108, 114, 126, 132, 135, 138, 144, ] + [0, 30, 60, 90, 120, ] + "###); + + // But we also check it programmatically + assert_correct_combinations(&combinations, &[2, 3, 5, 3]); + } + + #[test] + fn compute_combinations_10() { + let dividers = [2, 3, 5, 7, 11, 6, 15, 35, 18, 14]; + let parts_candidates: Vec = dividers + .iter() + .map(|÷r| { + (0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 210).collect() + }) + .collect(); + + let combinations = create_disjoint_combinations(parts_candidates); + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [2, 3, 4, 5, 7, 8, 9, 11, 16, 25, 26, 27, 32, 34, 38, 39, 46, 49, 51, 52, 57, 58, 62, 64, 65, 68, 69, 74, 76, 81, 82, 85, 86, 87, 91, 92, 93, 94, 95, 104, 106, 111, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 143, 145, 146, 148, 152, 153, 155, 158, 159, 161, 164, 166, 171, 172, 177, 178, 183, 184, 185, 187, 188, 194, 201, 202, 203, 205, 206, 207, 208, 209, ] + [10, 20, 21, 22, 33, 40, 44, 50, 55, 63, 77, 80, 88, 99, 100, 130, 147, 160, 170, 176, 189, 190, 200, ] + [6, 12, 14, 15, 24, 28, 35, 45, 48, 56, 75, 78, 96, 98, 102, 110, 112, 114, 135, 138, 156, 174, 175, 182, 186, 192, 195, 196, 204, ] + [18, 36, 54, 66, 72, 108, 132, 144, 154, 162, 165, ] + [30, 42, 60, 70, 84, 105, 120, 140, 150, 168, 198, ] + [90, 126, 180, ] + [] + [210, ] + [] + [0, ] + "###); + + assert_correct_combinations(&combinations, ÷rs); + } + + #[test] + fn compute_combinations_30() { + let dividers: [u32; 30] = [ + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, + 5, + ]; + let parts_candidates: Vec = dividers + .iter() + .map(|divider| { + (0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 100).collect() + }) + .collect(); + + let combinations = create_non_disjoint_combinations(parts_candidates.clone()); + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] + [0, 60, ] + [0, 60, ] + [0, 60, ] + [0, 60, ] + [0, 60, ] + [0, 60, ] + "###); + + let combinations = create_disjoint_combinations(parts_candidates); + insta::assert_snapshot!(print_combinations(&combinations), @r###" + [] + [] + [] + [] + [] + [1, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 49, 53, 59, 61, 67, 71, 73, 77, 79, 83, 89, 91, 97, ] + [] + [] + [] + [] + [] + [2, 3, 5, 9, 14, 21, 22, 25, 26, 27, 33, 34, 35, 38, 39, 46, 51, 55, 57, 58, 62, 63, 65, 69, 74, 81, 82, 85, 86, 87, 93, 94, 95, 98, 99, ] + [] + [] + [] + [] + [] + [4, 6, 8, 10, 15, 16, 18, 28, 32, 42, 44, 45, 50, 52, 54, 56, 64, 66, 68, 70, 75, 76, 78, 88, 92, ] + [] + [] + [] + [] + [] + [12, 20, 24, 30, 36, 40, 48, 72, 80, 84, 90, 96, 100, ] + [] + [] + [] + [] + [] + [0, 60, ] + "###); + + assert_correct_combinations(&combinations, ÷rs); + } +} diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs new file mode 100644 index 000000000..9f7a147b8 --- /dev/null +++ b/milli/src/search/criteria/final.rs @@ -0,0 +1,77 @@ +use log::debug; +use roaring::RoaringBitmap; + +use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::InitialCandidates; +use crate::search::query_tree::Operation; +use crate::search::WordDerivationsCache; +use crate::Result; + +/// The result of a call to the fetcher. +#[derive(Debug, Clone, PartialEq)] +pub struct FinalResult { + /// The query tree corresponding to the current bucket of the last criterion. + pub query_tree: Option, + /// The candidates of the current bucket of the last criterion. + pub candidates: RoaringBitmap, + /// Candidates that comes from the current bucket of the initial criterion. + pub initial_candidates: InitialCandidates, +} + +pub struct Final<'t> { + ctx: &'t dyn Context<'t>, + parent: Box, + wdcache: WordDerivationsCache, + returned_candidates: RoaringBitmap, +} + +impl<'t> Final<'t> { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Final<'t> { + Final { + ctx, + parent, + wdcache: WordDerivationsCache::new(), + returned_candidates: RoaringBitmap::new(), + } + } + + #[logging_timer::time("Final::{}")] + pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> Result> { + debug!("Final iteration"); + let excluded_candidates = &self.returned_candidates | excluded_candidates; + let mut criterion_parameters = CriterionParameters { + wdcache: &mut self.wdcache, + // returned_candidates is merged with excluded_candidates to avoid duplicas + excluded_candidates: &excluded_candidates, + }; + + match self.parent.next(&mut criterion_parameters)? { + Some(CriterionResult { + query_tree, + candidates, + filtered_candidates, + initial_candidates, + }) => { + let mut candidates = match (candidates, query_tree.as_ref()) { + (Some(candidates), _) => candidates, + (None, Some(qt)) => { + resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates + } + (None, None) => self.ctx.documents_ids()? - excluded_candidates, + }; + + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + + let initial_candidates = initial_candidates + .unwrap_or_else(|| InitialCandidates::Estimated(candidates.clone())); + + self.returned_candidates |= &candidates; + + Ok(Some(FinalResult { query_tree, candidates, initial_candidates })) + } + None => Ok(None), + } + } +} diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs new file mode 100644 index 000000000..0b33e6b2f --- /dev/null +++ b/milli/src/search/criteria/geo.rs @@ -0,0 +1,154 @@ +use std::iter; + +use roaring::RoaringBitmap; +use rstar::RTree; + +use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; +use crate::{lat_lng_to_xyz, GeoPoint, Index, Result}; + +pub struct Geo<'t> { + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + ascending: bool, + parent: Box, + candidates: Box>, + allowed_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, + rtree: Option>, + point: [f64; 2], +} + +impl<'t> Geo<'t> { + pub fn asc( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + point: [f64; 2], + ) -> Result { + Self::new(index, rtxn, parent, point, true) + } + + pub fn desc( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + point: [f64; 2], + ) -> Result { + Self::new(index, rtxn, parent, point, false) + } + + fn new( + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + parent: Box, + point: [f64; 2], + ascending: bool, + ) -> Result { + let candidates = Box::new(iter::empty()); + let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?; + let initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new()); + let rtree = index.geo_rtree(rtxn)?; + + Ok(Self { + index, + rtxn, + ascending, + parent, + candidates, + allowed_candidates, + initial_candidates, + rtree, + point, + }) + } +} + +impl Criterion for Geo<'_> { + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + let rtree = self.rtree.as_ref(); + + loop { + match self.candidates.next() { + Some(mut candidates) => { + candidates -= params.excluded_candidates; + self.allowed_candidates -= &candidates; + return Ok(Some(CriterionResult { + query_tree: None, + candidates: Some(candidates), + filtered_candidates: None, + initial_candidates: Some(self.initial_candidates.clone()), + })); + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree, + candidates, + filtered_candidates, + initial_candidates, + }) => { + let mut candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => { + let context = CriteriaBuilder::new(self.rtxn, self.index)?; + resolve_query_tree(&context, qt, params.wdcache)? + } + (None, None) => self.index.documents_ids(self.rtxn)?, + }; + + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), + } + + if candidates.is_empty() { + continue; + } + self.allowed_candidates = &candidates - params.excluded_candidates; + self.candidates = match rtree { + Some(rtree) => geo_point( + rtree, + self.allowed_candidates.clone(), + self.point, + self.ascending, + ), + None => Box::new(std::iter::empty()), + }; + } + None => return Ok(None), + }, + } + } + } +} + +fn geo_point( + rtree: &RTree, + mut candidates: RoaringBitmap, + point: [f64; 2], + ascending: bool, +) -> Box> { + let point = lat_lng_to_xyz(&point); + + let mut results = Vec::new(); + for point in rtree.nearest_neighbor_iter(&point) { + if candidates.remove(point.data.0) { + results.push(std::iter::once(point.data.0).collect()); + if candidates.is_empty() { + break; + } + } + } + + if ascending { + Box::new(results.into_iter()) + } else { + Box::new(results.into_iter().rev()) + } +} diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs new file mode 100644 index 000000000..0826a9f68 --- /dev/null +++ b/milli/src/search/criteria/initial.rs @@ -0,0 +1,82 @@ +use roaring::RoaringBitmap; + +use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::{resolve_query_tree, Context, InitialCandidates}; +use crate::search::query_tree::Operation; +use crate::search::Distinct; +use crate::Result; +/// Initial is a mandatory criterion, it is always the first +/// and is meant to initalize the CriterionResult used by the other criteria. +/// It behave like an [Once Iterator](https://doc.rust-lang.org/std/iter/struct.Once.html) and will return Some(CriterionResult) only one time. +pub struct Initial<'t, D> { + ctx: &'t dyn Context<'t>, + answer: Option, + exhaustive_number_hits: bool, + distinct: Option, +} + +impl<'t, D> Initial<'t, D> { + pub fn new( + ctx: &'t dyn Context<'t>, + query_tree: Option, + filtered_candidates: Option, + exhaustive_number_hits: bool, + distinct: Option, + ) -> Initial { + let answer = CriterionResult { + query_tree, + candidates: None, + filtered_candidates, + initial_candidates: None, + }; + Initial { ctx, answer: Some(answer), exhaustive_number_hits, distinct } + } +} + +impl Criterion for Initial<'_, D> { + #[logging_timer::time("Initial::{}")] + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + self.answer + .take() + .map(|mut answer| { + if self.exhaustive_number_hits { + // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. + let candidates = answer + .query_tree + .as_ref() + .map(|query_tree| resolve_query_tree(self.ctx, query_tree, params.wdcache)) + .transpose()?; + + // then intersect the candidates with the potential filtered candidates. + let mut candidates = match (candidates, answer.filtered_candidates.take()) { + (Some(candidates), Some(filtered)) => candidates & filtered, + (Some(candidates), None) => candidates, + (None, Some(filtered)) => filtered, + (None, None) => self.ctx.documents_ids()?, + }; + + // then remove the potential soft deleted documents. + candidates -= params.excluded_candidates; + + // because the initial_candidates should be an exhaustive count of the matching documents, + // we precompute the distinct attributes. + let initial_candidates = match &mut self.distinct { + Some(distinct) => { + let mut initial_candidates = RoaringBitmap::new(); + for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { + initial_candidates.insert(c?); + } + initial_candidates + } + None => candidates.clone(), + }; + + answer.candidates = Some(candidates); + answer.initial_candidates = + Some(InitialCandidates::Exhaustive(initial_candidates)); + } + Ok(answer) + }) + .transpose() + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs new file mode 100644 index 000000000..0a5bfd664 --- /dev/null +++ b/milli/src/search/criteria/mod.rs @@ -0,0 +1,1051 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::mem::take; +use std::ops::{BitOr, BitOrAssign}; + +use roaring::RoaringBitmap; + +use self::asc_desc::AscDesc; +use self::attribute::Attribute; +use self::exactness::Exactness; +use self::initial::Initial; +use self::proximity::Proximity; +use self::r#final::Final; +use self::typo::Typo; +use self::words::Words; +use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; +use super::CriterionImplementationStrategy; +use crate::search::criteria::geo::Geo; +use crate::search::{word_derivations, Distinct, WordDerivationsCache}; +use crate::update::{MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB}; +use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; + +mod asc_desc; +mod attribute; +mod exactness; +pub mod r#final; +mod geo; +mod initial; +mod proximity; +mod typo; +mod words; + +pub trait Criterion { + fn next(&mut self, params: &mut CriterionParameters) -> Result>; +} + +/// The result of a call to the parent criterion. +#[derive(Debug, Clone, PartialEq)] +pub struct CriterionResult { + /// The query tree that must be used by the children criterion to fetch candidates. + query_tree: Option, + /// The candidates that this criterion is allowed to return subsets of, + /// if None, it is up to the child to compute the candidates itself. + candidates: Option, + /// The candidates, coming from facet filters, that this criterion is allowed to return subsets of. + filtered_candidates: Option, + /// Candidates that comes from the current bucket of the initial criterion. + initial_candidates: Option, +} + +#[derive(Debug, PartialEq)] +pub struct CriterionParameters<'a> { + wdcache: &'a mut WordDerivationsCache, + excluded_candidates: &'a RoaringBitmap, +} + +/// Either a set of candidates that defines the candidates +/// that are allowed to be returned, +/// or the candidates that must never be returned. +#[derive(Debug)] +enum Candidates { + Allowed(RoaringBitmap), + Forbidden(RoaringBitmap), +} + +impl Default for Candidates { + fn default() -> Self { + Self::Forbidden(RoaringBitmap::new()) + } +} + +/// Either a set of candidates that defines the estimated set of candidates +/// that could be returned, +/// or the Exhaustive set of candidates that will be returned if all possible results are fetched. +#[derive(Debug, Clone, PartialEq)] +pub enum InitialCandidates { + Estimated(RoaringBitmap), + Exhaustive(RoaringBitmap), +} + +impl InitialCandidates { + fn take(&mut self) -> Self { + match self { + Self::Estimated(c) => Self::Estimated(take(c)), + Self::Exhaustive(c) => Self::Exhaustive(take(c)), + } + } + + /// modify the containing roaring bitmap inplace if the set isn't already Exhaustive. + pub fn map_inplace(&mut self, f: F) + where + F: FnOnce(RoaringBitmap) -> RoaringBitmap, + { + if let Self::Estimated(c) = self { + *c = f(take(c)) + } + } + + pub fn into_inner(self) -> RoaringBitmap { + match self { + Self::Estimated(c) => c, + Self::Exhaustive(c) => c, + } + } +} + +impl BitOrAssign for InitialCandidates { + /// Make an union between the containing roaring bitmaps if the set isn't already Exhaustive. + /// In the case of rhs is Exhaustive and not self, then rhs replaces self. + fn bitor_assign(&mut self, rhs: Self) { + if let Self::Estimated(c) = self { + *self = match rhs { + Self::Estimated(rhs) => Self::Estimated(rhs | &*c), + Self::Exhaustive(rhs) => Self::Exhaustive(rhs), + } + } + } +} + +impl BitOr for InitialCandidates { + type Output = Self; + + /// Make an union between the containing roaring bitmaps if the set isn't already Exhaustive. + /// In the case of rhs is Exhaustive and not self, then rhs replaces self. + fn bitor(self, rhs: Self) -> Self::Output { + if let Self::Estimated(c) = self { + match rhs { + Self::Estimated(rhs) => Self::Estimated(rhs | c), + Self::Exhaustive(rhs) => Self::Exhaustive(rhs), + } + } else { + self.clone() + } + } +} + +pub trait Context<'c> { + fn documents_ids(&self) -> heed::Result; + fn word_docids(&self, word: &str) -> heed::Result>; + fn exact_word_docids(&self, word: &str) -> heed::Result>; + fn word_prefix_docids(&self, word: &str) -> heed::Result>; + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result>; + + fn word_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result>; + fn word_prefix_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result>; + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, + right: &str, + proximity: u8, + ) -> heed::Result>; + fn words_fst<'t>(&self) -> &'t fst::Set>; + fn in_prefix_cache(&self, word: &str) -> bool; + fn docid_words_positions( + &self, + docid: DocumentId, + ) -> heed::Result>; + #[allow(clippy::type_complexity)] + fn word_position_iterator( + &self, + word: &str, + in_prefix_cache: bool, + ) -> heed::Result> + 'c>>; + fn synonyms(&self, word: &str) -> heed::Result>>>; + fn searchable_fields_ids(&self) -> Result>; + fn field_id_word_count_docids( + &self, + field_id: FieldId, + word_count: u8, + ) -> heed::Result>; + fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result>; +} + +pub struct CriteriaBuilder<'t> { + rtxn: &'t heed::RoTxn<'t>, + index: &'t Index, + words_fst: fst::Set>, + words_prefixes_fst: fst::Set>, +} + +/// Return the docids for the following word pairs and proximities using [`Context::word_pair_proximity_docids`]. +/// * `left, right, prox` (leftward proximity) +/// * `right, left, prox-1` (rightward proximity) +/// +/// ## Example +/// For a document with the text `the good fox eats the apple`, we have: +/// * `rightward_proximity(the, eats) = 3` +/// * `leftward_proximity(eats, the) = 1` +/// +/// So both the expressions `word_pair_overall_proximity_docids(ctx, the, eats, 3)` +/// and `word_pair_overall_proximity_docids(ctx, the, eats, 2)` would return a bitmap containing +/// the id of this document. +fn word_pair_overall_proximity_docids( + ctx: &dyn Context, + left: &str, + right: &str, + prox: u8, +) -> heed::Result> { + let rightward = ctx.word_pair_proximity_docids(left, right, prox)?; + let leftward = + if prox > 1 { ctx.word_pair_proximity_docids(right, left, prox - 1)? } else { None }; + if let Some(mut all) = rightward { + if let Some(leftward) = leftward { + all |= leftward; + } + Ok(Some(all)) + } else { + Ok(leftward) + } +} + +/// This function works identically to [`word_pair_overall_proximity_docids`] except that the +/// right word is replaced by a prefix string. +/// +/// It will return None if no documents were found or if the prefix does not exist in the +/// `word_prefix_pair_proximity_docids` database. +fn word_prefix_pair_overall_proximity_docids( + ctx: &dyn Context, + left: &str, + prefix: &str, + proximity: u8, +) -> heed::Result> { + // We retrieve the docids for the original and swapped word pairs: + // A: word1 prefix2 proximity + // B: prefix2 word1 proximity-1 + let rightward = ctx.word_prefix_pair_proximity_docids(left, prefix, proximity)?; + + let leftward = if proximity > 1 { + ctx.prefix_word_pair_proximity_docids(prefix, left, proximity - 1)? + } else { + None + }; + if let Some(mut all) = rightward { + if let Some(leftward) = leftward { + all |= leftward; + } + Ok(Some(all)) + } else { + Ok(leftward) + } +} + +impl<'c> Context<'c> for CriteriaBuilder<'c> { + fn documents_ids(&self) -> heed::Result { + self.index.documents_ids(self.rtxn) + } + + fn word_docids(&self, word: &str) -> heed::Result> { + self.index.word_docids.get(self.rtxn, word) + } + + fn exact_word_docids(&self, word: &str) -> heed::Result> { + self.index.exact_word_docids.get(self.rtxn, word) + } + + fn word_prefix_docids(&self, word: &str) -> heed::Result> { + self.index.word_prefix_docids.get(self.rtxn, word) + } + + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { + self.index.exact_word_prefix_docids.get(self.rtxn, word) + } + + fn word_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result> { + self.index.word_pair_proximity_docids.get(self.rtxn, &(proximity, left, right)) + } + + fn word_prefix_pair_proximity_docids( + &self, + left: &str, + prefix: &str, + proximity: u8, + ) -> heed::Result> { + self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &(proximity, left, prefix)) + } + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, + right: &str, + proximity: u8, + ) -> heed::Result> { + self.index.prefix_word_pair_proximity_docids.get(self.rtxn, &(proximity, prefix, right)) + } + + fn words_fst<'t>(&self) -> &'t fst::Set> { + &self.words_fst + } + + fn in_prefix_cache(&self, word: &str) -> bool { + self.words_prefixes_fst.contains(word) + } + + fn docid_words_positions( + &self, + docid: DocumentId, + ) -> heed::Result> { + let mut words_positions = HashMap::new(); + for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { + let ((_, word), positions) = result?; + words_positions.insert(word.to_string(), positions); + } + Ok(words_positions) + } + + fn word_position_iterator( + &self, + word: &str, + in_prefix_cache: bool, + ) -> heed::Result> + 'c>> + { + let range = { + let left = u32::min_value(); + let right = u32::max_value(); + let left = (word, left); + let right = (word, right); + left..=right + }; + let db = match in_prefix_cache { + true => self.index.word_prefix_position_docids, + false => self.index.word_position_docids, + }; + + Ok(Box::new(db.range(self.rtxn, &range)?)) + } + + fn synonyms(&self, word: &str) -> heed::Result>>> { + self.index.words_synonyms(self.rtxn, &[word]) + } + + fn searchable_fields_ids(&self) -> Result> { + match self.index.searchable_fields_ids(self.rtxn)? { + Some(searchable_fields_ids) => Ok(searchable_fields_ids), + None => Ok(self.index.fields_ids_map(self.rtxn)?.ids().collect()), + } + } + + fn field_id_word_count_docids( + &self, + field_id: FieldId, + word_count: u8, + ) -> heed::Result> { + let key = (field_id, word_count); + self.index.field_id_word_count_docids.get(self.rtxn, &key) + } + + fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result> { + let key = (word, pos); + self.index.word_position_docids.get(self.rtxn, &key) + } +} + +impl<'t> CriteriaBuilder<'t> { + pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> Result { + let words_fst = index.words_fst(rtxn)?; + let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; + Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) + } + + #[allow(clippy::too_many_arguments)] + pub fn build( + &'t self, + query_tree: Option, + primitive_query: Option>, + filtered_candidates: Option, + sort_criteria: Option>, + exhaustive_number_hits: bool, + distinct: Option, + implementation_strategy: CriterionImplementationStrategy, + ) -> Result> { + use crate::criterion::Criterion as Name; + + let primitive_query = primitive_query.unwrap_or_default(); + + let mut criterion = Box::new(Initial::new( + self, + query_tree, + filtered_candidates, + exhaustive_number_hits, + distinct, + )) as Box; + for name in self.index.criteria(self.rtxn)? { + criterion = match name { + Name::Words => Box::new(Words::new(self, criterion)), + Name::Typo => Box::new(Typo::new(self, criterion)), + Name::Sort => match sort_criteria { + Some(ref sort_criteria) => { + for asc_desc in sort_criteria { + criterion = match asc_desc { + AscDescName::Asc(Member::Field(field)) => Box::new(AscDesc::asc( + self.index, + self.rtxn, + criterion, + field.to_string(), + implementation_strategy, + )?), + AscDescName::Desc(Member::Field(field)) => Box::new(AscDesc::desc( + self.index, + self.rtxn, + criterion, + field.to_string(), + implementation_strategy, + )?), + AscDescName::Asc(Member::Geo(point)) => { + Box::new(Geo::asc(self.index, self.rtxn, criterion, *point)?) + } + AscDescName::Desc(Member::Geo(point)) => { + Box::new(Geo::desc(self.index, self.rtxn, criterion, *point)?) + } + }; + } + criterion + } + None => criterion, + }, + Name::Proximity => { + Box::new(Proximity::new(self, criterion, implementation_strategy)) + } + Name::Attribute => { + Box::new(Attribute::new(self, criterion, implementation_strategy)) + } + Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), + Name::Asc(field) => Box::new(AscDesc::asc( + self.index, + self.rtxn, + criterion, + field, + implementation_strategy, + )?), + Name::Desc(field) => Box::new(AscDesc::desc( + self.index, + self.rtxn, + criterion, + field, + implementation_strategy, + )?), + }; + } + + Ok(Final::new(self, criterion)) + } +} + +pub fn resolve_query_tree( + ctx: &dyn Context, + query_tree: &Operation, + wdcache: &mut WordDerivationsCache, +) -> Result { + fn resolve_operation( + ctx: &dyn Context, + query_tree: &Operation, + wdcache: &mut WordDerivationsCache, + ) -> Result { + use Operation::{And, Or, Phrase, Query}; + + match query_tree { + And(ops) => { + let mut ops = ops + .iter() + .map(|op| resolve_operation(ctx, op, wdcache)) + .collect::>>()?; + + ops.sort_unstable_by_key(|cds| cds.len()); + + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for docids in ops { + if first_loop { + candidates = docids; + first_loop = false; + } else { + candidates &= &docids; + } + } + Ok(candidates) + } + Phrase(words) => resolve_phrase(ctx, words), + Or(_, ops) => { + let mut candidates = RoaringBitmap::new(); + for op in ops { + let docids = resolve_operation(ctx, op, wdcache)?; + candidates |= docids; + } + Ok(candidates) + } + Query(q) => Ok(query_docids(ctx, q, wdcache)?), + } + } + + resolve_operation(ctx, query_tree, wdcache) +} + +pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result { + let mut candidates = RoaringBitmap::new(); + let mut first_iter = true; + let winsize = phrase.len().min(3); + + if phrase.is_empty() { + return Ok(candidates); + } + + for win in phrase.windows(winsize) { + // Get all the documents with the matching distance for each word pairs. + let mut bitmaps = Vec::with_capacity(winsize.pow(2)); + for (offset, s1) in win + .iter() + .enumerate() + .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) + { + for (dist, s2) in win + .iter() + .skip(offset + 1) + .enumerate() + .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) + { + if dist == 0 { + match ctx.word_pair_proximity_docids(s1, s2, 1)? { + Some(m) => bitmaps.push(m), + // If there are no document for this pair, there will be no + // results for the phrase query. + None => return Ok(RoaringBitmap::new()), + } + } else { + let mut bitmap = RoaringBitmap::new(); + for dist in 0..=dist { + if let Some(m) = ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + bitmap |= m + } + } + if bitmap.is_empty() { + return Ok(bitmap); + } else { + bitmaps.push(bitmap); + } + } + } + } + + // We sort the bitmaps so that we perform the small intersections first, which is faster. + bitmaps.sort_unstable_by_key(|a| a.len()); + + for bitmap in bitmaps { + if first_iter { + candidates = bitmap; + first_iter = false; + } else { + candidates &= bitmap; + } + // There will be no match, return early + if candidates.is_empty() { + break; + } + } + } + Ok(candidates) +} + +fn all_word_pair_overall_proximity_docids, U: AsRef>( + ctx: &dyn Context, + left_words: &[(T, u8)], + right_words: &[(U, u8)], + proximity: u8, +) -> Result { + let mut docids = RoaringBitmap::new(); + for (left, _l_typo) in left_words { + for (right, _r_typo) in right_words { + let current_docids = + word_pair_overall_proximity_docids(ctx, left.as_ref(), right.as_ref(), proximity)? + .unwrap_or_default(); + docids |= current_docids; + } + } + Ok(docids) +} + +fn query_docids( + ctx: &dyn Context, + query: &Query, + wdcache: &mut WordDerivationsCache, +) -> Result { + match &query.kind { + QueryKind::Exact { word, original_typo } => { + if query.prefix && ctx.in_prefix_cache(word) { + let mut docids = ctx.word_prefix_docids(word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_prefix_docids(word)?.unwrap_or_default(); + } + Ok(docids) + } else if query.prefix { + let words = word_derivations(word, true, 0, ctx.words_fst(), wdcache)?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + docids |= ctx.word_docids(word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_docids(word)?.unwrap_or_default(); + } + } + Ok(docids) + } else { + let mut docids = ctx.word_docids(word)?.unwrap_or_default(); + // only add the exact docids if the word hasn't been derived + if *original_typo == 0 { + docids |= ctx.exact_word_docids(word)?.unwrap_or_default(); + } + Ok(docids) + } + } + QueryKind::Tolerant { typo, word } => { + let words = word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)?; + let mut docids = RoaringBitmap::new(); + for (word, typo) in words { + let mut current_docids = ctx.word_docids(word)?.unwrap_or_default(); + if *typo == 0 { + current_docids |= ctx.exact_word_docids(word)?.unwrap_or_default() + } + docids |= current_docids; + } + Ok(docids) + } + } +} + +fn query_pair_proximity_docids( + ctx: &dyn Context, + left: &Query, + right: &Query, + proximity: u8, + wdcache: &mut WordDerivationsCache, +) -> Result { + if proximity >= 8 { + let mut candidates = query_docids(ctx, left, wdcache)?; + let right_candidates = query_docids(ctx, right, wdcache)?; + candidates &= right_candidates; + return Ok(candidates); + } + + let prefix = right.prefix; + match (&left.kind, &right.kind) { + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { + if prefix { + // There are three distinct cases which we need to distinguish regarding the prefix `right`: + // + // 1. `right` is not in any prefix cache because it is not the prefix of many words + // (and thus, it doesn't have many word derivations) + // 2. `right` is in the prefix cache but cannot be found in the "word prefix pair proximity" databases either + // because it is too long or because the given proximity is too high. + // 3. `right` is in the prefix cache and can be found in the "word prefix pair proximity" databases + // + // The three cases are handled as follows: + // 1. We manually retrieve all the word derivations of `right` and check the `word_pair_proximity` + // database for each of them. + // 2. It would be too expensive to apply the same strategy as (1), therefore, we "disable" the + // proximity ranking rule for the prefixes of the right word. This is done as follows: + // 1. Only find the documents where left is in proximity to the exact (ie non-prefix) right word + // 2. Otherwise, assume that their proximity in all the documents in which they coexist is >= 8 + // + // 3. Query the prefix proximity databases. + match ( + ctx.in_prefix_cache(right), + right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB + && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + ) { + // Case 1: not in prefix cache + (false, _) => { + let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; + all_word_pair_overall_proximity_docids( + ctx, + &[(left, 0)], + r_words, + proximity, + ) + } + // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to + // query the prefix proximity databases. + (true, false) => { + // To "save" the relevancy a little bit, we still find the documents where the + // exact (i.e. non-prefix) right word is in the given proximity to the left word. + Ok(word_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()) + } + // Case 3: in prefix cache, short enough, and proximity is low enough + (true, true) => Ok(word_prefix_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()), + } + } else { + Ok(word_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()) + } + } + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { + let l_words = + word_derivations(left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); + if prefix { + // The logic here is almost identical to the one in the previous match branch. + // The difference is that we fetch the docids for each derivation of the left word. + match ( + ctx.in_prefix_cache(right), + right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB + && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + ) { + // Case 1: not in prefix cache + (false, _) => { + let mut docids = RoaringBitmap::new(); + let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; + for (left, _) in l_words { + docids |= all_word_pair_overall_proximity_docids( + ctx, + &[(left, 0)], + r_words, + proximity, + )?; + } + Ok(docids) + } + // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to + // query the prefix proximity databases. + (true, false) => { + // To "save" the relevancy a little bit, we still find the documents where the + // exact (i.e. non-prefix) right word is in proximity to any derivation of the left word. + let mut candidates = RoaringBitmap::new(); + for (left, _) in l_words { + candidates |= ctx + .word_pair_proximity_docids(&left, right, proximity)? + .unwrap_or_default(); + } + Ok(candidates) + } + // Case 3: in prefix cache, short enough, and proximity is low enough + (true, true) => { + let mut docids = RoaringBitmap::new(); + for (left, _) in l_words { + docids |= word_prefix_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default(); + } + Ok(docids) + } + } + } else { + all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) + } + } + (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { + let r_words = word_derivations(right, prefix, *typo, ctx.words_fst(), wdcache)?; + all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], r_words, proximity) + } + ( + QueryKind::Tolerant { typo: l_typo, word: left }, + QueryKind::Tolerant { typo: r_typo, word: right }, + ) => { + let l_words = + word_derivations(left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); + let r_words = word_derivations(right, prefix, *r_typo, ctx.words_fst(), wdcache)?; + all_word_pair_overall_proximity_docids(ctx, &l_words, r_words, proximity) + } + } +} + +#[cfg(test)] +pub mod test { + use std::collections::HashMap; + use std::iter; + + use maplit::hashmap; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + + use super::*; + + fn s(s: &str) -> String { + s.to_string() + } + pub struct TestContext<'t> { + words_fst: fst::Set>, + word_docids: HashMap, + exact_word_docids: HashMap, + word_prefix_docids: HashMap, + exact_word_prefix_docids: HashMap, + word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + prefix_word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + docid_words: HashMap>, + } + + impl<'c> Context<'c> for TestContext<'c> { + fn documents_ids(&self) -> heed::Result { + Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids)) + } + + fn word_docids(&self, word: &str) -> heed::Result> { + Ok(self.word_docids.get(&word.to_string()).cloned()) + } + + fn exact_word_docids(&self, word: &str) -> heed::Result> { + Ok(self.exact_word_docids.get(&word.to_string()).cloned()) + } + + fn word_prefix_docids(&self, word: &str) -> heed::Result> { + Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) + } + + fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { + Ok(self.exact_word_prefix_docids.get(&word.to_string()).cloned()) + } + + fn word_pair_proximity_docids( + &self, + left: &str, + right: &str, + proximity: u8, + ) -> heed::Result> { + let key = (left.to_string(), right.to_string(), proximity.into()); + Ok(self.word_pair_proximity_docids.get(&key).cloned()) + } + + fn word_prefix_pair_proximity_docids( + &self, + word: &str, + prefix: &str, + proximity: u8, + ) -> heed::Result> { + let key = (word.to_string(), prefix.to_string(), proximity.into()); + Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) + } + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, + word: &str, + proximity: u8, + ) -> heed::Result> { + let key = (prefix.to_string(), word.to_string(), proximity.into()); + Ok(self.prefix_word_pair_proximity_docids.get(&key).cloned()) + } + + fn words_fst<'t>(&self) -> &'t fst::Set> { + &self.words_fst + } + + fn in_prefix_cache(&self, word: &str) -> bool { + self.word_prefix_docids.contains_key(&word.to_string()) + } + + fn docid_words_positions( + &self, + docid: DocumentId, + ) -> heed::Result> { + if let Some(docid_words) = self.docid_words.get(&docid) { + Ok(docid_words + .iter() + .enumerate() + .map(|(i, w)| { + let bitmap = RoaringBitmap::from_sorted_iter(iter::once(i as u32)).unwrap(); + (w.clone(), bitmap) + }) + .collect()) + } else { + Ok(HashMap::new()) + } + } + + fn word_position_iterator( + &self, + _word: &str, + _in_prefix_cache: bool, + ) -> heed::Result< + Box> + 'c>, + > { + todo!() + } + + fn synonyms(&self, _word: &str) -> heed::Result>>> { + todo!() + } + + fn searchable_fields_ids(&self) -> Result> { + todo!() + } + + fn word_position_docids( + &self, + _word: &str, + _pos: u32, + ) -> heed::Result> { + todo!() + } + + fn field_id_word_count_docids( + &self, + _field_id: FieldId, + _word_count: u8, + ) -> heed::Result> { + todo!() + } + } + + impl<'a> Default for TestContext<'a> { + fn default() -> TestContext<'a> { + let mut rng = StdRng::seed_from_u64(102); + let rng = &mut rng; + + fn random_postings(rng: &mut R, len: usize) -> RoaringBitmap { + let mut values = Vec::::with_capacity(len); + while values.len() != len { + values.push(rng.gen()); + } + values.sort_unstable(); + + RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap() + } + + let word_docids = hashmap! { + s("hello") => random_postings(rng, 1500), + s("hi") => random_postings(rng, 4000), + s("word") => random_postings(rng, 2500), + s("split") => random_postings(rng, 400), + s("ngrams") => random_postings(rng, 1400), + s("world") => random_postings(rng, 15_000), + s("earth") => random_postings(rng, 8000), + s("2021") => random_postings(rng, 100), + s("2020") => random_postings(rng, 500), + s("is") => random_postings(rng, 50_000), + s("this") => random_postings(rng, 50_000), + s("good") => random_postings(rng, 1250), + s("morning") => random_postings(rng, 125), + }; + + let exact_word_docids = HashMap::new(); + + let mut docid_words = HashMap::new(); + for (word, docids) in word_docids.iter() { + for docid in docids { + let words = docid_words.entry(docid).or_insert(vec![]); + words.push(word.clone()); + } + } + + let word_prefix_docids = hashmap! { + s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], + s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], + s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], + }; + + let exact_word_prefix_docids = HashMap::new(); + + let mut word_pair_proximity_docids = HashMap::new(); + let mut word_prefix_pair_proximity_docids = HashMap::new(); + let mut prefix_word_pair_proximity_docids = HashMap::new(); + + for (lword, lcandidates) in &word_docids { + for (rword, rcandidates) in &word_docids { + if lword == rword { + continue; + } + let candidates = lcandidates & rcandidates; + for candidate in candidates { + if let Some(docid_words) = docid_words.get(&candidate) { + let lposition = docid_words.iter().position(|w| w == lword).unwrap(); + let rposition = docid_words.iter().position(|w| w == rword).unwrap(); + let key = if lposition < rposition { + (s(lword), s(rword), (rposition - lposition) as i32) + } else { + (s(lword), s(rword), (lposition - rposition + 1) as i32) + }; + let docids = word_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); + docids.push(candidate); + } + } + } + for (pword, pcandidates) in &word_prefix_docids { + if lword.starts_with(pword) { + continue; + } + let candidates = lcandidates & pcandidates; + for candidate in candidates { + if let Some(docid_words) = docid_words.get(&candidate) { + let lposition = docid_words.iter().position(|w| w == lword).unwrap(); + let rposition = + docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); + if lposition < rposition { + let key = (s(lword), s(pword), (rposition - lposition) as i32); + let docids = word_prefix_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); + docids.push(candidate); + } else { + let key = (s(lword), s(pword), (lposition - rposition) as i32); + let docids = prefix_word_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); + docids.push(candidate); + }; + } + } + } + } + + let mut keys = word_docids.keys().collect::>(); + keys.sort_unstable(); + let words_fst = fst::Set::from_iter(keys).unwrap().map_data(Cow::Owned).unwrap(); + + TestContext { + words_fst, + word_docids, + exact_word_docids, + word_prefix_docids, + exact_word_prefix_docids, + word_pair_proximity_docids, + word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, + docid_words, + } + } + } +} diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs new file mode 100644 index 000000000..66e5c95bf --- /dev/null +++ b/milli/src/search/criteria/proximity.rs @@ -0,0 +1,712 @@ +use std::collections::btree_map::{self, BTreeMap}; +use std::collections::hash_map::HashMap; + +use log::debug; +use roaring::RoaringBitmap; +use slice_group_by::GroupBy; + +use super::{ + query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, + Criterion, CriterionParameters, CriterionResult, +}; +use crate::search::criteria::InitialCandidates; +use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; +use crate::search::{build_dfa, CriterionImplementationStrategy, WordDerivationsCache}; +use crate::{Position, Result}; + +type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; + +/// Threshold on the number of candidates that will make +/// the system choose between one algorithm or another. +const CANDIDATES_THRESHOLD: u64 = 1000; + +/// Threshold on the number of proximity that will make +/// the system choose between one algorithm or another. +const PROXIMITY_THRESHOLD: u8 = 0; + +pub struct Proximity<'t> { + ctx: &'t dyn Context<'t>, + /// (max_proximity, query_tree, allowed_candidates) + state: Option<(u8, Operation, RoaringBitmap)>, + proximity: u8, + initial_candidates: InitialCandidates, + parent: Box, + candidates_cache: Cache, + plane_sweep_cache: Option>, + implementation_strategy: CriterionImplementationStrategy, +} + +impl<'t> Proximity<'t> { + pub fn new( + ctx: &'t dyn Context<'t>, + parent: Box, + implementation_strategy: CriterionImplementationStrategy, + ) -> Self { + Proximity { + ctx, + state: None, + proximity: 0, + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), + parent, + candidates_cache: Cache::new(), + plane_sweep_cache: None, + implementation_strategy, + } + } +} + +impl<'t> Criterion for Proximity<'t> { + #[logging_timer::time("Proximity::{}")] + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + // remove excluded candidates when next is called, instead of doing it in the loop. + if let Some((_, _, allowed_candidates)) = self.state.as_mut() { + *allowed_candidates -= params.excluded_candidates; + } + + loop { + debug!( + "Proximity at iteration {} (max prox {:?}) ({:?})", + self.proximity, + self.state.as_ref().map(|(mp, _, _)| mp), + self.state.as_ref().map(|(_, _, cd)| cd), + ); + + match &mut self.state { + Some((max_prox, _, allowed_candidates)) + if allowed_candidates.is_empty() || self.proximity > *max_prox => + { + self.state = None; // reset state + } + Some((_, query_tree, allowed_candidates)) => { + let mut new_candidates = if matches!( + self.implementation_strategy, + CriterionImplementationStrategy::OnlyIterative + ) || (matches!( + self.implementation_strategy, + CriterionImplementationStrategy::Dynamic + ) && allowed_candidates.len() + <= CANDIDATES_THRESHOLD + && self.proximity > PROXIMITY_THRESHOLD) + { + if let Some(cache) = self.plane_sweep_cache.as_mut() { + match cache.next() { + Some((p, candidates)) => { + self.proximity = p; + candidates + } + None => { + self.state = None; // reset state + continue; + } + } + } else { + let cache = resolve_plane_sweep_candidates( + self.ctx, + query_tree, + allowed_candidates, + )?; + self.plane_sweep_cache = Some(cache.into_iter()); + + continue; + } + } else { + // use set theory based algorithm + resolve_candidates( + self.ctx, + query_tree, + self.proximity, + &mut self.candidates_cache, + params.wdcache, + )? + }; + + new_candidates &= &*allowed_candidates; + *allowed_candidates -= &new_candidates; + self.proximity += 1; + + return Ok(Some(CriterionResult { + query_tree: Some(query_tree.clone()), + candidates: Some(new_candidates), + filtered_candidates: None, + initial_candidates: Some(self.initial_candidates.take()), + })); + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + initial_candidates, + }) => { + let mut candidates = match candidates { + Some(candidates) => candidates, + None => { + resolve_query_tree(self.ctx, &query_tree, params.wdcache)? + - params.excluded_candidates + } + }; + + if let Some(filtered_candidates) = filtered_candidates { + candidates &= filtered_candidates; + } + + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), + } + + let maximum_proximity = maximum_proximity(&query_tree); + self.state = Some((maximum_proximity as u8, query_tree, candidates)); + self.proximity = 0; + self.plane_sweep_cache = None; + } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + initial_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + initial_candidates, + })); + } + None => return Ok(None), + }, + } + } + } +} + +fn resolve_candidates<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + proximity: u8, + cache: &mut Cache, + wdcache: &mut WordDerivationsCache, +) -> Result { + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + proximity: u8, + cache: &mut Cache, + wdcache: &mut WordDerivationsCache, + ) -> Result> { + use Operation::{And, Or, Phrase}; + + let result = match query_tree { + And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, + Phrase(words) => { + if proximity == 0 { + let most_left = words + .iter() + .filter_map(|o| o.as_ref()) + .next() + .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); + let most_right = words + .iter() + .rev() + .filter_map(|o| o.as_ref()) + .next() + .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); + + match (most_left, most_right) { + (Some(l), Some(r)) => vec![(l, r, resolve_phrase(ctx, words)?)], + _otherwise => Default::default(), + } + } else { + Default::default() + } + } + Or(_, ops) => { + let mut output = Vec::new(); + for op in ops { + let result = resolve_operation(ctx, op, proximity, cache, wdcache)?; + output.extend(result); + } + output + } + Operation::Query(q) => { + if proximity == 0 { + let candidates = query_docids(ctx, q, wdcache)?; + vec![(q.clone(), q.clone(), candidates)] + } else { + Default::default() + } + } + }; + + Ok(result) + } + + fn mdfs_pair<'t>( + ctx: &'t dyn Context, + left: &Operation, + right: &Operation, + proximity: u8, + cache: &mut Cache, + wdcache: &mut WordDerivationsCache, + ) -> Result> { + fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator { + (0..=mana.min(left_max)).map(move |m| (m, mana - m)) + } + + let pair_max_proximity = 7; + + let mut output = Vec::new(); + + for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) { + for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) { + let left_key = (left.clone(), left_p); + if !cache.contains_key(&left_key) { + let candidates = resolve_operation(ctx, left, left_p, cache, wdcache)?; + cache.insert(left_key.clone(), candidates); + } + + let right_key = (right.clone(), right_p); + if !cache.contains_key(&right_key) { + let candidates = resolve_operation(ctx, right, right_p, cache, wdcache)?; + cache.insert(right_key.clone(), candidates); + } + + let lefts = cache.get(&left_key).unwrap(); + let rights = cache.get(&right_key).unwrap(); + + for (ll, lr, lcandidates) in lefts { + for (rl, rr, rcandidates) in rights { + let mut candidates = + query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; + if lcandidates.len() < rcandidates.len() { + candidates &= lcandidates; + candidates &= rcandidates; + } else { + candidates &= rcandidates; + candidates &= lcandidates; + } + if !candidates.is_empty() { + output.push((ll.clone(), rr.clone(), candidates)); + } + } + } + } + } + + Ok(output) + } + + fn mdfs<'t>( + ctx: &'t dyn Context, + branches: &[Operation], + proximity: u8, + cache: &mut Cache, + wdcache: &mut WordDerivationsCache, + ) -> Result> { + // Extract the first two elements but gives the tail + // that is just after the first element. + let next = + branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t)))); + + match next { + Some((head1, Some((head2, [_])))) => { + mdfs_pair(ctx, head1, head2, proximity, cache, wdcache) + } + Some((head1, Some((head2, tail)))) => { + let mut output = Vec::new(); + for p in 0..=proximity { + for (lhead, _, head_candidates) in + mdfs_pair(ctx, head1, head2, p, cache, wdcache)? + { + if !head_candidates.is_empty() { + for (_, rtail, mut candidates) in + mdfs(ctx, tail, proximity - p, cache, wdcache)? + { + candidates &= &head_candidates; + if !candidates.is_empty() { + output.push((lhead.clone(), rtail, candidates)); + } + } + } + } + } + Ok(output) + } + Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache), + None => Ok(Default::default()), + } + } + + let mut candidates = RoaringBitmap::new(); + for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? { + candidates |= cds; + } + Ok(candidates) +} + +fn resolve_plane_sweep_candidates( + ctx: &dyn Context, + query_tree: &Operation, + allowed_candidates: &RoaringBitmap, +) -> Result> { + /// FIXME may be buggy with query like "new new york" + fn plane_sweep( + groups_positions: Vec>, + consecutive: bool, + ) -> Result> { + fn compute_groups_proximity( + groups: &[(usize, (Position, u8, Position))], + consecutive: bool, + ) -> Option<(Position, u8, Position)> { + // take the inner proximity of the first group as initial + let (_, (_, mut proximity, _)) = groups.first()?; + let (_, (left_most_pos, _, _)) = groups.first()?; + let (_, (_, _, right_most_pos)) = + groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?; + + for pair in groups.windows(2) { + if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair { + // if two positions are equal, meaning that they share at least a word, we return None + if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 { + return None; + } + + let pair_proximity = { + // if intervals are disjoint [..].(..) + if lpos2 > rpos1 { + lpos2 - rpos1 + } + // if the second interval is a subset of the first [.(..).] + else if rpos2 < rpos1 { + (lpos2 - lpos1).min(rpos1 - rpos2) + } + // if intervals overlaps [.(..].) + else { + (lpos2 - lpos1).min(rpos2 - rpos1) + } + }; + + // if groups are in the good order (query order) we remove 1 to the proximity + // the proximity is clamped to 7 + let pair_proximity = + if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) }; + + proximity += pair_proximity as u8 + prox2; + } + } + + // if groups should be consecutives, we will only accept groups with a proximity of 0 + if !consecutive || proximity == 0 { + Some((*left_most_pos, proximity, *right_most_pos)) + } else { + None + } + } + + let groups_len = groups_positions.len(); + + let mut groups_positions: Vec<_> = + groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); + + // Pop top elements of each list. + let mut current = Vec::with_capacity(groups_len); + for (i, positions) in groups_positions.iter_mut().enumerate() { + match positions.next() { + Some(p) => current.push((i, p)), + // if a group return None, it means that the document does not contain all the words, + // we return an empty result. + None => return Ok(Vec::new()), + } + } + + // Sort k elements by their positions. + current.sort_unstable_by_key(|(_, p)| *p); + + // Find leftmost and rightmost group and their positions. + let mut leftmost = *current.first().unwrap(); + let mut rightmost = *current.last().unwrap(); + + let mut output = Vec::new(); + loop { + // Find the position p of the next elements of a list of the leftmost group. + // If the list is empty, break the loop. + let p = groups_positions[leftmost.0].next().map(|p| (leftmost.0, p)); + + // let q be the position q of second group of the interval. + let q = current[1]; + + // If p > r, then the interval [l, r] is minimal and + // we insert it into the heap according to its size. + if p.map_or(true, |p| p.1 > rightmost.1) { + if let Some(group) = compute_groups_proximity(¤t, consecutive) { + output.push(group); + } + } + + let p = match p { + Some(p) => p, + None => break, + }; + + // Replace the leftmost group P in the interval. + current[0] = p; + + if p.1 > rightmost.1 { + // if [l, r] is minimal, let r = p and l = q. + rightmost = p; + leftmost = q; + } else { + // Ohterwise, let l = min{p,q}. + leftmost = if p.1 < q.1 { p } else { q }; + } + + // Then update the interval and order of groups_positions in the interval. + current.sort_unstable_by_key(|(_, p)| *p); + } + + // Sort the list according to the size and the positions. + output.sort_unstable(); + + Ok(output) + } + + fn resolve_operation<'a>( + query_tree: &'a Operation, + rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, + words_positions: &HashMap, + ) -> Result> { + use Operation::{And, Or, Phrase}; + + if let Some(result) = rocache.get(query_tree) { + return Ok(result.clone()); + } + + let result = match query_tree { + And(ops) => { + let mut groups_positions = Vec::with_capacity(ops.len()); + for operation in ops { + let positions = resolve_operation(operation, rocache, words_positions)?; + groups_positions.push(positions); + } + plane_sweep(groups_positions, false)? + } + Phrase(words) => { + let mut groups_positions = Vec::with_capacity(words.len()); + + // group stop_words together. + for words in words.linear_group_by_key(Option::is_none) { + // skip if it's a group of stop words. + if matches!(words.first(), None | Some(None)) { + continue; + } + // make a consecutive plane-sweep on the subgroup of words. + let mut subgroup = Vec::with_capacity(words.len()); + for word in words.iter().map(|w| w.as_deref().unwrap()) { + match words_positions.get(word) { + Some(positions) => { + subgroup.push(positions.iter().map(|p| (p, 0, p)).collect()) + } + None => return Ok(vec![]), + } + } + match subgroup.len() { + 0 => {} + 1 => groups_positions.push(subgroup.pop().unwrap()), + _ => groups_positions.push(plane_sweep(subgroup, true)?), + } + } + match groups_positions.len() { + 0 => vec![], + 1 => groups_positions.pop().unwrap(), + _ => plane_sweep(groups_positions, false)?, + } + } + Or(_, ops) => { + let mut result = Vec::new(); + for op in ops { + result.extend(resolve_operation(op, rocache, words_positions)?) + } + + result.sort_unstable(); + result + } + Operation::Query(Query { prefix, kind }) => { + let mut result = Vec::new(); + match kind { + QueryKind::Exact { word, .. } => { + if *prefix { + let iter = word_derivations(word, true, 0, words_positions) + .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); + result.extend(iter); + } else if let Some(positions) = words_positions.get(word) { + result.extend(positions.iter().map(|p| (p, 0, p))); + } + } + QueryKind::Tolerant { typo, word } => { + let iter = word_derivations(word, *prefix, *typo, words_positions) + .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); + result.extend(iter); + } + } + + result.sort_unstable(); + result + } + }; + + rocache.insert(query_tree, result.clone()); + Ok(result) + } + + fn word_derivations<'a>( + word: &str, + is_prefix: bool, + max_typo: u8, + words_positions: &'a HashMap, + ) -> impl Iterator { + let dfa = build_dfa(word, max_typo, is_prefix); + words_positions.iter().filter_map(move |(document_word, positions)| { + use levenshtein_automata::Distance; + match dfa.eval(document_word) { + Distance::Exact(_) => Some(positions), + Distance::AtLeast(_) => None, + } + }) + } + + let mut resolve_operation_cache = HashMap::new(); + let mut candidates = BTreeMap::new(); + for docid in allowed_candidates { + let words_positions = ctx.docid_words_positions(docid)?; + resolve_operation_cache.clear(); + let positions = + resolve_operation(query_tree, &mut resolve_operation_cache, &words_positions)?; + let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); + let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7); + candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid); + } + + Ok(candidates) +} + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use big_s::S; + + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; + use crate::{Criterion, CriterionImplementationStrategy, SearchResult}; + + fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { + let mut documents = Vec::new(); + for prefix in prefixes { + for i in 0..500 { + documents.push( + serde_json::json!({ + "text": format!("{prefix}{i:x}"), + }) + .as_object() + .unwrap() + .clone(), + ) + } + } + documents + } + + #[test] + fn test_proximity_criterion_prefix_handling() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_primary_key(S("id")); + settings.set_criteria(vec![ + Criterion::Words, + Criterion::Typo, + Criterion::Proximity, + ]); + }) + .unwrap(); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + for doc in [ + // 0 + serde_json::json!({ "text": "zero is exactly the amount of configuration I want" }), + // 1 + serde_json::json!({ "text": "zero bad configuration" }), + // 2 + serde_json::json!({ "text": "zero configuration" }), + // 3 + serde_json::json!({ "text": "zero config" }), + // 4 + serde_json::json!({ "text": "zero conf" }), + // 5 + serde_json::json!({ "text": "zero bad conf" }), + ] { + documents.append_json_object(doc.as_object().unwrap()).unwrap(); + } + for doc in documents_with_enough_different_words_for_prefixes(&["conf"]) { + documents.append_json_object(&doc).unwrap(); + } + let documents = + DocumentsBatchReader::from_reader(Cursor::new(documents.into_inner().unwrap())) + .unwrap(); + + index.add_documents(documents).unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .query("zero c") + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .execute() + .unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .query("zero co") + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .execute() + .unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .query("zero con") + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .execute() + .unwrap(); + // Here searh results are degraded because `con` is in the prefix cache but it is too + // long to be stored in the prefix proximity databases, and we don't want to iterate over + // all of its word derivations + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .query("zero conf") + .execute() + .unwrap(); + // Here search results are degraded as well, but we can still rank correctly documents + // that contain `conf` exactly, and not as a prefix. + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 2, 3]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .query("zero config") + .execute() + .unwrap(); + // `config` is not a common prefix, so the normal methods are used + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 4, 5]"); + } +} diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs new file mode 100644 index 000000000..20bc718fd --- /dev/null +++ b/milli/src/search/criteria/typo.rs @@ -0,0 +1,498 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::mem::take; + +use log::debug; +use roaring::RoaringBitmap; + +use super::{ + query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters, + CriterionResult, +}; +use crate::search::criteria::{resolve_phrase, InitialCandidates}; +use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; +use crate::search::{word_derivations, WordDerivationsCache}; +use crate::Result; + +/// Maximum number of typo for a word of any length. +const MAX_TYPOS_PER_WORD: u8 = 2; + +pub struct Typo<'t> { + ctx: &'t dyn Context<'t>, + /// (max_typos, query_tree, candidates) + state: Option<(u8, Operation, Candidates)>, + typos: u8, + initial_candidates: Option, + parent: Box, + candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, +} + +impl<'t> Typo<'t> { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { + Typo { + ctx, + state: None, + typos: 0, + initial_candidates: None, + parent, + candidates_cache: HashMap::new(), + } + } +} + +impl<'t> Criterion for Typo<'t> { + #[logging_timer::time("Typo::{}")] + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + use Candidates::{Allowed, Forbidden}; + // remove excluded candidates when next is called, instead of doing it in the loop. + match self.state.as_mut() { + Some((_, _, Allowed(candidates))) => *candidates -= params.excluded_candidates, + Some((_, _, Forbidden(candidates))) => *candidates |= params.excluded_candidates, + None => (), + } + + loop { + debug!( + "Typo at iteration {} (max typos {:?}) ({:?})", + self.typos, + self.state.as_ref().map(|(mt, _, _)| mt), + self.state.as_ref().map(|(_, _, cd)| cd), + ); + + match self.state.as_mut() { + Some((max_typos, _, _)) if self.typos > *max_typos => { + self.state = None; // reset state + } + Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => { + self.state = None; // reset state + } + Some((_, query_tree, candidates_authorization)) => { + let fst = self.ctx.words_fst(); + let new_query_tree = match self.typos { + typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree( + fst, + query_tree.clone(), + self.typos, + params.wdcache, + )?, + MAX_TYPOS_PER_WORD => { + // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, + // we keep the altered query tree + *query_tree = alterate_query_tree( + fst, + query_tree.clone(), + self.typos, + params.wdcache, + )?; + // we compute the allowed candidates + let query_tree_allowed_candidates = + resolve_query_tree(self.ctx, query_tree, params.wdcache)?; + // we assign the allowed candidates to the candidates authorization. + *candidates_authorization = match take(candidates_authorization) { + Allowed(allowed_candidates) => { + Allowed(query_tree_allowed_candidates & allowed_candidates) + } + Forbidden(forbidden_candidates) => { + Allowed(query_tree_allowed_candidates - forbidden_candidates) + } + }; + query_tree.clone() + } + _otherwise => query_tree.clone(), + }; + + let mut candidates = resolve_candidates( + self.ctx, + &new_query_tree, + self.typos, + &mut self.candidates_cache, + params.wdcache, + )?; + + match candidates_authorization { + Allowed(allowed_candidates) => { + candidates &= &*allowed_candidates; + *allowed_candidates -= &candidates; + } + Forbidden(forbidden_candidates) => { + candidates -= &*forbidden_candidates; + *forbidden_candidates |= &candidates; + } + } + + let initial_candidates = match self.initial_candidates.as_mut() { + Some(initial_candidates) => initial_candidates.take(), + None => InitialCandidates::Estimated(candidates.clone()), + }; + + self.typos += 1; + + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: Some(candidates), + filtered_candidates: None, + initial_candidates: Some(initial_candidates), + })); + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + initial_candidates, + }) => { + self.initial_candidates = + match (self.initial_candidates.take(), initial_candidates) { + (Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic), + (self_ic, parent_ic) => self_ic.or(parent_ic), + }; + + let candidates = match candidates.or(filtered_candidates) { + Some(candidates) => { + Candidates::Allowed(candidates - params.excluded_candidates) + } + None => Candidates::Forbidden(params.excluded_candidates.clone()), + }; + + let maximum_typos = maximum_typo(&query_tree) as u8; + self.state = Some((maximum_typos, query_tree, candidates)); + self.typos = 0; + } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + initial_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + initial_candidates, + })); + } + None => return Ok(None), + }, + } + } + } +} + +/// Modify the query tree by replacing every tolerant query by an Or operation +/// containing all of the corresponding exact words in the words FST. Each tolerant +/// query will only be replaced by exact query with up to `number_typos` maximum typos. +fn alterate_query_tree( + words_fst: &fst::Set>, + mut query_tree: Operation, + number_typos: u8, + wdcache: &mut WordDerivationsCache, +) -> Result { + fn recurse( + words_fst: &fst::Set>, + operation: &mut Operation, + number_typos: u8, + wdcache: &mut WordDerivationsCache, + ) -> Result<()> { + use Operation::{And, Or, Phrase}; + + match operation { + And(ops) | Or(_, ops) => { + ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) + } + // Because Phrases don't allow typos, no alteration can be done. + Phrase(_words) => Ok(()), + Operation::Query(q) => { + if let QueryKind::Tolerant { typo, word } = &q.kind { + // if no typo is allowed we don't call word_derivations function, + // and directly create an Exact query + if number_typos == 0 { + *operation = Operation::Query(Query { + prefix: q.prefix, + kind: QueryKind::Exact { original_typo: 0, word: word.clone() }, + }); + } else { + let typo = *typo.min(&number_typos); + let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; + let queries = words + .iter() + .map(|(word, typo)| { + Operation::Query(Query { + prefix: false, + kind: QueryKind::Exact { + original_typo: *typo, + word: word.to_string(), + }, + }) + }) + .collect(); + + *operation = Operation::or(false, queries); + } + } + + Ok(()) + } + } + } + + recurse(words_fst, &mut query_tree, number_typos, wdcache)?; + Ok(query_tree) +} + +fn resolve_candidates<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + number_typos: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + wdcache: &mut WordDerivationsCache, +) -> Result { + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + number_typos: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + wdcache: &mut WordDerivationsCache, + ) -> Result { + use Operation::{And, Or, Phrase, Query}; + + match query_tree { + And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache), + Phrase(words) => resolve_phrase(ctx, words), + Or(_, ops) => { + let mut candidates = RoaringBitmap::new(); + for op in ops { + let docids = resolve_operation(ctx, op, number_typos, cache, wdcache)?; + candidates |= docids; + } + Ok(candidates) + } + Query(q) => { + if q.kind.typo() == number_typos { + Ok(query_docids(ctx, q, wdcache)?) + } else { + Ok(RoaringBitmap::new()) + } + } + } + } + + fn mdfs<'t>( + ctx: &'t dyn Context, + branches: &[Operation], + mana: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + wdcache: &mut WordDerivationsCache, + ) -> Result { + match branches.split_first() { + Some((head, [])) => { + let cache_key = (head.clone(), mana); + if let Some(candidates) = cache.get(&cache_key) { + Ok(candidates.clone()) + } else { + let candidates = resolve_operation(ctx, head, mana, cache, wdcache)?; + cache.insert(cache_key, candidates.clone()); + Ok(candidates) + } + } + Some((head, tail)) => { + let mut candidates = RoaringBitmap::new(); + + for m in 0..=mana { + let mut head_candidates = { + let cache_key = (head.clone(), m); + if let Some(candidates) = cache.get(&cache_key) { + candidates.clone() + } else { + let candidates = resolve_operation(ctx, head, m, cache, wdcache)?; + cache.insert(cache_key, candidates.clone()); + candidates + } + }; + if !head_candidates.is_empty() { + let tail_candidates = mdfs(ctx, tail, mana - m, cache, wdcache)?; + head_candidates &= tail_candidates; + candidates |= head_candidates; + } + } + + Ok(candidates) + } + None => Ok(RoaringBitmap::new()), + } + } + + resolve_operation(ctx, query_tree, number_typos, cache, wdcache) +} + +#[cfg(test)] +mod test { + use super::super::initial::Initial; + use super::super::test::TestContext; + use super::*; + use crate::search::NoopDistinct; + + fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String { + let mut result = String::new(); + while let Some(criterion) = criteria.next(&mut parameters).unwrap() { + result.push_str(&format!("{criterion:?}\n\n")); + } + result + } + + #[test] + fn initial_placeholder_no_facets() { + let context = TestContext::default(); + let query_tree = None; + let facet_candidates = None; + + let criterion_parameters = CriterionParameters { + wdcache: &mut WordDerivationsCache::new(), + excluded_candidates: &RoaringBitmap::new(), + }; + + let parent = + Initial::::new(&context, query_tree, facet_candidates, false, None); + let criteria = Typo::new(&context, Box::new(parent)); + + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, initial_candidates: None } + + "###); + } + + #[test] + fn initial_query_tree_no_facets() { + let context = TestContext::default(); + let query_tree = Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ); + + let facet_candidates = None; + + let criterion_parameters = CriterionParameters { + wdcache: &mut WordDerivationsCache::new(), + excluded_candidates: &RoaringBitmap::new(), + }; + let parent = + Initial::::new(&context, Some(query_tree), facet_candidates, false, None); + let criteria = Typo::new(&context, Box::new(parent)); + + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } + + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + OR + Exact { word: "word" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } + + "###); + } + + #[test] + fn initial_placeholder_with_facets() { + let context = TestContext::default(); + let query_tree = None; + let facet_candidates = context.word_docids("earth").unwrap().unwrap(); + + let criterion_parameters = CriterionParameters { + wdcache: &mut WordDerivationsCache::new(), + excluded_candidates: &RoaringBitmap::new(), + }; + let parent = Initial::::new( + &context, + query_tree, + Some(facet_candidates.clone()), + false, + None, + ); + let criteria = Typo::new(&context, Box::new(parent)); + + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), initial_candidates: None } + + "###); + } + + #[test] + fn initial_query_tree_with_facets() { + let context = TestContext::default(); + let query_tree = Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("this".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ); + + let facet_candidates = context.word_docids("earth").unwrap().unwrap(); + + let criterion_parameters = CriterionParameters { + wdcache: &mut WordDerivationsCache::new(), + excluded_candidates: &RoaringBitmap::new(), + }; + let parent = Initial::::new( + &context, + Some(query_tree), + Some(facet_candidates.clone()), + false, + None, + ); + let criteria = Typo::new(&context, Box::new(parent)); + + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } + + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + OR + Exact { word: "word" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } + + "###); + } +} diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs new file mode 100644 index 000000000..4c5f8b45b --- /dev/null +++ b/milli/src/search/criteria/words.rs @@ -0,0 +1,106 @@ +use log::debug; +use roaring::RoaringBitmap; + +use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::InitialCandidates; +use crate::search::query_tree::Operation; +use crate::Result; + +pub struct Words<'t> { + ctx: &'t dyn Context<'t>, + query_trees: Vec, + candidates: Option, + initial_candidates: Option, + filtered_candidates: Option, + parent: Box, +} + +impl<'t> Words<'t> { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { + Words { + ctx, + query_trees: Vec::default(), + candidates: None, + initial_candidates: None, + parent, + filtered_candidates: None, + } + } +} + +impl<'t> Criterion for Words<'t> { + #[logging_timer::time("Words::{}")] + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + // remove excluded candidates when next is called, instead of doing it in the loop. + if let Some(candidates) = self.candidates.as_mut() { + *candidates -= params.excluded_candidates; + } + + loop { + debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); + + match self.query_trees.pop() { + Some(query_tree) => { + let candidates = match self.candidates.as_mut() { + Some(allowed_candidates) => { + let mut candidates = + resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; + candidates &= &*allowed_candidates; + *allowed_candidates -= &candidates; + Some(candidates) + } + None => None, + }; + + let initial_candidates = self.initial_candidates.clone(); + + return Ok(Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates: self.filtered_candidates.clone(), + initial_candidates, + })); + } + None => match self.parent.next(params)? { + Some(CriterionResult { + query_tree: Some(query_tree), + candidates, + filtered_candidates, + initial_candidates, + }) => { + self.query_trees = explode_query_tree(query_tree); + self.candidates = candidates; + self.filtered_candidates = filtered_candidates; + + self.initial_candidates = + match (self.initial_candidates.take(), initial_candidates) { + (Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic), + (self_ic, parent_ic) => self_ic.or(parent_ic), + }; + } + Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + initial_candidates, + }) => { + return Ok(Some(CriterionResult { + query_tree: None, + candidates, + filtered_candidates, + initial_candidates, + })); + } + None => return Ok(None), + }, + } + } + } +} + +fn explode_query_tree(query_tree: Operation) -> Vec { + match query_tree { + Operation::Or(true, ops) => ops, + otherwise => vec![otherwise], + } +} diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs new file mode 100644 index 000000000..3ed683823 --- /dev/null +++ b/milli/src/search/distinct/facet_distinct.rs @@ -0,0 +1,218 @@ +use std::mem::size_of; + +use concat_arrays::concat_arrays; +use heed::types::{ByteSlice, Str, Unit}; +use roaring::RoaringBitmap; + +use super::{Distinct, DocIter}; +use crate::error::InternalError; +use crate::heed_codec::facet::{FacetGroupKey, *}; +use crate::index::db_name; +use crate::{DocumentId, FieldId, Index, Result}; + +const FID_SIZE: usize = size_of::(); +const DOCID_SIZE: usize = size_of::(); + +/// A distinct implementer that is backed by facets. +/// +/// On each iteration, the facet values for the +/// distinct attribute of the first document are retrieved. The document ids for these facet values +/// are then retrieved and taken out of the the candidate and added to the excluded set. We take +/// care to keep the document we are currently on, and remove it from the excluded list. The next +/// iterations will never contain any occurence of a document with the same distinct value as a +/// document from previous iterations. +#[derive(Clone)] +pub struct FacetDistinct<'a> { + distinct: FieldId, + index: &'a Index, + txn: &'a heed::RoTxn<'a>, +} + +impl<'a> FacetDistinct<'a> { + pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { + Self { distinct, index, txn } + } +} + +pub struct FacetDistinctIter<'a> { + candidates: RoaringBitmap, + distinct: FieldId, + excluded: RoaringBitmap, + index: &'a Index, + iter_offset: usize, + txn: &'a heed::RoTxn<'a>, +} + +impl<'a> FacetDistinctIter<'a> { + fn facet_string_docids(&self, key: &str) -> heed::Result> { + self.index + .facet_id_string_docids + .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) + .map(|opt| opt.map(|v| v.bitmap)) + } + + fn facet_number_docids(&self, key: f64) -> heed::Result> { + // get facet docids on level 0 + self.index + .facet_id_f64_docids + .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) + .map(|opt| opt.map(|v| v.bitmap)) + } + + fn distinct_string(&mut self, id: DocumentId) -> Result<()> { + let iter = facet_string_values(id, self.distinct, self.index, self.txn)?; + + for item in iter { + let ((_, _, value), _) = item?; + let facet_docids = + self.facet_string_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::FACET_ID_STRING_DOCIDS, + key: None, + })?; + self.excluded |= facet_docids; + } + + self.excluded.remove(id); + + Ok(()) + } + + fn distinct_number(&mut self, id: DocumentId) -> Result<()> { + let iter = facet_number_values(id, self.distinct, self.index, self.txn)?; + + for item in iter { + let ((_, _, value), _) = item?; + let facet_docids = + self.facet_number_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::FACET_ID_F64_DOCIDS, + key: None, + })?; + self.excluded |= facet_docids; + } + + self.excluded.remove(id); + + Ok(()) + } + + /// Performs the next iteration of the facet distinct. This is a convenience method that is + /// called by the Iterator::next implementation that transposes the result. It makes error + /// handling easier. + fn next_inner(&mut self) -> Result> { + // The first step is to remove all the excluded documents from our candidates + self.candidates -= &self.excluded; + + let mut candidates_iter = self.candidates.iter().skip(self.iter_offset); + match candidates_iter.next() { + Some(id) => { + // We distinct the document id on its facet strings and facet numbers. + self.distinct_string(id)?; + self.distinct_number(id)?; + + // The first document of each iteration is kept, since the next call to + // `difference_with` will filter out all the documents for that facet value. By + // increasing the offset we make sure to get the first valid value for the next + // distinct document to keep. + self.iter_offset += 1; + + Ok(Some(id)) + } + // no more candidate at this offset, return. + None => Ok(None), + } + } +} + +#[allow(clippy::drop_non_drop)] +fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] { + concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) +} + +fn facet_number_values<'a>( + id: DocumentId, + distinct: FieldId, + index: &Index, + txn: &'a heed::RoTxn, +) -> Result> { + let key = facet_values_prefix_key(distinct, id); + + let iter = index + .field_id_docid_facet_f64s + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_key_type::(); + + Ok(iter) +} + +fn facet_string_values<'a>( + id: DocumentId, + distinct: FieldId, + index: &Index, + txn: &'a heed::RoTxn, +) -> Result> { + let key = facet_values_prefix_key(distinct, id); + + let iter = index + .field_id_docid_facet_strings + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_types::(); + + Ok(iter) +} + +impl Iterator for FacetDistinctIter<'_> { + type Item = Result; + + fn next(&mut self) -> Option { + self.next_inner().transpose() + } +} + +impl DocIter for FacetDistinctIter<'_> { + fn into_excluded(self) -> RoaringBitmap { + self.excluded + } +} + +impl<'a> Distinct for FacetDistinct<'a> { + type Iter = FacetDistinctIter<'a>; + + fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { + FacetDistinctIter { + candidates, + distinct: self.distinct, + excluded, + index: self.index, + iter_offset: 0, + txn: self.txn, + } + } +} + +#[cfg(test)] +mod test { + use super::super::test::{generate_index, validate_distinct_candidates}; + use super::*; + + macro_rules! test_facet_distinct { + ($name:ident, $distinct:literal) => { + #[test] + fn $name() { + let (index, fid, candidates) = generate_index($distinct); + let txn = index.read_txn().unwrap(); + let mut map_distinct = FacetDistinct::new(fid, &index, &txn); + let excluded = RoaringBitmap::new(); + let mut iter = map_distinct.distinct(candidates.clone(), excluded); + let count = validate_distinct_candidates(iter.by_ref(), fid, &index); + let excluded = iter.into_excluded(); + assert_eq!(count as u64 + excluded.len(), candidates.len()); + } + }; + } + + test_facet_distinct!(test_string, "txt"); + test_facet_distinct!(test_strings, "txts"); + test_facet_distinct!(test_number, "cat-int"); +} diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs new file mode 100644 index 000000000..3a46bb469 --- /dev/null +++ b/milli/src/search/distinct/mod.rs @@ -0,0 +1,155 @@ +mod facet_distinct; +mod noop_distinct; + +pub use facet_distinct::FacetDistinct; +pub use noop_distinct::NoopDistinct; +use roaring::RoaringBitmap; + +use crate::{DocumentId, Result}; + +/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. +/// It provides a way to get back the ownership to the excluded set. +pub trait DocIter: Iterator> { + /// Returns ownership on the internal exluded set. + fn into_excluded(self) -> RoaringBitmap; +} + +/// A trait that is implemented by structs that perform a distinct on `candidates`. Calling distinct +/// must return an iterator containing only distinct documents, and add the discarded documents to +/// the excluded set. The excluded set can later be retrieved by calling `DocIter::excluded` on the +/// returned iterator. +pub trait Distinct { + type Iter: DocIter; + + fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter; +} + +#[cfg(test)] +mod test { + use std::collections::HashSet; + use std::io::Cursor; + + use once_cell::sync::Lazy; + use rand::seq::SliceRandom; + use rand::Rng; + use roaring::RoaringBitmap; + use serde_json::{json, Value}; + + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; + use crate::index::Index; + use crate::update::{ + IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, + }; + use crate::{DocumentId, FieldId, BEU32}; + + static JSON: Lazy> = Lazy::new(|| { + let mut rng = rand::thread_rng(); + let num_docs = rng.gen_range(10..30); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + let txts = ["Toto", "Titi", "Tata"]; + let cats = (1..10).map(|i| i.to_string()).collect::>(); + let cat_ints = (1..10).collect::>(); + + for i in 0..num_docs { + let txt = txts.choose(&mut rng).unwrap(); + let mut sample_txts = cats.clone(); + sample_txts.shuffle(&mut rng); + + let mut sample_ints = cat_ints.clone(); + sample_ints.shuffle(&mut rng); + + let json = json!({ + "id": i, + "txt": txt, + "cat-int": rng.gen_range(0..3), + "txts": sample_txts[..(rng.gen_range(0..3))], + "cat-ints": sample_ints[..(rng.gen_range(0..3))], + }); + + let object = match json { + Value::Object(object) => object, + _ => panic!(), + }; + + builder.append_json_object(&object).unwrap(); + } + + builder.into_inner().unwrap() + }); + + /// Returns a temporary index populated with random test documents, the FieldId for the + /// distinct attribute, and the RoaringBitmap with the document ids. + pub(crate) fn generate_index(distinct: &str) -> (TempIndex, FieldId, RoaringBitmap) { + let index = TempIndex::new(); + let mut txn = index.write_txn().unwrap(); + + // set distinct and faceted attributes for the index. + let config = IndexerConfig::default(); + let mut update = Settings::new(&mut txn, &index, &config); + update.set_distinct_field(distinct.to_string()); + update.execute(|_| (), || false).unwrap(); + + // add documents to the index + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }; + let addition = + IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| (), || false) + .unwrap(); + + let reader = + crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) + .unwrap(); + + let (addition, user_error) = addition.add_documents(reader).unwrap(); + user_error.unwrap(); + addition.execute().unwrap(); + + let fields_map = index.fields_ids_map(&txn).unwrap(); + let fid = fields_map.id(distinct).unwrap(); + + let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap(); + let map = (0..documents.documents_count() as u32).collect(); + + txn.commit().unwrap(); + + (index, fid, map) + } + + /// Checks that all the candidates are distinct, and returns the candidates number. + pub(crate) fn validate_distinct_candidates( + candidates: impl Iterator>, + distinct: FieldId, + index: &Index, + ) -> usize { + fn test(seen: &mut HashSet, value: &Value) { + match value { + Value::Null | Value::Object(_) | Value::Bool(_) => (), + Value::Number(_) | Value::String(_) => { + let s = value.to_string(); + assert!(seen.insert(s)); + } + Value::Array(values) => values.iter().for_each(|value| test(seen, value)), + } + } + + let mut seen = HashSet::::new(); + + let txn = index.read_txn().unwrap(); + let mut count = 0; + for candidate in candidates { + count += 1; + let candidate = candidate.unwrap(); + let id = BEU32::new(candidate); + let document = index.documents.get(&txn, &id).unwrap().unwrap(); + let value = document.get(distinct).unwrap(); + let value = serde_json::from_slice(value).unwrap(); + test(&mut seen, &value); + } + count + } +} diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs new file mode 100644 index 000000000..96a1f7d5d --- /dev/null +++ b/milli/src/search/distinct/noop_distinct.rs @@ -0,0 +1,55 @@ +use roaring::bitmap::IntoIter; +use roaring::RoaringBitmap; + +use super::{Distinct, DocIter}; +use crate::{DocumentId, Result}; + +/// A distinct implementer that does not perform any distinct, +/// and simply returns an iterator to the candidates. +pub struct NoopDistinct; + +pub struct NoopDistinctIter { + candidates: IntoIter, + excluded: RoaringBitmap, +} + +impl Iterator for NoopDistinctIter { + type Item = Result; + + fn next(&mut self) -> Option { + self.candidates.next().map(Ok) + } +} + +impl DocIter for NoopDistinctIter { + fn into_excluded(self) -> RoaringBitmap { + self.excluded + } +} + +impl Distinct for NoopDistinct { + type Iter = NoopDistinctIter; + + fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { + NoopDistinctIter { candidates: candidates.into_iter(), excluded } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_noop() { + let candidates = (1..10).collect(); + let excluded = RoaringBitmap::new(); + let mut iter = NoopDistinct.distinct(candidates, excluded); + assert_eq!( + iter.by_ref().map(Result::unwrap).collect::>(), + (1..10).collect::>() + ); + + let excluded = iter.into_excluded(); + assert!(excluded.is_empty()); + } +} diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs new file mode 100644 index 000000000..43367abbb --- /dev/null +++ b/milli/src/search/facet/facet_distribution.rs @@ -0,0 +1,539 @@ +use std::collections::{BTreeMap, HashSet}; +use std::ops::ControlFlow; +use std::{fmt, mem}; + +use heed::types::ByteSlice; +use heed::BytesDecode; +use roaring::RoaringBitmap; + +use crate::error::UserError; +use crate::facet::FacetType; +use crate::heed_codec::facet::{ + FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + OrderedF64Codec, +}; +use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec}; +use crate::search::facet::facet_distribution_iter; +use crate::{FieldId, Index, Result}; + +/// The default number of values by facets that will +/// be fetched from the key-value store. +pub const DEFAULT_VALUES_PER_FACET: usize = 100; + +/// Threshold on the number of candidates that will make +/// the system to choose between one algorithm or another. +const CANDIDATES_THRESHOLD: u64 = 3000; + +pub struct FacetDistribution<'a> { + facets: Option>, + candidates: Option, + max_values_per_facet: usize, + rtxn: &'a heed::RoTxn<'a>, + index: &'a Index, +} + +impl<'a> FacetDistribution<'a> { + pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> { + FacetDistribution { + facets: None, + candidates: None, + max_values_per_facet: DEFAULT_VALUES_PER_FACET, + rtxn, + index, + } + } + + pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { + self.facets = Some(names.into_iter().map(|s| s.as_ref().to_string()).collect()); + self + } + + pub fn max_values_per_facet(&mut self, max: usize) -> &mut Self { + self.max_values_per_facet = max; + self + } + + pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self { + self.candidates = Some(candidates); + self + } + + /// There is a small amount of candidates OR we ask for facet string values so we + /// decide to iterate over the facet values of each one of them, one by one. + fn facet_distribution_from_documents( + &self, + field_id: FieldId, + facet_type: FacetType, + candidates: &RoaringBitmap, + distribution: &mut BTreeMap, + ) -> heed::Result<()> { + match facet_type { + FacetType::Number => { + let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); + + let distribution_prelength = distribution.len(); + let db = self.index.field_id_docid_facet_f64s; + for docid in candidates.into_iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = db + .remap_key_type::() + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, value), ()) = result?; + *distribution.entry(value.to_string()).or_insert(0) += 1; + + if distribution.len() - distribution_prelength == self.max_values_per_facet + { + break; + } + } + } + } + FacetType::String => { + let mut normalized_distribution = BTreeMap::new(); + let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); + + let db = self.index.field_id_docid_facet_strings; + 'outer: for docid in candidates.into_iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = db + .remap_key_type::() + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, normalized_value), original_value) = result?; + let (_, count) = normalized_distribution + .entry(normalized_value) + .or_insert_with(|| (original_value, 0)); + *count += 1; + + if normalized_distribution.len() == self.max_values_per_facet { + break 'outer; + } + } + } + + let iter = normalized_distribution + .into_iter() + .map(|(_normalized, (original, count))| (original.to_string(), count)); + distribution.extend(iter); + } + } + + Ok(()) + } + + /// There is too much documents, we use the facet levels to move throught + /// the facet values, to find the candidates and values associated. + fn facet_numbers_distribution_from_facet_levels( + &self, + field_id: FieldId, + candidates: &RoaringBitmap, + distribution: &mut BTreeMap, + ) -> heed::Result<()> { + facet_distribution_iter::iterate_over_facet_distribution( + self.rtxn, + self.index + .facet_id_f64_docids + .remap_key_type::>(), + field_id, + candidates, + |facet_key, nbr_docids, _| { + let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap(); + distribution.insert(facet_key.to_string(), nbr_docids); + if distribution.len() == self.max_values_per_facet { + Ok(ControlFlow::Break(())) + } else { + Ok(ControlFlow::Continue(())) + } + }, + ) + } + + fn facet_strings_distribution_from_facet_levels( + &self, + field_id: FieldId, + candidates: &RoaringBitmap, + distribution: &mut BTreeMap, + ) -> heed::Result<()> { + facet_distribution_iter::iterate_over_facet_distribution( + self.rtxn, + self.index + .facet_id_string_docids + .remap_key_type::>(), + field_id, + candidates, + |facet_key, nbr_docids, any_docid| { + let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); + + let key: (FieldId, _, &str) = (field_id, any_docid, facet_key); + let original_string = self + .index + .field_id_docid_facet_strings + .get(self.rtxn, &key)? + .unwrap() + .to_owned(); + + distribution.insert(original_string, nbr_docids); + if distribution.len() == self.max_values_per_facet { + Ok(ControlFlow::Break(())) + } else { + Ok(ControlFlow::Continue(())) + } + }, + ) + } + + /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the + /// facet values one by one and iterate on the facet level 0 for numbers. + fn facet_values_from_raw_facet_database( + &self, + field_id: FieldId, + ) -> heed::Result> { + let mut distribution = BTreeMap::new(); + + let db = self.index.facet_id_f64_docids; + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(0); // read values from level 0 only + + let iter = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); + + for result in iter { + let (key, value) = result?; + distribution.insert(key.left_bound.to_string(), value.bitmap.len()); + if distribution.len() == self.max_values_per_facet { + break; + } + } + + let iter = self + .index + .facet_id_string_docids + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); + + for result in iter { + let (key, value) = result?; + + let docid = value.bitmap.iter().next().unwrap(); + let key: (FieldId, _, &'a str) = (field_id, docid, key.left_bound); + let original_string = + self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned(); + + distribution.insert(original_string, value.bitmap.len()); + if distribution.len() == self.max_values_per_facet { + break; + } + } + + Ok(distribution) + } + + fn facet_values(&self, field_id: FieldId) -> heed::Result> { + use FacetType::{Number, String}; + + match self.candidates { + Some(ref candidates) => { + // Classic search, candidates were specified, we must return facet values only related + // to those candidates. We also enter here for facet strings for performance reasons. + let mut distribution = BTreeMap::new(); + if candidates.len() <= CANDIDATES_THRESHOLD { + self.facet_distribution_from_documents( + field_id, + Number, + candidates, + &mut distribution, + )?; + self.facet_distribution_from_documents( + field_id, + String, + candidates, + &mut distribution, + )?; + } else { + self.facet_numbers_distribution_from_facet_levels( + field_id, + candidates, + &mut distribution, + )?; + self.facet_strings_distribution_from_facet_levels( + field_id, + candidates, + &mut distribution, + )?; + } + Ok(distribution) + } + None => self.facet_values_from_raw_facet_database(field_id), + } + } + + pub fn execute(&self) -> Result>> { + let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; + let filterable_fields = self.index.filterable_fields(self.rtxn)?; + + let fields = match self.facets { + Some(ref facets) => { + let invalid_fields: HashSet<_> = facets + .iter() + .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) + .collect(); + if !invalid_fields.is_empty() { + return Err(UserError::InvalidFacetsDistribution { + invalid_facets_name: invalid_fields.into_iter().cloned().collect(), + } + .into()); + } else { + facets.clone() + } + } + None => filterable_fields, + }; + + let mut distribution = BTreeMap::new(); + for (fid, name) in fields_ids_map.iter() { + if crate::is_faceted(name, &fields) { + let values = self.facet_values(fid)?; + distribution.insert(name.to_string(), values); + } + } + + Ok(distribution) + } +} + +impl fmt::Debug for FacetDistribution<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let FacetDistribution { facets, candidates, max_values_per_facet, rtxn: _, index: _ } = + self; + + f.debug_struct("FacetDistribution") + .field("facets", facets) + .field("candidates", candidates) + .field("max_values_per_facet", max_values_per_facet) + .finish() + } +} + +#[cfg(test)] +mod tests { + use big_s::S; + use maplit::hashset; + + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + use crate::{milli_snap, FacetDistribution}; + + #[test] + fn few_candidates_few_facet_values() { + // All the tests here avoid using the code in `facet_distribution_iter` because there aren't + // enough candidates. + + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let documents = documents!([ + { "colour": "Blue" }, + { "colour": " blue" }, + { "colour": "RED" } + ]); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([0, 1, 2].iter().copied().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([1, 2].iter().copied().collect()) + .execute() + .unwrap(); + + // I think it would be fine if " blue" was "Blue" instead. + // We just need to get any non-normalised string I think, even if it's not in + // the candidates + milli_snap!(format!("{map:?}"), @r###"{"colour": {" blue": 1, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([2].iter().copied().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([0, 1, 2].iter().copied().collect()) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 1}}"###); + } + + #[test] + fn many_candidates_few_facet_values() { + let mut index = TempIndex::new_with_map_size(4096 * 10_000); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let facet_values = ["Red", "RED", " red ", "Blue", "BLUE"]; + + let mut documents = vec![]; + for i in 0..10_000 { + let document = serde_json::json!({ + "colour": facet_values[i % 5], + }) + .as_object() + .unwrap() + .clone(); + documents.push(document); + } + + let documents = documents_batch_reader_from_objects(documents); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..10_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000}}"###); + } + + #[test] + fn many_candidates_many_facet_values() { + let mut index = TempIndex::new_with_map_size(4096 * 10_000); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let facet_values = (0..1000).into_iter().map(|x| format!("{x:x}")).collect::>(); + + let mut documents = vec![]; + for i in 0..10_000 { + let document = serde_json::json!({ + "colour": facet_values[i % 1000], + }) + .as_object() + .unwrap() + .clone(); + documents.push(document); + } + + let documents = documents_batch_reader_from_objects(documents); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "no_candidates", @"ac9229ed5964d893af96a7076e2f8af5"); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .max_values_per_facet(2) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "no_candidates_with_max_2", @r###"{"colour": {"0": 10, "1": 10}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..10_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "candidates_0_10_000", @"ac9229ed5964d893af96a7076e2f8af5"); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "candidates_0_5_000", @"825f23a4090d05756f46176987b7d992"); + } +} diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs new file mode 100644 index 000000000..6e209c7aa --- /dev/null +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -0,0 +1,195 @@ +use std::ops::ControlFlow; + +use heed::Result; +use roaring::RoaringBitmap; + +use super::{get_first_facet_value, get_highest_level}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::DocumentId; + +/// Call the given closure on the facet distribution of the candidate documents. +/// +/// The arguments to the closure are: +/// - the facet value, as a byte slice +/// - the number of documents among the candidates that contain this facet value +/// - the id of a document which contains the facet value. Note that this document +/// is not necessarily from the list of candidates, it is simply *any* document which +/// contains this facet value. +/// +/// The return value of the closure is a `ControlFlow<()>` which indicates whether we should +/// keep iterating over the different facet values or stop. +pub fn iterate_over_facet_distribution<'t, CB>( + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: &RoaringBitmap, + callback: CB, +) -> Result<()> +where + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, +{ + let mut fd = FacetDistribution { rtxn, db, field_id, callback }; + let highest_level = get_highest_level( + rtxn, + db.remap_key_type::>(), + field_id, + )?; + + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; + Ok(()) + } else { + Ok(()) + } +} + +struct FacetDistribution<'t, CB> +where + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, +{ + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + callback: CB, +} + +impl<'t, CB> FacetDistribution<'t, CB> +where + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, +{ + fn iterate_level_0( + &mut self, + candidates: &RoaringBitmap, + starting_bound: &'t [u8], + group_size: usize, + ) -> Result> { + let starting_key = + FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; + let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size); + for el in iter { + let (key, value) = el?; + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != self.field_id { + return Ok(ControlFlow::Break(())); + } + let docids_in_common = value.bitmap & candidates; + if !docids_in_common.is_empty() { + let any_docid_in_common = docids_in_common.min().unwrap(); + match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)? + { + ControlFlow::Continue(_) => (), + ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), + } + } + } + Ok(ControlFlow::Continue(())) + } + fn iterate( + &mut self, + candidates: &RoaringBitmap, + level: u8, + starting_bound: &'t [u8], + group_size: usize, + ) -> Result> { + if level == 0 { + return self.iterate_level_0(candidates, starting_bound, group_size); + } + let starting_key = + FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound }; + let iter = self.db.range(self.rtxn, &(&starting_key..)).unwrap().take(group_size); + + for el in iter { + let (key, value) = el.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != self.field_id { + return Ok(ControlFlow::Break(())); + } + let docids_in_common = value.bitmap & candidates; + if !docids_in_common.is_empty() { + let cf = self.iterate( + &docids_in_common, + level - 1, + key.left_bound, + value.size as usize, + )?; + match cf { + ControlFlow::Continue(_) => {} + ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), + } + } + } + Ok(ControlFlow::Continue(())) + } +} + +#[cfg(test)] +mod tests { + use std::ops::ControlFlow; + + use heed::BytesDecode; + use roaring::RoaringBitmap; + + use super::iterate_over_facet_distribution; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::milli_snap; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; + + #[test] + fn filter_distribution_all() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (0..=255).into_iter().collect::(); + let mut results = String::new(); + iterate_over_facet_distribution( + &txn, + index.content, + 0, + &candidates, + |facet, count, _| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {count}\n")); + Ok(ControlFlow::Continue(())) + }, + ) + .unwrap(); + milli_snap!(results, i); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_distribution_all_stop_early() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (0..=255).into_iter().collect::(); + let mut results = String::new(); + let mut nbr_facets = 0; + iterate_over_facet_distribution( + &txn, + index.content, + 0, + &candidates, + |facet, count, _| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + if nbr_facets == 100 { + return Ok(ControlFlow::Break(())); + } else { + nbr_facets += 1; + results.push_str(&format!("{facet}: {count}\n")); + + Ok(ControlFlow::Continue(())) + } + }, + ) + .unwrap(); + milli_snap!(results, i); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs new file mode 100644 index 000000000..b1ab6f71f --- /dev/null +++ b/milli/src/search/facet/facet_range_search.rs @@ -0,0 +1,652 @@ +use std::ops::{Bound, RangeBounds}; + +use heed::BytesEncode; +use roaring::RoaringBitmap; + +use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::Result; + +/// Find all the document ids for which the given field contains a value contained within +/// the two bounds. +pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + left: &'t Bound<>::EItem>, + right: &'t Bound<>::EItem>, + docids: &mut RoaringBitmap, +) -> Result<()> +where + BoundCodec: for<'a> BytesEncode<'a>, + for<'a> >::EItem: Sized, +{ + let inner; + let left = match left { + Bound::Included(left) => { + inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; + Bound::Included(inner.as_ref()) + } + Bound::Excluded(left) => { + inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; + Bound::Excluded(inner.as_ref()) + } + Bound::Unbounded => Bound::Unbounded, + }; + let inner; + let right = match right { + Bound::Included(right) => { + inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; + Bound::Included(inner.as_ref()) + } + Bound::Excluded(right) => { + inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; + Bound::Excluded(inner.as_ref()) + } + Bound::Unbounded => Bound::Unbounded, + }; + let db = db.remap_key_type::>(); + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; + let highest_level = get_highest_level(rtxn, db, field_id)?; + + if let Some(starting_left_bound) = + get_first_facet_value::(rtxn, db, field_id)? + { + let rightmost_bound = Bound::Included( + get_last_facet_value::(rtxn, db, field_id)?.unwrap(), + ); // will not fail because get_first_facet_value succeeded + let group_size = usize::MAX; + f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; + Ok(()) + } else { + Ok(()) + } +} + +/// Fetch the document ids that have a facet with a value between the two given bounds +struct FacetRangeSearch<'t, 'b, 'bitmap> { + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + left: Bound<&'b [u8]>, + right: Bound<&'b [u8]>, + docids: &'bitmap mut RoaringBitmap, +} +impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { + fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { + let left_key = + FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; + let iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size); + for el in iter { + let (key, value) = el?; + // the right side of the iter range is unbounded, so we need to make sure that we are not iterating + // on the next field id + if key.field_id != self.field_id { + return Ok(()); + } + let should_skip = { + match self.left { + Bound::Included(left) => left > key.left_bound, + Bound::Excluded(left) => left >= key.left_bound, + Bound::Unbounded => false, + } + }; + if should_skip { + continue; + } + let should_stop = { + match self.right { + Bound::Included(right) => right < key.left_bound, + Bound::Excluded(right) => right <= key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + break; + } + + if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { + *self.docids |= value.bitmap; + } + } + Ok(()) + } + + /// Recursive part of the algorithm for level > 0. + /// + /// It works by visiting a slice of a level and checking whether the range asscociated + /// with each visited element is contained within the bounds. + /// + /// 1. So long as the element's range is less than the left bound, we do nothing and keep iterating + /// 2. If the element's range is fully contained by the bounds, then all of its docids are added to + /// the roaring bitmap. + /// 3. If the element's range merely intersects the bounds, then we call the algorithm recursively + /// on the children of the element from the level below. + /// 4. If the element's range is greater than the right bound, we do nothing and stop iterating. + /// Note that the right bound is found through either the `left_bound` of the *next* element, + /// or from the `rightmost_bound` argument + /// + /// ## Arguments + /// - `level`: the level being visited + /// - `starting_left_bound`: the left_bound of the first element to visit + /// - `rightmost_bound`: the right bound of the last element that should be visited + /// - `group_size`: the number of elements that should be visited + fn run( + &mut self, + level: u8, + starting_left_bound: &'t [u8], + rightmost_bound: Bound<&'t [u8]>, + group_size: usize, + ) -> Result<()> { + if level == 0 { + return self.run_level_0(starting_left_bound, group_size); + } + + let left_key = + FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound }; + let mut iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size); + + // We iterate over the range while keeping in memory the previous value + let (mut previous_key, mut previous_value) = iter.next().unwrap()?; + for el in iter { + let (next_key, next_value) = el?; + // the right of the iter range is potentially unbounded (e.g. if `group_size` is usize::MAX), + // so we need to make sure that we are not iterating on the next field id + if next_key.field_id != self.field_id { + break; + } + // now, do we skip, stop, or visit? + let should_skip = { + match self.left { + Bound::Included(left) => left >= next_key.left_bound, + Bound::Excluded(left) => left >= next_key.left_bound, + Bound::Unbounded => false, + } + }; + if should_skip { + previous_key = next_key; + previous_value = next_value; + continue; + } + + // should we stop? + // We should if the the search range doesn't include any + // element from the previous key or its successors + let should_stop = { + match self.right { + Bound::Included(right) => right < previous_key.left_bound, + Bound::Excluded(right) => right <= previous_key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + return Ok(()); + } + // should we take the whole thing, without recursing down? + let should_take_whole_group = { + let left_condition = match self.left { + Bound::Included(left) => previous_key.left_bound >= left, + Bound::Excluded(left) => previous_key.left_bound > left, + Bound::Unbounded => true, + }; + let right_condition = match self.right { + Bound::Included(right) => next_key.left_bound <= right, + Bound::Excluded(right) => next_key.left_bound <= right, + Bound::Unbounded => true, + }; + left_condition && right_condition + }; + if should_take_whole_group { + *self.docids |= &previous_value.bitmap; + previous_key = next_key; + previous_value = next_value; + continue; + } + // from here, we should visit the children of the previous element and + // call the function recursively + + let level = level - 1; + let starting_left_bound = previous_key.left_bound; + let rightmost_bound = Bound::Excluded(next_key.left_bound); + let group_size = previous_value.size as usize; + + self.run(level, starting_left_bound, rightmost_bound, group_size)?; + + previous_key = next_key; + previous_value = next_value; + } + // previous_key/previous_value are the last element's key/value + + // now, do we skip, stop, or visit? + let should_skip = { + match (self.left, rightmost_bound) { + (Bound::Included(left), Bound::Included(right)) => left > right, + (Bound::Included(left), Bound::Excluded(right)) => left >= right, + (Bound::Excluded(left), Bound::Included(right) | Bound::Excluded(right)) => { + left >= right + } + (Bound::Unbounded, _) => false, + (_, Bound::Unbounded) => false, // should never run? + } + }; + if should_skip { + return Ok(()); + } + + // should we stop? + // We should if the the search range doesn't include any + // element from the previous key or its successors + let should_stop = { + match self.right { + Bound::Included(right) => right < previous_key.left_bound, + Bound::Excluded(right) => right <= previous_key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + return Ok(()); + } + // should we take the whole thing, without recursing down? + let should_take_whole_group = { + let left_condition = match self.left { + Bound::Included(left) => previous_key.left_bound >= left, + Bound::Excluded(left) => previous_key.left_bound > left, + Bound::Unbounded => true, + }; + let right_condition = match (self.right, rightmost_bound) { + (Bound::Included(right), Bound::Included(rightmost)) => { + // we need to stay within the bound ..=right + // the element's range goes to ..=righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Included(right), Bound::Excluded(rightmost)) => { + // we need to stay within the bound ..=right + // the element's range goes to ..righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Excluded(right), Bound::Included(rightmost)) => { + // we need to stay within the bound ..right + // the element's range goes to ..=righmost + // so the element fits entirely within the bound if rightmost < right + rightmost < right + } + (Bound::Excluded(right), Bound::Excluded(rightmost)) => { + // we need to stay within the bound ..right + // the element's range goes to ..righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Unbounded, _) => { + // we need to stay within the bound ..inf + // so the element always fits entirely within the bound + true + } + (_, Bound::Unbounded) => { + // we need to stay within a finite bound + // but the element's range goes to ..inf + // so the element never fits entirely within the bound + false + } + }; + left_condition && right_condition + }; + if should_take_whole_group { + *self.docids |= &previous_value.bitmap; + } else { + let level = level - 1; + let starting_left_bound = previous_key.left_bound; + let group_size = previous_value.size as usize; + + self.run(level, starting_left_bound, rightmost_bound, group_size)?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::ops::Bound; + + use roaring::RoaringBitmap; + + use super::find_docids_of_facet_within_bounds; + use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; + use crate::milli_snap; + use crate::search::facet::tests::{ + get_random_looking_index, get_random_looking_index_with_multiple_field_ids, + get_simple_index, get_simple_index_with_multiple_field_ids, + }; + use crate::snapshot_tests::display_bitmap; + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + milli_snap!(format!("{index}"), @"3256c76a7c1b768a013e78d5fa6e9ff9"); + } + + #[test] + fn random_looking_index_with_multiple_field_ids_snap() { + let index = get_random_looking_index_with_multiple_field_ids(); + milli_snap!(format!("{index}"), @"c3e5fe06a8f1c404ed4935b32c90a89b"); + } + + #[test] + fn simple_index_snap() { + let index = get_simple_index(); + milli_snap!(format!("{index}"), @"5dbfa134cc44abeb3ab6242fc182e48e"); + } + + #[test] + fn simple_index_with_multiple_field_ids_snap() { + let index = get_simple_index_with_multiple_field_ids(); + milli_snap!(format!("{index}"), @"a4893298218f682bc76357f46777448c"); + } + + #[test] + fn filter_range_increasing() { + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Included(0.); + let end = Bound::Included(i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + #[allow(clippy::format_push_string)] + results.push_str(&format!("0 <= . <= {i} : {}\n", display_bitmap(&docids))); + } + milli_snap!(results, format!("included_{i}")); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Excluded(0.); + let end = Bound::Excluded(i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + #[allow(clippy::format_push_string)] + results.push_str(&format!("0 < . < {i} : {}\n", display_bitmap(&docids))); + } + milli_snap!(results, format!("excluded_{i}")); + txn.commit().unwrap(); + } + } + #[test] + fn filter_range_decreasing() { + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + + let mut results = String::new(); + + for i in (0..=255).into_iter().rev() { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Included(255.); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + results.push_str(&format!("{i} <= . <= 255 : {}\n", display_bitmap(&docids))); + } + + milli_snap!(results, format!("included_{i}")); + + let mut results = String::new(); + + for i in (0..=255).into_iter().rev() { + let i = i as f64; + let start = Bound::Excluded(i); + let end = Bound::Excluded(255.); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + results.push_str(&format!("{i} < . < 255 : {}\n", display_bitmap(&docids))); + } + + milli_snap!(results, format!("excluded_{i}")); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_range_pinch() { + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + + let mut results = String::new(); + + for i in (0..=128).into_iter().rev() { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Included(255. - i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + results.push_str(&format!( + "{i} <= . <= {r} : {docids}\n", + r = 255. - i, + docids = display_bitmap(&docids) + )); + } + + milli_snap!(results, format!("included_{i}")); + + let mut results = String::new(); + + for i in (0..=128).into_iter().rev() { + let i = i as f64; + let start = Bound::Excluded(i); + let end = Bound::Excluded(255. - i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + results.push_str(&format!( + "{i} < . < {r} {docids}\n", + r = 255. - i, + docids = display_bitmap(&docids) + )); + } + + milli_snap!(results, format!("excluded_{i}")); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_range_unbounded() { + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Unbounded; + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + #[allow(clippy::format_push_string)] + results.push_str(&format!(">= {i}: {}\n", display_bitmap(&docids))); + } + milli_snap!(results, format!("start_from_included_{i}")); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Unbounded; + let end = Bound::Included(i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + #[allow(clippy::format_push_string)] + results.push_str(&format!("<= {i}: {}\n", display_bitmap(&docids))); + } + milli_snap!(results, format!("end_at_included_{i}")); + + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &Bound::Unbounded, + &Bound::Unbounded, + &mut docids, + ) + .unwrap(); + milli_snap!( + &format!("all field_id 0: {}\n", display_bitmap(&docids)), + format!("unbounded_field_id_0_{i}") + ); + + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 1, + &Bound::Unbounded, + &Bound::Unbounded, + &mut docids, + ) + .unwrap(); + milli_snap!( + &format!("all field_id 1: {}\n", display_bitmap(&docids)), + format!("unbounded_field_id_1_{i}") + ); + + drop(txn); + } + } + + #[test] + fn filter_range_exact() { + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let mut results_0 = String::new(); + let mut results_1 = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Included(i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + #[allow(clippy::format_push_string)] + results_0.push_str(&format!("{i}: {}\n", display_bitmap(&docids))); + + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 1, + &start, + &end, + &mut docids, + ) + .unwrap(); + #[allow(clippy::format_push_string)] + results_1.push_str(&format!("{i}: {}\n", display_bitmap(&docids))); + } + milli_snap!(results_0, format!("field_id_0_exact_{i}")); + milli_snap!(results_1, format!("field_id_1_exact_{i}")); + + drop(txn); + } + } +} diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs new file mode 100644 index 000000000..32cf5c355 --- /dev/null +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -0,0 +1,226 @@ +use heed::Result; +use roaring::RoaringBitmap; + +use super::{get_first_facet_value, get_highest_level}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, +}; +use crate::heed_codec::ByteSliceRefCodec; + +/// Return an iterator which iterates over the given candidate documents in +/// ascending order of their facet value for the given field id. +/// +/// The documents returned by the iterator are grouped by the facet values that +/// determined their rank. For example, given the documents: +/// +/// ```ignore +/// 0: { "colour": ["blue", "green"] } +/// 1: { "colour": ["blue", "red"] } +/// 2: { "colour": ["orange", "red"] } +/// 3: { "colour": ["green", "red"] } +/// 4: { "colour": ["blue", "orange", "red"] } +/// ``` +/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator +/// over the following elements: +/// ```ignore +/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue" +/// [3] // same for "green" +/// [2] // same for "orange" +/// END +/// ``` +/// Note that once a document id is returned by the iterator, it is never returned again. +pub fn ascending_facet_sort<'t>( + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: RoaringBitmap, +) -> Result> + 't>> { + let highest_level = get_highest_level(rtxn, db, field_id)?; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; + let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); + + Ok(Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] })) + } else { + Ok(Box::new(std::iter::empty())) + } +} + +struct AscendingFacetSort<'t, 'e> { + rtxn: &'t heed::RoTxn<'e>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + #[allow(clippy::type_complexity)] + stack: Vec<( + RoaringBitmap, + std::iter::Take< + heed::RoRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, + >, + )>, +} + +impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { + type Item = Result; + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, deepest_iter) = self.stack.last_mut()?; + for result in deepest_iter { + let ( + FacetGroupKey { level, left_bound, field_id }, + FacetGroupValue { size: group_size, mut bitmap }, + ) = result.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if field_id != self.field_id { + return None; + } + + // If the last iterator found an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + // break our of the for loop into the end of the 'outer loop, which + // pops the stack + break; + } + + bitmap &= &*documents_ids; + if !bitmap.is_empty() { + *documents_ids -= &bitmap; + + if level == 0 { + return Some(Ok(bitmap)); + } + let starting_key_below = + FacetGroupKey { field_id: self.field_id, level: level - 1, left_bound }; + let iter = match self.db.range(self.rtxn, &(starting_key_below..)) { + Ok(iter) => iter, + Err(e) => return Some(Err(e)), + } + .take(group_size as usize); + + self.stack.push((bitmap, iter)); + continue 'outer; + } + } + self.stack.pop(); + } + } +} + +#[cfg(test)] +mod tests { + use roaring::RoaringBitmap; + + use crate::milli_snap; + use crate::search::facet::facet_sort_ascending::ascending_facet_sort; + use crate::search::facet::tests::{ + get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids, + get_simple_index, get_simple_string_index_with_multiple_field_ids, + }; + use crate::snapshot_tests::display_bitmap; + + #[test] + fn filter_sort_ascending() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, i); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_sort_ascending_multiple_field_ids() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-0")); + + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-1")); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_sort_ascending_with_no_candidates() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_sort_ascending_with_inexisting_field_id() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs new file mode 100644 index 000000000..4d1fdd1e7 --- /dev/null +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -0,0 +1,243 @@ +use std::ops::Bound; + +use heed::Result; +use roaring::RoaringBitmap; + +use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, +}; +use crate::heed_codec::ByteSliceRefCodec; + +/// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort). +/// +/// This function does the same thing, but in the opposite order. +pub fn descending_facet_sort<'t>( + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: RoaringBitmap, +) -> Result> + 't>> { + let highest_level = get_highest_level(rtxn, db, field_id)?; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; + let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); + Ok(Box::new(DescendingFacetSort { + rtxn, + db, + field_id, + stack: vec![(candidates, iter, Bound::Included(last_bound))], + })) + } else { + Ok(Box::new(std::iter::empty())) + } +} + +struct DescendingFacetSort<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + #[allow(clippy::type_complexity)] + stack: Vec<( + RoaringBitmap, + std::iter::Take< + heed::RoRevRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, + >, + Bound<&'t [u8]>, + )>, +} + +impl<'t> Iterator for DescendingFacetSort<'t> { + type Item = Result; + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?; + for result in deepest_iter.by_ref() { + let ( + FacetGroupKey { level, left_bound, field_id }, + FacetGroupValue { size: group_size, mut bitmap }, + ) = result.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if field_id != self.field_id { + return None; + } + // If the last iterator found an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + bitmap &= &*documents_ids; + if !bitmap.is_empty() { + *documents_ids -= &bitmap; + + if level == 0 { + return Some(Ok(bitmap)); + } + let starting_key_below = + FacetGroupKey { field_id, level: level - 1, left_bound }; + + let end_key_kelow = match *right_bound { + Bound::Included(right) => Bound::Included(FacetGroupKey { + field_id, + level: level - 1, + left_bound: right, + }), + Bound::Excluded(right) => Bound::Excluded(FacetGroupKey { + field_id, + level: level - 1, + left_bound: right, + }), + Bound::Unbounded => Bound::Unbounded, + }; + let prev_right_bound = *right_bound; + *right_bound = Bound::Excluded(left_bound); + let iter = match self + .db + .remap_key_type::>() + .rev_range(self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow)) + { + Ok(iter) => iter, + Err(e) => return Some(Err(e)), + } + .take(group_size as usize); + + self.stack.push((bitmap, iter, prev_right_bound)); + continue 'outer; + } + *right_bound = Bound::Excluded(left_bound); + } + self.stack.pop(); + } + } +} + +#[cfg(test)] +mod tests { + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::FacetGroupKeyCodec; + use crate::heed_codec::ByteSliceRefCodec; + use crate::milli_snap; + use crate::search::facet::facet_sort_descending::descending_facet_sort; + use crate::search::facet::tests::{ + get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids, + get_simple_index, get_simple_index_with_multiple_field_ids, + get_simple_string_index_with_multiple_field_ids, + }; + use crate::snapshot_tests::display_bitmap; + + #[test] + fn filter_sort_descending() { + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let db = index.content.remap_key_type::>(); + let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, i); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_sort_descending_multiple_field_ids() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let db = index.content.remap_key_type::>(); + let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-0")); + + let mut results = String::new(); + + let iter = descending_facet_sort(&txn, db, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, format!("{i}-1")); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_sort_ascending_with_no_candidates() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = descending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + let mut results = String::new(); + let iter = descending_facet_sort(&txn, index.content, 1, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } + + #[test] + fn filter_sort_ascending_with_inexisting_field_id() { + let indexes = [ + get_simple_string_index_with_multiple_field_ids(), + get_random_looking_string_index_with_multiple_field_ids(), + ]; + for (_i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = RoaringBitmap::new(); + let mut results = String::new(); + let iter = descending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + assert!(results.is_empty()); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs new file mode 100644 index 000000000..23cbb280c --- /dev/null +++ b/milli/src/search/facet/filter.rs @@ -0,0 +1,838 @@ +use std::collections::HashSet; +use std::fmt::{Debug, Display}; +use std::ops::Bound::{self, Excluded, Included}; + +use either::Either; +pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; +use roaring::RoaringBitmap; + +use super::facet_range_search; +use crate::error::{Error, UserError}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, +}; +use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; + +/// The maximum number of filters the filter AST can process. +const MAX_FILTER_DEPTH: usize = 2000; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Filter<'a> { + condition: FilterCondition<'a>, +} + +#[derive(Debug)] +enum FilterError<'a> { + AttributeNotFilterable { attribute: &'a str, filterable_fields: HashSet }, + BadGeo(&'a str), + BadGeoLat(f64), + BadGeoLng(f64), + Reserved(&'a str), + TooDeep, +} +impl<'a> std::error::Error for FilterError<'a> {} + +impl<'a> Display for FilterError<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::AttributeNotFilterable { attribute, filterable_fields } => { + if filterable_fields.is_empty() { + write!( + f, + "Attribute `{}` is not filterable. This index does not have configured filterable attributes.", + attribute, + ) + } else { + let filterables_list = filterable_fields.iter().map(AsRef::as_ref).collect::>().join(" "); + + write!( + f, + "Attribute `{}` is not filterable. Available filterable attributes are: `{}`.", + attribute, + filterables_list, + ) + } + }, + Self::TooDeep => write!(f, + "Too many filter conditions, can't process more than {} filters.", + MAX_FILTER_DEPTH + ), + Self::Reserved(keyword) => write!( + f, + "`{}` is a reserved keyword and thus can't be used as a filter expression.", + keyword + ), + Self::BadGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.", keyword), + Self::BadGeoLat(lat) => write!(f, "Bad latitude `{}`. Latitude must be contained between -90 and 90 degrees. ", lat), + Self::BadGeoLng(lng) => write!(f, "Bad longitude `{}`. Longitude must be contained between -180 and 180 degrees. ", lng), + } + } +} + +impl<'a> From> for Error { + fn from(error: FPError<'a>) -> Self { + Self::UserError(UserError::InvalidFilter(error.to_string())) + } +} + +impl<'a> From> for FilterCondition<'a> { + fn from(f: Filter<'a>) -> Self { + f.condition + } +} + +impl<'a> Filter<'a> { + pub fn from_array(array: I) -> Result> + where + I: IntoIterator>, + J: IntoIterator, + { + let mut ands = vec![]; + + for either in array { + match either { + Either::Left(array) => { + let mut ors = vec![]; + for rule in array { + if let Some(filter) = Self::from_str(rule)? { + ors.push(filter.condition); + } + } + + match ors.len() { + 0 => (), + 1 => ands.push(ors.pop().unwrap()), + _ => ands.push(FilterCondition::Or(ors)), + } + } + Either::Right(rule) => { + if let Some(filter) = Self::from_str(rule)? { + ands.push(filter.condition); + } + } + } + } + let and = if ands.is_empty() { + return Ok(None); + } else if ands.len() == 1 { + ands.pop().unwrap() + } else { + FilterCondition::And(ands) + }; + + if let Some(token) = and.token_at_depth(MAX_FILTER_DEPTH) { + return Err(token.as_external_error(FilterError::TooDeep).into()); + } + + Ok(Some(Self { condition: and })) + } + + #[allow(clippy::should_implement_trait)] + pub fn from_str(expression: &'a str) -> Result> { + let condition = match FilterCondition::parse(expression) { + Ok(Some(fc)) => Ok(fc), + Ok(None) => return Ok(None), + Err(e) => Err(Error::UserError(UserError::InvalidFilter(e.to_string()))), + }?; + + if let Some(token) = condition.token_at_depth(MAX_FILTER_DEPTH) { + return Err(token.as_external_error(FilterError::TooDeep).into()); + } + + Ok(Some(Self { condition })) + } +} + +impl<'a> Filter<'a> { + pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { + // to avoid doing this for each recursive call we're going to do it ONCE ahead of time + let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; + let filterable_fields = index.filterable_fields(rtxn)?; + + // and finally we delete all the soft_deleted_documents, again, only once at the very end + self.inner_evaluate(rtxn, index, &filterable_fields) + .map(|result| result - soft_deleted_documents) + } + + fn evaluate_operator( + rtxn: &heed::RoTxn, + index: &Index, + field_id: FieldId, + operator: &Condition<'a>, + ) -> Result { + let numbers_db = index.facet_id_f64_docids; + let strings_db = index.facet_id_string_docids; + + // Make sure we always bound the ranges with the field id and the level, + // as the facets values are all in the same database and prefixed by the + // field id and the level. + + let (left, right) = match operator { + Condition::GreaterThan(val) => { + (Excluded(val.parse_finite_float()?), Included(f64::MAX)) + } + Condition::GreaterThanOrEqual(val) => { + (Included(val.parse_finite_float()?), Included(f64::MAX)) + } + Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse_finite_float()?)), + Condition::LowerThanOrEqual(val) => { + (Included(f64::MIN), Included(val.parse_finite_float()?)) + } + Condition::Between { from, to } => { + (Included(from.parse_finite_float()?), Included(to.parse_finite_float()?)) + } + Condition::Exists => { + let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; + return Ok(exist); + } + Condition::Equal(val) => { + let string_docids = strings_db + .get( + rtxn, + &FacetGroupKey { + field_id, + level: 0, + left_bound: &val.value().to_lowercase(), + }, + )? + .map(|v| v.bitmap) + .unwrap_or_default(); + let number = val.parse_finite_float().ok(); + let number_docids = match number { + Some(n) => numbers_db + .get(rtxn, &FacetGroupKey { field_id, level: 0, left_bound: n })? + .map(|v| v.bitmap) + .unwrap_or_default(), + None => RoaringBitmap::new(), + }; + return Ok(string_docids | number_docids); + } + Condition::NotEqual(val) => { + let operator = Condition::Equal(val.clone()); + let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?; + let all_ids = index.documents_ids(rtxn)?; + return Ok(all_ids - docids); + } + }; + + let mut output = RoaringBitmap::new(); + Self::explore_facet_number_levels(rtxn, numbers_db, field_id, left, right, &mut output)?; + Ok(output) + } + + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_number_levels( + rtxn: &heed::RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: FieldId, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> Result<()> { + match (left, right) { + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + facet_range_search::find_docids_of_facet_within_bounds::( + rtxn, db, field_id, &left, &right, output, + )?; + + Ok(()) + } + + fn inner_evaluate( + &self, + rtxn: &heed::RoTxn, + index: &Index, + filterable_fields: &HashSet, + ) -> Result { + match &self.condition { + FilterCondition::Not(f) => { + let all_ids = index.documents_ids(rtxn)?; + let selected = Self::inner_evaluate( + &(f.as_ref().clone()).into(), + rtxn, + index, + filterable_fields, + )?; + Ok(all_ids - selected) + } + FilterCondition::In { fid, els } => { + if crate::is_faceted(fid.value(), filterable_fields) { + let field_ids_map = index.fields_ids_map(rtxn)?; + + if let Some(fid) = field_ids_map.id(fid.value()) { + let mut bitmap = RoaringBitmap::new(); + + for el in els { + let op = Condition::Equal(el.clone()); + let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?; + bitmap |= el_bitmap; + } + Ok(bitmap) + } else { + Ok(RoaringBitmap::new()) + } + } else { + Err(fid.as_external_error(FilterError::AttributeNotFilterable { + attribute: fid.value(), + filterable_fields: filterable_fields.clone(), + }))? + } + } + FilterCondition::Condition { fid, op } => { + if crate::is_faceted(fid.value(), filterable_fields) { + let field_ids_map = index.fields_ids_map(rtxn)?; + if let Some(fid) = field_ids_map.id(fid.value()) { + Self::evaluate_operator(rtxn, index, fid, op) + } else { + Ok(RoaringBitmap::new()) + } + } else { + match fid.lexeme() { + attribute @ "_geo" => { + Err(fid.as_external_error(FilterError::BadGeo(attribute)))? + } + attribute if attribute.starts_with("_geoPoint(") => { + Err(fid.as_external_error(FilterError::BadGeo("_geoPoint")))? + } + attribute @ "_geoDistance" => { + Err(fid.as_external_error(FilterError::Reserved(attribute)))? + } + attribute => { + Err(fid.as_external_error(FilterError::AttributeNotFilterable { + attribute, + filterable_fields: filterable_fields.clone(), + }))? + } + } + } + } + FilterCondition::Or(subfilters) => { + let mut bitmap = RoaringBitmap::new(); + for f in subfilters { + bitmap |= + Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; + } + Ok(bitmap) + } + FilterCondition::And(subfilters) => { + let mut subfilters_iter = subfilters.iter(); + if let Some(first_subfilter) = subfilters_iter.next() { + let mut bitmap = Self::inner_evaluate( + &(first_subfilter.clone()).into(), + rtxn, + index, + filterable_fields, + )?; + for f in subfilters_iter { + if bitmap.is_empty() { + return Ok(bitmap); + } + bitmap &= Self::inner_evaluate( + &(f.clone()).into(), + rtxn, + index, + filterable_fields, + )?; + } + Ok(bitmap) + } else { + Ok(RoaringBitmap::new()) + } + } + FilterCondition::GeoLowerThan { point, radius } => { + if filterable_fields.contains("_geo") { + let base_point: [f64; 2] = + [point[0].parse_finite_float()?, point[1].parse_finite_float()?]; + if !(-90.0..=90.0).contains(&base_point[0]) { + return Err( + point[0].as_external_error(FilterError::BadGeoLat(base_point[0])) + )?; + } + if !(-180.0..=180.0).contains(&base_point[1]) { + return Err( + point[1].as_external_error(FilterError::BadGeoLng(base_point[1])) + )?; + } + let radius = radius.parse_finite_float()?; + let rtree = match index.geo_rtree(rtxn)? { + Some(rtree) => rtree, + None => return Ok(RoaringBitmap::new()), + }; + + let xyz_base_point = lat_lng_to_xyz(&base_point); + + let result = rtree + .nearest_neighbor_iter(&xyz_base_point) + .take_while(|point| { + distance_between_two_points(&base_point, &point.data.1) + <= radius + f64::EPSILON + }) + .map(|point| point.data.0) + .collect(); + + Ok(result) + } else { + Err(point[0].as_external_error(FilterError::AttributeNotFilterable { + attribute: "_geo", + filterable_fields: filterable_fields.clone(), + }))? + } + } + } + } +} + +impl<'a> From> for Filter<'a> { + fn from(fc: FilterCondition<'a>) -> Self { + Self { condition: fc } + } +} + +#[cfg(test)] +mod tests { + use std::fmt::Write; + use std::iter::FromIterator; + + use big_s::S; + use either::Either; + use maplit::hashset; + use roaring::RoaringBitmap; + + use crate::index::tests::TempIndex; + use crate::Filter; + + #[test] + fn empty_db() { + let index = TempIndex::new(); + //Set the filterable fields to be the channel. + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("PrIcE") }); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let filter = Filter::from_str("PrIcE < 1000").unwrap().unwrap(); + let bitmap = filter.evaluate(&rtxn, &index).unwrap(); + assert!(bitmap.is_empty()); + + let filter = Filter::from_str("NOT PrIcE >= 1000").unwrap().unwrap(); + let bitmap = filter.evaluate(&rtxn, &index).unwrap(); + assert!(bitmap.is_empty()); + } + + #[test] + fn from_array() { + // Simple array with Left + let condition = Filter::from_array(vec![Either::Left(["channel = mv"])]).unwrap().unwrap(); + let expected = Filter::from_str("channel = mv").unwrap().unwrap(); + assert_eq!(condition, expected); + + // Simple array with Right + let condition = Filter::from_array::<_, Option<&str>>(vec![Either::Right("channel = mv")]) + .unwrap() + .unwrap(); + let expected = Filter::from_str("channel = mv").unwrap().unwrap(); + assert_eq!(condition, expected); + + // Array with Left and escaped quote + let condition = + Filter::from_array(vec![Either::Left(["channel = \"Mister Mv\""])]).unwrap().unwrap(); + let expected = Filter::from_str("channel = \"Mister Mv\"").unwrap().unwrap(); + assert_eq!(condition, expected); + + // Array with Right and escaped quote + let condition = + Filter::from_array::<_, Option<&str>>(vec![Either::Right("channel = \"Mister Mv\"")]) + .unwrap() + .unwrap(); + let expected = Filter::from_str("channel = \"Mister Mv\"").unwrap().unwrap(); + assert_eq!(condition, expected); + + // Array with Left and escaped simple quote + let condition = + Filter::from_array(vec![Either::Left(["channel = 'Mister Mv'"])]).unwrap().unwrap(); + let expected = Filter::from_str("channel = 'Mister Mv'").unwrap().unwrap(); + assert_eq!(condition, expected); + + // Array with Right and escaped simple quote + let condition = + Filter::from_array::<_, Option<&str>>(vec![Either::Right("channel = 'Mister Mv'")]) + .unwrap() + .unwrap(); + let expected = Filter::from_str("channel = 'Mister Mv'").unwrap().unwrap(); + assert_eq!(condition, expected); + + // Simple with parenthesis + let condition = + Filter::from_array(vec![Either::Left(["(channel = mv)"])]).unwrap().unwrap(); + let expected = Filter::from_str("(channel = mv)").unwrap().unwrap(); + assert_eq!(condition, expected); + + // Test that the facet condition is correctly generated. + let condition = Filter::from_array(vec![ + Either::Right("channel = gotaga"), + Either::Left(vec!["timestamp = 44", "channel != ponce"]), + ]) + .unwrap() + .unwrap(); + let expected = + Filter::from_str("channel = gotaga AND (timestamp = 44 OR channel != ponce)") + .unwrap() + .unwrap(); + assert_eq!(condition, expected); + } + + #[test] + fn not_filterable() { + let index = TempIndex::new(); + + let rtxn = index.read_txn().unwrap(); + let filter = Filter::from_str("_geoRadius(42, 150, 10)").unwrap().unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().starts_with( + "Attribute `_geo` is not filterable. This index does not have configured filterable attributes." + )); + + let filter = Filter::from_str("dog = \"bernese mountain\"").unwrap().unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().starts_with( + "Attribute `dog` is not filterable. This index does not have configured filterable attributes." + )); + drop(rtxn); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("title")]); + settings.set_filterable_fields(hashset! { S("title") }); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap().unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().starts_with( + "Attribute `_geo` is not filterable. Available filterable attributes are: `title`." + )); + + let filter = Filter::from_str("name = 12").unwrap().unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().starts_with( + "Attribute `name` is not filterable. Available filterable attributes are: `title`." + )); + } + + #[test] + fn escaped_quote_in_filter_value_2380() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { + "id": "test_1", + "monitor_diagonal": "27' to 30'" + }, + { + "id": "test_2", + "monitor_diagonal": "27\" to 30\"" + }, + { + "id": "test_3", + "monitor_diagonal": "27\" to 30'" + }, + ])) + .unwrap(); + + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset!(S("monitor_diagonal"))); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = crate::Search::new(&rtxn, &index); + // this filter is copy pasted from #2380 with the exact same espace sequence + search.filter(Filter::from_str("monitor_diagonal = '27\" to 30\\''").unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![2]); + + search.filter(Filter::from_str(r#"monitor_diagonal = "27' to 30'" "#).unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![0]); + + search.filter(Filter::from_str(r#"monitor_diagonal = "27\" to 30\"" "#).unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1]); + + search.filter(Filter::from_str(r#"monitor_diagonal = "27\" to 30'" "#).unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![2]); + } + + #[test] + fn zero_radius() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("_geo") }); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 1, + "name": "Nàpiz' Milano", + "address": "Viale Vittorio Veneto, 30, 20124, Milan, Italy", + "type": "pizza", + "rating": 9, + "_geo": { + "lat": 45.4777599, + "lng": 9.1967508 + } + }, + { + "id": 2, + "name": "Artico Gelateria Tradizionale", + "address": "Via Dogana, 1, 20123 Milan, Italy", + "type": "ice cream", + "rating": 10, + "_geo": { + "lat": 45.4632046, + "lng": 9.1719421 + } + }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = crate::Search::new(&rtxn, &index); + + search.filter(Filter::from_str("_geoRadius(45.4777599, 9.1967508, 0)").unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![0]); + } + + #[test] + fn geo_radius_error() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order + settings.set_filterable_fields(hashset! { S("_geo"), S("price") }); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // georadius have a bad latitude + let filter = Filter::from_str("_geoRadius(-100, 150, 10)").unwrap().unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!( + error.to_string().starts_with( + "Bad latitude `-100`. Latitude must be contained between -90 and 90 degrees." + ), + "{}", + error.to_string() + ); + + // georadius have a bad latitude + let filter = Filter::from_str("_geoRadius(-90.0000001, 150, 10)").unwrap().unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().contains( + "Bad latitude `-90.0000001`. Latitude must be contained between -90 and 90 degrees." + )); + + // georadius have a bad longitude + let filter = Filter::from_str("_geoRadius(-10, 250, 10)").unwrap().unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!( + error.to_string().contains( + "Bad longitude `250`. Longitude must be contained between -180 and 180 degrees." + ), + "{}", + error.to_string(), + ); + + // georadius have a bad longitude + let filter = Filter::from_str("_geoRadius(-10, 180.000001, 10)").unwrap().unwrap(); + let error = filter.evaluate(&rtxn, &index).unwrap_err(); + assert!(error.to_string().contains( + "Bad longitude `180.000001`. Longitude must be contained between -180 and 180 degrees." + )); + } + + #[test] + fn filter_depth() { + // generates a big (2 MiB) filter with too much of ORs. + let tipic_filter = "account_ids=14361 OR "; + let mut filter_string = String::with_capacity(tipic_filter.len() * 14360); + for i in 1..=14361 { + let _ = write!(&mut filter_string, "account_ids={}", i); + if i != 14361 { + let _ = write!(&mut filter_string, " OR "); + } + } + + // Note: the filter used to be rejected for being too deep, but that is + // no longer the case + let filter = Filter::from_str(&filter_string).unwrap(); + assert!(filter.is_some()); + } + + #[test] + fn empty_filter() { + let option = Filter::from_str(" ").unwrap(); + assert_eq!(option, None); + } + + #[test] + fn non_finite_float() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("price")]); // to keep the fields order + settings.set_filterable_fields(hashset! { S("price") }); + }) + .unwrap(); + index + .add_documents(documents!([ + { + "id": "test_1", + "price": "inf" + }, + { + "id": "test_2", + "price": "2000" + }, + { + "id": "test_3", + "price": "infinity" + }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let filter = Filter::from_str("price = inf").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert!(result.contains(0)); + let filter = Filter::from_str("price < inf").unwrap().unwrap(); + assert!(matches!( + filter.evaluate(&rtxn, &index), + Err(crate::Error::UserError(crate::error::UserError::InvalidFilter(_))) + )); + + let filter = Filter::from_str("price = NaN").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert!(result.is_empty()); + let filter = Filter::from_str("price < NaN").unwrap().unwrap(); + assert!(matches!( + filter.evaluate(&rtxn, &index), + Err(crate::Error::UserError(crate::error::UserError::InvalidFilter(_))) + )); + + let filter = Filter::from_str("price = infinity").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert!(result.contains(2)); + let filter = Filter::from_str("price < infinity").unwrap().unwrap(); + assert!(matches!( + filter.evaluate(&rtxn, &index), + Err(crate::Error::UserError(crate::error::UserError::InvalidFilter(_))) + )); + } + + #[test] + fn filter_number() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("id"), S("one"), S("two") }); + }) + .unwrap(); + + let mut docs = vec![]; + for i in 0..100 { + docs.push(serde_json::json!({ "id": i, "two": i % 10 })); + } + + index.add_documents(documents!(docs)).unwrap(); + + let rtxn = index.read_txn().unwrap(); + for i in 0..100 { + let filter_str = format!("id = {i}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter([i])); + } + for i in 0..100 { + let filter_str = format!("id > {i}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter((i + 1)..100)); + } + for i in 0..100 { + let filter_str = format!("id < {i}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter(0..i)); + } + for i in 0..100 { + let filter_str = format!("id <= {i}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter(0..=i)); + } + for i in 0..100 { + let filter_str = format!("id >= {i}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter(i..100)); + } + for i in 0..100 { + for j in i..100 { + let filter_str = format!("id {i} TO {j}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter(i..=j)); + } + } + let filter = Filter::from_str("one >= 0 OR one <= 0").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::default()); + + let filter = Filter::from_str("one = 0").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::default()); + + for i in 0..10 { + for j in i..10 { + let filter_str = format!("two {i} TO {j}"); + let filter = Filter::from_str(&filter_str).unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!( + result, + RoaringBitmap::from_iter((0..100).filter(|x| (i..=j).contains(&(x % 10)))) + ); + } + } + let filter = Filter::from_str("two != 0").unwrap().unwrap(); + let result = filter.evaluate(&rtxn, &index).unwrap(); + assert_eq!(result, RoaringBitmap::from_iter((0..100).filter(|x| x % 10 != 0))); + } +} diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs new file mode 100644 index 000000000..73054b84a --- /dev/null +++ b/milli/src/search/facet/mod.rs @@ -0,0 +1,190 @@ +pub use facet_sort_ascending::ascending_facet_sort; +pub use facet_sort_descending::descending_facet_sort; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, RoTxn}; + +pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; +pub use self::filter::Filter; +use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; +mod facet_distribution; +mod facet_distribution_iter; +mod facet_range_search; +mod facet_sort_ascending; +mod facet_sort_descending; +mod filter; + +/// Get the first facet value in the facet database +pub(crate) fn get_first_facet_value<'t, BoundCodec>( + txn: &'t RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> heed::Result> +where + BoundCodec: BytesDecode<'t>, +{ + let mut level0prefix = vec![]; + level0prefix.extend_from_slice(&field_id.to_be_bytes()); + level0prefix.push(0); + let mut level0_iter_forward = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; + if let Some(first) = level0_iter_forward.next() { + let (first_key, _) = first?; + let first_key = FacetGroupKeyCodec::::bytes_decode(first_key) + .ok_or(heed::Error::Encoding)?; + Ok(Some(first_key.left_bound)) + } else { + Ok(None) + } +} + +/// Get the last facet value in the facet database +pub(crate) fn get_last_facet_value<'t, BoundCodec>( + txn: &'t RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> heed::Result> +where + BoundCodec: BytesDecode<'t>, +{ + let mut level0prefix = vec![]; + level0prefix.extend_from_slice(&field_id.to_be_bytes()); + level0prefix.push(0); + let mut level0_iter_backward = db + .as_polymorph() + .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; + if let Some(last) = level0_iter_backward.next() { + let (last_key, _) = last?; + let last_key = FacetGroupKeyCodec::::bytes_decode(last_key) + .ok_or(heed::Error::Encoding)?; + Ok(Some(last_key.left_bound)) + } else { + Ok(None) + } +} + +/// Get the height of the highest level in the facet database +pub(crate) fn get_highest_level<'t>( + txn: &'t RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> heed::Result { + let field_id_prefix = &field_id.to_be_bytes(); + Ok(db + .as_polymorph() + .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, field_id_prefix)? + .next() + .map(|el| { + let (key, _) = el.unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); + key.level + }) + .unwrap_or(0)) +} + +#[cfg(test)] +pub(crate) mod tests { + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::StrRefCodec; + use crate::update::facet::test_helpers::FacetIndex; + + pub fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + txn.commit().unwrap(); + index + } + pub fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + + for (_i, key) in std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + txn.commit().unwrap(); + index + } + pub fn get_simple_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for fid in 0..2 { + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, fid, &(i as f64), &bitmap); + } + } + txn.commit().unwrap(); + index + } + pub fn get_random_looking_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + for fid in 0..2 { + for (_i, &key) in keys.iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, fid, &(key as f64), &bitmap); + } + } + txn.commit().unwrap(); + index + } + pub fn get_simple_string_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for fid in 0..2 { + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + if i % 2 == 0 { + index.insert(&mut txn, fid, &format!("{i}").as_str(), &bitmap); + } else { + index.insert(&mut txn, fid, &"", &bitmap); + } + } + } + txn.commit().unwrap(); + index + } + pub fn get_random_looking_string_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + for fid in 0..2 { + for (_i, &key) in keys.iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + if key % 2 == 0 { + index.insert(&mut txn, fid, &format!("{key}").as_str(), &bitmap); + } else { + index.insert(&mut txn, fid, &"", &bitmap); + } + } + } + txn.commit().unwrap(); + index + } +} diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap new file mode 100644 index 000000000..2b6123289 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap @@ -0,0 +1,260 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +0: 1 +1: 1 +2: 1 +3: 1 +4: 1 +5: 1 +6: 1 +7: 1 +8: 1 +9: 1 +10: 1 +11: 1 +12: 1 +13: 1 +14: 1 +15: 1 +16: 1 +17: 1 +18: 1 +19: 1 +20: 1 +21: 1 +22: 1 +23: 1 +24: 1 +25: 1 +26: 1 +27: 1 +28: 1 +29: 1 +30: 1 +31: 1 +32: 1 +33: 1 +34: 1 +35: 1 +36: 1 +37: 1 +38: 1 +39: 1 +40: 1 +41: 1 +42: 1 +43: 1 +44: 1 +45: 1 +46: 1 +47: 1 +48: 1 +49: 1 +50: 1 +51: 1 +52: 1 +53: 1 +54: 1 +55: 1 +56: 1 +57: 1 +58: 1 +59: 1 +60: 1 +61: 1 +62: 1 +63: 1 +64: 1 +65: 1 +66: 1 +67: 1 +68: 1 +69: 1 +70: 1 +71: 1 +72: 1 +73: 1 +74: 1 +75: 1 +76: 1 +77: 1 +78: 1 +79: 1 +80: 1 +81: 1 +82: 1 +83: 1 +84: 1 +85: 1 +86: 1 +87: 1 +88: 1 +89: 1 +90: 1 +91: 1 +92: 1 +93: 1 +94: 1 +95: 1 +96: 1 +97: 1 +98: 1 +99: 1 +100: 1 +101: 1 +102: 1 +103: 1 +104: 1 +105: 1 +106: 1 +107: 1 +108: 1 +109: 1 +110: 1 +111: 1 +112: 1 +113: 1 +114: 1 +115: 1 +116: 1 +117: 1 +118: 1 +119: 1 +120: 1 +121: 1 +122: 1 +123: 1 +124: 1 +125: 1 +126: 1 +127: 1 +128: 1 +129: 1 +130: 1 +131: 1 +132: 1 +133: 1 +134: 1 +135: 1 +136: 1 +137: 1 +138: 1 +139: 1 +140: 1 +141: 1 +142: 1 +143: 1 +144: 1 +145: 1 +146: 1 +147: 1 +148: 1 +149: 1 +150: 1 +151: 1 +152: 1 +153: 1 +154: 1 +155: 1 +156: 1 +157: 1 +158: 1 +159: 1 +160: 1 +161: 1 +162: 1 +163: 1 +164: 1 +165: 1 +166: 1 +167: 1 +168: 1 +169: 1 +170: 1 +171: 1 +172: 1 +173: 1 +174: 1 +175: 1 +176: 1 +177: 1 +178: 1 +179: 1 +180: 1 +181: 1 +182: 1 +183: 1 +184: 1 +185: 1 +186: 1 +187: 1 +188: 1 +189: 1 +190: 1 +191: 1 +192: 1 +193: 1 +194: 1 +195: 1 +196: 1 +197: 1 +198: 1 +199: 1 +200: 1 +201: 1 +202: 1 +203: 1 +204: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +211: 1 +212: 1 +213: 1 +214: 1 +215: 1 +216: 1 +217: 1 +218: 1 +219: 1 +220: 1 +221: 1 +222: 1 +223: 1 +224: 1 +225: 1 +226: 1 +227: 1 +228: 1 +229: 1 +230: 1 +231: 1 +232: 1 +233: 1 +234: 1 +235: 1 +236: 1 +237: 1 +238: 1 +239: 1 +240: 1 +241: 1 +242: 1 +243: 1 +244: 1 +245: 1 +246: 1 +247: 1 +248: 1 +249: 1 +250: 1 +251: 1 +252: 1 +253: 1 +254: 1 +255: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap new file mode 100644 index 000000000..d0c0dd98d --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap @@ -0,0 +1,105 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +3: 2 +5: 2 +6: 2 +9: 2 +10: 2 +11: 2 +14: 2 +18: 2 +19: 2 +24: 2 +26: 2 +28: 2 +29: 2 +32: 2 +33: 2 +35: 2 +36: 2 +37: 2 +38: 2 +39: 2 +41: 2 +46: 2 +47: 2 +49: 2 +52: 2 +53: 2 +55: 2 +59: 2 +61: 2 +64: 2 +68: 2 +71: 2 +74: 2 +75: 2 +76: 2 +81: 2 +83: 2 +85: 2 +86: 2 +88: 2 +90: 2 +91: 2 +92: 2 +98: 2 +99: 2 +101: 2 +102: 2 +103: 2 +107: 2 +111: 2 +115: 2 +119: 2 +123: 2 +124: 2 +130: 2 +131: 2 +133: 2 +135: 2 +136: 2 +137: 2 +139: 2 +141: 2 +143: 2 +144: 2 +147: 2 +150: 2 +156: 1 +158: 1 +160: 1 +162: 1 +163: 1 +164: 1 +167: 1 +169: 1 +173: 1 +177: 1 +178: 1 +179: 1 +181: 1 +182: 1 +186: 1 +189: 1 +192: 1 +193: 1 +195: 1 +197: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +216: 1 +219: 1 +220: 1 +223: 1 +226: 1 +235: 1 +236: 1 +238: 1 +243: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap new file mode 100644 index 000000000..7170dab89 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap @@ -0,0 +1,104 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +0: 1 +1: 1 +2: 1 +3: 1 +4: 1 +5: 1 +6: 1 +7: 1 +8: 1 +9: 1 +10: 1 +11: 1 +12: 1 +13: 1 +14: 1 +15: 1 +16: 1 +17: 1 +18: 1 +19: 1 +20: 1 +21: 1 +22: 1 +23: 1 +24: 1 +25: 1 +26: 1 +27: 1 +28: 1 +29: 1 +30: 1 +31: 1 +32: 1 +33: 1 +34: 1 +35: 1 +36: 1 +37: 1 +38: 1 +39: 1 +40: 1 +41: 1 +42: 1 +43: 1 +44: 1 +45: 1 +46: 1 +47: 1 +48: 1 +49: 1 +50: 1 +51: 1 +52: 1 +53: 1 +54: 1 +55: 1 +56: 1 +57: 1 +58: 1 +59: 1 +60: 1 +61: 1 +62: 1 +63: 1 +64: 1 +65: 1 +66: 1 +67: 1 +68: 1 +69: 1 +70: 1 +71: 1 +72: 1 +73: 1 +74: 1 +75: 1 +76: 1 +77: 1 +78: 1 +79: 1 +80: 1 +81: 1 +82: 1 +83: 1 +84: 1 +85: 1 +86: 1 +87: 1 +88: 1 +89: 1 +90: 1 +91: 1 +92: 1 +93: 1 +94: 1 +95: 1 +96: 1 +97: 1 +98: 1 +99: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap new file mode 100644 index 000000000..95c719bb0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap @@ -0,0 +1,104 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +3: 2 +5: 2 +6: 2 +9: 2 +10: 2 +11: 2 +14: 2 +18: 2 +19: 2 +24: 2 +26: 2 +28: 2 +29: 2 +32: 2 +33: 2 +35: 2 +36: 2 +37: 2 +38: 2 +39: 2 +41: 2 +46: 2 +47: 2 +49: 2 +52: 2 +53: 2 +55: 2 +59: 2 +61: 2 +64: 2 +68: 2 +71: 2 +74: 2 +75: 2 +76: 2 +81: 2 +83: 2 +85: 2 +86: 2 +88: 2 +90: 2 +91: 2 +92: 2 +98: 2 +99: 2 +101: 2 +102: 2 +103: 2 +107: 2 +111: 2 +115: 2 +119: 2 +123: 2 +124: 2 +130: 2 +131: 2 +133: 2 +135: 2 +136: 2 +137: 2 +139: 2 +141: 2 +143: 2 +144: 2 +147: 2 +150: 2 +156: 1 +158: 1 +160: 1 +162: 1 +163: 1 +164: 1 +167: 1 +169: 1 +173: 1 +177: 1 +178: 1 +179: 1 +181: 1 +182: 1 +186: 1 +189: 1 +192: 1 +193: 1 +195: 1 +197: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +216: 1 +219: 1 +220: 1 +223: 1 +226: 1 +235: 1 +236: 1 +238: 1 + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap new file mode 100644 index 000000000..e14520141 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +adf484f467a31ee9460dec539621938a diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap new file mode 100644 index 000000000..f3743e045 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +c9939aa4977fcd4bfd35852e102dbc82 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap new file mode 100644 index 000000000..e14520141 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +adf484f467a31ee9460dec539621938a diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap new file mode 100644 index 000000000..f3743e045 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +c9939aa4977fcd4bfd35852e102dbc82 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap new file mode 100644 index 000000000..25347579e --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +618738d28ff1386b6e93d171a5acb08f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap new file mode 100644 index 000000000..ec14f2cf6 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ffb62ab3eef55c2254c13dc0f4099849 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap new file mode 100644 index 000000000..25347579e --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +618738d28ff1386b6e93d171a5acb08f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap new file mode 100644 index 000000000..ec14f2cf6 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ffb62ab3eef55c2254c13dc0f4099849 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_0.hash.snap new file mode 100644 index 000000000..67965fcd4 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9c25261cec7275cb5cfd85835904d023 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_1.hash.snap new file mode 100644 index 000000000..c43ba2152 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2f97f18c15e915853e4df879be6e1f63 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_2.hash.snap new file mode 100644 index 000000000..67965fcd4 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9c25261cec7275cb5cfd85835904d023 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_3.hash.snap new file mode 100644 index 000000000..c43ba2152 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_0_exact_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2f97f18c15e915853e4df879be6e1f63 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_0.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_0.snap new file mode 100644 index 000000000..6cf7aa46c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_0.snap @@ -0,0 +1,260 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +0: [] +1: [] +2: [] +3: [] +4: [] +5: [] +6: [] +7: [] +8: [] +9: [] +10: [] +11: [] +12: [] +13: [] +14: [] +15: [] +16: [] +17: [] +18: [] +19: [] +20: [] +21: [] +22: [] +23: [] +24: [] +25: [] +26: [] +27: [] +28: [] +29: [] +30: [] +31: [] +32: [] +33: [] +34: [] +35: [] +36: [] +37: [] +38: [] +39: [] +40: [] +41: [] +42: [] +43: [] +44: [] +45: [] +46: [] +47: [] +48: [] +49: [] +50: [] +51: [] +52: [] +53: [] +54: [] +55: [] +56: [] +57: [] +58: [] +59: [] +60: [] +61: [] +62: [] +63: [] +64: [] +65: [] +66: [] +67: [] +68: [] +69: [] +70: [] +71: [] +72: [] +73: [] +74: [] +75: [] +76: [] +77: [] +78: [] +79: [] +80: [] +81: [] +82: [] +83: [] +84: [] +85: [] +86: [] +87: [] +88: [] +89: [] +90: [] +91: [] +92: [] +93: [] +94: [] +95: [] +96: [] +97: [] +98: [] +99: [] +100: [] +101: [] +102: [] +103: [] +104: [] +105: [] +106: [] +107: [] +108: [] +109: [] +110: [] +111: [] +112: [] +113: [] +114: [] +115: [] +116: [] +117: [] +118: [] +119: [] +120: [] +121: [] +122: [] +123: [] +124: [] +125: [] +126: [] +127: [] +128: [] +129: [] +130: [] +131: [] +132: [] +133: [] +134: [] +135: [] +136: [] +137: [] +138: [] +139: [] +140: [] +141: [] +142: [] +143: [] +144: [] +145: [] +146: [] +147: [] +148: [] +149: [] +150: [] +151: [] +152: [] +153: [] +154: [] +155: [] +156: [] +157: [] +158: [] +159: [] +160: [] +161: [] +162: [] +163: [] +164: [] +165: [] +166: [] +167: [] +168: [] +169: [] +170: [] +171: [] +172: [] +173: [] +174: [] +175: [] +176: [] +177: [] +178: [] +179: [] +180: [] +181: [] +182: [] +183: [] +184: [] +185: [] +186: [] +187: [] +188: [] +189: [] +190: [] +191: [] +192: [] +193: [] +194: [] +195: [] +196: [] +197: [] +198: [] +199: [] +200: [] +201: [] +202: [] +203: [] +204: [] +205: [] +206: [] +207: [] +208: [] +209: [] +210: [] +211: [] +212: [] +213: [] +214: [] +215: [] +216: [] +217: [] +218: [] +219: [] +220: [] +221: [] +222: [] +223: [] +224: [] +225: [] +226: [] +227: [] +228: [] +229: [] +230: [] +231: [] +232: [] +233: [] +234: [] +235: [] +236: [] +237: [] +238: [] +239: [] +240: [] +241: [] +242: [] +243: [] +244: [] +245: [] +246: [] +247: [] +248: [] +249: [] +250: [] +251: [] +252: [] +253: [] +254: [] +255: [] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_1.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_1.snap new file mode 100644 index 000000000..6cf7aa46c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_1.snap @@ -0,0 +1,260 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +0: [] +1: [] +2: [] +3: [] +4: [] +5: [] +6: [] +7: [] +8: [] +9: [] +10: [] +11: [] +12: [] +13: [] +14: [] +15: [] +16: [] +17: [] +18: [] +19: [] +20: [] +21: [] +22: [] +23: [] +24: [] +25: [] +26: [] +27: [] +28: [] +29: [] +30: [] +31: [] +32: [] +33: [] +34: [] +35: [] +36: [] +37: [] +38: [] +39: [] +40: [] +41: [] +42: [] +43: [] +44: [] +45: [] +46: [] +47: [] +48: [] +49: [] +50: [] +51: [] +52: [] +53: [] +54: [] +55: [] +56: [] +57: [] +58: [] +59: [] +60: [] +61: [] +62: [] +63: [] +64: [] +65: [] +66: [] +67: [] +68: [] +69: [] +70: [] +71: [] +72: [] +73: [] +74: [] +75: [] +76: [] +77: [] +78: [] +79: [] +80: [] +81: [] +82: [] +83: [] +84: [] +85: [] +86: [] +87: [] +88: [] +89: [] +90: [] +91: [] +92: [] +93: [] +94: [] +95: [] +96: [] +97: [] +98: [] +99: [] +100: [] +101: [] +102: [] +103: [] +104: [] +105: [] +106: [] +107: [] +108: [] +109: [] +110: [] +111: [] +112: [] +113: [] +114: [] +115: [] +116: [] +117: [] +118: [] +119: [] +120: [] +121: [] +122: [] +123: [] +124: [] +125: [] +126: [] +127: [] +128: [] +129: [] +130: [] +131: [] +132: [] +133: [] +134: [] +135: [] +136: [] +137: [] +138: [] +139: [] +140: [] +141: [] +142: [] +143: [] +144: [] +145: [] +146: [] +147: [] +148: [] +149: [] +150: [] +151: [] +152: [] +153: [] +154: [] +155: [] +156: [] +157: [] +158: [] +159: [] +160: [] +161: [] +162: [] +163: [] +164: [] +165: [] +166: [] +167: [] +168: [] +169: [] +170: [] +171: [] +172: [] +173: [] +174: [] +175: [] +176: [] +177: [] +178: [] +179: [] +180: [] +181: [] +182: [] +183: [] +184: [] +185: [] +186: [] +187: [] +188: [] +189: [] +190: [] +191: [] +192: [] +193: [] +194: [] +195: [] +196: [] +197: [] +198: [] +199: [] +200: [] +201: [] +202: [] +203: [] +204: [] +205: [] +206: [] +207: [] +208: [] +209: [] +210: [] +211: [] +212: [] +213: [] +214: [] +215: [] +216: [] +217: [] +218: [] +219: [] +220: [] +221: [] +222: [] +223: [] +224: [] +225: [] +226: [] +227: [] +228: [] +229: [] +230: [] +231: [] +232: [] +233: [] +234: [] +235: [] +236: [] +237: [] +238: [] +239: [] +240: [] +241: [] +242: [] +243: [] +244: [] +245: [] +246: [] +247: [] +248: [] +249: [] +250: [] +251: [] +252: [] +253: [] +254: [] +255: [] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_2.hash.snap new file mode 100644 index 000000000..67965fcd4 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9c25261cec7275cb5cfd85835904d023 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_3.hash.snap new file mode 100644 index 000000000..c43ba2152 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_exact/field_id_1_exact_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2f97f18c15e915853e4df879be6e1f63 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap new file mode 100644 index 000000000..b7b156c65 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +e849066b0e43d5c456f086c552372afc diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap new file mode 100644 index 000000000..92bba2433 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +8cc5e82995b0443b660f419bb9ea2e85 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap new file mode 100644 index 000000000..b7b156c65 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +e849066b0e43d5c456f086c552372afc diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap new file mode 100644 index 000000000..92bba2433 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +8cc5e82995b0443b660f419bb9ea2e85 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap new file mode 100644 index 000000000..fc48b6ddd --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +73b48005dc57b04f0939bbf21a68dab6 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap new file mode 100644 index 000000000..a16d93d8d --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3c23d35627667dcee98468bfdecf09d3 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap new file mode 100644 index 000000000..fc48b6ddd --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +73b48005dc57b04f0939bbf21a68dab6 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap new file mode 100644 index 000000000..a16d93d8d --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3c23d35627667dcee98468bfdecf09d3 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap new file mode 100644 index 000000000..b6a9b6bfa --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +c3f8b0b858a4820a508b25b42328cedd diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap new file mode 100644 index 000000000..76a0589f1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +38a42f5dc25e99d7a5312a63ce94ed30 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap new file mode 100644 index 000000000..b6a9b6bfa --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +c3f8b0b858a4820a508b25b42328cedd diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap new file mode 100644 index 000000000..76a0589f1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +38a42f5dc25e99d7a5312a63ce94ed30 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap new file mode 100644 index 000000000..558740f8a --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2049930204498b323885c91de88e44ca diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap new file mode 100644 index 000000000..48eb244c8 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +7f0ca8c0fc6494f3dba46e8eb9699045 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap new file mode 100644 index 000000000..558740f8a --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2049930204498b323885c91de88e44ca diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap new file mode 100644 index 000000000..48eb244c8 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +7f0ca8c0fc6494f3dba46e8eb9699045 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_0.hash.snap new file mode 100644 index 000000000..c75a7aafc --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ad8fc873747aaf1d3590e7ccab735985 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_1.hash.snap new file mode 100644 index 000000000..440494b37 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +7c6cc88697da835d33877b2df41fa1cb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_2.hash.snap new file mode 100644 index 000000000..c75a7aafc --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ad8fc873747aaf1d3590e7ccab735985 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_3.hash.snap new file mode 100644 index 000000000..440494b37 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/end_at_included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +7c6cc88697da835d33877b2df41fa1cb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_0.hash.snap new file mode 100644 index 000000000..f9becb30d --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9a8c7343b4735d37704748cabcd51ff2 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_1.hash.snap new file mode 100644 index 000000000..e495229a3 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +898a7dc25a1441bc3e7e2a8a62d99090 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_2.hash.snap new file mode 100644 index 000000000..f9becb30d --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +9a8c7343b4735d37704748cabcd51ff2 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_3.hash.snap new file mode 100644 index 000000000..e495229a3 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/start_from_included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +898a7dc25a1441bc3e7e2a8a62d99090 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_0.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_0.snap new file mode 100644 index 000000000..b86eebd09 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_0.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_1.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_1.snap new file mode 100644 index 000000000..778a5b488 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_1.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 0: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_2.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_2.snap new file mode 100644 index 000000000..b86eebd09 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_2.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_3.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_3.snap new file mode 100644 index 000000000..778a5b488 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_0_3.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 0: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_0.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_0.snap new file mode 100644 index 000000000..1773ad84b --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_0.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 1: [] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_1.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_1.snap new file mode 100644 index 000000000..1773ad84b --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_1.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 1: [] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_2.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_2.snap new file mode 100644 index 000000000..02c2c6318 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_2.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_3.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_3.snap new file mode 100644 index 000000000..d9c9cb699 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_unbounded/unbounded_field_id_1_3.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +all field_id 1: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/0.snap new file mode 100644 index 000000000..2d0f6e213 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/0.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[200, ] +[201, ] +[202, ] +[203, ] +[204, ] +[205, ] +[206, ] +[207, ] +[208, ] +[209, ] +[210, ] +[211, ] +[212, ] +[213, ] +[214, ] +[215, ] +[216, ] +[217, ] +[218, ] +[219, ] +[220, ] +[221, ] +[222, ] +[223, ] +[224, ] +[225, ] +[226, ] +[227, ] +[228, ] +[229, ] +[230, ] +[231, ] +[232, ] +[233, ] +[234, ] +[235, ] +[236, ] +[237, ] +[238, ] +[239, ] +[240, ] +[241, ] +[242, ] +[243, ] +[244, ] +[245, ] +[246, ] +[247, ] +[248, ] +[249, ] +[250, ] +[251, ] +[252, ] +[253, ] +[254, ] +[255, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/1.snap new file mode 100644 index 000000000..20d666494 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending/1.snap @@ -0,0 +1,54 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, ] +[202, ] +[203, ] +[207, ] +[211, ] +[215, ] +[219, ] +[223, ] +[224, ] +[230, ] +[231, ] +[233, ] +[235, ] +[236, ] +[237, ] +[239, ] +[241, ] +[243, ] +[244, ] +[247, ] +[250, ] +[256, ] +[258, ] +[260, ] +[262, ] +[263, ] +[264, ] +[267, ] +[269, ] +[273, ] +[277, ] +[278, ] +[279, ] +[281, ] +[282, ] +[286, ] +[289, ] +[292, ] +[293, ] +[295, ] +[297, ] +[205, ] +[206, ] +[208, ] +[209, ] +[210, ] +[216, ] +[220, ] +[226, ] +[238, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-0.snap new file mode 100644 index 000000000..ef207f888 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-0.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] +[200, ] +[202, ] +[204, ] +[206, ] +[208, ] +[210, ] +[212, ] +[214, ] +[216, ] +[218, ] +[220, ] +[222, ] +[224, ] +[226, ] +[228, ] +[230, ] +[232, ] +[234, ] +[236, ] +[238, ] +[240, ] +[242, ] +[244, ] +[246, ] +[248, ] +[250, ] +[252, ] +[254, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-1.snap new file mode 100644 index 000000000..ef207f888 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/0-1.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] +[200, ] +[202, ] +[204, ] +[206, ] +[208, ] +[210, ] +[212, ] +[214, ] +[216, ] +[218, ] +[220, ] +[222, ] +[224, ] +[226, ] +[228, ] +[230, ] +[232, ] +[234, ] +[236, ] +[238, ] +[240, ] +[242, ] +[244, ] +[246, ] +[248, ] +[250, ] +[252, ] +[254, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-0.snap new file mode 100644 index 000000000..52d3d0de0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-0.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] +[202, ] +[224, ] +[230, ] +[236, ] +[244, ] +[250, ] +[256, ] +[258, ] +[260, ] +[262, ] +[264, ] +[278, ] +[282, ] +[286, ] +[292, ] +[206, ] +[208, ] +[210, ] +[216, ] +[220, ] +[226, ] +[238, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-1.snap new file mode 100644 index 000000000..52d3d0de0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort_ascending_multiple_field_ids/1-1.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] +[202, ] +[224, ] +[230, ] +[236, ] +[244, ] +[250, ] +[256, ] +[258, ] +[260, ] +[262, ] +[264, ] +[278, ] +[282, ] +[286, ] +[292, ] +[206, ] +[208, ] +[210, ] +[216, ] +[220, ] +[226, ] +[238, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap new file mode 100644 index 000000000..032763c74 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[255, ] +[254, ] +[253, ] +[252, ] +[251, ] +[250, ] +[249, ] +[248, ] +[247, ] +[246, ] +[245, ] +[244, ] +[243, ] +[242, ] +[241, ] +[240, ] +[239, ] +[238, ] +[237, ] +[236, ] +[235, ] +[234, ] +[233, ] +[232, ] +[231, ] +[230, ] +[229, ] +[228, ] +[227, ] +[226, ] +[225, ] +[224, ] +[223, ] +[222, ] +[221, ] +[220, ] +[219, ] +[218, ] +[217, ] +[216, ] +[215, ] +[214, ] +[213, ] +[212, ] +[211, ] +[210, ] +[209, ] +[208, ] +[207, ] +[206, ] +[205, ] +[204, ] +[203, ] +[202, ] +[201, ] +[200, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap new file mode 100644 index 000000000..4c62cfee4 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap @@ -0,0 +1,54 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[243, ] +[238, ] +[236, ] +[235, ] +[226, ] +[223, ] +[220, ] +[219, ] +[216, ] +[210, ] +[209, ] +[208, ] +[207, ] +[206, ] +[205, ] +[297, ] +[295, ] +[293, ] +[292, ] +[289, ] +[286, ] +[282, ] +[281, ] +[279, ] +[278, ] +[277, ] +[273, ] +[269, ] +[267, ] +[264, ] +[263, ] +[262, ] +[260, ] +[258, ] +[256, ] +[250, ] +[247, ] +[244, ] +[241, ] +[239, ] +[237, ] +[233, ] +[231, ] +[230, ] +[224, ] +[215, ] +[211, ] +[203, ] +[202, ] +[201, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/2.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/2.snap new file mode 100644 index 000000000..032763c74 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/2.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[255, ] +[254, ] +[253, ] +[252, ] +[251, ] +[250, ] +[249, ] +[248, ] +[247, ] +[246, ] +[245, ] +[244, ] +[243, ] +[242, ] +[241, ] +[240, ] +[239, ] +[238, ] +[237, ] +[236, ] +[235, ] +[234, ] +[233, ] +[232, ] +[231, ] +[230, ] +[229, ] +[228, ] +[227, ] +[226, ] +[225, ] +[224, ] +[223, ] +[222, ] +[221, ] +[220, ] +[219, ] +[218, ] +[217, ] +[216, ] +[215, ] +[214, ] +[213, ] +[212, ] +[211, ] +[210, ] +[209, ] +[208, ] +[207, ] +[206, ] +[205, ] +[204, ] +[203, ] +[202, ] +[201, ] +[200, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-0.snap new file mode 100644 index 000000000..b833cae97 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-0.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[254, ] +[252, ] +[250, ] +[248, ] +[246, ] +[244, ] +[242, ] +[240, ] +[238, ] +[236, ] +[234, ] +[232, ] +[230, ] +[228, ] +[226, ] +[224, ] +[222, ] +[220, ] +[218, ] +[216, ] +[214, ] +[212, ] +[210, ] +[208, ] +[206, ] +[204, ] +[202, ] +[200, ] +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-1.snap new file mode 100644 index 000000000..b833cae97 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/0-1.snap @@ -0,0 +1,33 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[254, ] +[252, ] +[250, ] +[248, ] +[246, ] +[244, ] +[242, ] +[240, ] +[238, ] +[236, ] +[234, ] +[232, ] +[230, ] +[228, ] +[226, ] +[224, ] +[222, ] +[220, ] +[218, ] +[216, ] +[214, ] +[212, ] +[210, ] +[208, ] +[206, ] +[204, ] +[202, ] +[200, ] +[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-0.snap new file mode 100644 index 000000000..2623a8807 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-0.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[238, ] +[236, ] +[226, ] +[220, ] +[216, ] +[210, ] +[208, ] +[206, ] +[292, ] +[286, ] +[282, ] +[278, ] +[264, ] +[262, ] +[260, ] +[258, ] +[256, ] +[250, ] +[244, ] +[230, ] +[224, ] +[202, ] +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-1.snap new file mode 100644 index 000000000..2623a8807 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending_multiple_field_ids/1-1.snap @@ -0,0 +1,27 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[238, ] +[236, ] +[226, ] +[220, ] +[216, ] +[210, ] +[208, ] +[206, ] +[292, ] +[286, ] +[282, ] +[278, ] +[264, ] +[262, ] +[260, ] +[258, ] +[256, ] +[250, ] +[244, ] +[230, ] +[224, ] +[202, ] +[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ] + diff --git a/milli/src/search/fst_utils.rs b/milli/src/search/fst_utils.rs new file mode 100644 index 000000000..b488e6c19 --- /dev/null +++ b/milli/src/search/fst_utils.rs @@ -0,0 +1,187 @@ +/// This mod is necessary until https://github.com/BurntSushi/fst/pull/137 gets merged. +/// All credits for this code go to BurntSushi. +use fst::Automaton; + +pub struct StartsWith(pub A); + +/// The `Automaton` state for `StartsWith`. +pub struct StartsWithState(pub StartsWithStateKind); + +impl Clone for StartsWithState +where + A::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +/// The inner state of a `StartsWithState`. +pub enum StartsWithStateKind { + /// Sink state that is reached when the automaton has matched the prefix. + Done, + /// State in which the automaton is while it hasn't matched the prefix. + Running(A::State), +} + +impl Clone for StartsWithStateKind +where + A::State: Clone, +{ + fn clone(&self) -> Self { + match self { + StartsWithStateKind::Done => StartsWithStateKind::Done, + StartsWithStateKind::Running(inner) => StartsWithStateKind::Running(inner.clone()), + } + } +} + +impl Automaton for StartsWith { + type State = StartsWithState; + + fn start(&self) -> StartsWithState { + StartsWithState({ + let inner = self.0.start(); + if self.0.is_match(&inner) { + StartsWithStateKind::Done + } else { + StartsWithStateKind::Running(inner) + } + }) + } + fn is_match(&self, state: &StartsWithState) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(_) => false, + } + } + fn can_match(&self, state: &StartsWithState) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(ref inner) => self.0.can_match(inner), + } + } + fn will_always_match(&self, state: &StartsWithState) -> bool { + match state.0 { + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(_) => false, + } + } + fn accept(&self, state: &StartsWithState, byte: u8) -> StartsWithState { + StartsWithState(match state.0 { + StartsWithStateKind::Done => StartsWithStateKind::Done, + StartsWithStateKind::Running(ref inner) => { + let next_inner = self.0.accept(inner, byte); + if self.0.is_match(&next_inner) { + StartsWithStateKind::Done + } else { + StartsWithStateKind::Running(next_inner) + } + } + }) + } +} +/// An automaton that matches when one of its component automata match. +#[derive(Clone, Debug)] +pub struct Union(pub A, pub B); + +/// The `Automaton` state for `Union`. +pub struct UnionState(pub A::State, pub B::State); + +impl Clone for UnionState +where + A::State: Clone, + B::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone(), self.1.clone()) + } +} + +impl Automaton for Union { + type State = UnionState; + fn start(&self) -> UnionState { + UnionState(self.0.start(), self.1.start()) + } + fn is_match(&self, state: &UnionState) -> bool { + self.0.is_match(&state.0) || self.1.is_match(&state.1) + } + fn can_match(&self, state: &UnionState) -> bool { + self.0.can_match(&state.0) || self.1.can_match(&state.1) + } + fn will_always_match(&self, state: &UnionState) -> bool { + self.0.will_always_match(&state.0) || self.1.will_always_match(&state.1) + } + fn accept(&self, state: &UnionState, byte: u8) -> UnionState { + UnionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte)) + } +} +/// An automaton that matches when both of its component automata match. +#[derive(Clone, Debug)] +pub struct Intersection(pub A, pub B); + +/// The `Automaton` state for `Intersection`. +pub struct IntersectionState(pub A::State, pub B::State); + +impl Clone for IntersectionState +where + A::State: Clone, + B::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone(), self.1.clone()) + } +} + +impl Automaton for Intersection { + type State = IntersectionState; + fn start(&self) -> IntersectionState { + IntersectionState(self.0.start(), self.1.start()) + } + fn is_match(&self, state: &IntersectionState) -> bool { + self.0.is_match(&state.0) && self.1.is_match(&state.1) + } + fn can_match(&self, state: &IntersectionState) -> bool { + self.0.can_match(&state.0) && self.1.can_match(&state.1) + } + fn will_always_match(&self, state: &IntersectionState) -> bool { + self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1) + } + fn accept(&self, state: &IntersectionState, byte: u8) -> IntersectionState { + IntersectionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte)) + } +} +/// An automaton that matches exactly when the automaton it wraps does not. +#[derive(Clone, Debug)] +pub struct Complement(pub A); + +/// The `Automaton` state for `Complement`. +pub struct ComplementState(pub A::State); + +impl Clone for ComplementState +where + A::State: Clone, +{ + fn clone(&self) -> Self { + Self(self.0.clone()) + } +} + +impl Automaton for Complement { + type State = ComplementState; + fn start(&self) -> ComplementState { + ComplementState(self.0.start()) + } + fn is_match(&self, state: &ComplementState) -> bool { + !self.0.is_match(&state.0) + } + fn can_match(&self, state: &ComplementState) -> bool { + !self.0.will_always_match(&state.0) + } + fn will_always_match(&self, state: &ComplementState) -> bool { + !self.0.can_match(&state.0) + } + fn accept(&self, state: &ComplementState, byte: u8) -> ComplementState { + ComplementState(self.0.accept(&state.0, byte)) + } +} diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs new file mode 100644 index 000000000..22ba973b5 --- /dev/null +++ b/milli/src/search/matches/matching_words.rs @@ -0,0 +1,450 @@ +use std::cmp::{min, Reverse}; +use std::collections::BTreeMap; +use std::fmt; +use std::ops::{Index, IndexMut}; +use std::rc::Rc; + +use charabia::Token; +use levenshtein_automata::{Distance, DFA}; + +use crate::search::build_dfa; +use crate::MAX_WORD_LENGTH; + +type IsPrefix = bool; + +/// Structure created from a query tree +/// referencing words that match the given query tree. +#[derive(Default)] +pub struct MatchingWords { + inner: Vec<(Vec>, Vec)>, +} + +impl fmt::Debug for MatchingWords { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "[")?; + for (matching_words, primitive_word_id) in self.inner.iter() { + writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?; + } + writeln!(f, "]")?; + Ok(()) + } +} + +impl MatchingWords { + pub fn new(mut matching_words: Vec<(Vec>, Vec)>) -> Self { + // Sort word by len in DESC order prioritizing the longuest matches, + // in order to highlight the longuest part of the matched word. + matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len()))); + + Self { inner: matching_words } + } + + /// Returns an iterator over terms that match or partially match the given token. + pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { + MatchesIter { inner: Box::new(self.inner.iter()), token } + } +} + +/// Iterator over terms that match the given token, +/// This allow to lazily evaluate matches. +pub struct MatchesIter<'a, 'b> { + #[allow(clippy::type_complexity)] + inner: Box>, Vec)> + 'a>, + token: &'b Token<'b>, +} + +impl<'a> Iterator for MatchesIter<'a, '_> { + type Item = MatchType<'a>; + + fn next(&mut self) -> Option { + match self.inner.next() { + Some((matching_words, ids)) => match matching_words[0].match_token(self.token) { + Some(char_len) => { + if matching_words.len() > 1 { + Some(MatchType::Partial(PartialMatch { + matching_words: &matching_words[1..], + ids, + char_len, + })) + } else { + Some(MatchType::Full { char_len, ids }) + } + } + None => self.next(), + }, + None => None, + } + } +} + +/// Id of a matching term corespounding to a word written by the end user. +pub type PrimitiveWordId = u8; + +/// Structure used to match a specific term. +pub struct MatchingWord { + pub dfa: DFA, + pub word: String, + pub typo: u8, + pub prefix: IsPrefix, +} + +impl fmt::Debug for MatchingWord { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MatchingWord") + .field("word", &self.word) + .field("typo", &self.typo) + .field("prefix", &self.prefix) + .finish() + } +} + +impl PartialEq for MatchingWord { + fn eq(&self, other: &Self) -> bool { + self.prefix == other.prefix && self.typo == other.typo && self.word == other.word + } +} + +impl MatchingWord { + pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option { + if word.len() > MAX_WORD_LENGTH { + return None; + } + let dfa = build_dfa(&word, typo, prefix); + + Some(Self { dfa, word, typo, prefix }) + } + + /// Returns the lenght in chars of the match in case of the token matches the term. + pub fn match_token(&self, token: &Token) -> Option { + match self.dfa.eval(token.lemma()) { + Distance::Exact(t) if t <= self.typo => { + if self.prefix { + let len = bytes_to_highlight(token.lemma(), &self.word); + Some(token.original_lengths(len).0) + } else { + Some(token.original_lengths(token.lemma().len()).0) + } + } + _otherwise => None, + } + } +} + +/// A given token can partially match a query word for several reasons: +/// - split words +/// - multi-word synonyms +/// In these cases we need to match consecutively several tokens to consider that the match is full. +#[derive(Debug, PartialEq)] +pub enum MatchType<'a> { + Full { char_len: usize, ids: &'a [PrimitiveWordId] }, + Partial(PartialMatch<'a>), +} + +/// Structure helper to match several tokens in a row in order to complete a partial match. +#[derive(Debug, PartialEq)] +pub struct PartialMatch<'a> { + matching_words: &'a [Rc], + ids: &'a [PrimitiveWordId], + char_len: usize, +} + +impl<'a> PartialMatch<'a> { + /// Returns: + /// - None if the given token breaks the partial match + /// - Partial if the given token matches the partial match but doesn't complete it + /// - Full if the given token completes the partial match + pub fn match_token(self, token: &Token) -> Option> { + self.matching_words[0].match_token(token).map(|char_len| { + if self.matching_words.len() > 1 { + MatchType::Partial(PartialMatch { + matching_words: &self.matching_words[1..], + ids: self.ids, + char_len, + }) + } else { + MatchType::Full { char_len, ids: self.ids } + } + }) + } + + pub fn char_len(&self) -> usize { + self.char_len + } +} + +// A simple wrapper around vec so we can get contiguous but index it like it's 2D array. +struct N2Array { + y_size: usize, + buf: Vec, +} + +impl N2Array { + fn new(x: usize, y: usize, value: T) -> N2Array { + N2Array { y_size: y, buf: vec![value; x * y] } + } +} + +impl Index<(usize, usize)> for N2Array { + type Output = T; + + #[inline] + fn index(&self, (x, y): (usize, usize)) -> &T { + &self.buf[(x * self.y_size) + y] + } +} + +impl IndexMut<(usize, usize)> for N2Array { + #[inline] + fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T { + &mut self.buf[(x * self.y_size) + y] + } +} + +/// Returns the number of **bytes** we want to highlight in the `source` word. +/// Basically we want to highlight as much characters as possible in the source until it has too much +/// typos (= 2) +/// The algorithm is a modified +/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) +fn bytes_to_highlight(source: &str, target: &str) -> usize { + let n = source.chars().count(); + let m = target.chars().count(); + + if n == 0 { + return 0; + } + // since we allow two typos we can send two characters even if it's completely wrong + if m < 3 { + return source.chars().take(m).map(|c| c.len_utf8()).sum(); + } + if n == m && source == target { + return source.len(); + } + + let inf = n + m; + let mut matrix = N2Array::new(n + 2, m + 2, 0); + + matrix[(0, 0)] = inf; + for i in 0..=n { + matrix[(i + 1, 0)] = inf; + matrix[(i + 1, 1)] = i; + } + for j in 0..=m { + matrix[(0, j + 1)] = inf; + matrix[(1, j + 1)] = j; + } + + let mut last_row = BTreeMap::new(); + + for (row, char_s) in source.chars().enumerate() { + let mut last_match_col = 0; + let row = row + 1; + + for (col, char_t) in target.chars().enumerate() { + let col = col + 1; + let last_match_row = *last_row.get(&char_t).unwrap_or(&0); + let cost = usize::from(char_s != char_t); + + let dist_add = matrix[(row, col + 1)] + 1; + let dist_del = matrix[(row + 1, col)] + 1; + let dist_sub = matrix[(row, col)] + cost; + let dist_trans = matrix[(last_match_row, last_match_col)] + + (row - last_match_row - 1) + + 1 + + (col - last_match_col - 1); + let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans)); + matrix[(row + 1, col + 1)] = dist; + + if cost == 0 { + last_match_col = col; + } + } + + last_row.insert(char_s, row); + } + + let mut minimum = (u32::max_value(), 0); + for x in 0..=m { + let dist = matrix[(n + 1, x + 1)] as u32; + if dist < minimum.0 { + minimum = (dist, x); + } + } + + // everything was done characters wise and now we want to returns a number of bytes + source.chars().take(minimum.1).map(|c| c.len_utf8()).sum() +} + +#[cfg(test)] +mod tests { + use std::borrow::Cow; + use std::str::from_utf8; + + use charabia::TokenKind; + + use super::*; + use crate::MatchingWords; + + #[test] + fn test_bytes_to_highlight() { + struct TestBytesToHighlight { + query: &'static str, + text: &'static str, + length: usize, + } + let tests = [ + TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() }, + TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() }, + TestBytesToHighlight { + query: "Levenshtein", + text: "Levenshtein", + length: "Levenshtein".len(), + }, + // we get to the end of our word with only one typo + TestBytesToHighlight { + query: "Levenste", + text: "Levenshtein", + length: "Levenste".len(), + }, + // we get our third and last authorized typo right on the last character + TestBytesToHighlight { + query: "Levenstein", + text: "Levenshte", + length: "Levenste".len(), + }, + // we get to the end of our word with only two typos at the beginning + TestBytesToHighlight { + query: "Bavenshtein", + text: "Levenshtein", + length: "Bavenshtein".len(), + }, + TestBytesToHighlight { + query: "Альфа", text: "Альфой", length: "Альф".len() + }, + TestBytesToHighlight { + query: "Go💼", text: "Go💼od luck.", length: "Go💼".len() + }, + TestBytesToHighlight { + query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len() + }, + TestBytesToHighlight { + query: "chäräcters", + text: "chäräcters", + length: "chäräcters".len(), + }, + TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() }, + TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() }, + ]; + + for test in &tests { + let length = bytes_to_highlight(test.text, test.query); + assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text); + assert!( + from_utf8(&test.query.as_bytes()[..length]).is_ok(), + r#"converting {}[..{}] to an utf8 str failed"#, + test.query, + length + ); + } + } + + #[test] + fn matching_words() { + let all = vec![ + Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()), + Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), + ]; + let matching_words = vec![ + (vec![all[0].clone()], vec![0]), + (vec![all[1].clone()], vec![1]), + (vec![all[2].clone()], vec![2]), + ]; + + let matching_words = MatchingWords::new(matching_words); + + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("word"), + char_end: "word".chars().count(), + byte_end: "word".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 3, ids: &[2] }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("nyc"), + char_end: "nyc".chars().count(), + byte_end: "nyc".len(), + ..Default::default() + }) + .next(), + None + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("world"), + char_end: "world".chars().count(), + byte_end: "world".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[2] }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("splitted"), + char_end: "splitted".chars().count(), + byte_end: "splitted".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[0] }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("thisnew"), + char_end: "thisnew".chars().count(), + byte_end: "thisnew".len(), + ..Default::default() + }) + .next(), + None + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("borld"), + char_end: "borld".chars().count(), + byte_end: "borld".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[2] }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("wordsplit"), + char_end: "wordsplit".chars().count(), + byte_end: "wordsplit".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 4, ids: &[2] }) + ); + } +} diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs new file mode 100644 index 000000000..6ac5123a8 --- /dev/null +++ b/milli/src/search/matches/mod.rs @@ -0,0 +1,865 @@ +use std::borrow::Cow; + +use charabia::{SeparatorKind, Token, Tokenizer}; +use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; +pub use matching_words::{MatchingWord, MatchingWords}; +use serde::Serialize; + +pub mod matching_words; + +const DEFAULT_CROP_MARKER: &str = "…"; +const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; +const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; + +/// Structure used to build a Matcher allowing to customize formating tags. +pub struct MatcherBuilder<'a, A> { + matching_words: MatchingWords, + tokenizer: Tokenizer<'a, 'a, A>, + crop_marker: Option, + highlight_prefix: Option, + highlight_suffix: Option, +} + +impl<'a, A> MatcherBuilder<'a, A> { + pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { + Self { + matching_words, + tokenizer, + crop_marker: None, + highlight_prefix: None, + highlight_suffix: None, + } + } + + pub fn crop_marker(&mut self, marker: String) -> &Self { + self.crop_marker = Some(marker); + self + } + + pub fn highlight_prefix(&mut self, prefix: String) -> &Self { + self.highlight_prefix = Some(prefix); + self + } + + pub fn highlight_suffix(&mut self, suffix: String) -> &Self { + self.highlight_suffix = Some(suffix); + self + } + + pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { + let crop_marker = match &self.crop_marker { + Some(marker) => marker.as_str(), + None => DEFAULT_CROP_MARKER, + }; + + let highlight_prefix = match &self.highlight_prefix { + Some(marker) => marker.as_str(), + None => DEFAULT_HIGHLIGHT_PREFIX, + }; + let highlight_suffix = match &self.highlight_suffix { + Some(marker) => marker.as_str(), + None => DEFAULT_HIGHLIGHT_SUFFIX, + }; + Matcher { + text, + matching_words: &self.matching_words, + tokenizer: &self.tokenizer, + crop_marker, + highlight_prefix, + highlight_suffix, + matches: None, + } + } +} + +#[derive(Copy, Clone, Default)] +pub struct FormatOptions { + pub highlight: bool, + pub crop: Option, +} + +impl FormatOptions { + pub fn merge(self, other: Self) -> Self { + Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } + } +} + +#[derive(Clone, Debug)] +pub struct Match { + match_len: usize, + // ids of the query words that matches. + ids: Vec, + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, +} + +#[derive(Serialize, Debug, Clone, PartialEq, Eq)] +pub struct MatchBounds { + pub start: usize, + pub length: usize, +} + +/// Structure used to analize a string, compute words that match, +/// and format the source string, returning a highlighted and cropped sub-string. +pub struct Matcher<'t, 'm, A> { + text: &'t str, + matching_words: &'m MatchingWords, + tokenizer: &'m Tokenizer<'m, 'm, A>, + crop_marker: &'m str, + highlight_prefix: &'m str, + highlight_suffix: &'m str, + matches: Option<(Vec>, Vec)>, +} + +impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { + /// Iterates over tokens and save any of them that matches the query. + fn compute_matches(&mut self) -> &mut Self { + /// some words are counted as matches only if they are close together and in the good order, + /// compute_partial_match peek into next words to validate if the match is complete. + fn compute_partial_match<'a>( + mut partial: PartialMatch, + token_position: usize, + word_position: usize, + words_positions: &mut impl Iterator)>, + matches: &mut Vec, + ) -> bool { + let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; + + for (token_position, word_position, word) in words_positions { + partial = match partial.match_token(word) { + // token matches the partial match, but the match is not full, + // we temporarly save the current token then we try to match the next one. + Some(MatchType::Partial(partial)) => { + potential_matches.push((token_position, word_position, partial.char_len())); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + // save previously matched tokens as matches. + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| Match { + match_len, + ids: ids.to_vec(), + word_position, + token_position, + }, + ); + matches.extend(iter); + + // save the token that closes the partial match as a match. + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + + // the match is complete, we return true. + return true; + } + // no match, continue to next match. + None => break, + }; + } + + // the match is not complete, we return false. + false + } + + let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect(); + let mut matches = Vec::new(); + + let mut words_positions = tokens + .iter() + .scan((0, 0), |(token_position, word_position), token| { + let current_token_position = *token_position; + let current_word_position = *word_position; + *token_position += 1; + if !token.is_separator() { + *word_position += 1; + } + + Some((current_token_position, current_word_position, token)) + }) + .filter(|(_, _, token)| !token.is_separator()); + + while let Some((token_position, word_position, word)) = words_positions.next() { + for match_type in self.matching_words.match_token(word) { + match match_type { + // we match, we save the current token as a match, + // then we continue the rest of the tokens. + MatchType::Full { char_len, ids } => { + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + break; + } + // we match partially, iterate over next tokens to check if we can complete the match. + MatchType::Partial(partial) => { + // if match is completed, we break the matching loop over the current token, + // then we continue the rest of the tokens. + let mut wp = words_positions.clone(); + if compute_partial_match( + partial, + token_position, + word_position, + &mut wp, + &mut matches, + ) { + words_positions = wp; + break; + } + } + } + } + } + + self.matches = Some((tokens, matches)); + self + } + + /// Returns boundaries of the words that match the query. + pub fn matches(&mut self) -> Vec { + match &self.matches { + None => self.compute_matches().matches(), + Some((tokens, matches)) => matches + .iter() + .map(|m| MatchBounds { + start: tokens[m.token_position].byte_start, + length: m.match_len, + }) + .collect(), + } + } + + /// Returns the bounds in byte index of the crop window. + fn crop_bounds(&self, tokens: &[Token], matches: &[Match], crop_size: usize) -> (usize, usize) { + // if there is no match, we start from the beginning of the string by default. + let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); + let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); + let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); + let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); + + // matches needs to be counted in the crop len. + let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; + + // create the initial state of the crop window: 2 iterators starting from the matches positions, + // a reverse iterator starting from the first match token position and going towards the beginning of the text, + let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); + // an iterator starting from the last match token position and going towards the end of the text. + let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); + + // grows the crop window peeking in both directions + // until the window contains the good number of words: + while remaining_words > 0 { + let before_token = before_tokens.peek().map(|t| t.separator_kind()); + let after_token = after_tokens.peek().map(|t| t.separator_kind()); + + match (before_token, after_token) { + // we can expand both sides. + (Some(before_token), Some(after_token)) => { + match (before_token, after_token) { + // if they are both separators and are the same kind then advance both, + // or expand in the soft separator separator side. + (Some(before_token_kind), Some(after_token_kind)) => { + if before_token_kind == after_token_kind { + before_tokens.next(); + + // this avoid having an ending separator before crop marker. + if remaining_words > 1 { + after_tokens.next(); + } + } else if before_token_kind == SeparatorKind::Hard { + after_tokens.next(); + } else { + before_tokens.next(); + } + } + // if one of the tokens is a word, we expend in the side of the word. + // left is a word, advance left. + (None, Some(_)) => { + before_tokens.next(); + remaining_words -= 1; + } + // right is a word, advance right. + (Some(_), None) => { + after_tokens.next(); + remaining_words -= 1; + } + // both are words, advance left then right if remaining_word > 0. + (None, None) => { + before_tokens.next(); + remaining_words -= 1; + + if remaining_words > 0 { + after_tokens.next(); + remaining_words -= 1; + } + } + } + } + // the end of the text is reached, advance left. + (Some(before_token), None) => { + before_tokens.next(); + if before_token.is_none() { + remaining_words -= 1; + } + } + // the start of the text is reached, advance right. + (None, Some(after_token)) => { + after_tokens.next(); + if after_token.is_none() { + remaining_words -= 1; + } + } + // no more token to add. + (None, None) => break, + } + } + + // finally, keep the byte index of each bound of the crop window. + let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); + let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + + (crop_byte_start, crop_byte_end) + } + + /// Compute the score of a match interval: + /// 1) count unique matches + /// 2) calculate distance between matches + /// 3) count ordered matches + fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + // compute distance between matches + distance_score -= (next_match.word_position - m.word_position).min(7) as i16; + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + (uniq_score, distance_score, order_score) + } + + /// Returns the matches interval where the score computed by match_interval_score is the best. + fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { + // we compute the matches interval if we have at least 2 matches. + if matches.len() > 1 { + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval = (0, 0); + let mut best_interval_score = self.match_interval_score(&matches[0..=0]); + // current interval positions. + let mut interval_first = 0; + let mut interval_last = 0; + for (index, next_match) in matches.iter().enumerate().skip(1) { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + if next_match.word_position - matches[interval_first].word_position >= crop_size { + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + + // keep interval if it's the best + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + best_interval_score = interval_score; + } + + // advance start of the interval while interval is longer than crop_size. + while next_match.word_position - matches[interval_first].word_position + >= crop_size + { + interval_first += 1; + } + } + interval_last = index; + } + + // compute the last interval score and compare it to the best one. + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + } + + &matches[best_interval.0..=best_interval.1] + } else { + matches + } + } + + // Returns the formatted version of the original text. + pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { + if !format_options.highlight && format_options.crop.is_none() { + // compute matches is not needed if no highlight nor crop is requested. + Cow::Borrowed(self.text) + } else { + match &self.matches { + Some((tokens, matches)) => { + // If the text has to be cropped, + // compute the best interval to crop around. + let matches = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.find_best_match_interval(matches, crop_size) + } + _ => matches, + }; + + // If the text has to be cropped, + // crop around the best interval. + let (byte_start, byte_end) = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.crop_bounds(tokens, matches, crop_size) + } + _ => (0, self.text.len()), + }; + + let mut formatted = Vec::new(); + + // push crop marker if it's not the start of the text. + if byte_start > 0 && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + let mut byte_index = byte_start; + + if format_options.highlight { + // insert highlight markers around matches. + for m in matches { + let token = &tokens[m.token_position]; + + if byte_index < token.byte_start { + formatted.push(&self.text[byte_index..token.byte_start]); + } + + let highlight_byte_index = self.text[token.byte_start..] + .char_indices() + .enumerate() + .find(|(i, _)| *i == m.match_len) + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + formatted.push(self.highlight_prefix); + formatted.push(&self.text[token.byte_start..highlight_byte_index]); + formatted.push(self.highlight_suffix); + // if it's a prefix highlight, we put the end of the word after the highlight marker. + if highlight_byte_index < token.byte_end { + formatted.push(&self.text[highlight_byte_index..token.byte_end]); + } + + byte_index = token.byte_end; + } + } + + // push the rest of the text between last match and the end of crop. + if byte_index < byte_end { + formatted.push(&self.text[byte_index..byte_end]); + } + + // push crop marker if it's not the end of the text. + if byte_end < self.text.len() && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + if formatted.len() == 1 { + // avoid concatenating if there is already 1 slice. + Cow::Borrowed(&self.text[byte_start..byte_end]) + } else { + Cow::Owned(formatted.concat()) + } + } + None => self.compute_matches().format(format_options), + } + } + } +} + +#[cfg(test)] +mod tests { + use std::rc::Rc; + + use charabia::TokenizerBuilder; + + use super::*; + use crate::search::matches::matching_words::MatchingWord; + + fn matching_words() -> MatchingWords { + let all = vec![ + Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), + ]; + let matching_words = vec![ + (vec![all[0].clone()], vec![0]), + (vec![all[1].clone()], vec![1]), + (vec![all[2].clone()], vec![2]), + ]; + + MatchingWords::new(matching_words) + } + + impl MatcherBuilder<'_, Vec> { + pub fn from_matching_words(matching_words: MatchingWords) -> Self { + Self::new(matching_words, TokenizerBuilder::default().build()) + } + } + + #[test] + fn format_identity() { + let matching_words = matching_words(); + + let builder = MatcherBuilder::from_matching_words(matching_words); + + let format_options = FormatOptions { highlight: false, crop: None }; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + } + + #[test] + fn format_highlight() { + let matching_words = matching_words(); + + let builder = MatcherBuilder::from_matching_words(matching_words); + + let format_options = FormatOptions { highlight: true, crop: None }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + assert_eq!(&matcher.format(format_options), ""); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + assert_eq!(&matcher.format(format_options), ":-)"); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no crop should return complete text, because there is no matches. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves." + ); + } + + #[test] + fn highlight_unicode() { + let all = vec![ + Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()), + Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), + ]; + let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])]; + + let matching_words = MatchingWords::new(matching_words); + + let builder = MatcherBuilder::from_matching_words(matching_words); + + let format_options = FormatOptions { highlight: true, crop: None }; + + // Text containing prefix match. + let text = "Ŵôřlḑôle"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑôle" + ); + + // Text containing unicode match. + let text = "Ŵôřlḑ"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑ" + ); + + // Text containing unicode match. + let text = "Westfália"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Westfália" + ); + } + + #[test] + fn format_crop() { + let matching_words = matching_words(); + + let builder = MatcherBuilder::from_matching_words(matching_words); + + let format_options = FormatOptions { highlight: false, crop: Some(10) }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no highlight should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" + ); + + // Text without any match starting by a separator. + let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; + let mut matcher = builder.build(text); + // no highlight should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"(A quick brown fox can not jump 32 feet, right…" + ); + + // Test phrase propagation + let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; + let mut matcher = builder.build(text); + // should crop the phrase instead of croping around the match. + insta::assert_snapshot!( + matcher.format(format_options), + @"… Split The World is a book written by Emily Henry…" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no highlight should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no highlight should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + + // Text containing matches with diferent density. + let text = "split void the void void world void void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + + // Text containing matches with same word. + let text = "split split split split split split void void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + } + + #[test] + fn format_highlight_crop() { + let matching_words = matching_words(); + + let builder = MatcherBuilder::from_matching_words(matching_words); + + let format_options = FormatOptions { highlight: true, crop: Some(10) }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // both should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // both should return 10 last words with a marker at the start and highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // both should return 10 last words with a marker at the start and highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + } + + #[test] + fn smaller_crop_size() { + //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 + let matching_words = matching_words(); + + let builder = MatcherBuilder::from_matching_words(matching_words); + + let text = "void void split the world void void."; + + // set a smaller crop size + let format_options = FormatOptions { highlight: false, crop: Some(2) }; + let mut matcher = builder.build(text); + // because crop size < query size, partially format matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…split the…" + ); + + // set a smaller crop size + let format_options = FormatOptions { highlight: false, crop: Some(1) }; + let mut matcher = builder.build(text); + // because crop size < query size, partially format matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…split…" + ); + + // set crop size to 0 + let format_options = FormatOptions { highlight: false, crop: Some(0) }; + let mut matcher = builder.build(text); + // because crop size is 0, crop is ignored. + insta::assert_snapshot!( + matcher.format(format_options), + @"void void split the world void void." + ); + } + + #[test] + fn partial_matches() { + let all = vec![ + Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), + Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()), + ]; + let matching_words = vec![ + (vec![all[0].clone()], vec![0]), + (vec![all[1].clone(), all[2].clone()], vec![0]), + (vec![all[3].clone()], vec![1]), + (vec![all[4].clone(), all[5].clone()], vec![1]), + (vec![all[4].clone()], vec![2]), + ]; + + let matching_words = MatchingWords::new(matching_words); + + let mut builder = MatcherBuilder::from_matching_words(matching_words); + builder.highlight_prefix("_".to_string()); + builder.highlight_suffix("_".to_string()); + + let format_options = FormatOptions { highlight: true, crop: None }; + + let text = "the do or die can't be he do and or isn't he"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_" + ); + } +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs new file mode 100644 index 000000000..df59634bb --- /dev/null +++ b/milli/src/search/mod.rs @@ -0,0 +1,536 @@ +use std::borrow::Cow; +use std::collections::hash_map::{Entry, HashMap}; +use std::fmt; +use std::mem::take; +use std::result::Result as StdResult; +use std::str::Utf8Error; +use std::time::Instant; + +use charabia::TokenizerBuilder; +use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; +use fst::automaton::Str; +use fst::{Automaton, IntoStreamer, Streamer}; +use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; +use log::debug; +use once_cell::sync::Lazy; +use roaring::bitmap::RoaringBitmap; + +pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; +use self::fst_utils::{Complement, Intersection, StartsWith, Union}; +pub use self::matches::{ + FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, +}; +use self::query_tree::QueryTreeBuilder; +use crate::error::UserError; +use crate::search::criteria::r#final::{Final, FinalResult}; +use crate::search::criteria::InitialCandidates; +use crate::{AscDesc, Criterion, DocumentId, Index, Member, Result}; + +// Building these factories is not free. +static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); +static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); +static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); + +mod criteria; +mod distinct; +pub mod facet; +mod fst_utils; +mod matches; +mod query_tree; + +pub struct Search<'a> { + query: Option, + // this should be linked to the String in the query + filter: Option>, + offset: usize, + limit: usize, + sort_criteria: Option>, + terms_matching_strategy: TermsMatchingStrategy, + authorize_typos: bool, + words_limit: usize, + exhaustive_number_hits: bool, + criterion_implementation_strategy: CriterionImplementationStrategy, + rtxn: &'a heed::RoTxn<'a>, + index: &'a Index, +} + +impl<'a> Search<'a> { + pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { + Search { + query: None, + filter: None, + offset: 0, + limit: 20, + sort_criteria: None, + terms_matching_strategy: TermsMatchingStrategy::default(), + authorize_typos: true, + exhaustive_number_hits: false, + words_limit: 10, + criterion_implementation_strategy: CriterionImplementationStrategy::default(), + rtxn, + index, + } + } + + pub fn query(&mut self, query: impl Into) -> &mut Search<'a> { + self.query = Some(query.into()); + self + } + + pub fn offset(&mut self, offset: usize) -> &mut Search<'a> { + self.offset = offset; + self + } + + pub fn limit(&mut self, limit: usize) -> &mut Search<'a> { + self.limit = limit; + self + } + + pub fn sort_criteria(&mut self, criteria: Vec) -> &mut Search<'a> { + self.sort_criteria = Some(criteria); + self + } + + pub fn terms_matching_strategy(&mut self, value: TermsMatchingStrategy) -> &mut Search<'a> { + self.terms_matching_strategy = value; + self + } + + pub fn authorize_typos(&mut self, value: bool) -> &mut Search<'a> { + self.authorize_typos = value; + self + } + + pub fn words_limit(&mut self, value: usize) -> &mut Search<'a> { + self.words_limit = value; + self + } + + pub fn filter(&mut self, condition: Filter<'a>) -> &mut Search<'a> { + self.filter = Some(condition); + self + } + + /// Force the search to exhastivelly compute the number of candidates, + /// this will increase the search time but allows finite pagination. + pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> { + self.exhaustive_number_hits = exhaustive_number_hits; + self + } + + pub fn criterion_implementation_strategy( + &mut self, + strategy: CriterionImplementationStrategy, + ) -> &mut Search<'a> { + self.criterion_implementation_strategy = strategy; + self + } + + fn is_typo_authorized(&self) -> Result { + let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?; + // only authorize typos if both the index and the query allow it. + Ok(self.authorize_typos && index_authorizes_typos) + } + + pub fn execute(&self) -> Result { + // We create the query tree by spliting the query into tokens. + let before = Instant::now(); + let (query_tree, primitive_query, matching_words) = match self.query.as_ref() { + Some(query) => { + let mut builder = QueryTreeBuilder::new(self.rtxn, self.index)?; + builder.terms_matching_strategy(self.terms_matching_strategy); + + builder.authorize_typos(self.is_typo_authorized()?); + + builder.words_limit(self.words_limit); + // We make sure that the analyzer is aware of the stop words + // this ensures that the query builder is able to properly remove them. + let mut tokbuilder = TokenizerBuilder::new(); + let stop_words = self.index.stop_words(self.rtxn)?; + if let Some(ref stop_words) = stop_words { + tokbuilder.stop_words(stop_words); + } + + let tokenizer = tokbuilder.build(); + let tokens = tokenizer.tokenize(query); + builder + .build(tokens)? + .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) + } + None => (None, None, None), + }; + + debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); + + // We create the original candidates with the facet conditions results. + let before = Instant::now(); + let filtered_candidates = match &self.filter { + Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?), + None => None, + }; + + debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed()); + + // We check that we are allowed to use the sort criteria, we check + // that they are declared in the sortable fields. + if let Some(sort_criteria) = &self.sort_criteria { + let sortable_fields = self.index.sortable_fields(self.rtxn)?; + for asc_desc in sort_criteria { + match asc_desc.member() { + Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => { + return Err(UserError::InvalidSortableAttribute { + field: field.to_string(), + valid_fields: sortable_fields.into_iter().collect(), + })? + } + Member::Geo(_) if !sortable_fields.contains("_geo") => { + return Err(UserError::InvalidSortableAttribute { + field: "_geo".to_string(), + valid_fields: sortable_fields.into_iter().collect(), + })? + } + _ => (), + } + } + } + + // We check that the sort ranking rule exists and throw an + // error if we try to use it and that it doesn't. + let sort_ranking_rule_missing = !self.index.criteria(self.rtxn)?.contains(&Criterion::Sort); + let empty_sort_criteria = self.sort_criteria.as_ref().map_or(true, |s| s.is_empty()); + if sort_ranking_rule_missing && !empty_sort_criteria { + return Err(UserError::SortRankingRuleMissing.into()); + } + + let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; + + match self.index.distinct_field(self.rtxn)? { + None => { + let criteria = criteria_builder.build::( + query_tree, + primitive_query, + filtered_candidates, + self.sort_criteria.clone(), + self.exhaustive_number_hits, + None, + self.criterion_implementation_strategy, + )?; + self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria) + } + Some(name) => { + let field_ids_map = self.index.fields_ids_map(self.rtxn)?; + match field_ids_map.id(name) { + Some(fid) => { + let distinct = FacetDistinct::new(fid, self.index, self.rtxn); + + let criteria = criteria_builder.build( + query_tree, + primitive_query, + filtered_candidates, + self.sort_criteria.clone(), + self.exhaustive_number_hits, + Some(distinct.clone()), + self.criterion_implementation_strategy, + )?; + self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) + } + None => Ok(SearchResult::default()), + } + } + } + } + + fn perform_sort( + &self, + mut distinct: D, + matching_words: MatchingWords, + mut criteria: Final, + ) -> Result { + let mut offset = self.offset; + let mut initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new()); + let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?; + let mut documents_ids = Vec::new(); + + while let Some(FinalResult { candidates, initial_candidates: ic, .. }) = + criteria.next(&excluded_candidates)? + { + debug!("Number of candidates found {}", candidates.len()); + + let excluded = take(&mut excluded_candidates); + let mut candidates = distinct.distinct(candidates, excluded); + + initial_candidates |= ic; + + if offset != 0 { + let discarded = candidates.by_ref().take(offset).count(); + offset = offset.saturating_sub(discarded); + } + + for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { + documents_ids.push(candidate?); + } + + excluded_candidates |= candidates.into_excluded(); + + if documents_ids.len() == self.limit { + break; + } + } + + initial_candidates.map_inplace(|c| c - excluded_candidates); + + Ok(SearchResult { + matching_words, + candidates: initial_candidates.into_inner(), + documents_ids, + }) + } +} + +impl fmt::Debug for Search<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Search { + query, + filter, + offset, + limit, + sort_criteria, + terms_matching_strategy, + authorize_typos, + words_limit, + exhaustive_number_hits, + criterion_implementation_strategy, + rtxn: _, + index: _, + } = self; + f.debug_struct("Search") + .field("query", query) + .field("filter", filter) + .field("offset", offset) + .field("limit", limit) + .field("sort_criteria", sort_criteria) + .field("terms_matching_strategy", terms_matching_strategy) + .field("authorize_typos", authorize_typos) + .field("exhaustive_number_hits", exhaustive_number_hits) + .field("criterion_implementation_strategy", criterion_implementation_strategy) + .field("words_limit", words_limit) + .finish() + } +} + +#[derive(Default)] +pub struct SearchResult { + pub matching_words: MatchingWords, + pub candidates: RoaringBitmap, + // TODO those documents ids should be associated with their criteria scores. + pub documents_ids: Vec, +} + +#[derive(Debug, Default, Clone, Copy)] +pub enum CriterionImplementationStrategy { + OnlyIterative, + OnlySetBased, + #[default] + Dynamic, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TermsMatchingStrategy { + // remove last word first + Last, + // remove first word first + First, + // remove more frequent word first + Frequency, + // remove smallest word first + Size, + // only one of the word is mandatory + Any, + // all words are mandatory + All, +} + +impl Default for TermsMatchingStrategy { + fn default() -> Self { + Self::Last + } +} + +pub type WordDerivationsCache = HashMap<(String, bool, u8), Vec<(String, u8)>>; + +pub fn word_derivations<'c>( + word: &str, + is_prefix: bool, + max_typo: u8, + fst: &fst::Set>, + cache: &'c mut WordDerivationsCache, +) -> StdResult<&'c [(String, u8)], Utf8Error> { + match cache.entry((word.to_string(), is_prefix, max_typo)) { + Entry::Occupied(entry) => Ok(entry.into_mut()), + Entry::Vacant(entry) => { + let mut derived_words = Vec::new(); + if max_typo == 0 { + if is_prefix { + let prefix = Str::new(word).starts_with(); + let mut stream = fst.search(prefix).into_stream(); + + while let Some(word) = stream.next() { + let word = std::str::from_utf8(word)?; + derived_words.push((word.to_string(), 0)); + } + } else if fst.contains(word) { + derived_words.push((word.to_string(), 0)); + } + } else if max_typo == 1 { + let dfa = build_dfa(word, 1, is_prefix); + let starts = StartsWith(Str::new(get_first(word))); + let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); + + while let Some((word, state)) = stream.next() { + let word = std::str::from_utf8(word)?; + let d = dfa.distance(state.1); + derived_words.push((word.to_string(), d.to_u8())); + } + } else { + let starts = StartsWith(Str::new(get_first(word))); + let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); + let second_dfa = build_dfa(word, 2, is_prefix); + let second = Intersection(&second_dfa, &starts); + let automaton = Union(first, &second); + + let mut stream = fst.search_with_state(automaton).into_stream(); + + while let Some((found_word, state)) = stream.next() { + let found_word = std::str::from_utf8(found_word)?; + // in the case the typo is on the first letter, we know the number of typo + // is two + if get_first(found_word) != get_first(word) { + derived_words.push((found_word.to_string(), 2)); + } else { + // Else, we know that it is the second dfa that matched and compute the + // correct distance + let d = second_dfa.distance((state.1).0); + derived_words.push((found_word.to_string(), d.to_u8())); + } + } + } + Ok(entry.insert(derived_words)) + } + } +} + +fn get_first(s: &str) -> &str { + match s.chars().next() { + Some(c) => &s[..c.len_utf8()], + None => panic!("unexpected empty query"), + } +} + +pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA { + let lev = match typos { + 0 => &LEVDIST0, + 1 => &LEVDIST1, + _ => &LEVDIST2, + }; + + if is_prefix { + lev.build_prefix_dfa(word) + } else { + lev.build_dfa(word) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::index::tests::TempIndex; + + #[test] + fn test_is_authorized_typos() { + let index = TempIndex::new(); + let mut txn = index.write_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + + // default is authorized + assert!(search.is_typo_authorized().unwrap()); + + search.authorize_typos(false); + assert!(!search.is_typo_authorized().unwrap()); + + index.put_authorize_typos(&mut txn, false).unwrap(); + txn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + let mut search = Search::new(&txn, &index); + + assert!(!search.is_typo_authorized().unwrap()); + + search.authorize_typos(true); + assert!(!search.is_typo_authorized().unwrap()); + } + + #[test] + fn test_one_typos_tolerance() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[("zealand".to_string(), 1)]); + } + + #[test] + fn test_one_typos_first_letter() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[]); + } + + #[test] + fn test_two_typos_tolerance() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[("zealand".to_string(), 2)]); + } + + #[test] + fn test_two_typos_first_letter() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[("zealand".to_string(), 2)]); + } + + #[test] + fn test_prefix() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[("zealand".to_string(), 0)]); + } + + #[test] + fn test_bad_prefix() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[]); + } + + #[test] + fn test_prefix_with_typo() { + let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + let mut cache = HashMap::new(); + let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap(); + + assert_eq!(found, &[("zealand".to_string(), 1)]); + } +} diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs new file mode 100755 index 000000000..00c85aaba --- /dev/null +++ b/milli/src/search/query_tree.rs @@ -0,0 +1,1471 @@ +use std::borrow::Cow; +use std::cmp::max; +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::hash::Hash; +use std::rc::Rc; +use std::{fmt, mem}; + +use charabia::normalizer::NormalizedTokenIter; +use charabia::{SeparatorKind, TokenKind}; +use roaring::RoaringBitmap; +use slice_group_by::GroupBy; + +use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; +use crate::search::TermsMatchingStrategy; +use crate::{CboRoaringBitmapLenCodec, Index, MatchingWords, Result}; + +type IsOptionalWord = bool; +type IsPrefix = bool; + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum Operation { + And(Vec), + // series of consecutive non prefix and exact words + // `None` means a stop word. + Phrase(Vec>), + Or(IsOptionalWord, Vec), + Query(Query), +} + +impl fmt::Debug for Operation { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result { + match op { + Operation::And(children) => { + writeln!(f, "{:1$}AND", "", depth * 2)?; + children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) + } + Operation::Phrase(children) => { + writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2) + } + Operation::Or(true, children) => { + writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?; + children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) + } + Operation::Or(false, children) => { + writeln!(f, "{:1$}OR", "", depth * 2)?; + children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) + } + Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2), + } + } + + pprint_tree(f, self, 0) + } +} + +impl Operation { + fn and(mut ops: Vec) -> Self { + if ops.len() == 1 { + ops.pop().unwrap() + } else { + Self::And(ops) + } + } + + pub fn or(word_branch: IsOptionalWord, mut ops: Vec) -> Self { + if ops.len() == 1 { + ops.pop().unwrap() + } else { + let ops = ops + .into_iter() + .flat_map(|o| match o { + Operation::Or(wb, children) if wb == word_branch => children, + op => vec![op], + }) + .collect(); + Self::Or(word_branch, ops) + } + } + + fn phrase(mut words: Vec>) -> Self { + if words.len() == 1 { + if let Some(word) = words.pop().unwrap() { + Self::Query(Query { prefix: false, kind: QueryKind::exact(word) }) + } else { + Self::Phrase(words) + } + } else { + Self::Phrase(words) + } + } + + pub fn query(&self) -> Option<&Query> { + match self { + Operation::Query(query) => Some(query), + _ => None, + } + } +} + +#[derive(Clone, Eq, PartialEq, Hash)] +pub struct Query { + pub prefix: IsPrefix, + pub kind: QueryKind, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum QueryKind { + Tolerant { typo: u8, word: String }, + Exact { original_typo: u8, word: String }, +} + +impl QueryKind { + pub fn exact(word: String) -> Self { + QueryKind::Exact { original_typo: 0, word } + } + + pub fn tolerant(typo: u8, word: String) -> Self { + QueryKind::Tolerant { typo, word } + } + + pub fn typo(&self) -> u8 { + match self { + QueryKind::Tolerant { typo, .. } => *typo, + QueryKind::Exact { original_typo, .. } => *original_typo, + } + } + + pub fn word(&self) -> &str { + match self { + QueryKind::Tolerant { word, .. } => word, + QueryKind::Exact { word, .. } => word, + } + } +} + +impl fmt::Debug for Query { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Query { prefix, kind } = self; + let prefix = if *prefix { String::from("Prefix") } else { String::default() }; + match kind { + QueryKind::Exact { word, .. } => { + f.debug_struct(&(prefix + "Exact")).field("word", &word).finish() + } + QueryKind::Tolerant { typo, word } => f + .debug_struct(&(prefix + "Tolerant")) + .field("word", &word) + .field("max typo", &typo) + .finish(), + } + } +} + +trait Context { + fn word_docids(&self, word: &str) -> heed::Result>; + fn synonyms>(&self, words: &[S]) -> heed::Result>>>; + fn word_documents_count(&self, word: &str) -> heed::Result> { + match self.word_docids(word)? { + Some(rb) => Ok(Some(rb.len())), + None => Ok(None), + } + } + /// Returns the minimum word len for 1 and 2 typos. + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; + fn exact_words(&self) -> Option<&fst::Set>>; + fn word_pair_frequency( + &self, + left_word: &str, + right_word: &str, + proximity: u8, + ) -> heed::Result>; +} + +/// The query tree builder is the interface to build a query tree. +pub struct QueryTreeBuilder<'a> { + rtxn: &'a heed::RoTxn<'a>, + index: &'a Index, + terms_matching_strategy: TermsMatchingStrategy, + authorize_typos: bool, + words_limit: Option, + exact_words: Option>>, +} + +impl<'a> Context for QueryTreeBuilder<'a> { + fn word_docids(&self, word: &str) -> heed::Result> { + self.index.word_docids.get(self.rtxn, word) + } + + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { + self.index.words_synonyms(self.rtxn, words) + } + + fn word_documents_count(&self, word: &str) -> heed::Result> { + self.index.word_documents_count(self.rtxn, word) + } + + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { + let one = self.index.min_word_len_one_typo(self.rtxn)?; + let two = self.index.min_word_len_two_typos(self.rtxn)?; + Ok((one, two)) + } + + fn exact_words(&self) -> Option<&fst::Set>> { + self.exact_words.as_ref() + } + + fn word_pair_frequency( + &self, + left_word: &str, + right_word: &str, + proximity: u8, + ) -> heed::Result> { + let key = (proximity, left_word, right_word); + self.index + .word_pair_proximity_docids + .remap_data_type::() + .get(self.rtxn, &key) + } +} + +impl<'a> QueryTreeBuilder<'a> { + /// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn` + /// and an Index `index`. + pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Result { + Ok(Self { + rtxn, + index, + terms_matching_strategy: TermsMatchingStrategy::default(), + authorize_typos: true, + words_limit: None, + exact_words: index.exact_words(rtxn)?, + }) + } + + /// if `terms_matching_strategy` is set to `All` the query tree will be + /// generated forcing all query words to be present in each matching documents + /// (the criterion `words` will be ignored). + /// default value if not called: `Last` + pub fn terms_matching_strategy( + &mut self, + terms_matching_strategy: TermsMatchingStrategy, + ) -> &mut Self { + self.terms_matching_strategy = terms_matching_strategy; + self + } + + /// if `authorize_typos` is set to `false` the query tree will be generated + /// forcing all query words to match documents without any typo + /// (the criterion `typo` will be ignored). + /// default value if not called: `true` + pub fn authorize_typos(&mut self, authorize_typos: bool) -> &mut Self { + self.authorize_typos = authorize_typos; + self + } + + /// Limit words and phrases that will be taken for query building. + /// Any beyond `words_limit` will be ignored. + pub fn words_limit(&mut self, words_limit: usize) -> &mut Self { + self.words_limit = Some(words_limit); + self + } + + /// Build the query tree: + /// - if `terms_matching_strategy` is set to `All` the query tree will be + /// generated forcing all query words to be present in each matching documents + /// (the criterion `words` will be ignored) + /// - if `authorize_typos` is set to `false` the query tree will be generated + /// forcing all query words to match documents without any typo + /// (the criterion `typo` will be ignored) + pub fn build>( + &self, + query: NormalizedTokenIter, + ) -> Result> { + let primitive_query = create_primitive_query(query, self.words_limit); + if !primitive_query.is_empty() { + let qt = create_query_tree( + self, + self.terms_matching_strategy, + self.authorize_typos, + &primitive_query, + )?; + let matching_words = + create_matching_words(self, self.authorize_typos, &primitive_query)?; + Ok(Some((qt, primitive_query, matching_words))) + } else { + Ok(None) + } + } +} + +/// Split the word depending on the frequency of pairs near together in the database documents. +fn split_best_frequency<'a>( + ctx: &impl Context, + word: &'a str, +) -> heed::Result> { + let chars = word.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = word.split_at(i); + + let pair_freq = ctx.word_pair_frequency(left, right, 1)?.unwrap_or(0); + + if pair_freq != 0 && best.map_or(true, |(old, _, _)| pair_freq > old) { + best = Some((pair_freq, left, right)); + } + } + + Ok(best.map(|(_, left, right)| (left, right))) +} + +#[derive(Clone)] +pub struct TypoConfig<'a> { + pub max_typos: u8, + pub word_len_one_typo: u8, + pub word_len_two_typo: u8, + pub exact_words: Option<&'a fst::Set>>, +} + +/// Return the `QueryKind` of a word depending on `authorize_typos` +/// and the provided word length. +fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { + if authorize_typos && !config.exact_words.map_or(false, |s| s.contains(&word)) { + let count = word.chars().count().min(u8::MAX as usize) as u8; + if count < config.word_len_one_typo { + QueryKind::exact(word) + } else if count < config.word_len_two_typo { + QueryKind::tolerant(1.min(config.max_typos), word) + } else { + QueryKind::tolerant(2.min(config.max_typos), word) + } + } else { + QueryKind::exact(word) + } +} + +/// Fetch synonyms from the `Context` for the provided word +/// and create the list of operations for the query tree +fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result>> { + let synonyms = ctx.synonyms(word)?; + Ok(synonyms.map(|synonyms| { + synonyms + .into_iter() + .map(|synonym| { + if synonym.len() == 1 { + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact(synonym[0].clone()), + }) + } else { + Operation::Phrase(synonym.into_iter().map(Some).collect()) + } + }) + .collect() + })) +} + +/// Main function that creates the final query tree from the primitive query. +fn create_query_tree( + ctx: &impl Context, + terms_matching_strategy: TermsMatchingStrategy, + authorize_typos: bool, + query: &[PrimitiveQueryPart], +) -> Result { + /// Matches on the `PrimitiveQueryPart` and create an operation from it. + fn resolve_primitive_part( + ctx: &impl Context, + authorize_typos: bool, + part: PrimitiveQueryPart, + ) -> Result { + match part { + // 1. try to split word in 2 + // 2. try to fetch synonyms + // 3. create an operation containing the word + // 4. wrap all in an OR operation + PrimitiveQueryPart::Word(word, prefix) => { + let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); + if let Some((left, right)) = split_best_frequency(ctx, &word)? { + children.push(Operation::Phrase(vec![ + Some(left.to_string()), + Some(right.to_string()), + ])); + } + let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words(); + let config = + TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; + children.push(Operation::Query(Query { + prefix, + kind: typos(word, authorize_typos, config), + })); + Ok(Operation::or(false, children)) + } + // create a CONSECUTIVE operation wrapping all word in the phrase + PrimitiveQueryPart::Phrase(words) => Ok(Operation::phrase(words)), + } + } + + /// Create all ngrams 1..=3 generating query tree branches. + fn ngrams( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], + any_words: bool, + ) -> Result { + const MAX_NGRAM: usize = 3; + let mut op_children = Vec::new(); + + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { + let mut or_op_children = Vec::new(); + + for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { + if let Some(group) = sub_query.get(..ngram) { + let mut and_op_children = Vec::new(); + let tail = &sub_query[ngram..]; + let is_last = tail.is_empty(); + + match group { + [part] => { + let operation = + resolve_primitive_part(ctx, authorize_typos, part.clone())?; + and_op_children.push(operation); + } + words => { + let is_prefix = words.last().map_or(false, |part| part.is_prefix()); + let words: Vec<_> = words + .iter() + .filter_map(|part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }) + .collect(); + let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); + let concat = words.concat(); + let (word_len_one_typo, word_len_two_typo) = + ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words(); + let config = TypoConfig { + max_typos: 1, + word_len_one_typo, + word_len_two_typo, + exact_words, + }; + let query = Query { + prefix: is_prefix, + kind: typos(concat, authorize_typos, config), + }; + operations.push(Operation::Query(query)); + and_op_children.push(Operation::or(false, operations)); + } + } + + if !is_last { + let ngrams = ngrams(ctx, authorize_typos, tail, any_words)?; + and_op_children.push(ngrams); + } + + if any_words { + or_op_children.push(Operation::or(false, and_op_children)); + } else { + or_op_children.push(Operation::and(and_op_children)); + } + } + } + op_children.push(Operation::or(false, or_op_children)); + } + + if any_words { + Ok(Operation::or(false, op_children)) + } else { + Ok(Operation::and(op_children)) + } + } + + let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); + let remove_count = query.len() - max(number_phrases, 1); + if remove_count == 0 { + return ngrams(ctx, authorize_typos, query, false); + } + + let mut operation_children = Vec::new(); + let mut query = query.to_vec(); + for _ in 0..=remove_count { + let pos = match terms_matching_strategy { + TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false), + TermsMatchingStrategy::Any => { + let operation = Operation::Or( + true, + vec![ + // branch allowing matching documents to contains any query word. + ngrams(ctx, authorize_typos, &query, true)?, + // branch forcing matching documents to contains all the query words, + // keeping this documents of the top of the resulted list. + ngrams(ctx, authorize_typos, &query, false)?, + ], + ); + + return Ok(operation); + } + TermsMatchingStrategy::Last => query + .iter() + .enumerate() + .filter(|(_, part)| !part.is_phrase()) + .last() + .map(|(pos, _)| pos), + TermsMatchingStrategy::First => { + query.iter().enumerate().find(|(_, part)| !part.is_phrase()).map(|(pos, _)| pos) + } + TermsMatchingStrategy::Size => query + .iter() + .enumerate() + .filter(|(_, part)| !part.is_phrase()) + .min_by_key(|(_, part)| match part { + PrimitiveQueryPart::Word(s, _) => s.len(), + _ => unreachable!(), + }) + .map(|(pos, _)| pos), + TermsMatchingStrategy::Frequency => query + .iter() + .enumerate() + .filter(|(_, part)| !part.is_phrase()) + .max_by_key(|(_, part)| match part { + PrimitiveQueryPart::Word(s, _) => { + ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value()) + } + _ => unreachable!(), + }) + .map(|(pos, _)| pos), + }; + + // compute and push the current branch on the front + operation_children.insert(0, ngrams(ctx, authorize_typos, &query, false)?); + // remove word from query before creating an new branch + match pos { + Some(pos) => query.remove(pos), + None => break, + }; + } + + Ok(Operation::or(true, operation_children)) +} + +#[derive(Default, Debug)] +struct MatchingWordCache { + all: Vec>, + map: HashMap<(String, u8, bool), Rc>, +} +impl MatchingWordCache { + fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Option> { + match self.map.entry((word.clone(), typo, prefix)) { + Entry::Occupied(idx) => Some(idx.get().clone()), + Entry::Vacant(vacant) => { + let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); + self.all.push(matching_word.clone()); + vacant.insert(matching_word.clone()); + Some(matching_word) + } + } + // To deactivate the cache, for testing purposes, use the following instead: + // let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); + // self.all.push(matching_word.clone()); + // Some(matching_word) + } +} + +/// Main function that matchings words used for crop and highlight. +fn create_matching_words( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], +) -> Result { + /// Matches on the `PrimitiveQueryPart` and create matchings words from it. + fn resolve_primitive_part( + ctx: &impl Context, + authorize_typos: bool, + part: PrimitiveQueryPart, + matching_words: &mut Vec<(Vec>, Vec)>, + matching_word_cache: &mut MatchingWordCache, + id: PrimitiveWordId, + ) -> Result<()> { + match part { + // 1. try to split word in 2 + // 2. try to fetch synonyms + PrimitiveQueryPart::Word(word, prefix) => { + if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? { + for synonym in synonyms { + // Require that all words of the synonym have a corresponding MatchingWord + // before adding any of its words to the matching_words result. + if let Some(synonym_matching_words) = synonym + .into_iter() + .map(|word| matching_word_cache.insert(word, 0, false)) + .collect() + { + matching_words.push((synonym_matching_words, vec![id])); + } + } + } + + if let Some((left, right)) = split_best_frequency(ctx, &word)? { + // Require that both left and right words have a corresponding MatchingWord + // before adding them to the matching_words result + if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) { + if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false) + { + matching_words.push((vec![left, right], vec![id])); + } + } + } + + let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words(); + let config = + TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; + + let matching_word = match typos(word, authorize_typos, config) { + QueryKind::Exact { word, .. } => matching_word_cache.insert(word, 0, prefix), + QueryKind::Tolerant { typo, word } => { + matching_word_cache.insert(word, typo, prefix) + } + }; + if let Some(matching_word) = matching_word { + matching_words.push((vec![matching_word], vec![id])); + } + } + // create a CONSECUTIVE matchings words wrapping all word in the phrase + PrimitiveQueryPart::Phrase(words) => { + let ids: Vec<_> = + (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); + // Require that all words of the phrase have a corresponding MatchingWord + // before adding any of them to the matching_words result + if let Some(phrase_matching_words) = words + .into_iter() + .flatten() + .map(|w| matching_word_cache.insert(w, 0, false)) + .collect() + { + matching_words.push((phrase_matching_words, ids)); + } + } + } + + Ok(()) + } + + /// Create all ngrams 1..=3 generating query tree branches. + fn ngrams( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], + matching_words: &mut Vec<(Vec>, Vec)>, + matching_word_cache: &mut MatchingWordCache, + mut id: PrimitiveWordId, + ) -> Result<()> { + const MAX_NGRAM: usize = 3; + + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { + for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { + if let Some(group) = sub_query.get(..ngram) { + let tail = &sub_query[ngram..]; + let is_last = tail.is_empty(); + + match group { + [part] => { + resolve_primitive_part( + ctx, + authorize_typos, + part.clone(), + matching_words, + matching_word_cache, + id, + )?; + } + words => { + let is_prefix = words.last().map_or(false, |part| part.is_prefix()); + let words: Vec<_> = words + .iter() + .filter_map(|part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }) + .collect(); + let ids: Vec<_> = (0..words.len()) + .into_iter() + .map(|i| id + i as PrimitiveWordId) + .collect(); + + if let Some(synonyms) = ctx.synonyms(&words)? { + for synonym in synonyms { + if let Some(synonym) = synonym + .into_iter() + .map(|syn| matching_word_cache.insert(syn, 0, false)) + .collect() + { + matching_words.push((synonym, ids.clone())); + } + } + } + let word = words.concat(); + let (word_len_one_typo, word_len_two_typo) = + ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words(); + let config = TypoConfig { + max_typos: 1, + word_len_one_typo, + word_len_two_typo, + exact_words, + }; + let matching_word = match typos(word, authorize_typos, config) { + QueryKind::Exact { word, .. } => { + matching_word_cache.insert(word, 0, is_prefix) + } + QueryKind::Tolerant { typo, word } => { + matching_word_cache.insert(word, typo, is_prefix) + } + }; + if let Some(matching_word) = matching_word { + matching_words.push((vec![matching_word], ids)); + } + } + } + + if !is_last { + ngrams( + ctx, + authorize_typos, + tail, + matching_words, + matching_word_cache, + id + 1, + )?; + } + } + } + id += sub_query.iter().map(|x| x.len() as PrimitiveWordId).sum::(); + } + + Ok(()) + } + + let mut matching_word_cache = MatchingWordCache::default(); + let mut matching_words = Vec::new(); + ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?; + Ok(MatchingWords::new(matching_words)) +} + +pub type PrimitiveQuery = Vec; + +#[derive(Debug, Clone)] +pub enum PrimitiveQueryPart { + Phrase(Vec>), + Word(String, IsPrefix), +} + +impl PrimitiveQueryPart { + fn is_phrase(&self) -> bool { + matches!(self, Self::Phrase(_)) + } + + fn is_prefix(&self) -> bool { + matches!(self, Self::Word(_, is_prefix) if *is_prefix) + } + + fn len(&self) -> usize { + match self { + Self::Phrase(words) => words.len(), + Self::Word(_, _) => 1, + } + } +} + +/// Create primitive query from tokenized query string, +/// the primitive query is an intermediate state to build the query tree. +fn create_primitive_query( + query: NormalizedTokenIter, + words_limit: Option, +) -> PrimitiveQuery +where + A: AsRef<[u8]>, +{ + let mut primitive_query = Vec::new(); + let mut phrase = Vec::new(); + let mut quoted = false; + + let parts_limit = words_limit.unwrap_or(usize::MAX); + + let mut peekable = query.peekable(); + while let Some(token) = peekable.next() { + // early return if word limit is exceeded + if primitive_query.len() >= parts_limit { + return primitive_query; + } + + match token.kind { + TokenKind::Word | TokenKind::StopWord => { + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, + // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, + // 3. if the word is the last token of the query we push it as a prefix word. + if quoted { + if let TokenKind::StopWord = token.kind { + phrase.push(None) + } else { + phrase.push(Some(token.lemma().to_string())); + } + } else if peekable.peek().is_some() { + if let TokenKind::StopWord = token.kind { + } else { + primitive_query + .push(PrimitiveQueryPart::Word(token.lemma().to_string(), false)); + } + } else { + primitive_query.push(PrimitiveQueryPart::Word(token.lemma().to_string(), true)); + } + } + TokenKind::Separator(separator_kind) => { + let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + // swap quoted state if we encounter a double quote + if quote_count % 2 != 0 { + quoted = !quoted; + } + // if there is a quote or a hard separator we close the phrase. + if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) + { + primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); + } + } + _ => (), + } + } + + // If a quote is never closed, we consider all of the end of the query as a phrase. + if !phrase.is_empty() { + primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); + } + + primitive_query +} + +/// Returns the maximum number of typos that this Operation allows. +pub fn maximum_typo(operation: &Operation) -> usize { + use Operation::{And, Or, Phrase, Query}; + match operation { + Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0), + And(ops) => ops.iter().map(maximum_typo).sum::(), + Query(q) => q.kind.typo() as usize, + // no typo allowed in phrases + Phrase(_) => 0, + } +} + +/// Returns the maximum proximity that this Operation allows. +pub fn maximum_proximity(operation: &Operation) -> usize { + use Operation::{And, Or, Phrase, Query}; + match operation { + Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), + And(ops) => { + ops.iter().map(maximum_proximity).sum::() + ops.len().saturating_sub(1) * 7 + } + Query(_) | Phrase(_) => 0, + } +} + +#[cfg(test)] +mod test { + use std::collections::HashMap; + + use charabia::Tokenize; + use maplit::hashmap; + use rand::rngs::StdRng; + use rand::{Rng, SeedableRng}; + + use super::*; + use crate::index::tests::TempIndex; + use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; + + #[derive(Debug)] + struct TestContext { + synonyms: HashMap, Vec>>, + postings: HashMap, + exact_words: Option>>, + } + + impl TestContext { + fn build>( + &self, + terms_matching_strategy: TermsMatchingStrategy, + authorize_typos: bool, + words_limit: Option, + query: NormalizedTokenIter, + ) -> Result> { + let primitive_query = create_primitive_query(query, words_limit); + if !primitive_query.is_empty() { + let qt = create_query_tree( + self, + terms_matching_strategy, + authorize_typos, + &primitive_query, + )?; + Ok(Some((qt, primitive_query))) + } else { + Ok(None) + } + } + } + + impl Context for TestContext { + fn word_docids(&self, word: &str) -> heed::Result> { + Ok(self.postings.get(word).cloned()) + } + + fn synonyms>(&self, words: &[S]) -> heed::Result>>> { + let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); + Ok(self.synonyms.get(&words).cloned()) + } + + fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { + Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) + } + + fn exact_words(&self) -> Option<&fst::Set>> { + self.exact_words.as_ref() + } + + fn word_pair_frequency( + &self, + left_word: &str, + right_word: &str, + _proximity: u8, + ) -> heed::Result> { + match self.word_docids(&format!("{} {}", left_word, right_word))? { + Some(rb) => Ok(Some(rb.len())), + None => Ok(None), + } + } + } + + impl Default for TestContext { + fn default() -> TestContext { + let mut rng = StdRng::seed_from_u64(102); + let rng = &mut rng; + + fn random_postings(rng: &mut R, len: usize) -> RoaringBitmap { + let mut values = Vec::::with_capacity(len); + while values.len() != len { + values.push(rng.gen()); + } + values.sort_unstable(); + RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap() + } + + let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap(); + let exact_words = + Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); + + TestContext { + synonyms: hashmap! { + vec![String::from("hello")] => vec![ + vec![String::from("hi")], + vec![String::from("good"), String::from("morning")], + ], + vec![String::from("world")] => vec![ + vec![String::from("earth")], + vec![String::from("nature")], + ], + // new york city + vec![String::from("nyc")] => vec![ + vec![String::from("new"), String::from("york")], + vec![String::from("new"), String::from("york"), String::from("city")], + ], + vec![String::from("new"), String::from("york")] => vec![ + vec![String::from("nyc")], + vec![String::from("new"), String::from("york"), String::from("city")], + ], + vec![String::from("new"), String::from("york"), String::from("city")] => vec![ + vec![String::from("nyc")], + vec![String::from("new"), String::from("york")], + ], + }, + postings: hashmap! { + String::from("hello") => random_postings(rng, 1500), + String::from("hi") => random_postings(rng, 4000), + String::from("word") => random_postings(rng, 2500), + String::from("split") => random_postings(rng, 400), + String::from("ngrams") => random_postings(rng, 1400), + String::from("world") => random_postings(rng, 15_000), + String::from("earth") => random_postings(rng, 8000), + String::from("2021") => random_postings(rng, 100), + String::from("2020") => random_postings(rng, 500), + String::from("is") => random_postings(rng, 50_000), + String::from("this") => random_postings(rng, 50_000), + String::from("good") => random_postings(rng, 1250), + String::from("morning") => random_postings(rng, 125), + String::from("word split") => random_postings(rng, 5000), + String::from("quick brownfox") => random_postings(rng, 7000), + String::from("quickbrown fox") => random_postings(rng, 8000), + }, + exact_words, + } + } + } + + #[test] + fn prefix() { + let query = "hey friends"; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "hey" } + PrefixTolerant { word: "friends", max typo: 1 } + PrefixTolerant { word: "heyfriends", max typo: 1 } + "###); + } + + #[test] + fn no_prefix() { + let query = "hey friends "; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "hey" } + Tolerant { word: "friends", max typo: 1 } + Tolerant { word: "heyfriends", max typo: 1 } + "###); + } + + #[test] + fn synonyms() { + let query = "hello world "; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + OR + Exact { word: "hi" } + PHRASE [Some("good"), Some("morning")] + Tolerant { word: "hello", max typo: 1 } + OR + Exact { word: "earth" } + Exact { word: "nature" } + Tolerant { word: "world", max typo: 1 } + Tolerant { word: "helloworld", max typo: 1 } + "###); + } + + #[test] + fn simple_synonyms() { + let query = "nyc"; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::Last, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + PHRASE [Some("new"), Some("york")] + PHRASE [Some("new"), Some("york"), Some("city")] + PrefixExact { word: "nyc" } + "###); + } + + #[test] + fn complex_synonyms() { + let query = "new york city "; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "new" } + OR + AND + Exact { word: "york" } + Exact { word: "city" } + Tolerant { word: "yorkcity", max typo: 1 } + AND + OR + Exact { word: "nyc" } + PHRASE [Some("new"), Some("york"), Some("city")] + Tolerant { word: "newyork", max typo: 1 } + Exact { word: "city" } + Exact { word: "nyc" } + PHRASE [Some("new"), Some("york")] + Tolerant { word: "newyorkcity", max typo: 1 } + "###); + } + + #[test] + fn ngrams() { + let query = "n grams "; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "n" } + Tolerant { word: "grams", max typo: 1 } + Tolerant { word: "ngrams", max typo: 1 } + "###); + } + + #[test] + fn word_split() { + let query = "wordsplit fish "; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + OR + PHRASE [Some("word"), Some("split")] + Tolerant { word: "wordsplit", max typo: 2 } + Exact { word: "fish" } + Tolerant { word: "wordsplitfish", max typo: 1 } + "###); + } + + #[test] + fn word_split_choose_pair_with_max_freq() { + let query = "quickbrownfox"; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + PHRASE [Some("quickbrown"), Some("fox")] + PrefixTolerant { word: "quickbrownfox", max typo: 2 } + "###); + } + + #[test] + fn phrase() { + let query = "\"hey friends\" \" \" \"wooop"; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + AND + PHRASE [Some("hey"), Some("friends")] + Exact { word: "wooop" } + "###); + } + + #[test] + fn phrase_2() { + // https://github.com/meilisearch/meilisearch/issues/2722 + let query = "coco \"harry\""; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::default(), true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR(WORD) + Exact { word: "harry" } + AND + Exact { word: "coco" } + Exact { word: "harry" } + "###); + } + + #[test] + fn phrase_with_hard_separator() { + let query = "\"hey friends. wooop wooop\""; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + AND + PHRASE [Some("hey"), Some("friends")] + PHRASE [Some("wooop"), Some("wooop")] + "###); + } + + #[test] + fn optional_word() { + let query = "hey my friend "; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::default(), true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR(WORD) + Exact { word: "hey" } + OR + AND + Exact { word: "hey" } + Exact { word: "my" } + Tolerant { word: "heymy", max typo: 1 } + OR + AND + Exact { word: "hey" } + OR + AND + Exact { word: "my" } + Tolerant { word: "friend", max typo: 1 } + Tolerant { word: "myfriend", max typo: 1 } + AND + Tolerant { word: "heymy", max typo: 1 } + Tolerant { word: "friend", max typo: 1 } + Tolerant { word: "heymyfriend", max typo: 1 } + "###); + } + + #[test] + fn optional_word_phrase() { + let query = "\"hey my\""; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::default(), true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + PHRASE [Some("hey"), Some("my")] + "###); + } + + #[test] + fn optional_word_multiple_phrases() { + let query = r#""hey" my good "friend""#; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::default(), true, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR(WORD) + AND + Exact { word: "hey" } + Exact { word: "friend" } + AND + Exact { word: "hey" } + Exact { word: "my" } + Exact { word: "friend" } + AND + Exact { word: "hey" } + OR + AND + Exact { word: "my" } + Exact { word: "good" } + Tolerant { word: "mygood", max typo: 1 } + Exact { word: "friend" } + "###); + } + + #[test] + fn no_typo() { + let query = "hey friends "; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, false, None, tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "hey" } + Exact { word: "friends" } + Exact { word: "heyfriends" } + "###); + } + + #[test] + fn words_limit() { + let query = "\"hey my\" good friend"; + let tokens = query.tokenize(); + + let (query_tree, _) = TestContext::default() + .build(TermsMatchingStrategy::All, false, Some(2), tokens) + .unwrap() + .unwrap(); + + insta::assert_debug_snapshot!(query_tree, @r###" + AND + PHRASE [Some("hey"), Some("my")] + Exact { word: "good" } + "###); + } + + #[test] + fn test_min_word_len_typo() { + let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap(); + let config = TypoConfig { + max_typos: 2, + word_len_one_typo: 5, + word_len_two_typo: 7, + exact_words: Some(&exact_words), + }; + + assert_eq!( + typos("hello".to_string(), true, config.clone()), + QueryKind::Tolerant { typo: 1, word: "hello".to_string() } + ); + + assert_eq!( + typos("hell".to_string(), true, config.clone()), + QueryKind::exact("hell".to_string()) + ); + + assert_eq!( + typos("verylongword".to_string(), true, config.clone()), + QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() } + ); + } + + #[test] + fn test_dont_create_matching_word_for_long_words() { + let index = TempIndex::new(); + let rtxn = index.read_txn().unwrap(); + let query = "what a supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious house"; + let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); + builder.words_limit(10); + let (_, _, matching_words) = builder.build(query.tokenize()).unwrap().unwrap(); + insta::assert_snapshot!(format!("{matching_words:?}"), @r###" + [ + ([MatchingWord { word: "house", typo: 1, prefix: true }], [3]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) + ([MatchingWord { word: "whata", typo: 1, prefix: false }], [0, 1]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) + ([MatchingWord { word: "house", typo: 1, prefix: true }], [1]) + ([MatchingWord { word: "what", typo: 0, prefix: false }], [0]) + ([MatchingWord { word: "a", typo: 0, prefix: false }], [1]) + ] + "###); + } + + #[test] + fn disable_typo_on_word() { + let query = "goodbye"; + let tokens = query.tokenize(); + + let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); + let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); + let context = TestContext { exact_words, ..Default::default() }; + let (query_tree, _) = + context.build(TermsMatchingStrategy::All, true, Some(2), tokens).unwrap().unwrap(); + + assert!(matches!( + query_tree, + Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) + )); + } + + // The memory usage test below is disabled because `cargo test` runs multiple tests in parallel, + // which invalidates the measurements of memory usage. Nevertheless, it is a useful test to run + // manually from time to time, so I kept it here, commented-out. + + // use std::alloc::{GlobalAlloc, System}; + // use std::sync::atomic::{self, AtomicI64}; + // + // #[global_allocator] + // static ALLOC: CountingAlloc = + // CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) }; + // + // pub struct CountingAlloc { + // pub resident: AtomicI64, + // pub allocated: AtomicI64, + // } + // unsafe impl GlobalAlloc for CountingAlloc { + // unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { + // self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); + // self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); + // + // System.alloc(layout) + // } + // + // unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { + // self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); + // System.dealloc(ptr, layout) + // } + // } + // + // #[test] + // fn memory_usage_of_ten_word_query() { + // let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); + // let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); + // + // let index = TempIndex::new(); + // let rtxn = index.read_txn().unwrap(); + // let query = "a beautiful summer house by the beach overlooking what seems"; + // let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); + // builder.words_limit(10); + // let x = builder.build(query.tokenize()).unwrap().unwrap(); + // let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); + // let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); + // + // // Weak check on the memory usage + // // Don't keep more than 5MB. (Arguably 5MB is already too high) + // assert!(resident_after - resident_before < 5_000_000); + // // Don't allocate more than 10MB. + // assert!(allocated_after - allocated_before < 10_000_000); + // + // // Use these snapshots to measure the exact memory usage. + // // The values below were correct at the time I wrote them. + // // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950"); + // // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502"); + // + // // Note, with the matching word cache deactivated, the memory usage was: + // // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697"); + // // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588"); + // // or about 20x more resident memory (90MB vs 4.5MB) + // + // // Use x + // let _x = x; + // } +} diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs new file mode 100644 index 000000000..9ad5fe425 --- /dev/null +++ b/milli/src/snapshot_tests.rs @@ -0,0 +1,586 @@ +use std::borrow::Cow; +use std::fmt::Write; +use std::path::Path; + +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; +use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; + +#[track_caller] +pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { + let mut settings = insta::Settings::clone_current(); + settings.set_prepend_module_to_snapshot(false); + let path = Path::new(std::panic::Location::caller().file()); + let filename = path.file_name().unwrap().to_str().unwrap(); + settings.set_omit_expression(true); + let test_name = std::thread::current().name().unwrap().rsplit("::").next().unwrap().to_owned(); + + if let Some(name) = name { + settings + .set_snapshot_path(Path::new("snapshots").join(filename).join(&test_name).join(name)); + } else { + settings.set_snapshot_path(Path::new("snapshots").join(filename).join(&test_name)); + } + + (settings, test_name) +} +#[macro_export] +macro_rules! milli_snap { + ($value:expr, $name:expr) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", $name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($value:expr) => { + let (settings, test_name) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", test_name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($value:expr, @$inline:literal) => { + let (settings, test_name) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", test_name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; + ($value:expr, $name:expr, @$inline:literal) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", $name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; +} + +/** +Create a snapshot test of the given database. + +## Arguments +1. The identifier for the `Index` +2. The content of the index to snapshot. Available options are: + - `settings` + - `word_docids` + - `exact_word_docids` + - `word_prefix_docids` + - `exact_word_prefix_docids` + - `docid_word_positions` + - `word_pair_proximity_docids` + - `word_prefix_pair_proximity_docids` + - `word_position_docids` + - `field_id_word_count_docids` + - `word_prefix_position_docids` + - `facet_id_f64_docids` + - `facet_id_string_docids` + - `documents_ids` + - `stop_words` + - `soft_deleted_documents_ids` + - `field_distribution` + - `fields_ids_map` + - `geo_faceted_documents_ids` + - `external_documents_ids` + - `number_faceted_documents_ids` + - `string_faceted_documents_ids` + - `words_fst` + - `words_prefixes_fst` + +3. The identifier for the snapshot test (optional) +4. `@""` to write the snapshot inline (optional) + +## Behaviour +The content of the database will be printed either inline or to the file system +at `test_directory/test_file.rs/test_name/db_name.snap`. + +If the database is too large, then only the hash of the database will be saved, with +the name `db_name.hash.snap`. To *also* save the full content of the database anyway, +set the `MILLI_TEST_FULL_SNAPS` environment variable to `true`. The full snapshot will +be saved with the name `db_name.full.snap` but will not be saved to the git repository. + +Running `cargo test` will check whether the old snapshot is identical to the +current one. If they are equal, the test passes. Otherwise, the test fails. + +Use the command line `cargo insta` to approve or reject new snapshots. + +## Example +```ignore +let index = TempIndex::new(); + +// basic usages +db_snap!(index, word_docids); + +// named snapshot to avoid conflicts +db_snap!(index, word_docids, "some_identifier"); + +// write the snapshot inline +db_snap!(index, word_docids, @""); // will be autocompleted by running `cargo insta review` + +// give a name to the inline snapshot +db_snap!(index, word_docids, "some_identifier", @""); +``` +*/ +#[macro_export] +macro_rules! db_snap { + ($index:ident, $db_name:ident, $name:expr) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some( + &format!("{}", $name), + )); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($index:ident, $db_name:ident) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($index:ident, $db_name:ident, @$inline:literal) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; + ($index:ident, $db_name:ident, $name:expr, @$inline:literal) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(&format!("{}", $name))); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; +} + +pub fn snap_word_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_exact_word_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, exact_word_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_prefix_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_prefix_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_exact_word_prefix_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, exact_word_prefix_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_docid_word_positions(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, docid_word_positions, |((idx, s), b)| { + &format!("{idx:<6} {s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_pair_proximity_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_pair_proximity_docids, |( + (proximity, word1, word2), + b, + )| { + &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( + (proximity, word1, prefix), + b, + )| { + &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |( + (proximity, prefix, word2), + b, + )| { + &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_position_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { + &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_field_id_word_count_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, field_id_word_count_docids, |( + (field_id, word_count), + b, + )| { + &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_prefix_position_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_prefix_position_docids, |( + (word_prefix, position), + b, + )| { + &format!("{word_prefix:<4} {position:<6} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_facet_id_f64_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( + FacetGroupKey { field_id, level, left_bound }, + FacetGroupValue { size, bitmap }, + )| { + &format!("{field_id:<3} {level:<2} {left_bound:<6} {size:<2} {}", display_bitmap(&bitmap)) + }); + snap +} +pub fn snap_facet_id_exists_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, facet_id_exists_docids, |(facet_id, docids)| { + &format!("{facet_id:<3} {}", display_bitmap(&docids)) + }); + snap +} +pub fn snap_facet_id_string_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, facet_id_string_docids, |( + FacetGroupKey { field_id, level, left_bound }, + FacetGroupValue { size, bitmap }, + )| { + &format!("{field_id:<3} {level:<2} {left_bound:<12} {size:<2} {}", display_bitmap(&bitmap)) + }); + snap +} +pub fn snap_field_id_docid_facet_strings(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, field_id_docid_facet_strings, |( + (field_id, doc_id, string), + other_string, + )| { + &format!("{field_id:<3} {doc_id:<4} {string:<12} {other_string}") + }); + snap +} +pub fn snap_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let documents_ids = index.documents_ids(&rtxn).unwrap(); + + display_bitmap(&documents_ids) +} +pub fn snap_stop_words(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + let snap = format!("{stop_words:?}"); + snap +} +pub fn snap_soft_deleted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); + + display_bitmap(&soft_deleted_documents_ids) +} +pub fn snap_field_distributions(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let mut snap = String::new(); + for (field, count) in index.field_distribution(&rtxn).unwrap() { + writeln!(&mut snap, "{field:<16} {count:<6}").unwrap(); + } + snap +} +pub fn snap_fields_ids_map(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let name = fields_ids_map.name(field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap(); + } + snap +} +pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); + + display_bitmap(&geo_faceted_documents_ids) +} +pub fn snap_external_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap(); + + let mut snap = String::new(); + + writeln!(&mut snap, "soft:").unwrap(); + let stream_soft = soft.stream(); + let soft_external_ids = stream_soft.into_str_vec().unwrap(); + for (key, id) in soft_external_ids { + writeln!(&mut snap, "{key:<24} {id}").unwrap(); + } + writeln!(&mut snap, "hard:").unwrap(); + let stream_hard = hard.stream(); + let hard_external_ids = stream_hard.into_str_vec().unwrap(); + for (key, id) in hard_external_ids { + writeln!(&mut snap, "{key:<24} {id}").unwrap(); + } + + snap +} +pub fn snap_number_faceted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let number_faceted_documents_ids = + index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap(); + writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids)) + .unwrap(); + } + snap +} +pub fn snap_string_faceted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let string_faceted_documents_ids = + index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap(); + writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids)) + .unwrap(); + } + snap +} +pub fn snap_words_fst(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let words_fst = index.words_fst(&rtxn).unwrap(); + let bytes = words_fst.into_fst().as_bytes().to_owned(); + let mut snap = String::new(); + for byte in bytes { + write!(&mut snap, "{:x}", byte).unwrap(); + } + snap +} +pub fn snap_words_prefixes_fst(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let words_prefixes_fst = index.words_prefixes_fst(&rtxn).unwrap(); + let bytes = words_prefixes_fst.into_fst().as_bytes().to_owned(); + let mut snap = String::new(); + for byte in bytes { + write!(&mut snap, "{:x}", byte).unwrap(); + } + snap +} + +pub fn snap_settings(index: &Index) -> String { + let mut snap = String::new(); + let rtxn = index.read_txn().unwrap(); + + macro_rules! write_setting_to_snap { + ($name:ident) => { + let $name = index.$name(&rtxn).unwrap(); + writeln!(&mut snap, "{}: {:?}", stringify!($name), $name).unwrap(); + }; + } + + write_setting_to_snap!(primary_key); + write_setting_to_snap!(criteria); + write_setting_to_snap!(displayed_fields); + write_setting_to_snap!(distinct_field); + write_setting_to_snap!(filterable_fields); + write_setting_to_snap!(sortable_fields); + write_setting_to_snap!(synonyms); + write_setting_to_snap!(authorize_typos); + write_setting_to_snap!(min_word_len_one_typo); + write_setting_to_snap!(min_word_len_two_typos); + write_setting_to_snap!(exact_words); + write_setting_to_snap!(exact_attributes); + write_setting_to_snap!(max_values_per_facet); + write_setting_to_snap!(pagination_max_total_hits); + write_setting_to_snap!(searchable_fields); + write_setting_to_snap!(user_defined_searchable_fields); + + snap +} + +#[macro_export] +macro_rules! full_snap_of_db { + ($index:ident, settings) => {{ + $crate::snapshot_tests::snap_settings(&$index) + }}; + ($index:ident, word_docids) => {{ + $crate::snapshot_tests::snap_word_docids(&$index) + }}; + ($index:ident, exact_word_docids) => {{ + $crate::snapshot_tests::snap_exact_word_docids(&$index) + }}; + ($index:ident, word_prefix_docids) => {{ + $crate::snapshot_tests::snap_word_prefix_docids(&$index) + }}; + ($index:ident, exact_word_prefix_docids) => {{ + $crate::snapshot_tests::snap_exact_word_prefix_docids(&$index) + }}; + ($index:ident, docid_word_positions) => {{ + $crate::snapshot_tests::snap_docid_word_positions(&$index) + }}; + ($index:ident, word_pair_proximity_docids) => {{ + $crate::snapshot_tests::snap_word_pair_proximity_docids(&$index) + }}; + ($index:ident, word_prefix_pair_proximity_docids) => {{ + $crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index) + }}; + ($index:ident, prefix_word_pair_proximity_docids) => {{ + $crate::snapshot_tests::snap_prefix_word_pair_proximity_docids(&$index) + }}; + ($index:ident, word_position_docids) => {{ + $crate::snapshot_tests::snap_word_position_docids(&$index) + }}; + ($index:ident, field_id_word_count_docids) => {{ + $crate::snapshot_tests::snap_field_id_word_count_docids(&$index) + }}; + ($index:ident, word_prefix_position_docids) => {{ + $crate::snapshot_tests::snap_word_prefix_position_docids(&$index) + }}; + ($index:ident, facet_id_f64_docids) => {{ + $crate::snapshot_tests::snap_facet_id_f64_docids(&$index) + }}; + ($index:ident, facet_id_string_docids) => {{ + $crate::snapshot_tests::snap_facet_id_string_docids(&$index) + }}; + ($index:ident, field_id_docid_facet_strings) => {{ + $crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index) + }}; + ($index:ident, facet_id_exists_docids) => {{ + $crate::snapshot_tests::snap_facet_id_exists_docids(&$index) + }}; + ($index:ident, documents_ids) => {{ + $crate::snapshot_tests::snap_documents_ids(&$index) + }}; + ($index:ident, stop_words) => {{ + $crate::snapshot_tests::snap_stop_words(&$index) + }}; + ($index:ident, soft_deleted_documents_ids) => {{ + $crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index) + }}; + ($index:ident, field_distribution) => {{ + $crate::snapshot_tests::snap_field_distributions(&$index) + }}; + ($index:ident, fields_ids_map) => {{ + $crate::snapshot_tests::snap_fields_ids_map(&$index) + }}; + ($index:ident, geo_faceted_documents_ids) => {{ + $crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index) + }}; + ($index:ident, external_documents_ids) => {{ + $crate::snapshot_tests::snap_external_documents_ids(&$index) + }}; + ($index:ident, number_faceted_documents_ids) => {{ + $crate::snapshot_tests::snap_number_faceted_documents_ids(&$index) + }}; + ($index:ident, string_faceted_documents_ids) => {{ + $crate::snapshot_tests::snap_string_faceted_documents_ids(&$index) + }}; + ($index:ident, words_fst) => {{ + $crate::snapshot_tests::snap_words_fst(&$index) + }}; + ($index:ident, words_prefixes_fst) => {{ + $crate::snapshot_tests::snap_words_prefixes_fst(&$index) + }}; +} + +pub fn convert_snap_to_hash_if_needed<'snap>( + name: &str, + snap: &'snap str, + inline: bool, +) -> Vec<(String, Cow<'snap, str>)> { + let store_whole_snapshot = std::env::var("MILLI_TEST_FULL_SNAPS").unwrap_or("false".to_owned()); + let store_whole_snapshot: bool = store_whole_snapshot.parse().unwrap(); + + let max_len = if inline { 256 } else { 2048 }; + + if snap.len() < max_len { + vec![(name.to_owned(), Cow::Borrowed(snap))] + } else { + let mut r = vec![]; + if store_whole_snapshot { + r.push((format!("{name}.full"), Cow::Borrowed(snap))); + } + let hash = md5::compute(snap.as_bytes()); + let hash_str = format!("{hash:x}"); + r.push((format!("{name}.hash"), Cow::Owned(hash_str))); + r + } +} + +#[macro_export] +macro_rules! make_db_snap_from_iter { + ($index:ident, $name:ident, |$vars:pat| $push:block) => {{ + let rtxn = $index.read_txn().unwrap(); + let iter = $index.$name.iter(&rtxn).unwrap(); + let mut snap = String::new(); + for x in iter { + let $vars = x.unwrap(); + snap.push_str($push); + snap.push('\n'); + } + snap + }}; +} + +pub fn display_bitmap(b: &RoaringBitmap) -> String { + let mut s = String::new(); + s.push('['); + for x in b.into_iter() { + write!(&mut s, "{x}, ").unwrap(); + } + s.push(']'); + s +} diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap new file mode 100644 index 000000000..9b074fb59 --- /dev/null +++ b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/index.rs +--- +age 1 +id 2 +name 2 + diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap new file mode 100644 index 000000000..9b074fb59 --- /dev/null +++ b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/index.rs +--- +age 1 +id 2 +name 2 + diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs new file mode 100644 index 000000000..784bee5a7 --- /dev/null +++ b/milli/src/update/available_documents_ids.rs @@ -0,0 +1,90 @@ +use std::iter::{Chain, FromIterator}; +use std::ops::RangeInclusive; + +use roaring::bitmap::{IntoIter, RoaringBitmap}; + +pub struct AvailableDocumentsIds { + iter: Chain>, +} + +impl AvailableDocumentsIds { + pub fn from_documents_ids( + docids: &RoaringBitmap, + soft_deleted_docids: &RoaringBitmap, + ) -> AvailableDocumentsIds { + let used_docids = docids | soft_deleted_docids; + + match used_docids.max() { + Some(last_id) => { + let mut available = RoaringBitmap::from_iter(0..last_id); + available -= used_docids; + + let iter = match last_id.checked_add(1) { + Some(id) => id..=u32::max_value(), + #[allow(clippy::reversed_empty_ranges)] + None => 1..=0, // empty range iterator + }; + + AvailableDocumentsIds { iter: available.into_iter().chain(iter) } + } + None => { + let empty = RoaringBitmap::new().into_iter(); + AvailableDocumentsIds { iter: empty.chain(0..=u32::max_value()) } + } + } + } +} + +impl Iterator for AvailableDocumentsIds { + type Item = u32; + + fn next(&mut self) -> Option { + self.iter.next() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty() { + let base = RoaringBitmap::new(); + let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); + let right = 0..=u32::max_value(); + left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); + } + + #[test] + fn scattered() { + let mut base = RoaringBitmap::new(); + base.insert(0); + base.insert(10); + base.insert(100); + base.insert(405); + + let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); + let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405); + left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); + } + + #[test] + fn soft_deleted() { + let mut base = RoaringBitmap::new(); + base.insert(0); + base.insert(10); + base.insert(100); + base.insert(405); + + let mut soft_deleted = RoaringBitmap::new(); + soft_deleted.insert(1); + soft_deleted.insert(11); + soft_deleted.insert(101); + soft_deleted.insert(406); + + let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted); + let right = + (0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n)); + left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); + } +} diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs new file mode 100644 index 000000000..adeea11fa --- /dev/null +++ b/milli/src/update/clear_documents.rs @@ -0,0 +1,144 @@ +use roaring::RoaringBitmap; +use time::OffsetDateTime; + +use crate::facet::FacetType; +use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; + +pub struct ClearDocuments<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, +} + +impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> ClearDocuments<'t, 'u, 'i> { + ClearDocuments { wtxn, index } + } + + pub fn execute(self) -> Result { + self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; + let Index { + env: _env, + main: _main, + word_docids, + exact_word_docids, + word_prefix_docids, + exact_word_prefix_docids, + docid_word_positions, + word_pair_proximity_docids, + word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, + word_position_docids, + field_id_word_count_docids, + word_prefix_position_docids, + facet_id_f64_docids, + facet_id_string_docids, + facet_id_exists_docids, + field_id_docid_facet_f64s, + field_id_docid_facet_strings, + documents, + } = self.index; + + let empty_roaring = RoaringBitmap::default(); + + // We retrieve the number of documents ids that we are deleting. + let number_of_documents = self.index.number_of_documents(self.wtxn)?; + let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + + // We clean some of the main engine datastructures. + self.index.put_words_fst(self.wtxn, &fst::Set::default())?; + self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; + self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; + self.index.put_documents_ids(self.wtxn, &empty_roaring)?; + self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?; + self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; + self.index.delete_geo_rtree(self.wtxn)?; + self.index.delete_geo_faceted_documents_ids(self.wtxn)?; + + // We clean all the faceted documents ids. + for field_id in faceted_fields { + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::Number, + &empty_roaring, + )?; + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::String, + &empty_roaring, + )?; + } + + // Clear the other databases. + word_docids.clear(self.wtxn)?; + exact_word_docids.clear(self.wtxn)?; + word_prefix_docids.clear(self.wtxn)?; + exact_word_prefix_docids.clear(self.wtxn)?; + docid_word_positions.clear(self.wtxn)?; + word_pair_proximity_docids.clear(self.wtxn)?; + word_prefix_pair_proximity_docids.clear(self.wtxn)?; + prefix_word_pair_proximity_docids.clear(self.wtxn)?; + word_position_docids.clear(self.wtxn)?; + field_id_word_count_docids.clear(self.wtxn)?; + word_prefix_position_docids.clear(self.wtxn)?; + facet_id_f64_docids.clear(self.wtxn)?; + facet_id_exists_docids.clear(self.wtxn)?; + facet_id_string_docids.clear(self.wtxn)?; + field_id_docid_facet_f64s.clear(self.wtxn)?; + field_id_docid_facet_strings.clear(self.wtxn)?; + documents.clear(self.wtxn)?; + + Ok(number_of_documents) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::index::tests::TempIndex; + + #[test] + fn clear_documents() { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn(&mut wtxn, documents!([ + { "id": 0, "name": "kevin", "age": 20 }, + { "id": 1, "name": "kevina" }, + { "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } } + ])) + .unwrap(); + + // Clear all documents from the database. + let builder = ClearDocuments::new(&mut wtxn, &index); + assert_eq!(builder.execute().unwrap(), 3); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // the value is 7 because there is `[id, name, age, country, _geo, _geo.lng, _geo.lat]` + assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 7); + + assert!(index.words_fst(&rtxn).unwrap().is_empty()); + assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); + assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); + assert!(index.documents_ids(&rtxn).unwrap().is_empty()); + assert!(index.field_distribution(&rtxn).unwrap().is_empty()); + assert!(index.geo_rtree(&rtxn).unwrap().is_none()); + assert!(index.geo_faceted_documents_ids(&rtxn).unwrap().is_empty()); + + assert!(index.word_docids.is_empty(&rtxn).unwrap()); + assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); + assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); + assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); + assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); + assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); + assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); + assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); + assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); + assert!(index.field_id_docid_facet_strings.is_empty(&rtxn).unwrap()); + assert!(index.documents.is_empty(&rtxn).unwrap()); + } +} diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs new file mode 100644 index 000000000..635ce85be --- /dev/null +++ b/milli/src/update/delete_documents.rs @@ -0,0 +1,1167 @@ +use std::collections::btree_map::Entry; +use std::collections::{HashMap, HashSet}; + +use fst::IntoStreamer; +use heed::types::{ByteSlice, DecodeIgnore, Str}; +use heed::Database; +use roaring::RoaringBitmap; +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; + +use super::facet::delete::FacetsDelete; +use super::ClearDocuments; +use crate::error::InternalError; +use crate::facet::FacetType; +use crate::heed_codec::facet::FieldDocIdFacetCodec; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::{ + ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, + SmallString32, BEU32, +}; + +pub struct DeleteDocuments<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + external_documents_ids: ExternalDocumentsIds<'static>, + to_delete_docids: RoaringBitmap, + strategy: DeletionStrategy, +} + +/// Result of a [`DeleteDocuments`] operation. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DocumentDeletionResult { + pub deleted_documents: u64, + pub remaining_documents: u64, +} + +/// Strategy for deleting documents. +/// +/// - Soft-deleted documents are simply marked as deleted without being actually removed from DB. +/// - Hard-deleted documents are definitely suppressed from the DB. +/// +/// Soft-deleted documents trade disk space for runtime performance. +/// +/// Note that any of these variants can be used at any given moment for any indexation in a database. +/// For instance, you can use an [`AlwaysSoft`] followed by an [`AlwaysHard`] option without issue. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +pub enum DeletionStrategy { + #[default] + /// Definitely suppress documents according to the number or size of soft-deleted documents + Dynamic, + /// Never definitely suppress documents + AlwaysSoft, + /// Always definitely suppress documents + AlwaysHard, +} + +impl std::fmt::Display for DeletionStrategy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + DeletionStrategy::Dynamic => write!(f, "dynamic"), + DeletionStrategy::AlwaysSoft => write!(f, "always_soft"), + DeletionStrategy::AlwaysHard => write!(f, "always_hard"), + } + } +} + +/// Result of a [`DeleteDocuments`] operation, used for internal purposes. +/// +/// It is a superset of the [`DocumentDeletionResult`] structure, giving +/// additional information about the algorithm used to delete the documents. +#[derive(Debug)] +pub(crate) struct DetailedDocumentDeletionResult { + pub deleted_documents: u64, + pub remaining_documents: u64, + pub soft_deletion_used: bool, +} + +impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> Result> { + let external_documents_ids = index.external_documents_ids(wtxn)?.into_static(); + + Ok(DeleteDocuments { + wtxn, + index, + external_documents_ids, + to_delete_docids: RoaringBitmap::new(), + strategy: Default::default(), + }) + } + + pub fn strategy(&mut self, strategy: DeletionStrategy) { + self.strategy = strategy; + } + + pub fn delete_document(&mut self, docid: u32) { + self.to_delete_docids.insert(docid); + } + + pub fn delete_documents(&mut self, docids: &RoaringBitmap) { + self.to_delete_docids |= docids; + } + + pub fn delete_external_id(&mut self, external_id: &str) -> Option { + let docid = self.external_documents_ids.get(external_id)?; + self.delete_document(docid); + Some(docid) + } + pub fn execute(self) -> Result { + let DetailedDocumentDeletionResult { + deleted_documents, + remaining_documents, + soft_deletion_used: _, + } = self.execute_inner()?; + + Ok(DocumentDeletionResult { deleted_documents, remaining_documents }) + } + pub(crate) fn execute_inner(mut self) -> Result { + self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; + + // We retrieve the current documents ids that are in the database. + let mut documents_ids = self.index.documents_ids(self.wtxn)?; + let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?; + let current_documents_ids_len = documents_ids.len(); + + // We can and must stop removing documents in a database that is empty. + if documents_ids.is_empty() { + // but if there was still documents to delete we clear the database entirely + if !soft_deleted_docids.is_empty() { + ClearDocuments::new(self.wtxn, self.index).execute()?; + } + return Ok(DetailedDocumentDeletionResult { + deleted_documents: 0, + remaining_documents: 0, + soft_deletion_used: false, + }); + } + + // We remove the documents ids that we want to delete + // from the documents in the database and write them back. + documents_ids -= &self.to_delete_docids; + self.index.put_documents_ids(self.wtxn, &documents_ids)?; + + // We can execute a ClearDocuments operation when the number of documents + // to delete is exactly the number of documents in the database. + if current_documents_ids_len == self.to_delete_docids.len() { + let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?; + return Ok(DetailedDocumentDeletionResult { + deleted_documents: current_documents_ids_len, + remaining_documents, + soft_deletion_used: false, + }); + } + + let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + let mut field_distribution = self.index.field_distribution(self.wtxn)?; + + // we update the field distribution + for docid in self.to_delete_docids.iter() { + let key = BEU32::new(docid); + let document = + self.index.documents.get(self.wtxn, &key)?.ok_or( + InternalError::DatabaseMissingEntry { db_name: "documents", key: None }, + )?; + for (fid, _value) in document.iter() { + let field_name = + fields_ids_map.name(fid).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: fid, + process: "delete documents", + })?; + if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) + { + match entry.get().checked_sub(1) { + Some(0) | None => entry.remove(), + Some(count) => entry.insert(count), + }; + } + } + } + + self.index.put_field_distribution(self.wtxn, &field_distribution)?; + + soft_deleted_docids |= &self.to_delete_docids; + + // We always soft-delete the documents, even if they will be permanently + // deleted immediately after. + self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; + + // decide for a hard or soft deletion depending on the strategy + let soft_deletion = match self.strategy { + DeletionStrategy::Dynamic => { + // decide to keep the soft deleted in the DB for now if they meet 2 criteria: + // 1. There is less than a fixed rate of 50% of soft-deleted to actual documents, *and* + // 2. Soft-deleted occupy an average of less than a fixed size on disk + + let size_used = self.index.used_size()?; + let nb_documents = self.index.number_of_documents(self.wtxn)?; + let nb_soft_deleted = soft_deleted_docids.len(); + + (nb_soft_deleted < nb_documents) && { + const SOFT_DELETED_SIZE_BYTE_THRESHOLD: u64 = 1_073_741_824; // 1GiB + + // nb_documents + nb_soft_deleted !=0 because if nb_documents is 0 we short-circuit earlier, and then we moved the documents to delete + // from the documents_docids to the soft_deleted_docids. + let estimated_document_size = size_used / (nb_documents + nb_soft_deleted); + let estimated_size_used_by_soft_deleted = + estimated_document_size * nb_soft_deleted; + estimated_size_used_by_soft_deleted < SOFT_DELETED_SIZE_BYTE_THRESHOLD + } + } + DeletionStrategy::AlwaysSoft => true, + DeletionStrategy::AlwaysHard => false, + }; + + if soft_deletion { + // Keep the soft-deleted in the DB + return Ok(DetailedDocumentDeletionResult { + deleted_documents: self.to_delete_docids.len(), + remaining_documents: documents_ids.len(), + soft_deletion_used: true, + }); + } + + self.to_delete_docids = soft_deleted_docids; + + let Index { + env: _env, + main: _main, + word_docids, + exact_word_docids, + word_prefix_docids, + exact_word_prefix_docids, + docid_word_positions, + word_pair_proximity_docids, + field_id_word_count_docids, + word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, + word_position_docids, + word_prefix_position_docids, + facet_id_f64_docids: _, + facet_id_string_docids: _, + field_id_docid_facet_f64s: _, + field_id_docid_facet_strings: _, + facet_id_exists_docids, + documents, + } = self.index; + + // Retrieve the words contained in the documents. + let mut words = Vec::new(); + for docid in &self.to_delete_docids { + documents.delete(self.wtxn, &BEU32::new(docid))?; + + // We iterate through the words positions of the document id, retrieve the word and delete the positions. + // We create an iterator to be able to get the content and delete the key-value itself. + // It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once. + let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?; + while let Some(result) = iter.next() { + let ((_docid, word), _positions) = result?; + // This boolean will indicate if we must remove this word from the words FST. + words.push((SmallString32::from(word), false)); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } + } + // We acquire the current external documents ids map... + // Note that its soft-deleted document ids field will be equal to the `to_delete_docids` + let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?; + // We then remove the soft-deleted docids from it + new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; + // and write it back to the main database. + let new_external_documents_ids = new_external_documents_ids.into_static(); + self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; + + // Maybe we can improve the get performance of the words + // if we sort the words first, keeping the LMDB pages in cache. + words.sort_unstable(); + + // We iterate over the words and delete the documents ids + // from the word docids database. + for (word, must_remove) in &mut words { + remove_from_word_docids( + self.wtxn, + word_docids, + word.as_str(), + must_remove, + &self.to_delete_docids, + )?; + + remove_from_word_docids( + self.wtxn, + exact_word_docids, + word.as_str(), + must_remove, + &self.to_delete_docids, + )?; + } + + // We construct an FST set that contains the words to delete from the words FST. + let words_to_delete = + words.iter().filter_map( + |(word, must_remove)| { + if *must_remove { + Some(word.as_str()) + } else { + None + } + }, + ); + let words_to_delete = fst::Set::from_iter(words_to_delete)?; + + let new_words_fst = { + // We retrieve the current words FST from the database. + let words_fst = self.index.words_fst(self.wtxn)?; + let difference = words_fst.op().add(&words_to_delete).difference(); + + // We stream the new external ids that does no more contains the to-delete external ids. + let mut new_words_fst_builder = fst::SetBuilder::memory(); + new_words_fst_builder.extend_stream(difference.into_stream())?; + + // We create an words FST set from the above builder. + new_words_fst_builder.into_set() + }; + + // We write the new words FST into the main database. + self.index.put_words_fst(self.wtxn, &new_words_fst)?; + + let prefixes_to_delete = + remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.to_delete_docids)?; + + let exact_prefix_to_delete = remove_from_word_prefix_docids( + self.wtxn, + exact_word_prefix_docids, + &self.to_delete_docids, + )?; + + let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union(); + + // We compute the new prefix FST and write it only if there is a change. + if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() { + let new_words_prefixes_fst = { + // We retrieve the current words prefixes FST from the database. + let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?; + let difference = + words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference(); + + // We stream the new external ids that does no more contains the to-delete external ids. + let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory(); + new_words_prefixes_fst_builder.extend_stream(difference.into_stream())?; + + // We create an words FST set from the above builder. + new_words_prefixes_fst_builder.into_set() + }; + + // We write the new words prefixes FST into the main database. + self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?; + } + + for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { + // We delete the documents ids from the word prefix pair proximity database docids + // and remove the empty pairs too. + let db = db.remap_key_type::(); + let mut iter = db.iter_mut(self.wtxn)?; + while let Some(result) = iter.next() { + let (key, mut docids) = result?; + let previous_len = docids.len(); + docids -= &self.to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } + } + } + + // We delete the documents ids that are under the pairs of words, + // it is faster and use no memory to iterate over all the words pairs than + // to compute the cartesian product of every words of the deleted documents. + let mut iter = + word_pair_proximity_docids.remap_key_type::().iter_mut(self.wtxn)?; + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids -= &self.to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &docids)? }; + } + } + + drop(iter); + + // We delete the documents ids that are under the word level position docids. + let mut iter = word_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids -= &self.to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &docids)? }; + } + } + + drop(iter); + + // We delete the documents ids that are under the word prefix level position docids. + let mut iter = + word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids -= &self.to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &docids)? }; + } + } + + drop(iter); + + // Remove the documents ids from the field id word count database. + let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; + while let Some((key, mut docids)) = iter.next().transpose()? { + let previous_len = docids.len(); + docids -= &self.to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } + } + + drop(iter); + + if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? { + let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?; + + let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree + .iter() + .filter(|&point| self.to_delete_docids.contains(point.data.0)) + .cloned() + .map(|point| (point, point.data.0)) + .unzip(); + points_to_remove.iter().for_each(|point| { + rtree.remove(point); + }); + geo_faceted_doc_ids -= docids_to_remove; + + self.index.put_geo_rtree(self.wtxn, &rtree)?; + self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; + } + + for facet_type in [FacetType::Number, FacetType::String] { + let mut affected_facet_values = HashMap::new(); + for field_id in self.index.faceted_fields_ids(self.wtxn)? { + // Remove docids from the number faceted documents ids + let mut docids = + self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?; + docids -= &self.to_delete_docids; + self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?; + + let facet_values = remove_docids_from_field_id_docid_facet_value( + self.index, + self.wtxn, + facet_type, + field_id, + &self.to_delete_docids, + )?; + if !facet_values.is_empty() { + affected_facet_values.insert(field_id, facet_values); + } + } + FacetsDelete::new( + self.index, + facet_type, + affected_facet_values, + &self.to_delete_docids, + ) + .execute(self.wtxn)?; + } + + // We delete the documents ids that are under the facet field id values. + remove_docids_from_facet_id_exists_docids( + self.wtxn, + facet_id_exists_docids, + &self.to_delete_docids, + )?; + + self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; + + Ok(DetailedDocumentDeletionResult { + deleted_documents: self.to_delete_docids.len(), + remaining_documents: documents_ids.len(), + soft_deletion_used: false, + }) + } +} + +fn remove_from_word_prefix_docids( + txn: &mut heed::RwTxn, + db: &Database, + to_remove: &RoaringBitmap, +) -> Result>> { + let mut prefixes_to_delete = fst::SetBuilder::memory(); + + // We iterate over the word prefix docids database and remove the deleted documents ids + // from every docids lists. We register the empty prefixes in an fst Set for futur deletion. + let mut iter = db.iter_mut(txn)?; + while let Some(result) = iter.next() { + let (prefix, mut docids) = result?; + let prefix = prefix.to_owned(); + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + prefixes_to_delete.insert(prefix)?; + } else if docids.len() != previous_len { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&prefix, &docids)? }; + } + } + + Ok(prefixes_to_delete.into_set()) +} + +fn remove_from_word_docids( + txn: &mut heed::RwTxn, + db: &heed::Database, + word: &str, + must_remove: &mut bool, + to_remove: &RoaringBitmap, +) -> Result<()> { + // We create an iterator to be able to get the content and delete the word docids. + // It's faster to acquire a cursor to get and delete or put, as we avoid traversing + // the LMDB B-Tree two times but only once. + let mut iter = db.prefix_iter_mut(txn, word)?; + if let Some((key, mut docids)) = iter.next().transpose()? { + if key == word { + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + *must_remove = true; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } + } + } + + Ok(()) +} + +fn remove_docids_from_field_id_docid_facet_value<'i, 'a>( + index: &'i Index, + wtxn: &'a mut heed::RwTxn, + facet_type: FacetType, + field_id: FieldId, + to_remove: &RoaringBitmap, +) -> heed::Result>> { + let db = match facet_type { + FacetType::String => { + index.field_id_docid_facet_strings.remap_types::() + } + FacetType::Number => { + index.field_id_docid_facet_f64s.remap_types::() + } + }; + let mut all_affected_facet_values = HashSet::default(); + let mut iter = db + .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? + .remap_key_type::>(); + + while let Some(result) = iter.next() { + let ((_, docid, facet_value), _) = result?; + if to_remove.contains(docid) { + if !all_affected_facet_values.contains(facet_value) { + all_affected_facet_values.insert(facet_value.to_owned()); + } + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } + } + + Ok(all_affected_facet_values) +} + +fn remove_docids_from_facet_id_exists_docids<'a, C>( + wtxn: &'a mut heed::RwTxn, + db: &heed::Database, + to_remove: &RoaringBitmap, +) -> heed::Result<()> +where + C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, +{ + let mut iter = db.remap_key_type::().iter_mut(wtxn)?; + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &docids)? }; + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use big_s::S; + use heed::RwTxn; + use maplit::hashset; + + use super::*; + use crate::index::tests::TempIndex; + use crate::{db_snap, Filter}; + + fn delete_documents<'t>( + wtxn: &mut RwTxn<'t, '_>, + index: &'t Index, + external_ids: &[&str], + strategy: DeletionStrategy, + ) -> Vec { + let external_document_ids = index.external_documents_ids(wtxn).unwrap(); + let ids_to_delete: Vec = external_ids + .iter() + .map(|id| external_document_ids.get(id.as_bytes()).unwrap()) + .collect(); + + // Delete some documents. + let mut builder = DeleteDocuments::new(wtxn, index).unwrap(); + builder.strategy(strategy); + external_ids.iter().for_each(|id| drop(builder.delete_external_id(id))); + builder.execute().unwrap(); + + ids_to_delete + } + + fn delete_documents_with_numbers_as_primary_key_(deletion_strategy: DeletionStrategy) { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, + { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, + { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } + ]), + ) + .unwrap(); + + // delete those documents, ids are synchronous therefore 0, 1, and 2. + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_document(0); + builder.delete_document(1); + builder.delete_document(2); + builder.strategy(deletion_strategy); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + // All these snapshots should be empty since the database was cleared + db_snap!(index, documents_ids, deletion_strategy); + db_snap!(index, word_docids, deletion_strategy); + db_snap!(index, word_pair_proximity_docids, deletion_strategy); + db_snap!(index, facet_id_exists_docids, deletion_strategy); + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); + + let rtxn = index.read_txn().unwrap(); + + assert!(index.field_distribution(&rtxn).unwrap().is_empty()); + } + + #[test] + fn delete_documents_with_numbers_as_primary_key() { + delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysHard); + delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysSoft); + } + + fn delete_documents_with_strange_primary_key_(strategy: DeletionStrategy) { + let index = TempIndex::new(); + + index + .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()])) + .unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "mysuperid": 0, "name": "kevin" }, + { "mysuperid": 1, "name": "kevina" }, + { "mysuperid": 2, "name": "benoit" } + ]), + ) + .unwrap(); + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_external_id("0"); + builder.delete_external_id("1"); + builder.strategy(strategy); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, strategy); + db_snap!(index, word_docids, strategy); + db_snap!(index, word_pair_proximity_docids, strategy); + db_snap!(index, soft_deleted_documents_ids, strategy); + } + + #[test] + fn delete_documents_with_strange_primary_key() { + delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysHard); + delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysSoft); + } + + fn filtered_placeholder_search_should_not_return_deleted_documents_( + deletion_strategy: DeletionStrategy, + ) { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + settings.set_filterable_fields(hashset! { S("label"), S("label2") }); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"], deletion_strategy); + + // Placeholder search with filter + let filter = Filter::from_str("label = sign").unwrap().unwrap(); + let results = index.search(&wtxn).filter(filter).execute().unwrap(); + assert!(results.documents_ids.is_empty()); + + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); + db_snap!(index, word_docids, deletion_strategy); + db_snap!(index, facet_id_f64_docids, deletion_strategy); + db_snap!(index, word_pair_proximity_docids, deletion_strategy); + db_snap!(index, facet_id_exists_docids, deletion_strategy); + db_snap!(index, facet_id_string_docids, deletion_strategy); + } + + #[test] + fn filtered_placeholder_search_should_not_return_deleted_documents() { + filtered_placeholder_search_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysHard, + ); + filtered_placeholder_search_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysSoft, + ); + } + + fn placeholder_search_should_not_return_deleted_documents_( + deletion_strategy: DeletionStrategy, + ) { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"], deletion_strategy); + + // Placeholder search + let results = index.search(&wtxn).execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + } + + #[test] + fn placeholder_search_should_not_return_deleted_documents() { + placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); + placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); + } + + fn search_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy); + + // search for abstract + let results = index.search(&wtxn).query("abstract").execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); + } + + #[test] + fn search_should_not_return_deleted_documents() { + search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); + search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); + } + + fn geo_filtered_placeholder_search_should_not_return_deleted_documents_( + deletion_strategy: DeletionStrategy, + ) { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("id")); + settings.set_filterable_fields(hashset!(S("_geo"))); + settings.set_sortable_fields(hashset!(S("_geo"))); + }) + .unwrap(); + + index.add_documents_using_wtxn(&mut wtxn, documents!([ + { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } }, + { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } }, + { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } }, + { "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } }, + { "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } }, + { "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } }, + { "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } }, + { "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } }, + { "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } }, + { "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } }, + { "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } }, + { "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } }, + { "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } }, + { "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } }, + { "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } }, + { "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } }, + { "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } }, + { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } }, + { "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } }, + { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } } + ])).unwrap(); + + let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &external_ids_to_delete, deletion_strategy); + + // Placeholder search with geo filter + let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); + let results = index.search(&wtxn).filter(filter).execute().unwrap(); + assert!(!results.documents_ids.is_empty()); + for id in results.documents_ids.iter() { + assert!( + !deleted_internal_ids.contains(id), + "The document {} was supposed to be deleted", + id + ); + } + + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); + db_snap!(index, facet_id_f64_docids, deletion_strategy); + db_snap!(index, facet_id_string_docids, deletion_strategy); + } + + #[test] + fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { + geo_filtered_placeholder_search_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysHard, + ); + geo_filtered_placeholder_search_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysSoft, + ); + } + + fn get_documents_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } + ]), + ) + .unwrap(); + + let deleted_external_ids = ["1_7", "1_52"]; + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &deleted_external_ids, deletion_strategy); + + // list all documents + let results = index.all_documents(&wtxn).unwrap(); + for result in results { + let (id, _) = result.unwrap(); + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + + // list internal document ids + let results = index.documents_ids(&wtxn).unwrap(); + for id in results { + assert!( + !deleted_internal_ids.contains(&id), + "The document {} was supposed to be deleted", + id + ); + } + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // get internal docids from deleted external document ids + let results = index.external_documents_ids(&rtxn).unwrap(); + for id in deleted_external_ids { + assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id); + } + drop(rtxn); + + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); + } + + #[test] + fn get_documents_should_not_return_deleted_documents() { + get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); + get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); + } + + fn stats_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) { + let index = TempIndex::new(); + + let mut wtxn = index.write_txn().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("docid")); + }) + .unwrap(); + + index.add_documents_using_wtxn(&mut wtxn, documents!([ + { "docid": "1_4", "label": ["sign"]}, + { "docid": "1_5", "label": ["letter"]}, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"}, + { "docid": "1_36", "label": ["drawing","painting","pattern"]}, + { "docid": "1_37", "label": ["art","drawing","outdoor"]}, + { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"}, + { "docid": "1_39", "label": ["abstract"]}, + { "docid": "1_40", "label": ["cartoon"]}, + { "docid": "1_41", "label": ["art","drawing"]}, + { "docid": "1_42", "label": ["art","pattern"]}, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32}, + { "docid": "1_44", "label": ["drawing"], "number": 44i32}, + { "docid": "1_45", "label": ["art"]}, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]}, + { "docid": "1_47", "label": ["abstract","pattern"]}, + { "docid": "1_52", "label": ["abstract","cartoon"]}, + { "docid": "1_57", "label": ["abstract","drawing","pattern"]}, + { "docid": "1_58", "label": ["abstract","art","cartoon"]}, + { "docid": "1_68", "label": ["design"]}, + { "docid": "1_69", "label": ["geometry"]} + ])).unwrap(); + + delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy); + + // count internal documents + let results = index.number_of_documents(&wtxn).unwrap(); + assert_eq!(18, results); + + // count field distribution + let results = index.field_distribution(&wtxn).unwrap(); + assert_eq!(Some(&18), results.get("label")); + assert_eq!(Some(&1), results.get("title")); + assert_eq!(Some(&2), results.get("number")); + + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, deletion_strategy); + } + + #[test] + fn stats_should_not_return_deleted_documents() { + stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); + stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); + } +} diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs new file mode 100644 index 000000000..30f15ebab --- /dev/null +++ b/milli/src/update/facet/bulk.rs @@ -0,0 +1,528 @@ +use std::borrow::Cow; +use std::fs::File; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use roaring::RoaringBitmap; + +use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use crate::facet::FacetType; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, +}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; +use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; + +/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases +/// by rebuilding the database "from scratch". +/// +/// First, the new elements are inserted into the level 0 of the database. Then, the +/// higher levels are cleared and recomputed from the content of level 0. +/// +/// Finally, the `faceted_documents_ids` value in the main database of `Index` +/// is updated to contain the new set of faceted documents. +pub struct FacetsUpdateBulk<'i> { + index: &'i Index, + group_size: u8, + min_level_size: u8, + facet_type: FacetType, + field_ids: Vec, + // None if level 0 does not need to be updated + new_data: Option>, +} + +impl<'i> FacetsUpdateBulk<'i> { + pub fn new( + index: &'i Index, + field_ids: Vec, + facet_type: FacetType, + new_data: grenad::Reader, + group_size: u8, + min_level_size: u8, + ) -> FacetsUpdateBulk<'i> { + FacetsUpdateBulk { + index, + field_ids, + group_size, + min_level_size, + facet_type, + new_data: Some(new_data), + } + } + + pub fn new_not_updating_level_0( + index: &'i Index, + field_ids: Vec, + facet_type: FacetType, + ) -> FacetsUpdateBulk<'i> { + FacetsUpdateBulk { + index, + field_ids, + group_size: FACET_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, + facet_type, + new_data: None, + } + } + + #[logging_timer::time("FacetsUpdateBulk::{}")] + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; + + let db = match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; + + let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; + + inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { + index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; + Ok(()) + })?; + + Ok(()) + } +} + +/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type +pub(crate) struct FacetsUpdateBulkInner { + pub db: heed::Database, FacetGroupValueCodec>, + pub new_data: Option>, + pub group_size: u8, + pub min_level_size: u8, +} +impl FacetsUpdateBulkInner { + pub fn update( + mut self, + wtxn: &mut RwTxn, + field_ids: &[u16], + mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>, + ) -> Result<()> { + self.update_level0(wtxn)?; + for &field_id in field_ids.iter() { + self.clear_levels(wtxn, field_id)?; + } + + for &field_id in field_ids.iter() { + let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?; + + handle_all_docids(wtxn, field_id, all_docids)?; + + for level_reader in level_readers { + let mut cursor = level_reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + self.db.remap_types::().put(wtxn, k, v)?; + } + } + } + Ok(()) + } + + fn clear_levels(&self, wtxn: &mut heed::RwTxn, field_id: FieldId) -> Result<()> { + let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; + let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; + let range = left..=right; + self.db.delete_range(wtxn, &range).map(drop)?; + Ok(()) + } + fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { + let new_data = match self.new_data.take() { + Some(x) => x, + None => return Ok(()), + }; + if self.db.is_empty(wtxn)? { + let mut buffer = Vec::new(); + let mut database = self.db.iter_mut(wtxn)?.remap_types::(); + let mut cursor = new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + if !valid_lmdb_key(key) { + continue; + } + buffer.clear(); + // the group size for level 0 + buffer.push(1); + // then we extend the buffer with the docids bitmap + buffer.extend_from_slice(value); + unsafe { database.append(key, &buffer)? }; + } + } else { + let mut buffer = Vec::new(); + let database = self.db.remap_types::(); + + let mut cursor = new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + if !valid_lmdb_key(key) { + continue; + } + // the value is a CboRoaringBitmap, but I still need to prepend the + // group size for level 0 (= 1) to it + buffer.clear(); + buffer.push(1); + // then we extend the buffer with the docids bitmap + match database.get(wtxn, key)? { + Some(prev_value) => { + let old_bitmap = &prev_value[1..]; + CboRoaringBitmapCodec::merge_into( + &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], + &mut buffer, + )?; + } + None => { + buffer.extend_from_slice(value); + } + }; + database.put(wtxn, key, &buffer)?; + } + } + Ok(()) + } + fn compute_levels_for_field_id( + &self, + field_id: FieldId, + txn: &RoTxn, + ) -> Result<(Vec>, RoaringBitmap)> { + let mut all_docids = RoaringBitmap::new(); + let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { + for bitmap in bitmaps { + all_docids |= bitmap; + } + Ok(()) + })?; + + Ok((subwriters, all_docids)) + } + #[allow(clippy::type_complexity)] + fn read_level_0<'t>( + &self, + rtxn: &'t RoTxn, + field_id: u16, + handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, + ) -> Result<()> { + // we read the elements one by one and + // 1. keep track of the left bound + // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read + let mut bitmaps = vec![]; + + let mut level_0_prefix = vec![]; + level_0_prefix.extend_from_slice(&field_id.to_be_bytes()); + level_0_prefix.push(0); + + let level_0_iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, level_0_prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); + + let mut left_bound: &[u8] = &[]; + let mut first_iteration_for_new_group = true; + for el in level_0_iter { + let (key, value) = el?; + let bound = key.left_bound; + let docids = value.bitmap; + + if first_iteration_for_new_group { + left_bound = bound; + first_iteration_for_new_group = false; + } + bitmaps.push(docids); + + if bitmaps.len() == self.group_size as usize { + handle_group(&bitmaps, left_bound)?; + first_iteration_for_new_group = true; + bitmaps.clear(); + } + } + // don't forget to give the leftover bitmaps as well + if !bitmaps.is_empty() { + handle_group(&bitmaps, left_bound)?; + bitmaps.clear(); + } + Ok(()) + } + + /// Compute the content of the database levels from its level 0 for the given field id. + /// + /// ## Returns: + /// A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` + /// that must be inserted into the database. + #[allow(clippy::type_complexity)] + fn compute_higher_levels<'t>( + &self, + rtxn: &'t RoTxn, + field_id: u16, + level: u8, + handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, + ) -> Result>> { + if level == 0 { + self.read_level_0(rtxn, field_id, handle_group)?; + // Level 0 is already in the database + return Ok(vec![]); + } + // level >= 1 + // we compute each element of this level based on the elements of the level below it + // once we have computed `level_group_size` elements, we give the left bound + // of those elements, and their bitmaps, to the level above + + let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + let mut cur_writer_len: usize = 0; + + let mut group_sizes = vec![]; + let mut left_bounds = vec![]; + let mut bitmaps = vec![]; + + // compute the levels below + // in the callback, we fill `cur_writer` with the correct elements for this level + let mut sub_writers = self.compute_higher_levels( + rtxn, + field_id, + level - 1, + &mut |sub_bitmaps, left_bound| { + let mut combined_bitmap = RoaringBitmap::default(); + for bitmap in sub_bitmaps { + combined_bitmap |= bitmap; + } + // The conversion of sub_bitmaps.len() to a u8 will always be correct + // since its length is bounded by max_group_size, which is a u8. + group_sizes.push(sub_bitmaps.len() as u8); + left_bounds.push(left_bound); + + bitmaps.push(combined_bitmap); + if bitmaps.len() != self.group_size as usize { + return Ok(()); + } + let left_bound = left_bounds.first().unwrap(); + handle_group(&bitmaps, left_bound)?; + + for ((bitmap, left_bound), group_size) in + bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) + { + let key = FacetGroupKey { field_id, level, left_bound }; + let key = FacetGroupKeyCodec::::bytes_encode(&key) + .ok_or(Error::Encoding)?; + let value = FacetGroupValue { size: group_size, bitmap }; + let value = + FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; + cur_writer.insert(key, value)?; + cur_writer_len += 1; + } + Ok(()) + }, + )?; + // don't forget to insert the leftover elements into the writer as well + + // but only do so if the current number of elements to be inserted into this + // levelcould grow to the minimum level size + + if !bitmaps.is_empty() && (cur_writer_len >= self.min_level_size as usize - 1) { + // the length of bitmaps is between 0 and group_size + assert!(bitmaps.len() < self.group_size as usize); + assert!(cur_writer_len > 0); + + let left_bound = left_bounds.first().unwrap(); + handle_group(&bitmaps, left_bound)?; + + // Note: how many bitmaps are there here? + for ((bitmap, left_bound), group_size) in + bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) + { + let key = FacetGroupKey { field_id, level, left_bound }; + let key = FacetGroupKeyCodec::::bytes_encode(&key) + .ok_or(Error::Encoding)?; + let value = FacetGroupValue { size: group_size, bitmap }; + let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; + cur_writer.insert(key, value)?; + cur_writer_len += 1; + } + } + // if we inserted enough elements to reach the minimum level size, then we push the writer + if cur_writer_len >= self.min_level_size as usize { + sub_writers.push(writer_into_reader(cur_writer)?); + } else { + // otherwise, if there are still leftover elements, we give them to the level above + // this is necessary in order to get the union of all docids + if !bitmaps.is_empty() { + handle_group(&bitmaps, left_bounds.first().unwrap())?; + } + } + Ok(sub_writers) + } +} + +#[cfg(test)] +mod tests { + use std::iter::once; + + use big_s::S; + use maplit::hashset; + use roaring::RoaringBitmap; + + use crate::documents::documents_batch_reader_from_objects; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::StrRefCodec; + use crate::index::tests::TempIndex; + use crate::update::facet::test_helpers::{ordered_string, FacetIndex}; + use crate::{db_snap, milli_snap}; + + #[test] + fn insert() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = + FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); + + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..1_000u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); + } + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + let mut wtxn = index.env.write_txn().unwrap(); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); + + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + + wtxn.commit().unwrap(); + + milli_snap!(format!("{index}"), name); + }; + + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); + } + #[test] + fn insert_delete_field_insert() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = + FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); + let mut wtxn = index.env.write_txn().unwrap(); + + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..100u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); + } + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); + + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + // delete all the elements for the facet id 0 + for i in 0..100u32 { + index.delete_single_docid(&mut wtxn, 0, &(i as f64), i); + } + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + // then add some elements again for the facet id 1 + for i in 0..110u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); + + wtxn.commit().unwrap(); + + milli_snap!(format!("{index}"), name); + }; + + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); + } + + #[test] + fn bug_3165() { + // Indexing a number of facet values that falls within certains ranges (e.g. 22_540 qualifies) + // would lead to a facet DB which was missing some levels. + // That was because before writing a level into the database, we would + // check that its size was higher than the minimum level size using + // a lossy integer conversion: `level_size as u8 >= min_level_size`. + // + // This missing level in the facet DBs would make the incremental indexer + // (and other search algorithms) crash. + // + // https://github.com/meilisearch/meilisearch/issues/3165 + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("id") }); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..=22_540 { + documents.push( + serde_json::json! { + { + "id": i as u64, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); + db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); + } + + #[test] + fn insert_string() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); + + let strings = (0..1_000).map(|i| ordered_string(i as usize)).collect::>(); + let mut elements = Vec::<((u16, &str), RoaringBitmap)>::new(); + for i in 0..1_000u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, &strings[i as usize]), once(i).collect())); + } + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, &strings[i as usize]), once(i).collect())); + } + let mut wtxn = index.env.write_txn().unwrap(); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); + + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + + wtxn.commit().unwrap(); + + milli_snap!(format!("{index}"), name); + }; + + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); + } +} diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs new file mode 100644 index 000000000..883abc8ca --- /dev/null +++ b/milli/src/update/facet/delete.rs @@ -0,0 +1,360 @@ +use std::collections::{HashMap, HashSet}; + +use heed::RwTxn; +use log::debug; +use roaring::RoaringBitmap; +use time::OffsetDateTime; + +use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}; +use crate::{FieldId, Index, Result}; + +/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases. +/// +/// Depending on the number of removed elements and the existing size of the database, we use either +/// a bulk delete method or an incremental delete method. +pub struct FacetsDelete<'i, 'b> { + index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, + facet_type: FacetType, + affected_facet_values: HashMap>>, + docids_to_delete: &'b RoaringBitmap, + group_size: u8, + max_group_size: u8, + min_level_size: u8, +} +impl<'i, 'b> FacetsDelete<'i, 'b> { + pub fn new( + index: &'i Index, + facet_type: FacetType, + affected_facet_values: HashMap>>, + docids_to_delete: &'b RoaringBitmap, + ) -> Self { + let database = match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; + Self { + index, + database, + facet_type, + affected_facet_values, + docids_to_delete, + group_size: FACET_GROUP_SIZE, + max_group_size: FACET_MAX_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, + } + } + + pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> { + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + + for (field_id, affected_facet_values) in self.affected_facet_values { + // This is an incorrect condition, since we assume that the length of the database is equal + // to the number of facet values for the given field_id. It means that in some cases, we might + // wrongly choose the incremental indexer over the bulk indexer. But the only case where that could + // really be a performance problem is when we fully delete a large ratio of all facet values for + // each field id. This would almost never happen. Still, to be overly cautious, I have added a + // 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance + // penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead. + if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) { + // Bulk delete + let mut modified = false; + + for facet_value in affected_facet_values { + let key = + FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() }; + let mut old = self.database.get(wtxn, &key)?.unwrap(); + let previous_len = old.bitmap.len(); + old.bitmap -= self.docids_to_delete; + if old.bitmap.is_empty() { + modified = true; + self.database.delete(wtxn, &key)?; + } else if old.bitmap.len() != previous_len { + modified = true; + self.database.put(wtxn, &key, &old)?; + } + } + if modified { + let builder = FacetsUpdateBulk::new_not_updating_level_0( + self.index, + vec![field_id], + self.facet_type, + ); + builder.execute(wtxn)?; + } + } else { + // Incremental + let inc = FacetsUpdateIncrementalInner { + db: self.database, + group_size: self.group_size, + min_level_size: self.min_level_size, + max_group_size: self.max_group_size, + }; + for facet_value in affected_facet_values { + inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?; + } + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::iter::FromIterator; + + use big_s::S; + use maplit::hashset; + use rand::seq::SliceRandom; + use rand::SeedableRng; + use roaring::RoaringBitmap; + + use crate::db_snap; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + use crate::update::facet::test_helpers::ordered_string; + use crate::update::{DeleteDocuments, DeletionStrategy}; + + #[test] + fn delete_mixed_incremental_and_bulk() { + // The point of this test is to create an index populated with documents + // containing different filterable attributes. Then, we delete a bunch of documents + // such that a mix of the incremental and bulk indexer is used (depending on the field id) + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_filterable_fields( + hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "label": i / 10, + "colour": i / 100, + "timestamp": i / 2, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576"); + db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf"); + + let mut wtxn = index.env.write_txn().unwrap(); + + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.strategy(DeletionStrategy::AlwaysHard); + builder.delete_documents(&RoaringBitmap::from_iter(0..100)); + // by deleting the first 100 documents, we expect that: + // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) + // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 + // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 + // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 + // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[]"); + db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6"); + db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56"); + } + + // Same test as above but working with string values for the facets + #[test] + fn delete_mixed_incremental_and_bulk_string() { + // The point of this test is to create an index populated with documents + // containing different filterable attributes. Then, we delete a bunch of documents + // such that a mix of the incremental and bulk indexer is used (depending on the field id) + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_filterable_fields( + hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "label": ordered_string(i / 10), + "colour": ordered_string(i / 100), + "timestamp": ordered_string(i / 2), + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) + db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); + db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); + + let mut wtxn = index.env.write_txn().unwrap(); + + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.strategy(DeletionStrategy::AlwaysHard); + builder.delete_documents(&RoaringBitmap::from_iter(0..100)); + // by deleting the first 100 documents, we expect that: + // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) + // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 + // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 + // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 + // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[]"); + db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc"); + db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f"); + } + + #[test] + fn delete_almost_all_incrementally_string() { + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_filterable_fields( + hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "label": ordered_string(i / 10), + "colour": ordered_string(i / 100), + "timestamp": ordered_string(i / 2), + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) + db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); + db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + + let mut docids_to_delete = (0..1000).collect::>(); + docids_to_delete.shuffle(&mut rng); + for docid in docids_to_delete.into_iter().take(990) { + let mut wtxn = index.env.write_txn().unwrap(); + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.strategy(DeletionStrategy::AlwaysHard); + builder.delete_documents(&RoaringBitmap::from_iter([docid])); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + } + + db_snap!(index, soft_deleted_documents_ids, @"[]"); + db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d"); + db_snap!(index, string_faceted_documents_ids, 2, @r###" + 0 [] + 1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] + 2 [292, 324, 358, 381, 493, 839, 852, ] + 3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] + "###); + } +} + +#[allow(unused)] +#[cfg(test)] +mod comparison_bench { + use std::iter::once; + + use rand::Rng; + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + use crate::update::facet::test_helpers::FacetIndex; + + // This is a simple test to get an intuition on the relative speed + // of the incremental vs. bulk indexer. + // + // The benchmark shows the worst-case scenario for the incremental indexer, since + // each facet value contains only one document ID. + // + // In that scenario, it appears that the incremental indexer is about 70 times slower than the + // bulk indexer. + // #[test] + fn benchmark_facet_indexing_delete() { + let mut r = rand::thread_rng(); + + for i in 1..=20 { + let size = 50_000 * i; + let index = FacetIndex::::new(4, 8, 5); + + let mut txn = index.env.write_txn().unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..size { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); + } + let timer = std::time::Instant::now(); + index.bulk_insert(&mut txn, &[0], elements.iter()); + let time_spent = timer.elapsed().as_millis(); + println!("bulk {size} : {time_spent}ms"); + + txn.commit().unwrap(); + + for nbr_doc in [1, 100, 1000, 10_000] { + let mut txn = index.env.write_txn().unwrap(); + let timer = std::time::Instant::now(); + // + // delete one document + // + for _ in 0..nbr_doc { + let deleted_u32 = r.gen::() % size; + let deleted_f64 = deleted_u32 as f64; + index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32) + } + let time_spent = timer.elapsed().as_millis(); + println!(" delete {nbr_doc} : {time_spent}ms"); + txn.abort().unwrap(); + } + } + } +} diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs new file mode 100644 index 000000000..cffce5525 --- /dev/null +++ b/milli/src/update/facet/incremental.rs @@ -0,0 +1,1218 @@ +use std::collections::HashMap; +use std::fs::File; + +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, +}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::search::facet::get_highest_level; +use crate::update::index_documents::valid_lmdb_key; +use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; + +enum InsertionResult { + InPlace, + Expand, + Insert, +} +enum DeletionResult { + InPlace, + Reduce { next: Option> }, + Remove { next: Option> }, +} + +/// Algorithm to incrementally insert and delete elememts into the +/// `facet_id_(string/f64)_docids` databases. +/// +/// Rhe `faceted_documents_ids` value in the main database of `Index` +/// is also updated to contain the new set of faceted documents. +pub struct FacetsUpdateIncremental<'i> { + index: &'i Index, + inner: FacetsUpdateIncrementalInner, + facet_type: FacetType, + new_data: grenad::Reader, +} + +impl<'i> FacetsUpdateIncremental<'i> { + pub fn new( + index: &'i Index, + facet_type: FacetType, + new_data: grenad::Reader, + group_size: u8, + min_level_size: u8, + max_group_size: u8, + ) -> Self { + FacetsUpdateIncremental { + index, + inner: FacetsUpdateIncrementalInner { + db: match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => index + .facet_id_f64_docids + .remap_key_type::>(), + }, + group_size, + max_group_size, + min_level_size, + }, + facet_type, + new_data, + } + } + + pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { + let mut new_faceted_docids = HashMap::::default(); + + let mut cursor = self.new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + if !valid_lmdb_key(key) { + continue; + } + let key = FacetGroupKeyCodec::::bytes_decode(key) + .ok_or(heed::Error::Encoding)?; + let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + + for (field_id, new_docids) in new_faceted_docids { + let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; + docids |= new_docids; + self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; + } + Ok(()) + } +} + +/// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type +pub struct FacetsUpdateIncrementalInner { + pub db: heed::Database, FacetGroupValueCodec>, + pub group_size: u8, + pub min_level_size: u8, + pub max_group_size: u8, +} +impl FacetsUpdateIncrementalInner { + /// Find the `FacetGroupKey`/`FacetGroupValue` in the database that + /// should be used to insert the new `facet_value` for the given `field_id` and `level` + /// where `level` must be strictly greater than 0. + /// + /// For example, when inserting the facet value `4`, there are two possibilities: + /// + /// 1. We find a key whose lower bound is 3 followed by a key whose lower bound is 6. Therefore, + /// we know that the implicit range of the first key is 3..6, which contains 4. + /// So the new facet value belongs in that first key/value pair. + /// + /// 2. The first key of the level has a lower bound of `5`. We return this key/value pair + /// but will need to change the lowerbound of this key to `4` in order to insert this facet value. + fn find_insertion_key_value( + &self, + field_id: u16, + level: u8, + facet_value: &[u8], + txn: &RoTxn, + ) -> Result<(FacetGroupKey>, FacetGroupValue)> { + assert!(level > 0); + match self.db.get_lower_than_or_equal_to( + txn, + &FacetGroupKey { field_id, level, left_bound: facet_value }, + )? { + Some((key, value)) => { + if key.level != level { + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(level); + + let mut iter = + self.db.as_polymorph().prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + txn, + prefix.as_slice(), + )?; + let (key_bytes, value) = iter.next().unwrap()?; + Ok(( + FacetGroupKeyCodec::::bytes_decode(key_bytes) + .ok_or(Error::Encoding)? + .into_owned(), + value, + )) + } else { + Ok((key.into_owned(), value)) + } + } + None => { + // We checked that the level is > 0 + // Since all keys of level 1 are greater than those of level 0, + // we are guaranteed that db.get_lower_than_or_equal_to(key) exists + panic!() + } + } + } + + /// Insert the given facet value and corresponding document ids in the level 0 of the database + /// + /// ## Return + /// See documentation of `insert_in_level` + fn insert_in_level_0<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result { + let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; + let value = FacetGroupValue { bitmap: docids.clone(), size: 1 }; + + let mut level0_prefix = vec![]; + level0_prefix.extend_from_slice(&field_id.to_be_bytes()); + level0_prefix.push(0); + + let mut iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, &level0_prefix)?; + + if iter.next().is_none() { + drop(iter); + self.db.put(txn, &key, &value)?; + Ok(InsertionResult::Insert) + } else { + drop(iter); + let old_value = self.db.get(txn, &key)?; + match old_value { + Some(mut updated_value) => { + // now merge the two + updated_value.bitmap |= value.bitmap; + self.db.put(txn, &key, &updated_value)?; + Ok(InsertionResult::InPlace) + } + None => { + self.db.put(txn, &key, &value)?; + Ok(InsertionResult::Insert) + } + } + } + } + + /// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`. + /// This function works recursively. + /// + /// ## Return + /// Returns the effect of adding the facet value to the database on the given `level`. + /// + /// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have + /// an effect on the number of keys in that level. Therefore, it did not increase the number of children + /// of the parent node. + /// + /// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted + /// in the addition of a new key in that level, and that therefore the number of children + /// of the parent node should be incremented. + fn insert_in_level<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + level: u8, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result { + if level == 0 { + return self.insert_in_level_0(txn, field_id, facet_value, docids); + } + + let max_group_size = self.max_group_size; + + let result = self.insert_in_level(txn, field_id, level - 1, facet_value, docids)?; + // level below inserted an element + + let (insertion_key, insertion_value) = + self.find_insertion_key_value(field_id, level, facet_value, txn)?; + + match result { + // because we know that we inserted in place, the facet_value is not a new one + // thus it doesn't extend a group, and thus the insertion key computed above is + // still correct + InsertionResult::InPlace => { + let mut updated_value = insertion_value; + updated_value.bitmap |= docids; + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; + + return Ok(InsertionResult::InPlace); + } + InsertionResult::Expand => {} + InsertionResult::Insert => {} + } + + // Here we know that inserting the facet value in the level below resulted in the creation + // of a new key. Therefore, it may be the case that we need to modify the left bound of the + // insertion key (see documentation of `find_insertion_key_value` for an example of when that + // could happen). + let (insertion_key, insertion_key_was_modified) = { + let mut new_insertion_key = insertion_key.clone(); + let mut key_should_be_modified = false; + + if facet_value < insertion_key.left_bound.as_slice() { + new_insertion_key.left_bound = facet_value.to_vec(); + key_should_be_modified = true; + } + if key_should_be_modified { + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; + } + (new_insertion_key, key_should_be_modified) + }; + // Now we know that the insertion key contains the `facet_value`. + + // We still need to update the insertion value by: + // 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`) + // 2. Merge the previous docids with the new one + let mut updated_value = insertion_value; + + if matches!(result, InsertionResult::Insert) { + updated_value.size += 1; + } + + if updated_value.size < max_group_size { + updated_value.bitmap |= docids; + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; + if insertion_key_was_modified { + return Ok(InsertionResult::Expand); + } else { + return Ok(InsertionResult::InPlace); + } + } + + // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` + // Therefore it must be split into two nodes. + + let size_left = updated_value.size / 2; + let size_right = updated_value.size - size_left; + + let level_below = level - 1; + + let start_key = FacetGroupKey { + field_id, + level: level_below, + left_bound: insertion_key.left_bound.as_slice(), + }; + + let mut iter = + self.db.range(txn, &(start_key..))?.take((size_left as usize) + (size_right as usize)); + + let group_left = { + let mut values_left = RoaringBitmap::new(); + + let mut i = 0; + for next in iter.by_ref() { + let (_key, value) = next?; + i += 1; + values_left |= &value.bitmap; + if i == size_left { + break; + } + } + + let key = + FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; + let value = FacetGroupValue { size: size_left, bitmap: values_left }; + (key, value) + }; + + let group_right = { + let ( + FacetGroupKey { left_bound: right_left_bound, .. }, + FacetGroupValue { bitmap: mut values_right, .. }, + ) = iter.next().unwrap()?; + + for next in iter.by_ref() { + let (_, value) = next?; + values_right |= &value.bitmap; + } + + let key = FacetGroupKey { field_id, level, left_bound: right_left_bound.to_vec() }; + let value = FacetGroupValue { size: size_right, bitmap: values_right }; + (key, value) + }; + drop(iter); + + let _ = self.db.delete(txn, &insertion_key.as_ref())?; + + self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; + self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; + + Ok(InsertionResult::Insert) + } + + /// Insert the given facet value and corresponding document ids in the database. + pub fn insert<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result<()> { + if docids.is_empty() { + return Ok(()); + } + let group_size = self.group_size; + + let highest_level = get_highest_level(txn, self.db, field_id)?; + + let result = self.insert_in_level(txn, field_id, highest_level, facet_value, docids)?; + match result { + InsertionResult::InPlace => return Ok(()), + InsertionResult::Expand => return Ok(()), + InsertionResult::Insert => {} + } + + // Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`. + // If it has, we must build an addition level above it. + + let mut highest_level_prefix = vec![]; + highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); + highest_level_prefix.push(highest_level); + + let size_highest_level = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)? + .count(); + + if size_highest_level < self.group_size as usize * self.min_level_size as usize { + return Ok(()); + } + + let mut groups_iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, &highest_level_prefix)?; + + let nbr_new_groups = size_highest_level / self.group_size as usize; + let nbr_leftover_elements = size_highest_level % self.group_size as usize; + + let mut to_add = vec![]; + for _ in 0..nbr_new_groups { + let mut first_key = None; + let mut values = RoaringBitmap::new(); + for _ in 0..group_size { + let (key_bytes, value_i) = groups_iter.next().unwrap()?; + let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) + .ok_or(Error::Encoding)?; + + if first_key.is_none() { + first_key = Some(key_i); + } + values |= value_i.bitmap; + } + let key = FacetGroupKey { + field_id, + level: highest_level + 1, + left_bound: first_key.unwrap().left_bound, + }; + let value = FacetGroupValue { size: group_size, bitmap: values }; + to_add.push((key.into_owned(), value)); + } + // now we add the rest of the level, in case its size is > group_size * min_level_size + // this can indeed happen if the min_level_size parameter changes between two calls to `insert` + if nbr_leftover_elements > 0 { + let mut first_key = None; + let mut values = RoaringBitmap::new(); + for _ in 0..nbr_leftover_elements { + let (key_bytes, value_i) = groups_iter.next().unwrap()?; + let key_i = FacetGroupKeyCodec::::bytes_decode(key_bytes) + .ok_or(Error::Encoding)?; + + if first_key.is_none() { + first_key = Some(key_i); + } + values |= value_i.bitmap; + } + let key = FacetGroupKey { + field_id, + level: highest_level + 1, + left_bound: first_key.unwrap().left_bound, + }; + // Note: nbr_leftover_elements can be casted to a u8 since it is bounded by `max_group_size` + // when it is created above. + let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values }; + to_add.push((key.into_owned(), value)); + } + + drop(groups_iter); + for (key, value) in to_add { + self.db.put(txn, &key.as_ref(), &value)?; + } + Ok(()) + } + + /// Delete the given document id from the given facet value in the database, from level 0 to the + /// the given level. + /// + /// ## Return + /// Returns the effect of removing the document id from the database on the given `level`. + /// + /// - `DeletionResult::InPlace` means that deleting the document id did not have + /// an effect on the keys in that level. + /// + /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the + /// number of keys in the level. For example, removing a document id from the facet value `3` could + /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted + /// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must + /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. + /// + /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the + /// bounds of the keys of the level. For example, removing a document id from the facet value + /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, + /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). + /// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust + /// its left bound as well. + fn delete_in_level<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + level: u8, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result { + if level == 0 { + return self.delete_in_level_0(txn, field_id, facet_value, docids); + } + let (deletion_key, mut bitmap) = + self.find_insertion_key_value(field_id, level, facet_value, txn)?; + + let result = self.delete_in_level(txn, field_id, level - 1, facet_value, docids)?; + + let mut decrease_size = false; + let next_key = match result { + DeletionResult::InPlace => { + bitmap.bitmap -= docids; + self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; + return Ok(DeletionResult::InPlace); + } + DeletionResult::Reduce { next } => next, + DeletionResult::Remove { next } => { + decrease_size = true; + next + } + }; + // If either DeletionResult::Reduce or DeletionResult::Remove was returned, + // then we may need to adjust the left_bound of the deletion key. + + // If DeletionResult::Remove was returned, then we need to decrease the group + // size of the deletion key. + let mut updated_value = bitmap; + if decrease_size { + updated_value.size -= 1; + } + + if updated_value.size == 0 { + self.db.delete(txn, &deletion_key.as_ref())?; + Ok(DeletionResult::Remove { next: next_key }) + } else { + let mut updated_deletion_key = deletion_key.clone(); + let reduced_range = facet_value == deletion_key.left_bound; + if reduced_range { + updated_deletion_key.left_bound = next_key.clone().unwrap(); + } + updated_value.bitmap -= docids; + let _ = self.db.delete(txn, &deletion_key.as_ref())?; + self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; + if reduced_range { + Ok(DeletionResult::Reduce { next: next_key }) + } else { + Ok(DeletionResult::InPlace) + } + } + } + + fn delete_in_level_0<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result { + let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; + let mut bitmap = self.db.get(txn, &key)?.unwrap().bitmap; + bitmap -= docids; + + if bitmap.is_empty() { + let mut next_key = None; + if let Some((next, _)) = + self.db.remap_data_type::().get_greater_than(txn, &key)? + { + if next.field_id == field_id && next.level == 0 { + next_key = Some(next.left_bound.to_vec()); + } + } + self.db.delete(txn, &key)?; + Ok(DeletionResult::Remove { next: next_key }) + } else { + self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?; + Ok(DeletionResult::InPlace) + } + } + + pub fn delete<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result<()> { + if self + .db + .remap_data_type::() + .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })? + .is_none() + { + return Ok(()); + } + let highest_level = get_highest_level(txn, self.db, field_id)?; + + let result = self.delete_in_level(txn, field_id, highest_level, facet_value, docids)?; + match result { + DeletionResult::InPlace => return Ok(()), + DeletionResult::Reduce { .. } => return Ok(()), + DeletionResult::Remove { .. } => {} + } + + // if we either removed a key from the highest level, its size may have fallen + // below `min_level_size`, in which case we need to remove the entire level + + let mut highest_level_prefix = vec![]; + highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); + highest_level_prefix.push(highest_level); + + if highest_level == 0 + || self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)? + .count() + >= self.min_level_size as usize + { + return Ok(()); + } + let mut to_delete = vec![]; + let mut iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)?; + for el in iter.by_ref() { + let (k, _) = el?; + to_delete.push( + FacetGroupKeyCodec::::bytes_decode(k) + .ok_or(Error::Encoding)? + .into_owned(), + ); + } + drop(iter); + for k in to_delete { + self.db.delete(txn, &k.as_ref())?; + } + Ok(()) + } +} + +impl<'a> FacetGroupKey<&'a [u8]> { + pub fn into_owned(self) -> FacetGroupKey> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.to_vec(), + } + } +} + +impl FacetGroupKey> { + pub fn as_ref(&self) -> FacetGroupKey<&[u8]> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.as_slice(), + } + } +} + +#[cfg(test)] +mod tests { + use rand::seq::SliceRandom; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::StrRefCodec; + use crate::milli_snap; + use crate::update::facet::test_helpers::FacetIndex; + + #[test] + fn append() { + let index = FacetIndex::::new(4, 8, 5); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + #[test] + fn many_field_ids_append() { + let index = FacetIndex::::new(4, 8, 5); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 2, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 1, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.verify_structure_validity(&txn, 1); + index.verify_structure_validity(&txn, 2); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + #[test] + fn many_field_ids_prepend() { + let index = FacetIndex::::new(4, 8, 5); + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 2, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 1, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.verify_structure_validity(&txn, 1); + index.verify_structure_validity(&txn, 2); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + + #[test] + fn prepend() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + + #[test] + fn shuffled() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + + #[test] + fn merge_values() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(rng.gen_range(256..512)); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + + #[test] + fn delete_from_end() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + + for i in (200..256).into_iter().rev() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 200); + let mut txn = index.env.write_txn().unwrap(); + + for i in (150..200).into_iter().rev() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 150); + let mut txn = index.env.write_txn().unwrap(); + for i in (100..150).into_iter().rev() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 100); + let mut txn = index.env.write_txn().unwrap(); + for i in (17..100).into_iter().rev() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 17); + let mut txn = index.env.write_txn().unwrap(); + for i in (15..17).into_iter().rev() { + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 15); + let mut txn = index.env.write_txn().unwrap(); + for i in (0..15).into_iter().rev() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 0); + } + + #[test] + fn delete_from_start() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + + for i in 0..128 { + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 127); + let mut txn = index.env.write_txn().unwrap(); + for i in 128..216 { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 215); + let mut txn = index.env.write_txn().unwrap(); + for i in 216..256 { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 255); + } + + #[test] + fn delete_shuffled() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for i in 0..128 { + let key = keys[i]; + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 127); + let mut txn = index.env.write_txn().unwrap(); + for i in 128..216 { + let key = keys[i]; + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + let mut txn = index.env.write_txn().unwrap(); + milli_snap!(format!("{index}"), 215); + for i in 216..256 { + let key = keys[i]; + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 255); + } + + #[test] + fn in_place_level0_insert() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (0..16).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + for i in 0..4 { + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(rng.gen_range(i * 256..(i + 1) * 256)); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + + #[test] + fn in_place_level0_delete() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (0..64).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.verify_structure_validity(&txn, 0); + + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), "before_delete"); + + let mut txn = index.env.write_txn().unwrap(); + + for &key in keys.iter() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(key as f64), key + 100); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), "after_delete"); + } + + #[test] + fn shuffle_merge_string_and_delete() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (1000..1064).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &format!("{key:x}").as_str(), &bitmap); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), "before_delete"); + + let mut txn = index.env.write_txn().unwrap(); + + for &key in keys.iter() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &format!("{key:x}").as_str(), key + 100); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), "after_delete"); + } +} + +// fuzz tests +#[cfg(all(test, fuzzing))] +/** +Fuzz test for the incremental indxer. + +The fuzz test uses fuzzcheck, a coverage-guided fuzzer. +See https://github.com/loiclec/fuzzcheck-rs and https://fuzzcheck.neocities.org +for more information. + +It is only run when using the `cargo fuzzcheck` command line tool, which can be installed with: +```sh +cargo install cargo-fuzzcheck +``` +To start the fuzz test, run (from the base folder or from milli/): +```sh +cargo fuzzcheck update::facet::incremental::fuzz::fuzz +``` +and wait a couple minutes to make sure the code was thoroughly tested, then +hit `Ctrl-C` to stop the fuzzer. The corpus generated by the fuzzer is located in milli/fuzz. + +To work on this module with rust-analyzer working properly, add the following to your .cargo/config.toml file: +```toml +[build] +rustflags = ["--cfg", "fuzzing"] +``` + +The fuzz test generates sequences of additions and deletions to the facet database and +ensures that: +1. its structure is still internally valid +2. its content is the same as a trivially correct implementation of the same database +*/ +mod fuzz { + use std::collections::{BTreeMap, HashMap}; + use std::iter::FromIterator; + use std::rc::Rc; + + use fuzzcheck::mutators::integer::U8Mutator; + use fuzzcheck::mutators::integer_within_range::{U16WithinRangeMutator, U8WithinRangeMutator}; + use fuzzcheck::mutators::vector::VecMutator; + use fuzzcheck::DefaultMutator; + use roaring::RoaringBitmap; + use tempfile::TempDir; + + use super::*; + use crate::update::facet::test_helpers::FacetIndex; + #[derive(Default)] + pub struct TrivialDatabase { + pub elements: BTreeMap>, + } + impl TrivialDatabase + where + T: Ord + Clone + Eq + std::fmt::Debug, + { + #[no_coverage] + pub fn insert(&mut self, field_id: u16, new_key: &T, new_values: &RoaringBitmap) { + if new_values.is_empty() { + return; + } + let values_field_id = self.elements.entry(field_id).or_default(); + let values = values_field_id.entry(new_key.clone()).or_default(); + *values |= new_values; + } + #[no_coverage] + pub fn delete(&mut self, field_id: u16, key: &T, values_to_remove: &RoaringBitmap) { + if let Some(values_field_id) = self.elements.get_mut(&field_id) { + if let Some(values) = values_field_id.get_mut(&key) { + *values -= values_to_remove; + if values.is_empty() { + values_field_id.remove(&key); + } + } + if values_field_id.is_empty() { + self.elements.remove(&field_id); + } + } + } + } + #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] + struct Operation { + #[field_mutator(VecMutator = { VecMutator::new(u8::default_mutator(), 0 ..= 5) })] + key: Vec, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + group_size: u8, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + max_group_size: u8, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + min_level_size: u8, + #[field_mutator(U16WithinRangeMutator = { U16WithinRangeMutator::new(..=3) })] + field_id: u16, + kind: OperationKind, + } + #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] + enum OperationKind { + Insert( + #[field_mutator(VecMutator = { VecMutator::new(U8Mutator::default(), 0 ..= 10) })] + Vec, + ), + Delete( + #[field_mutator(VecMutator = { VecMutator::new(U8Mutator::default(), 0 ..= 10) })] + Vec, + ), + } + + #[no_coverage] + fn compare_with_trivial_database(tempdir: Rc, operations: &[Operation]) { + let index = FacetIndex::::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten + let mut txn = index.env.write_txn().unwrap(); + + let mut trivial_db = TrivialDatabase::>::default(); + let mut value_to_keys = HashMap::>>::new(); + for Operation { key, group_size, max_group_size, min_level_size, field_id, kind } in + operations + { + index.set_group_size(*group_size); + index.set_max_group_size(*max_group_size); + index.set_min_level_size(*min_level_size); + match kind { + OperationKind::Insert(values) => { + let mut bitmap = RoaringBitmap::new(); + for value in values { + bitmap.insert(*value as u32); + value_to_keys.entry(*value).or_default().push(key.clone()); + } + index.insert(&mut txn, *field_id, &key.as_slice(), &bitmap); + trivial_db.insert(*field_id, &key, &bitmap); + } + OperationKind::Delete(values) => { + let values = RoaringBitmap::from_iter(values.iter().copied().map(|x| x as u32)); + let mut values_per_key = HashMap::new(); + + for value in values { + if let Some(keys) = value_to_keys.get(&(value as u8)) { + for key in keys { + let values: &mut RoaringBitmap = + values_per_key.entry(key).or_default(); + values.insert(value); + } + } + } + for (key, values) in values_per_key { + index.delete(&mut txn, *field_id, &key.as_slice(), &values); + trivial_db.delete(*field_id, &key, &values); + } + } + } + } + + for (field_id, values_field_id) in trivial_db.elements.iter() { + let level0iter = index + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + &mut txn, + &field_id.to_be_bytes(), + ) + .unwrap(); + + for ((key, values), group) in values_field_id.iter().zip(level0iter) { + let (group_key, group_values) = group.unwrap(); + let group_key = + FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + assert_eq!(key, &group_key.left_bound); + assert_eq!(values, &group_values.bitmap); + } + } + + for (field_id, values_field_id) in trivial_db.elements.iter() { + let level0iter = index + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) + .unwrap(); + + for ((key, values), group) in values_field_id.iter().zip(level0iter) { + let (group_key, group_values) = group.unwrap(); + let group_key = + FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + assert_eq!(key, &group_key.left_bound); + assert_eq!(values, &group_values.bitmap); + } + index.verify_structure_validity(&txn, *field_id); + } + txn.abort().unwrap(); + } + + #[test] + #[no_coverage] + fn fuzz() { + let tempdir = Rc::new(TempDir::new().unwrap()); + let tempdir_cloned = tempdir.clone(); + let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| { + compare_with_trivial_database(tempdir_cloned.clone(), operations) + }) + .default_mutator() + .serde_serializer() + .default_sensor_and_pool_with_custom_filter(|file, function| { + file == std::path::Path::new("milli/src/update/facet/incremental.rs") + && !function.contains("serde") + && !function.contains("tests::") + && !function.contains("fuzz::") + && !function.contains("display_bitmap") + }) + .arguments_from_cargo_fuzzcheck() + .launch(); + assert!(!result.found_test_failure); + } +} diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs new file mode 100644 index 000000000..e2895919f --- /dev/null +++ b/milli/src/update/facet/mod.rs @@ -0,0 +1,611 @@ +/*! +This module implements two different algorithms for updating the `facet_id_string_docids` +and `facet_id_f64_docids` databases. The first algorithm is a "bulk" algorithm, meaning that +it recreates the database from scratch when new elements are added to it. The second algorithm +is incremental: it modifies the database as little as possible. + +The databases must be able to return results for queries such as: +1. Filter : find all the document ids that have a facet value greater than X and/or smaller than Y +2. Min/Max : find the minimum/maximum facet value among these document ids +3. Sort : sort these document ids by increasing/decreasing facet values +4. Distribution : given some document ids, make a list of each facet value + found in these documents along with the number of documents that contain it + +The algorithms that implement these queries are found in the `src/search/facet` folder. + +To make these queries fast to compute, the database adopts a tree structure: +```ignore + ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ +┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │ +│Level 2│ │ │ │ │ +└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ + ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ +┌───────┐ │ "ab" (2) │ "ba" (2) │ "gaf" (2) │ "form" (2) │ "woz" (2) │ +│Level 1│ │ │ │ │ │ │ +└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ + ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ +┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │ +│Level 0│ │ │ │ │ │ │ │ │ │ │ │ +└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ + └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ +``` +In the diagram above, each cell corresponds to a node in the tree. The first line of the cell +contains the left bound of the range of facet values as well as the number of children of the node. +The second line contains the document ids which have a facet value within the range of the node. +The nodes at level 0 are the leaf nodes. They have 0 children and a single facet value in their range. + +In the diagram above, the first cell of level 2 is `ab (2)`. Its range is `ab .. gaf` (because +`gaf` is the left bound of the next node) and it has two children. Its document ids are `[a,b,d,f,z]`. +These documents all contain a facet value that is contained within `ab .. gaf`. + +In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a +[`FacetGroupValue`], which have the following format: + +```ignore +FacetGroupKey: +- field id : u16 +- level : u8 +- left bound: [u8] // the facet value encoded using either OrderedF64Codec or Str + +FacetGroupValue: +- #children : u8 +- docids : RoaringBitmap +``` + +When the database is first created using the "bulk" method, each node has a fixed number of children +(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`). +The tree is also built such that the highest level has more than `min_level_size` +(default to `FACET_MIN_LEVEL_SIZE`) elements in it. + +When the database is incrementally updated, the number of children of a node can vary between +1 and `max_group_size`. This is done so that most incremental operations do not need to change +the structure of the tree. When the number of children of a node reaches `max_group_size`, +we split the node in two and update the number of children of its parent. + +When adding documents to the databases, it is important to determine which method to use to +minimise indexing time. The incremental method is faster when adding few new facet values, but the +bulk method is faster when a large part of the database is modified. Empirically, it seems that +it takes 50x more time to incrementally add N facet values to an existing database than it is to +construct a database of N facet values. This is the heuristic that is used to choose between the +two methods. + +Related PR: https://github.com/meilisearch/milli/pull/619 +*/ + +pub const FACET_MAX_GROUP_SIZE: u8 = 8; +pub const FACET_GROUP_SIZE: u8 = 4; +pub const FACET_MIN_LEVEL_SIZE: u8 = 5; + +use std::fs::File; + +use log::debug; +use time::OffsetDateTime; + +use self::incremental::FacetsUpdateIncremental; +use super::FacetsUpdateBulk; +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::{Index, Result}; + +pub mod bulk; +pub mod delete; +pub mod incremental; + +/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases. +/// +/// Depending on the number of new elements and the existing size of the database, we use either +/// a bulk update method or an incremental update method. +pub struct FacetsUpdate<'i> { + index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, + facet_type: FacetType, + new_data: grenad::Reader, + group_size: u8, + max_group_size: u8, + min_level_size: u8, +} +impl<'i> FacetsUpdate<'i> { + pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { + let database = match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; + Self { + index, + database, + group_size: FACET_GROUP_SIZE, + max_group_size: FACET_MAX_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, + facet_type, + new_data, + } + } + + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + if self.new_data.is_empty() { + return Ok(()); + } + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + + // See self::comparison_bench::benchmark_facet_indexing + if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { + let field_ids = + self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); + let bulk_update = FacetsUpdateBulk::new( + self.index, + field_ids, + self.facet_type, + self.new_data, + self.group_size, + self.min_level_size, + ); + bulk_update.execute(wtxn)?; + } else { + let incremental_update = FacetsUpdateIncremental::new( + self.index, + self.facet_type, + self.new_data, + self.group_size, + self.min_level_size, + self.max_group_size, + ); + incremental_update.execute(wtxn)?; + } + Ok(()) + } +} + +#[cfg(test)] +pub(crate) mod test_helpers { + use std::cell::Cell; + use std::fmt::Display; + use std::iter::FromIterator; + use std::marker::PhantomData; + use std::rc::Rc; + + use heed::types::ByteSlice; + use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; + use roaring::RoaringBitmap; + + use super::bulk::FacetsUpdateBulkInner; + use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + }; + use crate::heed_codec::ByteSliceRefCodec; + use crate::search::facet::get_highest_level; + use crate::snapshot_tests::display_bitmap; + use crate::update::FacetsUpdateIncrementalInner; + use crate::CboRoaringBitmapCodec; + + /// Utility function to generate a string whose position in a lexicographically + /// ordered list is `i`. + pub fn ordered_string(mut i: usize) -> String { + // The first string is empty + if i == 0 { + return String::new(); + } + // The others are 5 char long, each between 'a' and 'z' + let mut s = String::new(); + for _ in 0..5 { + let (digit, next) = (i % 26, i / 26); + s.insert(0, char::from_u32('a' as u32 + digit as u32).unwrap()); + i = next; + } + s + } + + /// A dummy index that only contains the facet database, used for testing + pub struct FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + pub env: Env, + pub content: heed::Database, FacetGroupValueCodec>, + pub group_size: Cell, + pub min_level_size: Cell, + pub max_group_size: Cell, + _tempdir: Rc, + _phantom: PhantomData, + } + + impl FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + #[cfg(all(test, fuzzing))] + pub fn open_from_tempdir( + tempdir: Rc, + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(16, std::cmp::max(group_size, 2)); // 2 <= x <= 16 + let max_group_size = std::cmp::min(16, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 16 + let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17 + + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 10 * 1000); + unsafe { + options.flag(heed::flags::Flags::MdbAlwaysFreePages); + } + let env = options.open(tempdir.path()).unwrap(); + let content = env.open_database(None).unwrap().unwrap(); + + FacetIndex { + content, + group_size: Cell::new(group_size), + max_group_size: Cell::new(max_group_size), + min_level_size: Cell::new(min_level_size), + _tempdir: tempdir, + env, + _phantom: PhantomData, + } + } + pub fn new( + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 1000 * 100); + let tempdir = tempfile::TempDir::new().unwrap(); + let env = options.open(tempdir.path()).unwrap(); + let content = env.create_database(None).unwrap(); + + FacetIndex { + content, + group_size: Cell::new(group_size), + max_group_size: Cell::new(max_group_size), + min_level_size: Cell::new(min_level_size), + _tempdir: Rc::new(tempdir), + env, + _phantom: PhantomData, + } + } + + #[cfg(all(test, fuzzing))] + pub fn set_group_size(&self, group_size: u8) { + // 2 <= x <= 64 + self.group_size.set(std::cmp::min(64, std::cmp::max(group_size, 2))); + } + #[cfg(all(test, fuzzing))] + pub fn set_max_group_size(&self, max_group_size: u8) { + // 2*group_size <= x <= 128 + let max_group_size = std::cmp::max(4, std::cmp::min(128, max_group_size)); + self.max_group_size.set(max_group_size); + if self.group_size.get() < max_group_size / 2 { + self.group_size.set(max_group_size / 2); + } + } + #[cfg(all(test, fuzzing))] + pub fn set_min_level_size(&self, min_level_size: u8) { + // 1 <= x <= inf + self.min_level_size.set(std::cmp::max(1, min_level_size)); + } + + pub fn insert<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docids: &RoaringBitmap, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + max_group_size: self.max_group_size.get(), + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); + } + pub fn delete_single_docid<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docid: u32, + ) { + self.delete(wtxn, field_id, key, &RoaringBitmap::from_iter(std::iter::once(docid))) + } + + pub fn delete<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docids: &RoaringBitmap, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + max_group_size: self.max_group_size.get(), + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.delete(wtxn, field_id, &key_bytes, docids).unwrap(); + } + + pub fn bulk_insert<'a, 'b>( + &self, + wtxn: &'a mut RwTxn, + field_ids: &[u16], + els: impl IntoIterator< + Item = &'a ((u16, >::EItem), RoaringBitmap), + >, + ) where + for<'c> >::EItem: Sized, + { + let mut new_data = vec![]; + let mut writer = grenad::Writer::new(&mut new_data); + for ((field_id, left_bound), docids) in els { + let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned(); + let key: FacetGroupKey<&[u8]> = + FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; + let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + let value = CboRoaringBitmapCodec::bytes_encode(&docids).unwrap(); + writer.insert(&key, &value).unwrap(); + } + writer.finish().unwrap(); + let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); + + let update = FacetsUpdateBulkInner { + db: self.content, + new_data: Some(reader), + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + }; + + update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); + } + + pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) { + let mut field_id_prefix = vec![]; + field_id_prefix.extend_from_slice(&field_id.to_be_bytes()); + + let highest_level = get_highest_level(txn, self.content, field_id).unwrap(); + + for level_no in (1..=highest_level).rev() { + let mut level_no_prefix = vec![]; + level_no_prefix.extend_from_slice(&field_id.to_be_bytes()); + level_no_prefix.push(level_no); + + let mut iter = self + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, &level_no_prefix) + .unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); + + let mut prefix_start_below = vec![]; + prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); + prefix_start_below.push(level_no - 1); + prefix_start_below.extend_from_slice(&key.left_bound); + + let start_below = { + let mut start_below_iter = self + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + txn, + &prefix_start_below, + ) + .unwrap(); + let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); + FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() + }; + + assert!(value.size > 0); + + let mut actual_size = 0; + let mut values_below = RoaringBitmap::new(); + let mut iter_below = self + .content + .range(txn, &(start_below..)) + .unwrap() + .take(value.size as usize); + while let Some(el) = iter_below.next() { + let (_, value) = el.unwrap(); + actual_size += 1; + values_below |= value.bitmap; + } + assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}"); + + assert_eq!(value.bitmap, values_below); + } + } + } + } + + impl Display for FacetIndex + where + for<'a> >::EItem: Sized + Display, + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let txn = self.env.read_txn().unwrap(); + let mut iter = self.content.iter(&txn).unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let FacetGroupKey { field_id, level, left_bound: bound } = key; + let bound = BoundCodec::bytes_decode(bound).unwrap(); + let FacetGroupValue { size, bitmap } = value; + writeln!( + f, + "{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}", + values = display_bitmap(&bitmap) + )?; + } + Ok(()) + } + } +} + +#[cfg(test)] +mod tests { + use big_s::S; + use maplit::hashset; + + use crate::db_snap; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + use crate::update::DeletionStrategy; + + #[test] + fn replace_all_identical_soft_deletion_then_hard_deletion() { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_filterable_fields(hashset! { S("size") }); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "size": i % 250, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b"); + db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9"); + db_snap!(index, soft_deleted_documents_ids, "initial", @"[]"); + + let mut documents = vec![]; + for i in 0..999 { + documents.push( + serde_json::json! { + { + "id": i, + "size": i % 250, + "other": 0, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f"); + db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06"); + db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); + + // Then replace the last document while disabling soft_deletion + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; + let mut documents = vec![]; + for i in 999..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "size": i % 250, + "other": 0, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6"); + db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028"); + db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]"); + } +} + +#[allow(unused)] +#[cfg(test)] +mod comparison_bench { + use std::iter::once; + + use rand::Rng; + use roaring::RoaringBitmap; + + use super::test_helpers::FacetIndex; + use crate::heed_codec::facet::OrderedF64Codec; + + // This is a simple test to get an intuition on the relative speed + // of the incremental vs. bulk indexer. + // + // The benchmark shows the worst-case scenario for the incremental indexer, since + // each facet value contains only one document ID. + // + // In that scenario, it appears that the incremental indexer is about 50 times slower than the + // bulk indexer. + // #[test] + fn benchmark_facet_indexing() { + let mut facet_value = 0; + + let mut r = rand::thread_rng(); + + for i in 1..=20 { + let size = 50_000 * i; + let index = FacetIndex::::new(4, 8, 5); + + let mut txn = index.env.write_txn().unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..size { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, facet_value as f64), once(i).collect())); + facet_value += 1; + } + let timer = std::time::Instant::now(); + index.bulk_insert(&mut txn, &[0], elements.iter()); + let time_spent = timer.elapsed().as_millis(); + println!("bulk {size} : {time_spent}ms"); + + txn.commit().unwrap(); + + for nbr_doc in [1, 100, 1000, 10_000] { + let mut txn = index.env.write_txn().unwrap(); + let timer = std::time::Instant::now(); + // + // insert one document + // + for _ in 0..nbr_doc { + index.insert(&mut txn, 0, &r.gen(), &once(1).collect()); + } + let time_spent = timer.elapsed().as_millis(); + println!(" add {nbr_doc} : {time_spent}ms"); + txn.abort().unwrap(); + } + } + } +} diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap new file mode 100644 index 000000000..bef20823c --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b40dd31a65e033ffc6b35c027ce19506 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..74c40e6a3 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +7ee22d8e9387e72758f00918eb67e4c6 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..6fb086d35 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +60f567359382507afdaf45fb075740c3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..0271a6c6b --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b986d6e6cbf425685f409a8b417010e1 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..d801ef19f --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +ee10dd2ae2b5c6621a89a5d0a9aa8ccc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap new file mode 100644 index 000000000..e9988f527 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +fa877559eef78b383b496c15a364a2dc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..64f5012a4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +be1b08073b9d9788d18080c1320151d7 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..bb0e9aa69 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +32a45d555df2e001420fea149818d376 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap new file mode 100644 index 000000000..b7705b72e --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +353d70f52eea66e5031dca989ea8a037 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..15030a1ea --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +52a093c909133d84023a4a7b83864808 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..949ec6647 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +9d86c72ddb241d0aeca2995d61a3648a diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..d8797f1ab --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +c0943177594534bfe5527cbf40fe388e diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..f7949c5f3 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +6ed86f234028ae3df5881bee5512f11e diff --git a/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap new file mode 100644 index 000000000..919f3fe7c --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +5dbfa134cc44abeb3ab6242fc182e48e diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap new file mode 100644 index 000000000..bdeeefc13 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +6ed7bf5d440599b3b10b37549a271fdf diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap new file mode 100644 index 000000000..e037c0295 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap @@ -0,0 +1,19 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[0, ]" +0 0 k1 1 "[1, ]" +0 0 k2 1 "[2, ]" +0 0 k3 1 "[3, ]" +0 0 k4 1 "[4, ]" +0 0 k5 1 "[5, ]" +0 0 k6 1 "[6, ]" +0 0 k7 1 "[7, ]" +0 0 k8 1 "[8, ]" +0 0 k9 1 "[9, ]" +0 0 k10 1 "[10, ]" +0 0 k11 1 "[11, ]" +0 0 k12 1 "[12, ]" +0 0 k13 1 "[13, ]" +0 0 k14 1 "[14, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap new file mode 100644 index 000000000..e9ccc990f --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b5203f0df0036ebaa133dd77d63a00eb diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap new file mode 100644 index 000000000..a98803604 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap @@ -0,0 +1,26 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[0, ]" +0 0 k1 1 "[1, ]" +0 0 k2 1 "[2, ]" +0 0 k3 1 "[3, ]" +0 0 k4 1 "[4, ]" +0 0 k5 1 "[5, ]" +0 0 k6 1 "[6, ]" +0 0 k7 1 "[7, ]" +0 0 k8 1 "[8, ]" +0 0 k9 1 "[9, ]" +0 0 k10 1 "[10, ]" +0 0 k11 1 "[11, ]" +0 0 k12 1 "[12, ]" +0 0 k13 1 "[13, ]" +0 0 k14 1 "[14, ]" +0 0 k15 1 "[15, ]" +0 0 k16 1 "[16, ]" +0 1 k0 4 "[0, 1, 2, 3, ]" +0 1 k4 4 "[4, 5, 6, 7, ]" +0 1 k8 4 "[8, 9, 10, 11, ]" +0 1 k12 4 "[12, 13, 14, 15, ]" +0 1 k16 1 "[16, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap new file mode 100644 index 000000000..bb07123a9 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +95497d8579740868ee0bfc655b0bf782 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap new file mode 100644 index 000000000..8714af061 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +d565c2f7bbd9e13e12de40cfbbfba6bb diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap new file mode 100644 index 000000000..1bba99454 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap @@ -0,0 +1,54 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k216 1 "[216, ]" +0 0 k217 1 "[217, ]" +0 0 k218 1 "[218, ]" +0 0 k219 1 "[219, ]" +0 0 k220 1 "[220, ]" +0 0 k221 1 "[221, ]" +0 0 k222 1 "[222, ]" +0 0 k223 1 "[223, ]" +0 0 k224 1 "[224, ]" +0 0 k225 1 "[225, ]" +0 0 k226 1 "[226, ]" +0 0 k227 1 "[227, ]" +0 0 k228 1 "[228, ]" +0 0 k229 1 "[229, ]" +0 0 k230 1 "[230, ]" +0 0 k231 1 "[231, ]" +0 0 k232 1 "[232, ]" +0 0 k233 1 "[233, ]" +0 0 k234 1 "[234, ]" +0 0 k235 1 "[235, ]" +0 0 k236 1 "[236, ]" +0 0 k237 1 "[237, ]" +0 0 k238 1 "[238, ]" +0 0 k239 1 "[239, ]" +0 0 k240 1 "[240, ]" +0 0 k241 1 "[241, ]" +0 0 k242 1 "[242, ]" +0 0 k243 1 "[243, ]" +0 0 k244 1 "[244, ]" +0 0 k245 1 "[245, ]" +0 0 k246 1 "[246, ]" +0 0 k247 1 "[247, ]" +0 0 k248 1 "[248, ]" +0 0 k249 1 "[249, ]" +0 0 k250 1 "[250, ]" +0 0 k251 1 "[251, ]" +0 0 k252 1 "[252, ]" +0 0 k253 1 "[253, ]" +0 0 k254 1 "[254, ]" +0 0 k255 1 "[255, ]" +0 1 k216 4 "[216, 217, 218, 219, ]" +0 1 k220 4 "[220, 221, 222, 223, ]" +0 1 k224 4 "[224, 225, 226, 227, ]" +0 1 k228 4 "[228, 229, 230, 231, ]" +0 1 k232 4 "[232, 233, 234, 235, ]" +0 1 k236 4 "[236, 237, 238, 239, ]" +0 1 k240 4 "[240, 241, 242, 243, ]" +0 1 k244 4 "[244, 245, 246, 247, ]" +0 1 k248 4 "[248, 249, 250, 251, ]" +0 1 k252 4 "[252, 253, 254, 255, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap new file mode 100644 index 000000000..6815ee609 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7cb503827ba17e9670296cc9531a1380 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap new file mode 100644 index 000000000..6860385ee --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b061f43e379e16f0617c05d3313d0078 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap new file mode 100644 index 000000000..f96b42b27 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +81fc9489d6b163935b97433477dea63b diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap new file mode 100644 index 000000000..c57ca72eb --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b17b2c4ec87a778aae07854c96c08b48 diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap new file mode 100644 index 000000000..82a7ce716 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap @@ -0,0 +1,20 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[3, 435, 583, 849, ]" +0 0 k1 1 "[35, 494, 693, 796, ]" +0 0 k2 1 "[76, 420, 526, 909, ]" +0 0 k3 1 "[133, 451, 653, 806, ]" +0 0 k4 1 "[131, 464, 656, 853, ]" +0 0 k5 1 "[61, 308, 701, 903, ]" +0 0 k6 1 "[144, 449, 674, 794, ]" +0 0 k7 1 "[182, 451, 735, 941, ]" +0 0 k8 1 "[6, 359, 679, 1003, ]" +0 0 k9 1 "[197, 418, 659, 904, ]" +0 0 k10 1 "[88, 297, 567, 800, ]" +0 0 k11 1 "[150, 309, 530, 946, ]" +0 0 k12 1 "[156, 466, 567, 892, ]" +0 0 k13 1 "[46, 425, 610, 807, ]" +0 0 k14 1 "[236, 433, 549, 891, ]" +0 0 k15 1 "[207, 472, 603, 974, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap new file mode 100644 index 000000000..fd4beeca8 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7f8aa18d2b3a6422d55c03bede0563db diff --git a/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap new file mode 100644 index 000000000..fd4beeca8 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7f8aa18d2b3a6422d55c03bede0563db diff --git a/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap new file mode 100644 index 000000000..d055892f5 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b3e2de9020d9e0f3941bc3a179c795ba diff --git a/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap new file mode 100644 index 000000000..919f3fe7c --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +5dbfa134cc44abeb3ab6242fc182e48e diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap new file mode 100644 index 000000000..1802eb952 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +9343355bf535ed4a0c956df2b229d5e6 diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap new file mode 100644 index 000000000..2b6805676 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +4fc800f49201a336295af0542fdf01ab diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap new file mode 100644 index 000000000..5ef88bfb4 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +fd65ce7d96a07aafb0ef6cfb5bf016b8 diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/milli/src/update/facets.rs @@ -0,0 +1 @@ + diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs new file mode 100644 index 000000000..3331497c9 --- /dev/null +++ b/milli/src/update/index_documents/enrich.rs @@ -0,0 +1,389 @@ +use std::io::{Read, Seek}; +use std::result::Result as StdResult; +use std::{fmt, iter}; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; +use crate::error::{GeoError, InternalError, UserError}; +use crate::update::index_documents::{obkv_to_object, writer_into_reader}; +use crate::{FieldId, Index, Object, Result}; + +/// The symbol used to define levels in a nested primary key. +const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; + +/// The default primary that is used when not specified. +const DEFAULT_PRIMARY_KEY: &str = "id"; + +/// This function validates and enrich the documents by checking that: +/// - we can infer a primary key, +/// - all the documents id exist and are extracted, +/// - the validity of them but also, +/// - the validity of the `_geo` field depending on the settings. +/// +/// # Panics +/// +/// - if reader.is_empty(), this function may panic in some cases +pub fn enrich_documents_batch( + rtxn: &heed::RoTxn, + index: &Index, + autogenerate_docids: bool, + reader: DocumentsBatchReader, +) -> Result, UserError>> { + let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index(); + + let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; + let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; + + // The primary key *field id* that has already been set for this index or the one + // we will guess by searching for the first key that contains "id" as a substring. + let primary_key = match index.primary_key(rtxn)? { + Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { + PrimaryKey::nested(primary_key) + } + Some(primary_key) => match documents_batch_index.id(primary_key) { + Some(id) => PrimaryKey::flat(primary_key, id), + None if autogenerate_docids => { + PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) + } + None => { + return match cursor.next_document()? { + Some(first_document) => Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(&first_document, &documents_batch_index)?, + })), + None => unreachable!("Called with reader.is_empty()"), + }; + } + }, + None => { + let mut guesses: Vec<(u16, &str)> = documents_batch_index + .iter() + .filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY)) + .map(|(field_id, name)| (*field_id, name.as_str())) + .collect(); + + // sort the keys in a deterministic, obvious way, so that fields are always in the same order. + guesses.sort_by(|(_, left_name), (_, right_name)| { + // shortest name first + left_name.len().cmp(&right_name.len()).then_with( + // then alphabetical order + || left_name.cmp(right_name), + ) + }); + + match guesses.as_slice() { + [] if autogenerate_docids => PrimaryKey::flat( + DEFAULT_PRIMARY_KEY, + documents_batch_index.insert(DEFAULT_PRIMARY_KEY), + ), + [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), + [(field_id, name)] => { + log::info!("Primary key was not specified in index. Inferred to '{name}'"); + PrimaryKey::flat(name, *field_id) + } + multiple => { + return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { + candidates: multiple + .iter() + .map(|(_, candidate)| candidate.to_string()) + .collect(), + })); + } + } + } + }; + + // If the settings specifies that a _geo field must be used therefore we must check the + // validity of it in all the documents of this batch and this is when we return `Some`. + let geo_field_id = match documents_batch_index.id("_geo") { + Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id), + _otherwise => None, + }; + + let mut count = 0; + while let Some(document) = cursor.next_document()? { + let document_id = match fetch_or_generate_document_id( + &document, + &documents_batch_index, + primary_key, + autogenerate_docids, + &mut uuid_buffer, + count, + )? { + Ok(document_id) => document_id, + Err(user_error) => return Ok(Err(user_error)), + }; + + if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { + if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? { + return Ok(Err(UserError::from(user_error))); + } + } + + let document_id = serde_json::to_vec(&document_id).map_err(InternalError::SerdeJson)?; + external_ids.insert(count.to_be_bytes(), document_id)?; + + count += 1; + } + + let external_ids = writer_into_reader(external_ids)?; + let primary_key_name = primary_key.name().to_string(); + let reader = EnrichedDocumentsBatchReader::new( + DocumentsBatchReader::new(cursor, documents_batch_index), + primary_key_name, + external_ids, + )?; + + Ok(Ok(reader)) +} + +/// Retrieve the document id after validating it, returning a `UserError` +/// if the id is invalid or can't be guessed. +fn fetch_or_generate_document_id( + document: &obkv::KvReader, + documents_batch_index: &DocumentsBatchIndex, + primary_key: PrimaryKey, + autogenerate_docids: bool, + uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], + count: u32, +) -> Result> { + match primary_key { + PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { + match document.get(primary_key_id) { + Some(document_id_bytes) => { + let document_id = serde_json::from_slice(document_id_bytes) + .map_err(InternalError::SerdeJson)?; + match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), + Err(user_error) => Ok(Err(user_error)), + } + } + None if autogenerate_docids => { + let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); + Ok(Ok(DocumentId::generated(uuid.to_string(), count))) + } + None => Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(document, documents_batch_index)?, + })), + } + } + nested @ PrimaryKey::Nested { .. } => { + let mut matching_documents_ids = Vec::new(); + for (first_level_name, right) in nested.possible_level_names() { + if let Some(field_id) = documents_batch_index.id(first_level_name) { + if let Some(value_bytes) = document.get(field_id) { + let object = serde_json::from_slice(value_bytes) + .map_err(InternalError::SerdeJson)?; + fetch_matching_values(object, right, &mut matching_documents_ids); + + if matching_documents_ids.len() >= 2 { + return Ok(Err(UserError::TooManyDocumentIds { + primary_key: nested.name().to_string(), + document: obkv_to_object(document, documents_batch_index)?, + })); + } + } + } + } + + match matching_documents_ids.pop() { + Some(document_id) => match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), + Err(user_error) => Ok(Err(user_error)), + }, + None => Ok(Err(UserError::MissingDocumentId { + primary_key: nested.name().to_string(), + document: obkv_to_object(document, documents_batch_index)?, + })), + } + } + } +} + +/// A type that represent the type of primary key that has been set +/// for this index, a classic flat one or a nested one. +#[derive(Debug, Clone, Copy)] +enum PrimaryKey<'a> { + Flat { name: &'a str, field_id: FieldId }, + Nested { name: &'a str }, +} + +impl PrimaryKey<'_> { + fn flat(name: &str, field_id: FieldId) -> PrimaryKey { + PrimaryKey::Flat { name, field_id } + } + + fn nested(name: &str) -> PrimaryKey { + PrimaryKey::Nested { name } + } + + fn name(&self) -> &str { + match self { + PrimaryKey::Flat { name, .. } => name, + PrimaryKey::Nested { name } => name, + } + } + + /// Returns an `Iterator` that gives all the possible fields names the primary key + /// can have depending of the first level name and deepnes of the objects. + fn possible_level_names(&self) -> impl Iterator + '_ { + let name = self.name(); + name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) + .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) + .chain(iter::once((name, ""))) + } +} + +/// A type that represents a document id that has been retrieved from a document or auto-generated. +/// +/// In case the document id has been auto-generated, the document nth is kept to help +/// users debug if there is an issue with the document itself. +#[derive(Serialize, Deserialize, Clone)] +pub enum DocumentId { + Retrieved { value: String }, + Generated { value: String, document_nth: u32 }, +} + +impl DocumentId { + fn retrieved(value: String) -> DocumentId { + DocumentId::Retrieved { value } + } + + fn generated(value: String, document_nth: u32) -> DocumentId { + DocumentId::Generated { value, document_nth } + } + + fn debug(&self) -> String { + format!("{:?}", self) + } + + pub fn is_generated(&self) -> bool { + matches!(self, DocumentId::Generated { .. }) + } + + pub fn value(&self) -> &str { + match self { + DocumentId::Retrieved { value } => value, + DocumentId::Generated { value, .. } => value, + } + } +} + +impl fmt::Debug for DocumentId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + DocumentId::Retrieved { value } => write!(f, "{:?}", value), + DocumentId::Generated { value, document_nth } => { + write!(f, "{{{:?}}} of the {}nth document", value, document_nth) + } + } + } +} + +fn starts_with(selector: &str, key: &str) -> bool { + selector.strip_prefix(key).map_or(false, |tail| { + tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) + }) +} + +pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { + match value { + Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), + otherwise => output.push(otherwise), + } +} + +pub fn fetch_matching_values_in_object( + object: Object, + selector: &str, + base_key: &str, + output: &mut Vec, +) { + for (key, value) in object { + let base_key = if base_key.is_empty() { + key.to_string() + } else { + format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) + }; + + if starts_with(selector, &base_key) { + match value { + Value::Object(object) => { + fetch_matching_values_in_object(object, selector, &base_key, output) + } + value => output.push(value), + } + } + } +} + +pub fn validate_document_id(document_id: &str) -> Option<&str> { + if !document_id.is_empty() + && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) + { + Some(document_id) + } else { + None + } +} + +/// Parses a Json encoded document id and validate it, returning a user error when it is one. +pub fn validate_document_id_value(document_id: Value) -> Result> { + match document_id { + Value::String(string) => match validate_document_id(&string) { + Some(s) if s.len() == string.len() => Ok(Ok(string)), + Some(s) => Ok(Ok(s.to_string())), + None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), + }, + Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), + content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), + } +} + +/// Try to extract an `f64` from a JSON `Value` and return the `Value` +/// in the `Err` variant if it failed. +pub fn extract_finite_float_from_value(value: Value) -> StdResult { + let number = match value { + Value::Number(ref n) => match n.as_f64() { + Some(number) => number, + None => return Err(value), + }, + Value::String(ref s) => match s.parse::() { + Ok(number) => number, + Err(_) => return Err(value), + }, + value => return Err(value), + }; + + if number.is_finite() { + Ok(number) + } else { + Err(value) + } +} + +pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result> { + use GeoError::*; + let debug_id = || Value::from(id.debug()); + match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { + Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { + (Some(lat), Some(lng)) => { + match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) { + (Ok(_), Ok(_)) => Ok(Ok(())), + (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })), + (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })), + (Err(lat), Err(lng)) => { + Ok(Err(BadLatitudeAndLongitude { document_id: debug_id(), lat, lng })) + } + } + } + (None, Some(_)) => Ok(Err(MissingLatitude { document_id: debug_id() })), + (Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })), + (None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })), + }, + value => Ok(Err(NotAnObject { document_id: debug_id(), value })), + } +} diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs new file mode 100644 index 000000000..be9b479bb --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -0,0 +1,174 @@ +use std::collections::HashSet; +use std::convert::TryInto; +use std::fs::File; +use std::{io, mem, str}; + +use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; +use roaring::RoaringBitmap; +use serde_json::Value; + +use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; +use crate::error::{InternalError, SerializationError}; +use crate::{ + absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, +}; + +/// Extracts the word and positions where this word appear and +/// prefixes it by the document id. +/// +/// Returns the generated internal documents ids and a grenad reader +/// with the list of extracted words from the given chunk of documents. +#[logging_timer::time] +pub fn extract_docid_word_positions( + obkv_documents: grenad::Reader, + indexer: GrenadParameters, + searchable_fields: &Option>, + stop_words: Option<&fst::Set<&[u8]>>, + max_positions_per_attributes: Option, +) -> Result<(RoaringBitmap, grenad::Reader)> { + let max_positions_per_attributes = max_positions_per_attributes + .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); + let max_memory = indexer.max_memory_by_thread(); + + let mut documents_ids = RoaringBitmap::new(); + let mut docid_word_positions_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + concat_u32s_array, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut field_buffer = String::new(); + let mut builder = TokenizerBuilder::new(); + if let Some(stop_words) = stop_words { + builder.stop_words(stop_words); + } + let tokenizer = builder.build(); + + let mut cursor = obkv_documents.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let document_id = key + .try_into() + .map(u32::from_be_bytes) + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + let obkv = obkv::KvReader::::new(value); + + documents_ids.push(document_id); + key_buffer.clear(); + key_buffer.extend_from_slice(&document_id.to_be_bytes()); + + for (field_id, field_bytes) in obkv.iter() { + if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + field_buffer.clear(); + if let Some(field) = json_to_string(&value, &mut field_buffer) { + let tokens = process_tokens(tokenizer.tokenize(field)) + .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); + + for (index, token) in tokens { + let token = token.lemma().trim(); + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(token.as_bytes()); + + let position: u16 = index + .try_into() + .map_err(|_| SerializationError::InvalidNumberSerialization)?; + let position = absolute_from_relative_position(field_id, position); + docid_word_positions_sorter + .insert(&key_buffer, position.to_ne_bytes())?; + } + } + } + } + } + } + + sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader)) +} + +/// Transform a JSON value into a string that can be indexed. +fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> { + fn inner(value: &Value, output: &mut String) -> bool { + use std::fmt::Write; + match value { + Value::Null => false, + Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(), + Value::Number(number) => write!(output, "{}", number).is_ok(), + Value::String(string) => write!(output, "{}", string).is_ok(), + Value::Array(array) => { + let mut count = 0; + for value in array { + if inner(value, output) { + output.push_str(". "); + count += 1; + } + } + // check that at least one value was written + count != 0 + } + Value::Object(object) => { + let mut buffer = String::new(); + let mut count = 0; + for (key, value) in object { + buffer.clear(); + let _ = write!(&mut buffer, "{}: ", key); + if inner(value, &mut buffer) { + buffer.push_str(". "); + // We write the "key: value. " pair only when + // we are sure that the value can be written. + output.push_str(&buffer); + count += 1; + } + } + // check that at least one value was written + count != 0 + } + } + } + + if let Value::String(string) = value { + Some(string) + } else if inner(value, buffer) { + Some(buffer) + } else { + None + } +} + +/// take an iterator on tokens and compute their relative position depending on separator kinds +/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// else we keep the standart proximity of 1 between words. +fn process_tokens<'a>( + tokens: impl Iterator>, +) -> impl Iterator)> { + tokens + .skip_while(|token| token.is_separator()) + .scan((0, None), |(offset, prev_kind), token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) + } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => + { + *prev_kind = Some(token.kind); + } + _ => (), + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs new file mode 100644 index 000000000..33dd5ce5b --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -0,0 +1,45 @@ +use std::fs::File; +use std::io; + +use heed::{BytesDecode, BytesEncode}; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, +}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, +}; +use crate::Result; + +/// Extracts the facet number and the documents ids where this facet number appear. +/// +/// Returns a grenad reader with the list of extracted facet numbers and +/// documents ids from the given chunk of docid facet number positions. +#[logging_timer::time] +pub fn extract_facet_number_docids( + docid_fid_facet_number: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_number_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut cursor = docid_fid_facet_number.into_cursor()?; + while let Some((key_bytes, _)) = cursor.move_on_next()? { + let (field_id, document_id, number) = + FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); + + let key = FacetGroupKey { field_id, level: 0, left_bound: number }; + let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + } + + sorter_into_reader(facet_number_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs new file mode 100644 index 000000000..0d9c0981e --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -0,0 +1,59 @@ +use std::fs::File; +use std::io; + +use heed::BytesEncode; + +use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; +use crate::heed_codec::StrRefCodec; +use crate::update::index_documents::merge_cbo_roaring_bitmaps; +use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; + +/// Extracts the facet string and the documents ids where this facet string appear. +/// +/// Returns a grenad reader with the list of extracted facet strings and +/// documents ids from the given chunk of docid facet string positions. +#[logging_timer::time] +pub fn extract_facet_string_docids( + docid_fid_facet_string: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_string_docids_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut cursor = docid_fid_facet_string.into_cursor()?; + while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { + let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); + let field_id = FieldId::from_be_bytes(field_id_bytes); + + let (document_id_bytes, normalized_value_bytes) = + try_split_array_at::<_, 4>(bytes).unwrap(); + let document_id = u32::from_be_bytes(document_id_bytes); + + let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?; + + let normalised_truncated_value: String; + if normalised_value.len() > MAX_FACET_VALUE_LENGTH { + normalised_truncated_value = normalised_value + .char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + normalised_value = normalised_truncated_value.as_str(); + } + let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; + let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + // document id is encoded in native-endian because of the CBO roaring bitmap codec + facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; + } + + sorter_into_reader(facet_string_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs new file mode 100644 index 000000000..0a7dfbeb1 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -0,0 +1,157 @@ +use std::collections::{BTreeMap, HashSet}; +use std::convert::TryInto; +use std::fs::File; +use std::io; +use std::mem::size_of; + +use heed::zerocopy::AsBytes; +use heed::BytesEncode; +use roaring::RoaringBitmap; +use serde_json::Value; + +use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; +use crate::error::InternalError; +use crate::facet::value_encoding::f64_into_bytes; +use crate::update::index_documents::{create_writer, writer_into_reader}; +use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; + +/// Extracts the facet values of each faceted field of each document. +/// +/// Returns the generated grenad reader containing the docid the fid and the orginal value as key +/// and the normalized value as value extracted from the given chunk of documents. +#[logging_timer::time] +pub fn extract_fid_docid_facet_values( + obkv_documents: grenad::Reader, + indexer: GrenadParameters, + faceted_fields: &HashSet, +) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { + let max_memory = indexer.max_memory_by_thread(); + + let mut fid_docid_facet_numbers_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut fid_docid_facet_strings_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + let mut facet_exists_docids = BTreeMap::::new(); + + let mut key_buffer = Vec::new(); + let mut cursor = obkv_documents.into_cursor()?; + while let Some((docid_bytes, value)) = cursor.move_on_next()? { + let obkv = obkv::KvReader::new(value); + + for (field_id, field_bytes) in obkv.iter() { + if faceted_fields.contains(&field_id) { + key_buffer.clear(); + + // Set key to the field_id + // Note: this encoding is consistent with FieldIdCodec + key_buffer.extend_from_slice(&field_id.to_be_bytes()); + + // Here, we know already that the document must be added to the “field id exists” database + let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); + let document = BEU32::from(document).get(); + + facet_exists_docids.entry(field_id).or_default().insert(document); + + // For the other extraction tasks, prefix the key with the field_id and the document_id + key_buffer.extend_from_slice(docid_bytes); + + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + + let (numbers, strings) = extract_facet_values(&value); + + // insert facet numbers in sorter + for number in numbers { + key_buffer.truncate(size_of::() + size_of::()); + if let Some(value_bytes) = f64_into_bytes(number) { + key_buffer.extend_from_slice(&value_bytes); + key_buffer.extend_from_slice(&number.to_be_bytes()); + + fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; + } + } + + // insert normalized and original facet string in sorter + for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { + let normalised_truncated_value: String = normalized + .char_indices() + .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) + .map(|(_, c)| c) + .collect(); + + key_buffer.truncate(size_of::() + size_of::()); + key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); + fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; + } + } + } + } + + let mut facet_exists_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + for (fid, bitmap) in facet_exists_docids.into_iter() { + let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + } + let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; + + Ok(( + sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, + sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + facet_exists_docids_reader, + )) +} + +fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { + fn inner_extract_facet_values( + value: &Value, + can_recurse: bool, + output_numbers: &mut Vec, + output_strings: &mut Vec<(String, String)>, + ) { + match value { + Value::Null => (), + Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())), + Value::Number(number) => { + if let Some(float) = number.as_f64() { + output_numbers.push(float); + } + } + Value::String(original) => { + let normalized = original.trim().to_lowercase(); + output_strings.push((normalized, original.clone())); + } + Value::Array(values) => { + if can_recurse { + for value in values { + inner_extract_facet_values(value, false, output_numbers, output_strings); + } + } + } + Value::Object(_) => (), + } + } + + let mut facet_number_values = Vec::new(); + let mut facet_string_values = Vec::new(); + inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); + + (facet_number_values, facet_string_values) +} diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs new file mode 100644 index 000000000..315ebdf0c --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -0,0 +1,96 @@ +use std::collections::HashMap; +use std::fs::File; +use std::{cmp, io}; + +use grenad::Sorter; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, MergeFn, +}; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::{relative_from_absolute_position, DocumentId, FieldId, Result}; + +/// Extracts the field id word count and the documents ids where +/// this field id with this amount of words appear. +/// +/// Returns a grenad reader with the list of extracted field id word counts +/// and documents ids from the given chunk of docid word positions. +#[logging_timer::time] +pub fn extract_fid_word_count_docids( + docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut fid_word_count_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + // This map is assumed to not consume a lot of memory. + let mut document_fid_wordcount = HashMap::new(); + let mut current_document_id = None; + + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, _word_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = u32::from_be_bytes(document_id_bytes); + + let curr_document_id = *current_document_id.get_or_insert(document_id); + if curr_document_id != document_id { + drain_document_fid_wordcount_into_sorter( + &mut fid_word_count_docids_sorter, + &mut document_fid_wordcount, + curr_document_id, + )?; + current_document_id = Some(document_id); + } + + for position in read_u32_ne_bytes(value) { + let (field_id, position) = relative_from_absolute_position(position); + let word_count = position as u32 + 1; + + let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); + *value = cmp::max(*value, word_count); + } + } + + if let Some(document_id) = current_document_id { + // We must make sure that don't lose the current document field id + // word count map if we break because we reached the end of the chunk. + drain_document_fid_wordcount_into_sorter( + &mut fid_word_count_docids_sorter, + &mut document_fid_wordcount, + document_id, + )?; + } + + sorter_into_reader(fid_word_count_docids_sorter, indexer) +} + +fn drain_document_fid_wordcount_into_sorter( + fid_word_count_docids_sorter: &mut Sorter, + document_fid_wordcount: &mut HashMap, + document_id: DocumentId, +) -> Result<()> { + let mut key_buffer = Vec::new(); + + for (fid, count) in document_fid_wordcount.drain() { + if count <= 10 { + key_buffer.clear(); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + key_buffer.push(count as u8); + + fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + } + + Ok(()) +} diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs new file mode 100644 index 000000000..55044e712 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -0,0 +1,65 @@ +use std::fs::File; +use std::io; + +use concat_arrays::concat_arrays; +use serde_json::Value; + +use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; +use crate::error::GeoError; +use crate::update::index_documents::extract_finite_float_from_value; +use crate::{FieldId, InternalError, Result}; + +/// Extracts the geographical coordinates contained in each document under the `_geo` field. +/// +/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude) +#[logging_timer::time] +pub fn extract_geo_points( + obkv_documents: grenad::Reader, + indexer: GrenadParameters, + primary_key_id: FieldId, + (lat_fid, lng_fid): (FieldId, FieldId), +) -> Result> { + let mut writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + let mut cursor = obkv_documents.into_cursor()?; + while let Some((docid_bytes, value)) = cursor.move_on_next()? { + let obkv = obkv::KvReader::new(value); + // since we only needs the primary key when we throw an error we create this getter to + // lazily get it when needed + let document_id = || -> Value { + let document_id = obkv.get(primary_key_id).unwrap(); + serde_json::from_slice(document_id).unwrap() + }; + + // first we get the two fields + let lat = obkv.get(lat_fid); + let lng = obkv.get(lng_fid); + + if let Some((lat, lng)) = lat.zip(lng) { + // then we extract the values + let lat = extract_finite_float_from_value( + serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; + + let lng = extract_finite_float_from_value( + serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, + ) + .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; + + #[allow(clippy::drop_non_drop)] + let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; + writer.insert(docid_bytes, bytes)?; + } else if lat.is_none() && lng.is_some() { + return Err(GeoError::MissingLatitude { document_id: document_id() })?; + } else if lat.is_some() && lng.is_none() { + return Err(GeoError::MissingLongitude { document_id: document_id() })?; + } + } + + writer_into_reader(writer) +} diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs new file mode 100644 index 000000000..da59f9dde --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -0,0 +1,88 @@ +use std::collections::HashSet; +use std::fs::File; +use std::io; +use std::iter::FromIterator; + +use roaring::RoaringBitmap; + +use super::helpers::{ + create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::update::index_documents::helpers::read_u32_ne_bytes; +use crate::{relative_from_absolute_position, FieldId, Result}; + +/// Extracts the word and the documents ids where this word appear. +/// +/// Returns a grenad reader with the list of extracted words and +/// documents ids from the given chunk of docid word positions. +/// +/// The first returned reader is the one for normal word_docids, and the second one is for +/// exact_word_docids +#[logging_timer::time] +pub fn extract_word_docids( + docid_word_positions: grenad::Reader, + indexer: GrenadParameters, + exact_attributes: &HashSet, +) -> Result<(grenad::Reader, grenad::Reader)> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|x| x / 2), + ); + + let mut exact_word_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|x| x / 2), + ); + + let mut value_buffer = Vec::new(); + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, positions)) = cursor.move_on_next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = u32::from_be_bytes(document_id_bytes); + + let bitmap = RoaringBitmap::from_iter(Some(document_id)); + serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; + + // If there are no exact attributes, we do not need to iterate over positions. + if exact_attributes.is_empty() { + word_docids_sorter.insert(word_bytes, &value_buffer)?; + } else { + let mut added_to_exact = false; + let mut added_to_word_docids = false; + for position in read_u32_ne_bytes(positions) { + // as soon as we know that this word had been to both readers, we don't need to + // iterate over the positions. + if added_to_exact && added_to_word_docids { + break; + } + let (fid, _) = relative_from_absolute_position(position); + if exact_attributes.contains(&fid) && !added_to_exact { + exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; + added_to_exact = true; + } else if !added_to_word_docids { + word_docids_sorter.insert(word_bytes, &value_buffer)?; + added_to_word_docids = true; + } + } + } + } + + Ok(( + sorter_into_reader(word_docids_sorter, indexer)?, + sorter_into_reader(exact_word_docids_sorter, indexer)?, + )) +} diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs new file mode 100644 index 000000000..6707fc268 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -0,0 +1,178 @@ +use std::cmp::Ordering; +use std::collections::{BinaryHeap, HashMap}; +use std::fs::File; +use std::{cmp, io, mem, str, vec}; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, MergeFn, +}; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::proximity::{positions_proximity, MAX_DISTANCE}; +use crate::{DocumentId, Result}; + +/// Extracts the best proximity between pairs of words and the documents ids where this pair appear. +/// +/// Returns a grenad reader with the list of extracted word pairs proximities and +/// documents ids from the given chunk of docid word positions. +#[logging_timer::time] +pub fn extract_word_pair_proximity_docids( + docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_pair_proximity_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 2), + ); + + // This map is assumed to not consume a lot of memory. + let mut document_word_positions_heap = BinaryHeap::new(); + let mut current_document_id = None; + + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = u32::from_be_bytes(document_id_bytes); + let word = str::from_utf8(word_bytes)?; + + let curr_document_id = *current_document_id.get_or_insert(document_id); + if curr_document_id != document_id { + let document_word_positions_heap = mem::take(&mut document_word_positions_heap); + document_word_positions_into_sorter( + curr_document_id, + document_word_positions_heap, + &mut word_pair_proximity_docids_sorter, + )?; + current_document_id = Some(document_id); + } + + let word = word.to_string(); + let mut positions: Vec<_> = read_u32_ne_bytes(value).collect(); + positions.sort_unstable(); + let mut iter = positions.into_iter(); + if let Some(position) = iter.next() { + document_word_positions_heap.push(PeekedWordPosition { word, position, iter }); + } + } + + if let Some(document_id) = current_document_id { + // We must make sure that don't lose the current document field id + // word count map if we break because we reached the end of the chunk. + let document_word_positions_heap = mem::take(&mut document_word_positions_heap); + document_word_positions_into_sorter( + document_id, + document_word_positions_heap, + &mut word_pair_proximity_docids_sorter, + )?; + } + + sorter_into_reader(word_pair_proximity_docids_sorter, indexer) +} + +/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive. +/// +/// This list is used by the engine to calculate the documents containing words that are +/// close to each other. +fn document_word_positions_into_sorter( + document_id: DocumentId, + mut word_positions_heap: BinaryHeap>>, + word_pair_proximity_docids_sorter: &mut grenad::Sorter, +) -> Result<()> { + let mut word_pair_proximity = HashMap::new(); + let mut ordered_peeked_word_positions = Vec::new(); + while !word_positions_heap.is_empty() { + while let Some(peeked_word_position) = word_positions_heap.pop() { + ordered_peeked_word_positions.push(peeked_word_position); + if ordered_peeked_word_positions.len() == 7 { + break; + } + } + + if let Some((head, tail)) = ordered_peeked_word_positions.split_first() { + for PeekedWordPosition { word, position, .. } in tail { + let prox = positions_proximity(head.position, *position); + if prox > 0 && prox < MAX_DISTANCE { + word_pair_proximity + .entry((head.word.clone(), word.clone())) + .and_modify(|p| { + *p = cmp::min(*p, prox); + }) + .or_insert(prox); + } + } + + // Push the tail in the heap. + let tail_iter = ordered_peeked_word_positions.drain(1..); + word_positions_heap.extend(tail_iter); + + // Advance the head and push it in the heap. + if let Some(mut head) = ordered_peeked_word_positions.pop() { + if let Some(next_position) = head.iter.next() { + let prox = positions_proximity(head.position, next_position); + + if prox > 0 && prox < MAX_DISTANCE { + word_pair_proximity + .entry((head.word.clone(), head.word.clone())) + .and_modify(|p| { + *p = cmp::min(*p, prox); + }) + .or_insert(prox); + } + + word_positions_heap.push(PeekedWordPosition { + word: head.word, + position: next_position, + iter: head.iter, + }); + } + } + } + } + + let mut key_buffer = Vec::new(); + for ((w1, w2), prox) in word_pair_proximity { + key_buffer.clear(); + key_buffer.push(prox as u8); + key_buffer.extend_from_slice(w1.as_bytes()); + key_buffer.push(0); + key_buffer.extend_from_slice(w2.as_bytes()); + + word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + + Ok(()) +} + +struct PeekedWordPosition { + word: String, + position: u32, + iter: I, +} + +impl Ord for PeekedWordPosition { + fn cmp(&self, other: &Self) -> Ordering { + self.position.cmp(&other.position).reverse() + } +} + +impl PartialOrd for PeekedWordPosition { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Eq for PeekedWordPosition {} + +impl PartialEq for PeekedWordPosition { + fn eq(&self, other: &Self) -> bool { + self.position == other.position + } +} diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs new file mode 100644 index 000000000..d95db4157 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -0,0 +1,49 @@ +use std::fs::File; +use std::io; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::{DocumentId, Result}; + +/// Extracts the word positions and the documents ids where this word appear. +/// +/// Returns a grenad reader with the list of extracted words at positions and +/// documents ids from the given chunk of docid word positions. +#[logging_timer::time] +pub fn extract_word_position_docids( + docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_position_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = DocumentId::from_be_bytes(document_id_bytes); + + for position in read_u32_ne_bytes(value) { + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + key_buffer.extend_from_slice(&position.to_be_bytes()); + + word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + } + + sorter_into_reader(word_position_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs new file mode 100644 index 000000000..e696ed44b --- /dev/null +++ b/milli/src/update/index_documents/extract/mod.rs @@ -0,0 +1,314 @@ +mod extract_docid_word_positions; +mod extract_facet_number_docids; +mod extract_facet_string_docids; +mod extract_fid_docid_facet_values; +mod extract_fid_word_count_docids; +mod extract_geo_points; +mod extract_word_docids; +mod extract_word_pair_proximity_docids; +mod extract_word_position_docids; + +use std::collections::HashSet; +use std::fs::File; + +use crossbeam_channel::Sender; +use log::debug; +use rayon::prelude::*; + +use self::extract_docid_word_positions::extract_docid_word_positions; +use self::extract_facet_number_docids::extract_facet_number_docids; +use self::extract_facet_string_docids::extract_facet_string_docids; +use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; +use self::extract_fid_word_count_docids::extract_fid_word_count_docids; +use self::extract_geo_points::extract_geo_points; +use self::extract_word_docids::extract_word_docids; +use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; +use self::extract_word_position_docids::extract_word_position_docids; +use super::helpers::{ + as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, + GrenadParameters, MergeFn, MergeableReader, +}; +use super::{helpers, TypedChunk}; +use crate::{FieldId, Result}; + +/// Extract data for each databases from obkv documents in parallel. +/// Send data in grenad file over provided Sender. +#[allow(clippy::too_many_arguments)] +pub(crate) fn data_from_obkv_documents( + original_obkv_chunks: impl Iterator>> + Send, + flattened_obkv_chunks: impl Iterator>> + Send, + indexer: GrenadParameters, + lmdb_writer_sx: Sender>, + searchable_fields: Option>, + faceted_fields: HashSet, + primary_key_id: FieldId, + geo_fields_ids: Option<(FieldId, FieldId)>, + stop_words: Option>, + max_positions_per_attributes: Option, + exact_attributes: HashSet, +) -> Result<()> { + original_obkv_chunks + .par_bridge() + .map(|original_documents_chunk| { + send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone()) + }) + .collect::>()?; + + #[allow(clippy::type_complexity)] + let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks + .par_bridge() + .map(|flattened_obkv_chunks| { + send_and_extract_flattened_documents_data( + flattened_obkv_chunks, + indexer, + lmdb_writer_sx.clone(), + &searchable_fields, + &faceted_fields, + primary_key_id, + geo_fields_ids, + &stop_words, + max_positions_per_attributes, + ) + }) + .collect(); + + let ( + docid_word_positions_chunks, + ( + docid_fid_facet_numbers_chunks, + (docid_fid_facet_strings_chunks, facet_exists_docids_chunks), + ), + ) = result?; + + // merge facet_exists_docids and send them as a typed chunk + { + let lmdb_writer_sx = lmdb_writer_sx.clone(); + rayon::spawn(move || { + debug!("merge {} database", "facet-id-exists-docids"); + match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + Ok(reader) => { + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); + } + Err(e) => { + let _ = lmdb_writer_sx.send(Err(e)); + } + } + }); + } + + spawn_extraction_task::<_, _, Vec>>( + docid_word_positions_chunks.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_word_pair_proximity_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordPairProximityDocids, + "word-pair-proximity-docids", + ); + + spawn_extraction_task::<_, _, Vec>>( + docid_word_positions_chunks.clone(), + indexer, + lmdb_writer_sx.clone(), + extract_fid_word_count_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::FieldIdWordcountDocids, + "field-id-wordcount-docids", + ); + + spawn_extraction_task::<_, _, Vec<(grenad::Reader, grenad::Reader)>>( + docid_word_positions_chunks.clone(), + indexer, + lmdb_writer_sx.clone(), + move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), + merge_roaring_bitmaps, + |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + }, + "word-docids", + ); + + spawn_extraction_task::<_, _, Vec>>( + docid_word_positions_chunks, + indexer, + lmdb_writer_sx.clone(), + extract_word_position_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordPositionDocids, + "word-position-docids", + ); + + spawn_extraction_task::<_, _, Vec>>( + docid_fid_facet_strings_chunks, + indexer, + lmdb_writer_sx.clone(), + extract_facet_string_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::FieldIdFacetStringDocids, + "field-id-facet-string-docids", + ); + + spawn_extraction_task::<_, _, Vec>>( + docid_fid_facet_numbers_chunks, + indexer, + lmdb_writer_sx, + extract_facet_number_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::FieldIdFacetNumberDocids, + "field-id-facet-number-docids", + ); + + Ok(()) +} + +/// Spawn a new task to extract data for a specific DB using extract_fn. +/// Generated grenad chunks are merged using the merge_fn. +/// The result of merged chunks is serialized as TypedChunk using the serialize_fn +/// and sent into lmdb_writer_sx. +fn spawn_extraction_task( + chunks: Vec>, + indexer: GrenadParameters, + lmdb_writer_sx: Sender>, + extract_fn: FE, + merge_fn: MergeFn, + serialize_fn: FS, + name: &'static str, +) where + FE: Fn(grenad::Reader, GrenadParameters) -> Result + + Sync + + Send + + 'static, + FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static, + M: MergeableReader + FromParallelIterator + Send + 'static, + M::Output: Send, +{ + rayon::spawn(move || { + let chunks: Result = + chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer)).collect(); + rayon::spawn(move || match chunks { + Ok(chunks) => { + debug!("merge {} database", name); + let reader = chunks.merge(merge_fn, &indexer); + let _ = lmdb_writer_sx.send(reader.map(serialize_fn)); + } + Err(e) => { + let _ = lmdb_writer_sx.send(Err(e)); + } + }) + }); +} + +/// Extract chunked data and send it into lmdb_writer_sx sender: +/// - documents +fn send_original_documents_data( + original_documents_chunk: Result>, + lmdb_writer_sx: Sender>, +) -> Result<()> { + let original_documents_chunk = + original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; + + // TODO: create a custom internal error + lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap(); + Ok(()) +} + +/// Extract chunked data and send it into lmdb_writer_sx sender: +/// - documents_ids +/// - docid_word_positions +/// - docid_fid_facet_numbers +/// - docid_fid_facet_strings +/// - docid_fid_facet_exists +#[allow(clippy::too_many_arguments)] +#[allow(clippy::type_complexity)] +fn send_and_extract_flattened_documents_data( + flattened_documents_chunk: Result>, + indexer: GrenadParameters, + lmdb_writer_sx: Sender>, + searchable_fields: &Option>, + faceted_fields: &HashSet, + primary_key_id: FieldId, + geo_fields_ids: Option<(FieldId, FieldId)>, + stop_words: &Option>, + max_positions_per_attributes: Option, +) -> Result<( + grenad::Reader, + ( + grenad::Reader, + (grenad::Reader, grenad::Reader), + ), +)> { + let flattened_documents_chunk = + flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; + + if let Some(geo_fields_ids) = geo_fields_ids { + let documents_chunk_cloned = flattened_documents_chunk.clone(); + let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); + rayon::spawn(move || { + let result = + extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids); + let _ = match result { + Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))), + Err(error) => lmdb_writer_sx_cloned.send(Err(error)), + }; + }); + } + + let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = + rayon::join( + || { + let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( + flattened_documents_chunk.clone(), + indexer, + searchable_fields, + stop_words.as_ref(), + max_positions_per_attributes, + )?; + + // send documents_ids to DB writer + let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))); + + // send docid_word_positions_chunk to DB writer + let docid_word_positions_chunk = + unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; + let _ = lmdb_writer_sx + .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); + + Ok(docid_word_positions_chunk) + }, + || { + let ( + docid_fid_facet_numbers_chunk, + docid_fid_facet_strings_chunk, + fid_facet_exists_docids_chunk, + ) = extract_fid_docid_facet_values( + flattened_documents_chunk.clone(), + indexer, + faceted_fields, + )?; + + // send docid_fid_facet_numbers_chunk to DB writer + let docid_fid_facet_numbers_chunk = + unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? }; + + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers( + docid_fid_facet_numbers_chunk.clone(), + ))); + + // send docid_fid_facet_strings_chunk to DB writer + let docid_fid_facet_strings_chunk = + unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? }; + + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings( + docid_fid_facet_strings_chunk.clone(), + ))); + + Ok(( + docid_fid_facet_numbers_chunk, + (docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk), + )) + }, + ); + + Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) +} diff --git a/milli/src/update/index_documents/helpers/clonable_mmap.rs b/milli/src/update/index_documents/helpers/clonable_mmap.rs new file mode 100644 index 000000000..1c4d78506 --- /dev/null +++ b/milli/src/update/index_documents/helpers/clonable_mmap.rs @@ -0,0 +1,24 @@ +use std::sync::Arc; + +use memmap2::Mmap; + +/// Wrapper around Mmap allowing to virtualy clone grenad-chunks +/// in a parallel process like the indexing. +#[derive(Debug, Clone)] +pub struct ClonableMmap { + inner: Arc, +} + +impl AsRef<[u8]> for ClonableMmap { + fn as_ref(&self) -> &[u8] { + self.inner.as_ref() + } +} + +impl From for ClonableMmap { + fn from(inner: Mmap) -> ClonableMmap { + ClonableMmap { inner: Arc::new(inner) } + } +} + +pub type CursorClonableMmap = std::io::Cursor; diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs new file mode 100644 index 000000000..03f15945a --- /dev/null +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -0,0 +1,255 @@ +use std::borrow::Cow; +use std::fs::File; +use std::io::{self, Seek, SeekFrom}; +use std::time::Instant; + +use grenad::{CompressionType, Sorter}; +use heed::types::ByteSlice; +use log::debug; + +use super::{ClonableMmap, MergeFn}; +use crate::error::InternalError; +use crate::Result; + +pub type CursorClonableMmap = io::Cursor; + +pub fn create_writer( + typ: grenad::CompressionType, + level: Option, + file: R, +) -> grenad::Writer { + let mut builder = grenad::Writer::builder(); + builder.compression_type(typ); + if let Some(level) = level { + builder.compression_level(level); + } + builder.build(file) +} + +pub fn create_sorter( + sort_algorithm: grenad::SortAlgorithm, + merge: MergeFn, + chunk_compression_type: grenad::CompressionType, + chunk_compression_level: Option, + max_nb_chunks: Option, + max_memory: Option, +) -> grenad::Sorter { + let mut builder = grenad::Sorter::builder(merge); + builder.chunk_compression_type(chunk_compression_type); + if let Some(level) = chunk_compression_level { + builder.chunk_compression_level(level); + } + if let Some(nb_chunks) = max_nb_chunks { + builder.max_nb_chunks(nb_chunks); + } + if let Some(memory) = max_memory { + builder.dump_threshold(memory); + builder.allow_realloc(false); + } + builder.sort_algorithm(sort_algorithm); + builder.build() +} + +pub fn sorter_into_reader( + sorter: grenad::Sorter, + indexer: GrenadParameters, +) -> Result> { + let mut writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + sorter.write_into_stream_writer(&mut writer)?; + + writer_into_reader(writer) +} + +pub fn writer_into_reader(writer: grenad::Writer) -> Result> { + let mut file = writer.into_inner()?; + file.seek(SeekFrom::Start(0))?; + grenad::Reader::new(file).map_err(Into::into) +} + +pub unsafe fn as_cloneable_grenad( + reader: &grenad::Reader, +) -> Result> { + let file = reader.get_ref(); + let mmap = memmap2::Mmap::map(file)?; + let cursor = io::Cursor::new(ClonableMmap::from(mmap)); + let reader = grenad::Reader::new(cursor)?; + Ok(reader) +} + +pub trait MergeableReader +where + Self: Sized, +{ + type Output; + + fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result; +} + +impl MergeableReader for Vec> { + type Output = grenad::Reader; + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut merger = MergerBuilder::new(merge_fn); + self.into_iter().try_for_each(|r| merger.push(r))?; + merger.finish(params) + } +} + +impl MergeableReader for Vec<(grenad::Reader, grenad::Reader)> { + type Output = (grenad::Reader, grenad::Reader); + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut m1 = MergerBuilder::new(merge_fn); + let mut m2 = MergerBuilder::new(merge_fn); + for (r1, r2) in self.into_iter() { + m1.push(r1)?; + m2.push(r2)?; + } + Ok((m1.finish(params)?, m2.finish(params)?)) + } +} + +struct MergerBuilder(grenad::MergerBuilder); + +impl MergerBuilder { + fn new(merge_fn: MergeFn) -> Self { + Self(grenad::MergerBuilder::new(merge_fn)) + } + + fn push(&mut self, reader: grenad::Reader) -> Result<()> { + self.0.push(reader.into_cursor()?); + Ok(()) + } + + fn finish(self, params: &GrenadParameters) -> Result> { + let merger = self.0.build(); + let mut writer = create_writer( + params.chunk_compression_type, + params.chunk_compression_level, + tempfile::tempfile()?, + ); + merger.write_into_stream_writer(&mut writer)?; + + writer_into_reader(writer) + } +} + +#[derive(Debug, Clone, Copy)] +pub struct GrenadParameters { + pub chunk_compression_type: CompressionType, + pub chunk_compression_level: Option, + pub max_memory: Option, + pub max_nb_chunks: Option, +} + +impl Default for GrenadParameters { + fn default() -> Self { + Self { + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + max_memory: None, + max_nb_chunks: None, + } + } +} + +impl GrenadParameters { + /// This function use the number of threads in the current threadpool to compute the value. + /// This should be called inside of a rayon thread pool, + /// Otherwise, it will take the global number of threads. + pub fn max_memory_by_thread(&self) -> Option { + self.max_memory.map(|max_memory| max_memory / rayon::current_num_threads()) + } +} + +/// Returns an iterator that outputs grenad readers of obkv documents +/// with a maximum size of approximately `documents_chunks_size`. +/// +/// The grenad obkv entries are composed of an incremental document id big-endian +/// encoded as the key and an obkv object with an `u8` for the field as the key +/// and a simple UTF-8 encoded string as the value. +pub fn grenad_obkv_into_chunks( + reader: grenad::Reader, + indexer: GrenadParameters, + documents_chunk_size: usize, +) -> Result>>> { + let mut continue_reading = true; + let mut cursor = reader.into_cursor()?; + + let mut transposer = move || { + if !continue_reading { + return Ok(None); + } + + let mut current_chunk_size = 0u64; + let mut obkv_documents = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + while let Some((document_id, obkv)) = cursor.move_on_next()? { + obkv_documents.insert(document_id, obkv)?; + current_chunk_size += document_id.len() as u64 + obkv.len() as u64; + + if current_chunk_size >= documents_chunk_size as u64 { + return writer_into_reader(obkv_documents).map(Some); + } + } + + continue_reading = false; + writer_into_reader(obkv_documents).map(Some) + }; + + Ok(std::iter::from_fn(move || transposer().transpose())) +} + +pub fn sorter_into_lmdb_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + sorter: Sorter, + merge: MergeFn, +) -> Result<()> { + debug!("Writing MTBL sorter..."); + let before = Instant::now(); + + let mut merger_iter = sorter.into_stream_merger_iter()?; + if database.is_empty(wtxn)? { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + while let Some((k, v)) = merger_iter.next()? { + // safety: we don't keep references from inside the LMDB database. + unsafe { out_iter.append(k, v)? }; + } + } else { + while let Some((k, v)) = merger_iter.next()? { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; + match iter.next().transpose()? { + Some((key, old_val)) if key == k => { + let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; + let val = merge(k, &vals).map_err(|_| { + // TODO just wrap this error? + InternalError::IndexingMergingKeys { process: "get-put-merge" } + })?; + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(k, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } + } + } + } + + debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); + Ok(()) +} + +/// Used when trying to merge readers, but you don't actually care about the values. +pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result> { + Ok(Cow::Owned(Vec::new())) +} diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs new file mode 100644 index 000000000..37af7ab6a --- /dev/null +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -0,0 +1,102 @@ +use std::borrow::Cow; +use std::io; +use std::result::Result as StdResult; + +use roaring::RoaringBitmap; + +use super::read_u32_ne_bytes; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::Result; + +pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result>; + +pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let capacity = values.iter().map(|v| v.len()).sum::(); + let mut output = Vec::with_capacity(capacity); + values.iter().for_each(|integers| output.extend_from_slice(integers)); + Ok(Cow::Owned(output)) + } +} + +pub fn roaring_bitmap_from_u32s_array(slice: &[u8]) -> RoaringBitmap { + read_u32_ne_bytes(slice).collect() +} + +pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec) -> io::Result<()> { + buffer.clear(); + buffer.reserve(bitmap.serialized_size()); + bitmap.serialize_into(buffer) +} + +pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let merged = values + .iter() + .map(AsRef::as_ref) + .map(RoaringBitmap::deserialize_from) + .map(StdResult::unwrap) + .reduce(|a, b| a | b) + .unwrap(); + let mut buffer = Vec::new(); + serialize_roaring_bitmap(&merged, &mut buffer)?; + Ok(Cow::Owned(buffer)) + } +} + +pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { + Ok(values[0].clone()) +} + +/// Only the last value associated with an id is kept. +pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result> { + Ok(obkvs.last().unwrap().clone()) +} + +/// Merge all the obks in the order we see them. +pub fn merge_obkvs<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result> { + Ok(obkvs + .iter() + .cloned() + .reduce(|acc, current| { + let first = obkv::KvReader::new(&acc); + let second = obkv::KvReader::new(¤t); + let mut buffer = Vec::new(); + merge_two_obkvs(first, second, &mut buffer); + Cow::from(buffer) + }) + .unwrap()) +} + +pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec) { + use itertools::merge_join_by; + use itertools::EitherOrBoth::{Both, Left, Right}; + + buffer.clear(); + + let mut writer = obkv::KvWriter::new(buffer); + for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { + match eob { + Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), + } + } + + writer.finish().unwrap(); +} + +pub fn merge_cbo_roaring_bitmaps<'a>( + _key: &[u8], + values: &[Cow<'a, [u8]>], +) -> Result> { + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let mut vec = Vec::new(); + CboRoaringBitmapCodec::merge_into(values, &mut vec)?; + Ok(Cow::from(vec)) + } +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs new file mode 100644 index 000000000..a496ccd6e --- /dev/null +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -0,0 +1,78 @@ +mod clonable_mmap; +mod grenad_helpers; +mod merge_functions; + +use std::collections::HashSet; +use std::convert::{TryFrom, TryInto}; + +pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; +use fst::{IntoStreamer, Streamer}; +pub use grenad_helpers::{ + as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, + merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, + GrenadParameters, MergeableReader, +}; +pub use merge_functions::{ + concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_obkvs, + merge_roaring_bitmaps, merge_two_obkvs, roaring_bitmap_from_u32s_array, + serialize_roaring_bitmap, MergeFn, +}; + +use crate::MAX_WORD_LENGTH; + +pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { + key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty() +} + +/// Divides one slice into two at an index, returns `None` if mid is out of bounds. +pub fn try_split_at(slice: &[T], mid: usize) -> Option<(&[T], &[T])> { + if mid <= slice.len() { + Some(slice.split_at(mid)) + } else { + None + } +} + +/// Divides one slice into an array and the tail at an index, +/// returns `None` if `N` is out of bounds. +pub fn try_split_array_at(slice: &[T]) -> Option<([T; N], &[T])> +where + [T; N]: for<'a> TryFrom<&'a [T]>, +{ + let (head, tail) = try_split_at(slice, N)?; + let head = head.try_into().ok()?; + Some((head, tail)) +} + +pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator + '_ { + bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) +} + +/// Converts an fst Stream into an HashSet of Strings. +pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet> +where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, +{ + let mut hashset = HashSet::new(); + let mut stream = stream.into_stream(); + while let Some(value) = stream.next() { + hashset.insert(value.to_owned()); + } + hashset +} + +// Converts an fst Stream into a Vec of Strings. +pub fn fst_stream_into_vec<'f, I, S>(stream: I) -> Vec +where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, +{ + let mut strings = Vec::new(); + let mut stream = stream.into_stream(); + while let Some(word) = stream.next() { + let s = std::str::from_utf8(word).unwrap(); + strings.push(s.to_owned()); + } + strings +} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs new file mode 100644 index 000000000..f912a756a --- /dev/null +++ b/milli/src/update/index_documents/mod.rs @@ -0,0 +1,1910 @@ +mod enrich; +mod extract; +mod helpers; +mod transform; +mod typed_chunk; + +use std::collections::HashSet; +use std::io::{Cursor, Read, Seek}; +use std::iter::FromIterator; +use std::num::NonZeroU32; +use std::result::Result as StdResult; + +use crossbeam_channel::{Receiver, Sender}; +use heed::types::Str; +use heed::Database; +use log::debug; +use roaring::RoaringBitmap; +use serde::{Deserialize, Serialize}; +use slice_group_by::GroupBy; +use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; + +use self::enrich::enrich_documents_batch; +pub use self::enrich::{ + extract_finite_float_from_value, validate_document_id, validate_document_id_value, + validate_geo_from_json, DocumentId, +}; +pub use self::helpers::{ + as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, + fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, + sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, +}; +use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; +pub use self::transform::{Transform, TransformOutput}; +use crate::documents::{obkv_to_object, DocumentsBatchReader}; +use crate::error::{Error, InternalError, UserError}; +pub use crate::update::index_documents::helpers::CursorClonableMmap; +use crate::update::{ + self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, + WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, +}; +use crate::{Index, Result, RoaringBitmapCodec}; + +static MERGED_DATABASE_COUNT: usize = 7; +static PREFIX_DATABASE_COUNT: usize = 5; +static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DocumentAdditionResult { + /// The number of documents that were indexed during the update + pub indexed_documents: u64, + /// The total number of documents in the index after the update + pub number_of_documents: u64, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[non_exhaustive] +pub enum IndexDocumentsMethod { + /// Replace the previous document with the new one, + /// removing all the already known attributes. + ReplaceDocuments, + + /// Merge the previous version of the document with the new version, + /// replacing old attributes values with the new ones and add the new attributes. + UpdateDocuments, +} + +impl Default for IndexDocumentsMethod { + fn default() -> Self { + Self::ReplaceDocuments + } +} + +pub struct IndexDocuments<'t, 'u, 'i, 'a, FP, FA> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + config: IndexDocumentsConfig, + indexer_config: &'a IndexerConfig, + transform: Option>, + progress: FP, + should_abort: FA, + added_documents: u64, +} + +#[derive(Default, Debug, Clone)] +pub struct IndexDocumentsConfig { + pub words_prefix_threshold: Option, + pub max_prefix_length: Option, + pub words_positions_level_group_size: Option, + pub words_positions_min_level_size: Option, + pub update_method: IndexDocumentsMethod, + pub deletion_strategy: DeletionStrategy, + pub autogenerate_docids: bool, +} + +impl<'t, 'u, 'i, 'a, FP, FA> IndexDocuments<'t, 'u, 'i, 'a, FP, FA> +where + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, +{ + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + indexer_config: &'a IndexerConfig, + config: IndexDocumentsConfig, + progress: FP, + should_abort: FA, + ) -> Result> { + let transform = Some(Transform::new( + wtxn, + index, + indexer_config, + config.update_method, + config.autogenerate_docids, + )?); + + Ok(IndexDocuments { + transform, + config, + indexer_config, + progress, + should_abort, + wtxn, + index, + added_documents: 0, + }) + } + + /// Adds a batch of documents to the current builder. + /// + /// Since the documents are progressively added to the writer, a failure will cause only + /// return an error and not the `IndexDocuments` struct as it is invalid to use it afterward. + /// + /// Returns the number of documents added to the builder. + pub fn add_documents( + mut self, + reader: DocumentsBatchReader, + ) -> Result<(Self, StdResult)> { + // Early return when there is no document to add + if reader.is_empty() { + return Ok((self, Ok(0))); + } + + // We check for user errors in this validator and if there is one, we can return + // the `IndexDocument` struct as it is valid to send more documents into it. + // However, if there is an internal error we throw it away! + let enriched_documents_reader = match enrich_documents_batch( + self.wtxn, + self.index, + self.config.autogenerate_docids, + reader, + )? { + Ok(reader) => reader, + Err(user_error) => return Ok((self, Err(user_error))), + }; + + let indexed_documents = + self.transform.as_mut().expect("Invalid document addition state").read_documents( + enriched_documents_reader, + self.wtxn, + &self.progress, + &self.should_abort, + )? as u64; + + self.added_documents += indexed_documents; + + Ok((self, Ok(indexed_documents))) + } + + #[logging_timer::time("IndexDocuments::{}")] + pub fn execute(mut self) -> Result { + if self.added_documents == 0 { + let number_of_documents = self.index.number_of_documents(self.wtxn)?; + return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents }); + } + let output = self + .transform + .take() + .expect("Invalid document addition state") + .output_from_sorter(self.wtxn, &self.progress)?; + + let new_facets = output.compute_real_facets(self.wtxn, self.index)?; + self.index.put_faceted_fields(self.wtxn, &new_facets)?; + + // in case new fields were introduced we're going to recreate the searchable fields. + if let Some(faceted_fields) = self.index.user_defined_searchable_fields(self.wtxn)? { + // we can't keep references on the faceted fields while we update the index thus we need to own it. + let faceted_fields: Vec = + faceted_fields.into_iter().map(str::to_string).collect(); + self.index.put_all_searchable_fields_from_fields_ids_map( + self.wtxn, + &faceted_fields.iter().map(String::as_ref).collect::>(), + &output.fields_ids_map, + )?; + } + + let indexed_documents = output.documents_count as u64; + let number_of_documents = self.execute_raw(output)?; + + Ok(DocumentAdditionResult { indexed_documents, number_of_documents }) + } + + /// Returns the total number of documents in the index after the update. + #[logging_timer::time("IndexDocuments::{}")] + pub fn execute_raw(self, output: TransformOutput) -> Result + where + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, + { + let TransformOutput { + primary_key, + fields_ids_map, + field_distribution, + mut external_documents_ids, + new_documents_ids, + replaced_documents_ids, + documents_count, + original_documents, + flattened_documents, + } = output; + + // The fields_ids_map is put back to the store now so the rest of the transaction sees an + // up to date field map. + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + + let backup_pool; + let pool = match self.indexer_config.thread_pool { + Some(ref pool) => pool, + #[cfg(not(test))] + None => { + // We initialize a bakcup pool with the default + // settings if none have already been set. + backup_pool = rayon::ThreadPoolBuilder::new().build()?; + &backup_pool + } + #[cfg(test)] + None => { + // We initialize a bakcup pool with the default + // settings if none have already been set. + backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?; + &backup_pool + } + }; + + let original_documents = grenad::Reader::new(original_documents)?; + let flattened_documents = grenad::Reader::new(flattened_documents)?; + + // create LMDB writer channel + let (lmdb_writer_sx, lmdb_writer_rx): ( + Sender>, + Receiver>, + ) = crossbeam_channel::unbounded(); + + // get the primary key field id + let primary_key_id = fields_ids_map.id(&primary_key).unwrap(); + + // get searchable fields for word databases + let searchable_fields = + self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); + // get filterable fields for facet databases + let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + // get the fid of the `_geo.lat` and `_geo.lng` fields. + let geo_fields_ids = match self.index.fields_ids_map(self.wtxn)?.id("_geo") { + Some(gfid) => { + let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid); + let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid); + // if `_geo` is faceted then we get the `lat` and `lng` + if is_sortable || is_filterable { + let field_ids = self + .index + .fields_ids_map(self.wtxn)? + .insert("_geo.lat") + .zip(self.index.fields_ids_map(self.wtxn)?.insert("_geo.lng")) + .ok_or(UserError::AttributeLimitReached)?; + Some(field_ids) + } else { + None + } + } + None => None, + }; + + let stop_words = self.index.stop_words(self.wtxn)?; + let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; + + let pool_params = GrenadParameters { + chunk_compression_type: self.indexer_config.chunk_compression_type, + chunk_compression_level: self.indexer_config.chunk_compression_level, + max_memory: self.indexer_config.max_memory, + max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. + }; + let documents_chunk_size = + self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB + let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; + + // Run extraction pipeline in parallel. + pool.install(|| { + // split obkv file into several chunks + let original_chunk_iter = + grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size); + + // split obkv file into several chunks + let flattened_chunk_iter = + grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); + + let result = original_chunk_iter.and_then(|original_chunk| { + let flattened_chunk = flattened_chunk_iter?; + // extract all databases from the chunked obkv douments + extract::data_from_obkv_documents( + original_chunk, + flattened_chunk, + pool_params, + lmdb_writer_sx.clone(), + searchable_fields, + faceted_fields, + primary_key_id, + geo_fields_ids, + stop_words, + max_positions_per_attributes, + exact_attributes, + ) + }); + + if let Err(e) = result { + let _ = lmdb_writer_sx.send(Err(e)); + } + + // needs to be droped to avoid channel waiting lock. + drop(lmdb_writer_sx) + }); + + // We delete the documents that this document addition replaces. This way we are + // able to simply insert all the documents even if they already exist in the database. + if !replaced_documents_ids.is_empty() { + let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?; + deletion_builder.strategy(self.config.deletion_strategy); + debug!("documents to delete {:?}", replaced_documents_ids); + deletion_builder.delete_documents(&replaced_documents_ids); + let deleted_documents_result = deletion_builder.execute_inner()?; + debug!("{} documents actually deleted", deleted_documents_result.deleted_documents); + if !deleted_documents_result.soft_deletion_used { + external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; + } + } + + let index_documents_ids = self.index.documents_ids(self.wtxn)?; + let index_is_empty = index_documents_ids.is_empty(); + let mut final_documents_ids = RoaringBitmap::new(); + let mut word_pair_proximity_docids = None; + let mut word_position_docids = None; + let mut word_docids = None; + let mut exact_word_docids = None; + + let mut databases_seen = 0; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + + for result in lmdb_writer_rx { + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + + let typed_chunk = match result? { + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; + word_docids = Some(cloneable_chunk); + let cloneable_chunk = + unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; + exact_word_docids = Some(cloneable_chunk); + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } + } + TypedChunk::WordPairProximityDocids(chunk) => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + word_pair_proximity_docids = Some(cloneable_chunk); + TypedChunk::WordPairProximityDocids(chunk) + } + TypedChunk::WordPositionDocids(chunk) => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + word_position_docids = Some(cloneable_chunk); + TypedChunk::WordPositionDocids(chunk) + } + otherwise => otherwise, + }; + + let (docids, is_merged_database) = + write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?; + if !docids.is_empty() { + final_documents_ids |= docids; + let documents_seen_count = final_documents_ids.len(); + (self.progress)(UpdateIndexingStep::IndexDocuments { + documents_seen: documents_seen_count as usize, + total_documents: documents_count, + }); + debug!( + "We have seen {} documents on {} total document so far", + documents_seen_count, documents_count + ); + } + if is_merged_database { + databases_seen += 1; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + } + } + + // We write the field distribution into the main database + self.index.put_field_distribution(self.wtxn, &field_distribution)?; + + // We write the primary key field id into the main database + self.index.put_primary_key(self.wtxn, &primary_key)?; + + // We write the external documents ids into the main database. + self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; + + let all_documents_ids = index_documents_ids | new_documents_ids; + self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; + + self.execute_prefix_databases( + word_docids, + exact_word_docids, + word_pair_proximity_docids, + word_position_docids, + )?; + + Ok(all_documents_ids.len()) + } + + #[logging_timer::time("IndexDocuments::{}")] + pub fn execute_prefix_databases( + self, + word_docids: Option>, + exact_word_docids: Option>, + word_pair_proximity_docids: Option>, + word_position_docids: Option>, + ) -> Result<()> + where + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, + { + // Merged databases are already been indexed, we start from this count; + let mut databases_seen = MERGED_DATABASE_COUNT; + + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + + databases_seen += 1; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + + let previous_words_prefixes_fst = + self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?; + + // Run the words prefixes update operation. + let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); + if let Some(value) = self.config.words_prefix_threshold { + builder.threshold(value); + } + if let Some(value) = self.config.max_prefix_length { + builder.max_prefix_length(value); + } + builder.execute()?; + + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + + let current_prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; + + // We retrieve the common words between the previous and new prefix word fst. + let common_prefix_fst_words = fst_stream_into_vec( + previous_words_prefixes_fst.op().add(¤t_prefix_fst).intersection(), + ); + let common_prefix_fst_words: Vec<_> = common_prefix_fst_words + .as_slice() + .linear_group_by_key(|x| x.chars().next().unwrap()) + .collect(); + + // We retrieve the newly added words between the previous and new prefix word fst. + let new_prefix_fst_words = fst_stream_into_vec( + current_prefix_fst.op().add(&previous_words_prefixes_fst).difference(), + ); + + // We compute the set of prefixes that are no more part of the prefix fst. + let del_prefix_fst_words = fst_stream_into_hashset( + previous_words_prefixes_fst.op().add(¤t_prefix_fst).difference(), + ); + + databases_seen += 1; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + + if let Some(word_docids) = word_docids { + execute_word_prefix_docids( + self.wtxn, + word_docids, + self.index.word_docids, + self.index.word_prefix_docids, + self.indexer_config, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + } + + if let Some(exact_word_docids) = exact_word_docids { + execute_word_prefix_docids( + self.wtxn, + exact_word_docids, + self.index.exact_word_docids, + self.index.exact_word_prefix_docids, + self.indexer_config, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + } + + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + + databases_seen += 1; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + + if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { + // Run the word prefix pair proximity docids update operation. + PrefixWordPairsProximityDocids::new( + self.wtxn, + self.index, + self.indexer_config.chunk_compression_type, + self.indexer_config.chunk_compression_level, + ) + .execute( + word_pair_proximity_docids, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + } + + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + + databases_seen += 1; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + + if let Some(word_position_docids) = word_position_docids { + // Run the words prefix position docids update operation. + let mut builder = WordPrefixPositionDocids::new(self.wtxn, self.index); + builder.chunk_compression_type = self.indexer_config.chunk_compression_type; + builder.chunk_compression_level = self.indexer_config.chunk_compression_level; + builder.max_nb_chunks = self.indexer_config.max_nb_chunks; + builder.max_memory = self.indexer_config.max_memory; + if let Some(value) = self.config.words_positions_level_group_size { + builder.level_group_size(value); + } + if let Some(value) = self.config.words_positions_min_level_size { + builder.min_level_size(value); + } + builder.execute( + word_position_docids, + &new_prefix_fst_words, + &common_prefix_fst_words, + &del_prefix_fst_words, + )?; + } + + if (self.should_abort)() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + + databases_seen += 1; + (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen, + total_databases: TOTAL_POSTING_DATABASE_COUNT, + }); + + Ok(()) + } +} + +/// Run the word prefix docids update operation. +#[allow(clippy::too_many_arguments)] +fn execute_word_prefix_docids( + txn: &mut heed::RwTxn, + reader: grenad::Reader>, + word_docids_db: Database, + word_prefix_docids_db: Database, + indexer_config: &IndexerConfig, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, +) -> Result<()> { + let cursor = reader.into_cursor()?; + let mut builder = WordPrefixDocids::new(txn, word_docids_db, word_prefix_docids_db); + builder.chunk_compression_type = indexer_config.chunk_compression_type; + builder.chunk_compression_level = indexer_config.chunk_compression_level; + builder.max_nb_chunks = indexer_config.max_nb_chunks; + builder.max_memory = indexer_config.max_memory; + builder.execute(cursor, new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words)?; + Ok(()) +} + +#[cfg(test)] +mod tests { + use big_s::S; + use maplit::hashset; + + use super::*; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + use crate::search::TermsMatchingStrategy; + use crate::update::DeleteDocuments; + use crate::{db_snap, BEU16}; + + #[test] + fn simple_document_replacement() { + let index = TempIndex::new(); + + // First we send 3 documents with ids from 1 to 3. + index + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "kevina" }, + { "id": 3, "name": "benoit" } + ])) + .unwrap(); + + // Check that there is 3 documents now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 3); + drop(rtxn); + + // Second we send 1 document with id 1, to erase the previous ones. + index.add_documents(documents!([ { "id": 1, "name": "updated kevin" } ])).unwrap(); + + // Check that there is **always** 3 documents. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 3); + drop(rtxn); + + // Third we send 3 documents again to replace the existing ones. + index + .add_documents(documents!([ + { "id": 1, "name": "updated second kevin" }, + { "id": 2, "name": "updated kevina" }, + { "id": 3, "name": "updated benoit" } + ])) + .unwrap(); + + // Check that there is **always** 3 documents. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 3); + let count = index.all_documents(&rtxn).unwrap().count(); + assert_eq!(count, 3); + + drop(rtxn); + } + + #[test] + fn simple_document_merge() { + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + + // First we send 3 documents with duplicate ids and + // change the index method to merge documents. + index + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 1, "name": "kevina" }, + { "id": 1, "name": "benoit" } + ])) + .unwrap(); + + // Check that there is only 1 document now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 1); + + // Check that we get only one document from the database. + let docs = index.documents(&rtxn, Some(0)).unwrap(); + assert_eq!(docs.len(), 1); + let (id, doc) = docs[0]; + assert_eq!(id, 0); + + // Check that this document is equal to the last one sent. + let mut doc_iter = doc.iter(); + assert_eq!(doc_iter.next(), Some((0, &b"1"[..]))); + assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..]))); + assert_eq!(doc_iter.next(), None); + drop(rtxn); + + // Second we send 1 document with id 1, to force it to be merged with the previous one. + index.add_documents(documents!([ { "id": 1, "age": 25 } ])).unwrap(); + + // Check that there is **always** 1 document. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 1); + + // Check that we get only one document from the database. + // Since the document has been deleted and re-inserted, its internal docid has been incremented to 1 + let docs = index.documents(&rtxn, Some(1)).unwrap(); + assert_eq!(docs.len(), 1); + let (id, doc) = docs[0]; + assert_eq!(id, 1); + + // Check that this document is equal to the last one sent. + let mut doc_iter = doc.iter(); + assert_eq!(doc_iter.next(), Some((0, &b"1"[..]))); + assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..]))); + assert_eq!(doc_iter.next(), Some((2, &b"25"[..]))); + assert_eq!(doc_iter.next(), None); + drop(rtxn); + } + + #[test] + fn not_auto_generated_documents_ids() { + let index = TempIndex::new(); + + let result = index.add_documents(documents!([ + { "name": "kevin" }, + { "name": "kevina" }, + { "name": "benoit" } + ])); + assert!(result.is_err()); + + // Check that there is no document. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 0); + drop(rtxn); + } + + #[test] + fn simple_auto_generated_documents_ids() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + // First we send 3 documents with ids from 1 to 3. + index + .add_documents(documents!([ + { "name": "kevin" }, + { "name": "kevina" }, + { "name": "benoit" } + ])) + .unwrap(); + + // Check that there is 3 documents now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 3); + + let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap(); + let (_id, obkv) = docs.iter().find(|(_id, kv)| kv.get(0) == Some(br#""kevin""#)).unwrap(); + let kevin_uuid: String = serde_json::from_slice(obkv.get(1).unwrap()).unwrap(); + drop(rtxn); + + // Second we send 1 document with the generated uuid, to erase the previous ones. + index.add_documents(documents!([ { "name": "updated kevin", "id": kevin_uuid } ])).unwrap(); + + // Check that there is **always** 3 documents. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 3); + + // the document 0 has been deleted and reinserted with the id 3 + let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap(); + let kevin_position = + docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); + assert_eq!(kevin_position, 2); + let (_, doc) = docs[kevin_position]; + + // Check that this document is equal to the last + // one sent and that an UUID has been generated. + assert_eq!(doc.get(0), Some(&br#""updated kevin""#[..])); + // This is an UUID, it must be 36 bytes long plus the 2 surrounding string quotes ("). + assert_eq!(doc.get(1).unwrap().len(), 36 + 2); + drop(rtxn); + } + + #[test] + fn reordered_auto_generated_documents_ids() { + let mut index = TempIndex::new(); + + // First we send 3 documents with ids from 1 to 3. + index + .add_documents(documents!([ + { "id": 1, "name": "kevin" }, + { "id": 2, "name": "kevina" }, + { "id": 3, "name": "benoit" } + ])) + .unwrap(); + + // Check that there is 3 documents now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 3); + drop(rtxn); + + // Second we send 1 document without specifying the id. + index.index_documents_config.autogenerate_docids = true; + index.add_documents(documents!([ { "name": "new kevin" } ])).unwrap(); + + // Check that there is 4 documents now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 4); + drop(rtxn); + } + + #[test] + fn empty_update() { + let index = TempIndex::new(); + + // First we send 0 documents and only headers. + index.add_documents(documents!([])).unwrap(); + + // Check that there is no documents. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 0); + drop(rtxn); + } + + #[test] + fn invalid_documents_ids() { + let index = TempIndex::new(); + + // First we send 1 document with an invalid id. + // There is a space in the document id. + index.add_documents(documents!([ { "id": "brume bleue", "name": "kevin" } ])).unwrap_err(); + + // Then we send 1 document with a valid id. + index.add_documents(documents!([ { "id": 32, "name": "kevin" } ])).unwrap(); + + // Check that there is 1 document now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 1); + drop(rtxn); + } + + #[test] + fn complex_documents() { + let index = TempIndex::new(); + + // First we send 3 documents with an id for only one of them. + index + .add_documents(documents!([ + { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } }, + { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] }, + { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] } + ])) + .unwrap(); + + // Check that there is 1 documents now. + let rtxn = index.read_txn().unwrap(); + + // Search for a sub object value + let result = index.search(&rtxn).query(r#""value2""#).execute().unwrap(); + assert_eq!(result.documents_ids, vec![0]); + + // Search for a sub array value + let result = index.search(&rtxn).query(r#""fine""#).execute().unwrap(); + assert_eq!(result.documents_ids, vec![1]); + + // Search for a sub array sub object key + let result = index.search(&rtxn).query(r#""amazing""#).execute().unwrap(); + assert_eq!(result.documents_ids, vec![2]); + + drop(rtxn); + } + + #[test] + fn simple_documents_replace() { + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index.add_documents(documents!([ + { "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, "_geo": { "lat": 12, "lng": 42 } }, + { "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 }, + { "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 }, + { "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" }, + { "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" }, + { "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } } + ])).unwrap(); + + db_snap!(index, word_docids, "initial"); + + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + + index + .add_documents(documents!([ + {"id":4,"title":"Harry Potter and the Half-Blood Princess"}, + {"id":456,"title":"The Little Prince"} + ])) + .unwrap(); + + index + .add_documents(documents!([ + { "id": 2, "author": "J. Austen", "date": "1813" } + ])) + .unwrap(); + + // Check that there is **always** 6 documents. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 6); + let count = index.all_documents(&rtxn).unwrap().count(); + assert_eq!(count, 6); + + db_snap!(index, word_docids, "updated"); + db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]"); + + drop(rtxn); + } + + #[test] + fn mixed_geo_documents() { + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + // We send 6 documents and mix the ones that have _geo and those that don't have it. + index + .add_documents(documents!([ + { "id": 2, "price": 3.5, "_geo": { "lat": 12, "lng": 42 } }, + { "id": 456 }, + { "id": 1 }, + { "id": 1344 }, + { "id": 4 }, + { "id": 42, "_geo": { "lat": 35, "lng": 23 } } + ])) + .unwrap(); + + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset!(S("_geo"))); + }) + .unwrap(); + } + + #[test] + fn index_all_flavour_of_geo() { + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset!(S("_geo"))); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "id": 0, "_geo": { "lat": 31, "lng": [42] } }, + { "id": 1, "_geo": { "lat": "31" }, "_geo.lng": 42 }, + { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" }, + { "id": 3, "_geo.lat": 31, "_geo.lng": "42" }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = crate::Search::new(&rtxn, &index); + search.filter(crate::Filter::from_str("_geoRadius(31, 42, 0.000001)").unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![0, 1, 2, 3]); + } + + #[test] + fn geo_error() { + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset!(S("_geo"))); + }) + .unwrap(); + + let error = index + .add_documents(documents!([ + { "id": 0, "_geo": { "lng": 42 } } + ])) + .unwrap_err(); + assert_eq!( + &error.to_string(), + r#"Could not find latitude in the document with the id: `0`. Was expecting a `_geo.lat` field."# + ); + + let error = index + .add_documents(documents!([ + { "id": 0, "_geo": { "lat": 42 } } + ])) + .unwrap_err(); + assert_eq!( + &error.to_string(), + r#"Could not find longitude in the document with the id: `0`. Was expecting a `_geo.lng` field."# + ); + + let error = index + .add_documents(documents!([ + { "id": 0, "_geo": { "lat": "lol", "lng": 42 } } + ])) + .unwrap_err(); + assert_eq!( + &error.to_string(), + r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `"lol"`."# + ); + + let error = index + .add_documents(documents!([ + { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } } + ])) + .unwrap_err(); + assert_eq!( + &error.to_string(), + r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `[12,13]`."# + ); + + let error = index + .add_documents(documents!([ + { "id": 0, "_geo": { "lat": 12, "lng": "hello" } } + ])) + .unwrap_err(); + assert_eq!( + &error.to_string(), + r#"Could not parse longitude in the document with the id: `0`. Was expecting a finite number but instead got `"hello"`."# + ); + } + + #[test] + fn delete_documents_then_insert() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" }, + { "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" }, + { "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" }, + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } + ])) + .unwrap(); + let mut wtxn = index.write_txn().unwrap(); + assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.delete_external_id("30"); + builder.execute().unwrap(); + + let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); + assert!(external_documents_ids.get("30").is_none()); + wtxn.commit().unwrap(); + + index + .add_documents(documents!([ + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } + ])) + .unwrap(); + + let wtxn = index.write_txn().unwrap(); + let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); + assert!(external_documents_ids.get("30").is_some()); + wtxn.commit().unwrap(); + + index + .add_documents(documents!([ + { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } + ])) + .unwrap(); + } + + #[test] + fn index_more_than_256_fields() { + let index = TempIndex::new(); + + let mut big_object = serde_json::Map::new(); + big_object.insert(S("id"), serde_json::Value::from("wow")); + for i in 0..1000 { + let key = i.to_string(); + big_object.insert(key, serde_json::Value::from("I am a text!")); + } + + let documents = documents_batch_reader_from_objects([big_object]); + index.add_documents(documents).unwrap(); + } + + #[test] + fn index_more_than_1000_positions_in_a_field() { + let index = TempIndex::new_with_map_size(4096 * 100_000); // 400 MB + let mut content = String::with_capacity(382101); + for i in 0..=u16::MAX { + content.push_str(&format!("{i} ")); + } + index + .add_documents(documents!({ + "id": "wow", + "content": content + })) + .unwrap(); + + let mut rtxn = index.read_txn().unwrap(); + + assert!(index.word_docids.get(&mut rtxn, "0").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "64").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "256").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "1024").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "32768").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "65535").unwrap().is_some()); + } + + #[test] + fn index_documents_with_zeroes() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { + "id": 2, + "title": "Prideand Prejudice", + "au{hor": "Jane Austin", + "genre": "romance", + "price$": "3.5$", + }, + { + "id": 456, + "title": "Le Petit Prince", + "au{hor": "Antoine de Saint-Exupéry", + "genre": "adventure", + "price$": "10.0$", + }, + { + "id": 1, + "title": "Wonderland", + "au{hor": "Lewis Carroll", + "genre": "fantasy", + "price$": "25.99$", + }, + { + "id": 4, + "title": "Harry Potter ing fantasy\0lood Prince", + "au{hor": "J. K. Rowling", + "genre": "fantasy\0", + }, + ])) + .unwrap(); + } + + #[test] + fn index_documents_with_nested_fields() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { + "id": 0, + "title": "The zeroth document", + }, + { + "id": 1, + "title": "The first document", + "nested": { + "object": "field", + "machin": "bidule", + }, + }, + { + "id": 2, + "title": "The second document", + "nested": [ + "array", + { + "object": "field", + }, + { + "prout": "truc", + "machin": "lol", + }, + ], + }, + { + "id": 3, + "title": "The third document", + "nested": "I lied", + }, + ])) + .unwrap(); + + index + .update_settings(|settings| { + let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")]; + settings.set_searchable_fields(searchable_fields); + + let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin")); + settings.set_filterable_fields(faceted_fields); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin"))); + + // testing the simple query search + let mut search = crate::Search::new(&rtxn, &index); + search.query("document"); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + // all documents should be returned + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids.len(), 4); + + search.query("zeroth"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![0]); + search.query("first"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1]); + search.query("second"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![2]); + search.query("third"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![3]); + + search.query("field"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1, 2]); + + search.query("lol"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![2]); + + search.query("object"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert!(documents_ids.is_empty()); + + search.query("array"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert!(documents_ids.is_empty()); // nested is not searchable + + search.query("lied"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert!(documents_ids.is_empty()); // nested is not searchable + + // testing the filters + let mut search = crate::Search::new(&rtxn, &index); + search.filter(crate::Filter::from_str(r#"title = "The first document""#).unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1]); + + search.filter(crate::Filter::from_str(r#"nested.object = field"#).unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1, 2]); + + search.filter(crate::Filter::from_str(r#"nested.machin = bidule"#).unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1]); + + search.filter(crate::Filter::from_str(r#"nested = array"#).unwrap().unwrap()); + let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable + assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_)))); + + search.filter(crate::Filter::from_str(r#"nested = "I lied""#).unwrap().unwrap()); + let error = search.execute().map(|_| unreachable!()).unwrap_err(); // nested is not filterable + assert!(matches!(error, crate::Error::UserError(crate::UserError::InvalidFilter(_)))); + } + + #[test] + fn index_documents_with_nested_primary_key() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("complex.nested.id".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "complex": { + "nested": { + "id": 0, + }, + }, + "title": "The zeroth document", + }, + { + "complex.nested": { + "id": 1, + }, + "title": "The first document", + }, + { + "complex": { + "nested.id": 2, + }, + "title": "The second document", + }, + { + "complex.nested.id": 3, + "title": "The third document", + }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // testing the simple query search + let mut search = crate::Search::new(&rtxn, &index); + search.query("document"); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + // all documents should be returned + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids.len(), 4); + + search.query("zeroth"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![0]); + search.query("first"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1]); + search.query("second"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![2]); + search.query("third"); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![3]); + } + + #[test] + fn retrieve_a_b_nested_document_id() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("a.b".to_owned()); + }) + .unwrap(); + + // There must be an issue with the primary key no present in the given document + index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap_err(); + } + + #[test] + fn retrieve_a_b_c_nested_document_id() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key("a.b.c".to_owned()); + }) + .unwrap(); + index.add_documents(documents!({ "a" : { "b" : { "c" : 1 }}})).unwrap(); + + let rtxn = index.read_txn().unwrap(); + let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); + assert!(external_documents_ids.get("1").is_some()); + } + + #[test] + fn test_facets_generation() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { + "id": 0, + "dog": { + "race": { + "bernese mountain": "zeroth", + }, + }, + }, + { + "id": 1, + "dog.race": { + "bernese mountain": "first", + }, + }, + { + "id": 2, + "dog.race.bernese mountain": "second", + }, + { + "id": 3, + "dog": { + "race.bernese mountain": "third" + }, + }, + ])) + .unwrap(); + + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset!(String::from("dog"))); + }) + .unwrap(); + + db_snap!(index, facet_id_string_docids, @r###" + 3 0 first 1 [1, ] + 3 0 second 1 [2, ] + 3 0 third 1 [3, ] + 3 0 zeroth 1 [0, ] + "###); + db_snap!(index, field_id_docid_facet_strings, @r###" + 3 0 zeroth zeroth + 3 1 first first + 3 2 second second + 3 3 third third + "###); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + + let rtxn = index.read_txn().unwrap(); + + let hidden = index.faceted_fields(&rtxn).unwrap(); + + assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain"))); + + for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] { + let mut search = crate::Search::new(&rtxn, &index); + let filter = format!(r#""dog.race.bernese mountain" = {s}"#); + search.filter(crate::Filter::from_str(&filter).unwrap().unwrap()); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![i]); + } + // Reset the settings + index + .update_settings(|settings| { + settings.reset_filterable_fields(); + }) + .unwrap(); + + db_snap!(index, facet_id_string_docids, @""); + db_snap!(index, field_id_docid_facet_strings, @""); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + + let rtxn = index.read_txn().unwrap(); + + let facets = index.faceted_fields(&rtxn).unwrap(); + + assert_eq!(facets, hashset!()); + + // update the settings to test the sortable + index + .update_settings(|settings| { + settings.set_sortable_fields(hashset!(S("dog.race"))); + }) + .unwrap(); + + db_snap!(index, facet_id_string_docids, @r###" + 3 0 first 1 [1, ] + 3 0 second 1 [2, ] + 3 0 third 1 [3, ] + 3 0 zeroth 1 [0, ] + "###); + db_snap!(index, field_id_docid_facet_strings, @r###" + 3 0 zeroth zeroth + 3 1 first first + 3 2 second second + 3 3 third third + "###); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + + let rtxn = index.read_txn().unwrap(); + + let facets = index.faceted_fields(&rtxn).unwrap(); + + assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain"))); + + let mut search = crate::Search::new(&rtxn, &index); + search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S( + "dog.race.bernese mountain", + )))]); + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids, vec![1, 2, 3, 0]); + } + + #[test] + fn index_2_times_documents_split_by_zero_document_indexation() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + {"id": 0, "name": "Kerollmops", "score": 78}, + {"id": 1, "name": "ManyTheFish", "score": 75}, + {"id": 2, "name": "Ferdi", "score": 39}, + {"id": 3, "name": "Tommy", "score": 33} + ])) + .unwrap(); + + // Check that there is 4 document now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 4); + + index.add_documents(documents!([])).unwrap(); + + // Check that there is 4 document now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 4); + + index + .add_documents(documents!([ + {"id": 0, "name": "Kerollmops", "score": 78}, + {"id": 1, "name": "ManyTheFish", "score": 75}, + {"id": 2, "name": "Ferdi", "score": 39}, + {"id": 3, "name": "Tommy", "score": 33} + ])) + .unwrap(); + + // Check that there is 4 document now. + let rtxn = index.read_txn().unwrap(); + let count = index.number_of_documents(&rtxn).unwrap(); + assert_eq!(count, 4); + } + + #[cfg(feature = "default")] + #[test] + fn test_meilisearch_1714() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + {"id": "123", "title": "小化妆包" }, + {"id": "456", "title": "Ipad 包" } + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + + // Only the first document should match. + let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len(); + assert_eq!(count, 1); + + // Only the second document should match. + let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len(); + assert_eq!(count, 1); + + let mut search = crate::Search::new(&rtxn, &index); + search.query("化妆包"); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + // only 1 document should be returned + let crate::SearchResult { documents_ids, .. } = search.execute().unwrap(); + assert_eq!(documents_ids.len(), 1); + } + + /// We try to index documents with words that are too long here, + /// it should not return any error. + #[test] + fn text_with_too_long_words() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + {"id": 1, "title": "a".repeat(256) }, + {"id": 2, "title": "b".repeat(512) }, + {"id": 3, "title": format!("{} {}", "c".repeat(250), "d".repeat(250)) }, + ])) + .unwrap(); + } + + #[test] + fn text_with_too_long_keys() { + let index = TempIndex::new(); + let script = "https://bug.example.com/meilisearch/milli.saml2?ROLE=Programmer-1337&SAMLRequest=Cy1ytcZT1Po%2L2IY2y9Unru8rgnW4qWfPiI0EpT7P8xjJV8PeQikRL%2E8D9A4pj9tmbymbQCQwGmGjPMK7qwXFPX4DH52JO2b7n6TXjuR7zkIFuYdzdY2rwRNBPgCL7ihclEm9zyIjKZQ%2JTqiwfXxWjnI0KEYQYHdwd6Q%2Fx%28BDLNsvmL54CCY2F4RWeRs4eqWfn%2EHqxlhreFzax4AiQ2tgOtV5thOaaWqrhZD%2Py70nuyZWNTKwciGI43AoHg6PThANsQ5rAY5amzN%2ufbs1swETUXlLZuOut5YGpYPZfY6STJWNp4QYSUOUXBZpdElYsH7UHZ7VhJycgyt%28aTK0GW6GbKne2tJM0hgSczOqndg6RFa9WsnSBi4zMcaEfYur4WlSsHDYInF9ROousKqVMZ6H8%2gbUissaLh1eXRGo8KEJbyEHbhVVKGD%28kx4cfKjx9fT3pkeDTdvDrVn25jIzi9wHyt9l1lWc8ICnCvXCVUPP%2BjBG4wILR29gMV9Ux2QOieQm2%2Fycybhr8sBGCl30mHC7blvWt%2T3mrCHQoS3VK49PZNPqBZO9C7vOjOWoszNkJx4QckWV%2FZFvbpzUUkiBiehr9F%2FvQSxz9lzv68GwbTu9fr638p%2FQM%3D&RelayState=https%3A%2F%example.bug.com%2Fde&SigAlg=http%3A%2F%2Fwww.w3.org%2F2000%2F09%2Fxmldsig%23rsa-sha1&Signature=AZFpkhFFII7PodiewTovaGnLQKUVZp0qOCCcBIUkJ6P5by3lE3Lldj9pKaFu4wz4j%2B015HEhDvF0LlAmwwES85vdGh%2FpD%2cIQPRUEjdCbQkQDd3dy1mMXbpXxSe4QYcv9Ni7tqNTQxekpO1gE7rtg6zC66EU55uM9aj9abGQ034Vly%2F6IJ08bvAq%2B%2FB9KruLstuiNWnlXTfNGsOxGLK7%2BXr94LTkat8m%2FMan6Qr95%2KeR5TmmqaQIE4N9H6o4TopT7mXr5CF2Z3"; + + // Create 200 documents with a long text + let content = { + let documents_iter = (0..200i32) + .into_iter() + .map(|i| serde_json::json!({ "id": i, "script": script })) + .filter_map(|json| match json { + serde_json::Value::Object(object) => Some(object), + _ => None, + }); + documents_batch_reader_from_objects(documents_iter) + }; + // Index those 200 long documents + index.add_documents(content).unwrap(); + + // Index one long document + index + .add_documents(documents!([ + {"id": 400, "script": script }, + ])) + .unwrap(); + } + + #[test] + fn index_documents_in_multiple_transforms() { + let index = TempIndex::new(); + + let doc1 = documents! {[{ + "id": 228142, + "title": "asdsad", + "state": "automated", + "priority": "normal", + "public_uid": "37ccf021", + "project_id": 78207, + "branch_id_number": 0 + }]}; + + let doc2 = documents! {[{ + "id": 228143, + "title": "something", + "state": "automated", + "priority": "normal", + "public_uid": "39c6499b", + "project_id": 78207, + "branch_id_number": 0 + }]}; + + { + let mut wtxn = index.write_txn().unwrap(); + index.put_primary_key(&mut wtxn, "id").unwrap(); + wtxn.commit().unwrap(); + } + + index.add_documents(doc1).unwrap(); + index.add_documents(doc2).unwrap(); + + let wtxn = index.read_txn().unwrap(); + + let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map(); + let ids = map.values().collect::>(); + + assert_eq!(ids.len(), map.len()); + } + + #[test] + fn index_documents_check_exists_database() { + let content = || { + documents!([ + { + "id": 0, + "colour": 0, + }, + { + "id": 1, + "colour": [] + }, + { + "id": 2, + "colour": {} + }, + { + "id": 3, + "colour": null + }, + { + "id": 4, + "colour": [1] + }, + { + "id": 5 + }, + { + "id": 6, + "colour": { + "green": 1 + } + }, + { + "id": 7, + "colour": { + "green": { + "blue": [] + } + } + } + ]) + }; + + let check_ok = |index: &Index| { + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); + + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + + let bitmap_colour = + index.facet_id_exists_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap(); + assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6, 7]); + + let bitmap_colour_green = index + .facet_id_exists_docids + .get(&rtxn, &BEU16::new(colour_green_id)) + .unwrap() + .unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6, 7]); + }; + + let faceted_fields = hashset!(S("colour")); + + let index = TempIndex::new(); + index.add_documents(content()).unwrap(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); + check_ok(&index); + + let index = TempIndex::new(); + index + .update_settings(|settings| { + settings.set_filterable_fields(faceted_fields.clone()); + }) + .unwrap(); + index.add_documents(content()).unwrap(); + check_ok(&index); + } + + #[test] + fn primary_key_must_not_contain_floats() { + let index = TempIndex::new_with_map_size(4096 * 100); + + let doc1 = documents! {[{ + "id": -228142, + "title": "asdsad", + }]}; + + let doc2 = documents! {[{ + "id": 228143.56, + "title": "something", + }]}; + + let doc3 = documents! {[{ + "id": -228143.56, + "title": "something", + }]}; + + let doc4 = documents! {[{ + "id": 2.0, + "title": "something", + }]}; + + index.add_documents(doc1).unwrap(); + index.add_documents(doc2).unwrap_err(); + index.add_documents(doc3).unwrap_err(); + index.add_documents(doc4).unwrap_err(); + } + + #[test] + fn primary_key_must_not_contain_whitespace() { + let index = TempIndex::new(); + + let doc1 = documents! {[{ + "id": " 1", + "title": "asdsad", + }]}; + + let doc2 = documents! {[{ + "id": "\t2", + "title": "something", + }]}; + + let doc3 = documents! {[{ + "id": "\r3", + "title": "something", + }]}; + + let doc4 = documents! {[{ + "id": "\n4", + "title": "something", + }]}; + + index.add_documents(doc1).unwrap_err(); + index.add_documents(doc2).unwrap_err(); + index.add_documents(doc3).unwrap_err(); + index.add_documents(doc4).unwrap_err(); + } + + #[test] + fn primary_key_inference() { + let index = TempIndex::new(); + + let doc_no_id = documents! {[{ + "title": "asdsad", + "state": "automated", + "priority": "normal", + "branch_id_number": 0 + }]}; + assert!(matches!( + index.add_documents(doc_no_id), + Err(Error::UserError(UserError::NoPrimaryKeyCandidateFound)) + )); + + let doc_multiple_ids = documents! {[{ + "id": 228143, + "title": "something", + "state": "automated", + "priority": "normal", + "public_uid": "39c6499b", + "project_id": 78207, + "branch_id_number": 0 + }]}; + + let Err(Error::UserError(UserError::MultiplePrimaryKeyCandidatesFound { + candidates + })) = + index.add_documents(doc_multiple_ids) else { panic!("Expected Error::UserError(MultiplePrimaryKeyCandidatesFound)") }; + + assert_eq!(candidates, vec![S("id"), S("project_id"), S("public_uid"),]); + + let doc_inferable = documents! {[{ + "video": "test.mp4", + "id": 228143, + "title": "something", + "state": "automated", + "priority": "normal", + "public_uid_": "39c6499b", + "project_id_": 78207, + "branch_id_number": 0 + }]}; + + index.add_documents(doc_inferable).unwrap(); + + let txn = index.read_txn().unwrap(); + + assert_eq!(index.primary_key(&txn).unwrap().unwrap(), "id"); + } + + #[test] + fn long_words_must_be_skipped() { + let index = TempIndex::new(); + + // this is obviousy too long + let long_word = "lol".repeat(1000); + let doc1 = documents! {[{ + "id": "1", + "title": long_word, + }]}; + + index.add_documents(doc1).unwrap(); + + let rtxn = index.read_txn().unwrap(); + let words_fst = index.words_fst(&rtxn).unwrap(); + assert!(!words_fst.contains(&long_word)); + } + + #[test] + fn long_facet_values_must_not_crash() { + let index = TempIndex::new(); + + // this is obviousy too long + let long_word = "lol".repeat(1000); + let doc1 = documents! {[{ + "id": "1", + "title": long_word, + }]}; + + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("title") }); + }) + .unwrap(); + + index.add_documents(doc1).unwrap(); + } +} diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap new file mode 100644 index 000000000..5b424356a --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap @@ -0,0 +1,54 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +1 [2, ] +10.0 [1, ] +12 [0, ] +1344 [3, ] +2 [0, ] +23 [5, ] +25.99 [2, ] +3.5 [0, ] +35 [5, ] +4 [4, ] +42 [0, 5, ] +456 [1, ] +adams [5, ] +adventure [1, ] +alice [2, ] +and [0, 4, ] +antoine [1, ] +austin [0, ] +blood [4, ] +carroll [2, ] +de [1, ] +douglas [5, ] +exupery [1, ] +fantasy [2, 3, 4, ] +galaxy [5, ] +guide [5, ] +half [4, ] +harry [4, ] +hitchhiker' [5, ] +hobbit [3, ] +in [2, ] +j [3, 4, ] +jane [0, ] +k [4, ] +le [1, ] +lewis [2, ] +petit [1, ] +potter [4, ] +prejudice [0, ] +pride [0, ] +prince [1, 4, ] +r [3, ] +romance [0, ] +rowling [4, ] +s [5, ] +saint [1, ] +the [3, 4, 5, ] +to [5, ] +tolkien [3, ] +wonderland [2, ] + diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap new file mode 100644 index 000000000..4f4a9e33a --- /dev/null +++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap @@ -0,0 +1,58 @@ +--- +source: milli/src/update/index_documents/mod.rs +--- +1 [2, ] +10.0 [1, 7, ] +12 [0, 8, ] +1344 [3, ] +1813 [8, ] +2 [0, 8, ] +23 [5, ] +25.99 [2, ] +3.5 [0, 8, ] +35 [5, ] +4 [4, 6, ] +42 [0, 5, 8, ] +456 [1, 7, ] +adams [5, ] +adventure [1, 7, ] +alice [2, ] +and [0, 4, 6, 8, ] +antoine [1, 7, ] +austen [8, ] +austin [0, ] +blood [4, 6, ] +carroll [2, ] +de [1, 7, ] +douglas [5, ] +exupery [1, 7, ] +fantasy [2, 3, 4, 6, ] +galaxy [5, ] +guide [5, ] +half [4, 6, ] +harry [4, 6, ] +hitchhiker' [5, ] +hobbit [3, ] +in [2, ] +j [3, 4, 6, 8, ] +jane [0, ] +k [4, 6, ] +le [1, ] +lewis [2, ] +little [7, ] +petit [1, ] +potter [4, 6, ] +prejudice [0, 8, ] +pride [0, 8, ] +prince [1, 4, 7, ] +princess [6, ] +r [3, ] +romance [0, 8, ] +rowling [4, 6, ] +s [5, ] +saint [1, 7, ] +the [3, 4, 5, 6, 7, ] +to [5, ] +tolkien [3, ] +wonderland [2, ] + diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs new file mode 100644 index 000000000..68ef2b7ee --- /dev/null +++ b/milli/src/update/index_documents/transform.rs @@ -0,0 +1,703 @@ +use std::borrow::Cow; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet}; +use std::fs::File; +use std::io::{Read, Seek, SeekFrom}; + +use fxhash::FxHashMap; +use heed::RoTxn; +use itertools::Itertools; +use obkv::{KvReader, KvWriter}; +use roaring::RoaringBitmap; +use serde_json::Value; +use smartstring::SmartString; + +use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; +use super::{IndexDocumentsMethod, IndexerConfig}; +use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; +use crate::error::{Error, InternalError, UserError}; +use crate::index::{db_name, main_key}; +use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; +use crate::{ + ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, + Result, BEU32, +}; + +pub struct TransformOutput { + pub primary_key: String, + pub fields_ids_map: FieldsIdsMap, + pub field_distribution: FieldDistribution, + pub external_documents_ids: ExternalDocumentsIds<'static>, + pub new_documents_ids: RoaringBitmap, + pub replaced_documents_ids: RoaringBitmap, + pub documents_count: usize, + pub original_documents: File, + pub flattened_documents: File, +} + +/// Extract the external ids, deduplicate and compute the new internal documents ids +/// and fields ids, writing all the documents under their internal ids into a final file. +/// +/// Outputs the new `FieldsIdsMap`, the new `UsersIdsDocumentsIds` map, the new documents ids, +/// the replaced documents ids, the number of documents in this update and the file +/// containing all those documents. +pub struct Transform<'a, 'i> { + pub index: &'i Index, + fields_ids_map: FieldsIdsMap, + + indexer_settings: &'a IndexerConfig, + pub autogenerate_docids: bool, + pub index_documents_method: IndexDocumentsMethod, + available_documents_ids: AvailableDocumentsIds, + + original_sorter: grenad::Sorter, + flattened_sorter: grenad::Sorter, + replaced_documents_ids: RoaringBitmap, + new_documents_ids: RoaringBitmap, + // To increase the cache locality and decrease the heap usage we use compact smartstring. + new_external_documents_ids_builder: FxHashMap, u64>, + documents_count: usize, +} + +/// Create a mapping between the field ids found in the document batch and the one that were +/// already present in the index. +/// +/// If new fields are present in the addition, they are added to the index field ids map. +fn create_fields_mapping( + index_field_map: &mut FieldsIdsMap, + batch_field_map: &DocumentsBatchIndex, +) -> Result> { + batch_field_map + .iter() + // we sort by id here to ensure a deterministic mapping of the fields, that preserves + // the original ordering. + .sorted_by_key(|(&id, _)| id) + .map(|(field, name)| match index_field_map.id(name) { + Some(id) => Ok((*field, id)), + None => index_field_map + .insert(name) + .ok_or(Error::UserError(UserError::AttributeLimitReached)) + .map(|id| (*field, id)), + }) + .collect() +} + +impl<'a, 'i> Transform<'a, 'i> { + pub fn new( + wtxn: &mut heed::RwTxn, + index: &'i Index, + indexer_settings: &'a IndexerConfig, + index_documents_method: IndexDocumentsMethod, + autogenerate_docids: bool, + ) -> Result { + // We must choose the appropriate merge function for when two or more documents + // with the same user id must be merged or fully replaced in the same batch. + let merge_function = match index_documents_method { + IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv, + IndexDocumentsMethod::UpdateDocuments => merge_obkvs, + }; + + // We initialize the sorter with the user indexing settings. + let original_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_function, + indexer_settings.chunk_compression_type, + indexer_settings.chunk_compression_level, + indexer_settings.max_nb_chunks, + indexer_settings.max_memory.map(|mem| mem / 2), + ); + + // We initialize the sorter with the user indexing settings. + let flattened_sorter = create_sorter( + grenad::SortAlgorithm::Stable, + merge_function, + indexer_settings.chunk_compression_type, + indexer_settings.chunk_compression_level, + indexer_settings.max_nb_chunks, + indexer_settings.max_memory.map(|mem| mem / 2), + ); + let documents_ids = index.documents_ids(wtxn)?; + let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?; + + Ok(Transform { + index, + fields_ids_map: index.fields_ids_map(wtxn)?, + indexer_settings, + autogenerate_docids, + available_documents_ids: AvailableDocumentsIds::from_documents_ids( + &documents_ids, + &soft_deleted_documents_ids, + ), + original_sorter, + flattened_sorter, + index_documents_method, + replaced_documents_ids: RoaringBitmap::new(), + new_documents_ids: RoaringBitmap::new(), + new_external_documents_ids_builder: FxHashMap::default(), + documents_count: 0, + }) + } + + pub fn read_documents( + &mut self, + reader: EnrichedDocumentsBatchReader, + wtxn: &mut heed::RwTxn, + progress_callback: FP, + should_abort: FA, + ) -> Result + where + R: Read + Seek, + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, + { + let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); + + let external_documents_ids = self.index.external_documents_ids(wtxn)?; + + let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; + + let primary_key = cursor.primary_key().to_string(); + let primary_key_id = + self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; + + let mut obkv_buffer = Vec::new(); + let mut documents_count = 0; + let mut docid_buffer: Vec = Vec::new(); + let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); + while let Some(enriched_document) = cursor.next_enriched_document()? { + let EnrichedDocument { document, document_id } = enriched_document; + + if should_abort() { + return Err(Error::InternalError(InternalError::AbortedIndexation)); + } + + // drop_and_reuse is called instead of .clear() to communicate to the compiler that field_buffer + // does not keep references from the cursor between loop iterations + let mut field_buffer_cache = drop_and_reuse(field_buffer); + if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { + progress_callback(UpdateIndexingStep::RemapDocumentAddition { + documents_seen: documents_count, + }); + } + + // When the document id has been auto-generated by the `enrich_documents_batch` + // we must insert this document id into the remaped document. + let external_id = document_id.value(); + if document_id.is_generated() { + serde_json::to_writer(&mut docid_buffer, external_id) + .map_err(InternalError::SerdeJson)?; + field_buffer_cache.push((primary_key_id, Cow::from(&docid_buffer))); + } + + for (k, v) in document.iter() { + let mapped_id = + *mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; + field_buffer_cache.push((mapped_id, Cow::from(v))); + } + + // Insertion in a obkv need to be done with keys ordered. For now they are ordered + // according to the document addition key order, so we sort it according to the + // fieldids map keys order. + field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2)); + + // Build the new obkv document. + let mut writer = obkv::KvWriter::new(&mut obkv_buffer); + for (k, v) in field_buffer_cache.iter() { + writer.insert(*k, v)?; + } + + let mut original_docid = None; + + let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { + Entry::Occupied(entry) => *entry.get() as u32, + Entry::Vacant(entry) => { + // If the document was already in the db we mark it as a replaced document. + // It'll be deleted later. We keep its original docid to insert it in the grenad. + if let Some(docid) = external_documents_ids.get(entry.key()) { + self.replaced_documents_ids.insert(docid); + original_docid = Some(docid); + } + let docid = self + .available_documents_ids + .next() + .ok_or(UserError::DocumentLimitReached)?; + entry.insert(docid as u64); + docid + } + }; + + let mut skip_insertion = false; + if let Some(original_docid) = original_docid { + let original_key = BEU32::new(original_docid); + let base_obkv = self + .index + .documents + .remap_data_type::() + .get(wtxn, &original_key)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::DOCUMENTS, + key: None, + })?; + + // we check if the two documents are exactly equal. If it's the case we can skip this document entirely + if base_obkv == obkv_buffer { + // we're not replacing anything + self.replaced_documents_ids.remove(original_docid); + // and we need to put back the original id as it was before + self.new_external_documents_ids_builder.remove(external_id); + skip_insertion = true; + } else { + // we associate the base document with the new key, everything will get merged later. + self.original_sorter.insert(docid.to_be_bytes(), base_obkv)?; + match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? { + Some(buffer) => { + self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)? + } + None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?, + } + } + } + + if !skip_insertion { + self.new_documents_ids.insert(docid); + // We use the extracted/generated user id as the key for this document. + self.original_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?; + + match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { + Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, + None => { + self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())? + } + } + } + documents_count += 1; + + progress_callback(UpdateIndexingStep::RemapDocumentAddition { + documents_seen: documents_count, + }); + + field_buffer = drop_and_reuse(field_buffer_cache); + docid_buffer.clear(); + obkv_buffer.clear(); + } + + progress_callback(UpdateIndexingStep::RemapDocumentAddition { + documents_seen: documents_count, + }); + + self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; + self.index.put_primary_key(wtxn, &primary_key)?; + self.documents_count += documents_count; + // Now that we have a valid sorter that contains the user id and the obkv we + // give it to the last transforming function which returns the TransformOutput. + Ok(documents_count) + } + + // Flatten a document from the fields ids map contained in self and insert the new + // created fields. Returns `None` if the document doesn't need to be flattened. + fn flatten_from_fields_ids_map(&mut self, obkv: KvReader) -> Result>> { + if obkv + .iter() + .all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value)) + { + return Ok(None); + } + + // store the keys and values the original obkv + the flattened json + // We first extract all the key+value out of the obkv. If a value is not nested + // we keep a reference on its value. If the value is nested we'll get its value + // as an owned `Vec` after flattening it. + let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new(); + + // the object we're going to use to store the fields that need to be flattened. + let mut doc = serde_json::Map::new(); + + // we recreate a json containing only the fields that needs to be flattened. + // all the raw values get inserted directly in the `key_value` vec. + for (key, value) in obkv.iter() { + if json_depth_checker::should_flatten_from_unchecked_slice(value) { + let key = self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Flatten from fields ids map.", + })?; + + let value = serde_json::from_slice::(value) + .map_err(crate::error::InternalError::SerdeJson)?; + doc.insert(key.to_string(), value); + } else { + key_value.push((key, value.into())); + } + } + + let flattened = flatten_serde_json::flatten(&doc); + + // Once we have the flattened version we insert all the new generated fields_ids + // (if any) in the fields ids map and serialize the value. + for (key, value) in flattened.into_iter() { + let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; + let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; + key_value.push((fid, value.into())); + } + + // we sort the key. If there was a conflict between the obkv and the new generated value the + // keys will be consecutive. + key_value.sort_unstable_by_key(|(key, _)| *key); + + let mut buffer = Vec::new(); + Self::create_obkv_from_key_value(&mut key_value, &mut buffer)?; + Ok(Some(buffer)) + } + + /// Generate an obkv from a slice of key / value sorted by key. + fn create_obkv_from_key_value( + key_value: &mut [(FieldId, Cow<[u8]>)], + output_buffer: &mut Vec, + ) -> Result<()> { + debug_assert!( + key_value.windows(2).all(|vec| vec[0].0 <= vec[1].0), + "The slice of key / value pair must be sorted." + ); + + output_buffer.clear(); + let mut writer = KvWriter::new(output_buffer); + + let mut skip_next_value = false; + for things in key_value.windows(2) { + if skip_next_value { + skip_next_value = false; + continue; + } + let (key1, value1) = &things[0]; + let (key2, value2) = &things[1]; + + // now we're going to look for conflicts between the keys. For example the following documents would cause a conflict: + // { "doggo.name": "jean", "doggo": { "name": "paul" } } + // we should find a first "doggo.name" from the obkv and a second one from the flattening. + // but we must generate the following document: + // { "doggo.name": ["jean", "paul"] } + // thus we're going to merge the value from the obkv and the flattened document in a single array and skip the next + // iteration. + if key1 == key2 { + skip_next_value = true; + + let value1 = serde_json::from_slice(value1) + .map_err(crate::error::InternalError::SerdeJson)?; + let value2 = serde_json::from_slice(value2) + .map_err(crate::error::InternalError::SerdeJson)?; + let value = match (value1, value2) { + (Value::Array(mut left), Value::Array(mut right)) => { + left.append(&mut right); + Value::Array(left) + } + (Value::Array(mut array), value) | (value, Value::Array(mut array)) => { + array.push(value); + Value::Array(array) + } + (left, right) => Value::Array(vec![left, right]), + }; + + let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; + writer.insert(*key1, value)?; + } else { + writer.insert(*key1, value1)?; + } + } + + if !skip_next_value { + // the unwrap is safe here, we know there was at least one value in the document + let (key, value) = key_value.last().unwrap(); + writer.insert(*key, value)?; + } + + Ok(()) + } + + fn remove_deleted_documents_from_field_distribution( + &self, + rtxn: &RoTxn, + field_distribution: &mut FieldDistribution, + ) -> Result<()> { + for deleted_docid in self.replaced_documents_ids.iter() { + let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, + )?; + + for (key, _) in obkv.iter() { + let name = + self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + })?; + // We checked that the document was in the db earlier. If we can't find it it means + // there is an inconsistency between the field distribution and the field id map. + let field = + field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Accessing field distribution in transform.", + })?; + *field -= 1; + if *field == 0 { + // since we were able to get the field right before it's safe to unwrap here + field_distribution.remove(name).unwrap(); + } + } + } + Ok(()) + } + + /// Generate the `TransformOutput` based on the given sorter that can be generated from any + /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document + /// id for the user side and the value must be an obkv where keys are valid fields ids. + pub(crate) fn output_from_sorter( + self, + wtxn: &mut heed::RwTxn, + progress_callback: F, + ) -> Result + where + F: Fn(UpdateIndexingStep) + Sync, + { + let primary_key = self + .index + .primary_key(wtxn)? + .ok_or(Error::InternalError(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + }))? + .to_string(); + + let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; + + // We create a final writer to write the new documents in order from the sorter. + let mut writer = create_writer( + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + tempfile::tempfile()?, + ); + + // To compute the field distribution we need to; + // 1. Remove all the deleted documents from the field distribution + // 2. Add all the new documents to the field distribution + let mut field_distribution = self.index.field_distribution(wtxn)?; + + self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?; + + // Here we are going to do the document count + field distribution + `write_into_stream_writer` + let mut iter = self.original_sorter.into_stream_merger_iter()?; + // used only for the callback + let mut documents_count = 0; + + while let Some((key, val)) = iter.next()? { + // send a callback to show at which step we are + documents_count += 1; + progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments { + documents_seen: documents_count, + total_documents: self.documents_count, + }); + + // We increment all the field of the current document in the field distribution. + let obkv = KvReader::new(val); + + for (key, _) in obkv.iter() { + let name = + self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: key, + process: "Computing field distribution in transform.", + })?; + *field_distribution.entry(name.to_string()).or_insert(0) += 1; + } + writer.insert(key, val)?; + } + + let mut original_documents = writer.into_inner()?; + // We then extract the file and reset the seek to be able to read it again. + original_documents.seek(SeekFrom::Start(0))?; + + // We create a final writer to write the new documents in order from the sorter. + let mut writer = create_writer( + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + tempfile::tempfile()?, + ); + // Once we have written all the documents into the final sorter, we write the documents + // into this writer, extract the file and reset the seek to be able to read it again. + self.flattened_sorter.write_into_stream_writer(&mut writer)?; + let mut flattened_documents = writer.into_inner()?; + flattened_documents.seek(SeekFrom::Start(0))?; + + let mut new_external_documents_ids_builder: Vec<_> = + self.new_external_documents_ids_builder.into_iter().collect(); + + new_external_documents_ids_builder + .sort_unstable_by(|(left, _), (right, _)| left.cmp(right)); + let mut fst_new_external_documents_ids_builder = fst::MapBuilder::memory(); + new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| { + fst_new_external_documents_ids_builder.insert(key, value) + })?; + let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map(); + external_documents_ids.insert_ids(&new_external_documents_ids)?; + + Ok(TransformOutput { + primary_key, + fields_ids_map: self.fields_ids_map, + field_distribution, + external_documents_ids: external_documents_ids.into_static(), + new_documents_ids: self.new_documents_ids, + replaced_documents_ids: self.replaced_documents_ids, + documents_count: self.documents_count, + original_documents, + flattened_documents, + }) + } + + /// Clear all databases. Returns a `TransformOutput` with a file that contains the documents + /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. + /// + // TODO this can be done in parallel by using the rayon `ThreadPool`. + pub fn prepare_for_documents_reindexing( + self, + wtxn: &mut heed::RwTxn<'i, '_>, + old_fields_ids_map: FieldsIdsMap, + mut new_fields_ids_map: FieldsIdsMap, + ) -> Result { + // There already has been a document addition, the primary key should be set by now. + let primary_key = self + .index + .primary_key(wtxn)? + .ok_or(InternalError::DatabaseMissingEntry { + db_name: db_name::MAIN, + key: Some(main_key::PRIMARY_KEY_KEY), + })? + .to_string(); + let field_distribution = self.index.field_distribution(wtxn)?; + + // Delete the soft deleted document ids from the maps inside the external_document_ids structure + let new_external_documents_ids = { + let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; + external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; + external_documents_ids + }; + + let documents_ids = self.index.documents_ids(wtxn)?; + let documents_count = documents_ids.len() as usize; + + // We create a final writer to write the new documents in order from the sorter. + let mut original_writer = create_writer( + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + tempfile::tempfile()?, + ); + + // We create a final writer to write the new documents in order from the sorter. + let mut flattened_writer = create_writer( + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + tempfile::tempfile()?, + ); + + let mut obkv_buffer = Vec::new(); + for result in self.index.all_documents(wtxn)? { + let (docid, obkv) = result?; + + obkv_buffer.clear(); + let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); + + // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. + for (id, name) in new_fields_ids_map.iter() { + if let Some(val) = old_fields_ids_map.id(name).and_then(|id| obkv.get(id)) { + obkv_writer.insert(id, val)?; + } + } + + let buffer = obkv_writer.into_inner()?; + original_writer.insert(docid.to_be_bytes(), &buffer)?; + + // Once we have the document. We're going to flatten it + // and insert it in the flattened sorter. + let mut doc = serde_json::Map::new(); + + let reader = obkv::KvReader::new(buffer); + for (k, v) in reader.iter() { + let key = new_fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId { + field_id: k, + process: "Accessing field distribution in transform.", + })?; + let value = serde_json::from_slice::(v) + .map_err(InternalError::SerdeJson)?; + doc.insert(key.to_string(), value); + } + + let flattened = flatten_serde_json::flatten(&doc); + + // Once we have the flattened version we can convert it back to obkv and + // insert all the new generated fields_ids (if any) in the fields ids map. + let mut buffer: Vec = Vec::new(); + let mut writer = KvWriter::new(&mut buffer); + let mut flattened: Vec<_> = flattened.into_iter().collect(); + // we reorder the field to get all the known field first + flattened.sort_unstable_by_key(|(key, _)| { + new_fields_ids_map.id(key).unwrap_or(FieldId::MAX) + }); + + for (key, value) in flattened { + let fid = + new_fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; + let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; + writer.insert(fid, &value)?; + } + flattened_writer.insert(docid.to_be_bytes(), &buffer)?; + } + + // Once we have written all the documents, we extract + // the file and reset the seek to be able to read it again. + let mut original_documents = original_writer.into_inner()?; + original_documents.seek(SeekFrom::Start(0))?; + + let mut flattened_documents = flattened_writer.into_inner()?; + flattened_documents.seek(SeekFrom::Start(0))?; + + let output = TransformOutput { + primary_key, + fields_ids_map: new_fields_ids_map, + field_distribution, + external_documents_ids: new_external_documents_ids.into_static(), + new_documents_ids: documents_ids, + replaced_documents_ids: RoaringBitmap::default(), + documents_count, + original_documents, + flattened_documents, + }; + + let new_facets = output.compute_real_facets(wtxn, self.index)?; + self.index.put_faceted_fields(wtxn, &new_facets)?; + + // We clear the full database (words-fst, documents ids and documents content). + ClearDocuments::new(wtxn, self.index).execute()?; + + Ok(output) + } +} + +/// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec`. +/// +/// The size and alignment of T and U must match. +fn drop_and_reuse(mut vec: Vec) -> Vec { + debug_assert_eq!(std::mem::align_of::(), std::mem::align_of::()); + debug_assert_eq!(std::mem::size_of::(), std::mem::size_of::()); + vec.clear(); + debug_assert!(vec.is_empty()); + vec.into_iter().map(|_| unreachable!()).collect() +} + +impl TransformOutput { + // find and insert the new field ids + pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result> { + let user_defined_facets = index.user_defined_faceted_fields(rtxn)?; + + Ok(self + .fields_ids_map + .names() + .filter(|&field| crate::is_faceted(field, &user_defined_facets)) + .map(|field| field.to_string()) + .collect()) + } +} diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs new file mode 100644 index 000000000..16784bd92 --- /dev/null +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -0,0 +1,334 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::fs::File; +use std::io; + +use grenad::MergerBuilder; +use heed::types::ByteSlice; +use heed::{BytesDecode, RwTxn}; +use roaring::RoaringBitmap; + +use super::helpers::{ + self, merge_ignore_values, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, + valid_lmdb_key, CursorClonableMmap, +}; +use super::{ClonableMmap, MergeFn}; +use crate::facet::FacetType; +use crate::update::facet::FacetsUpdate; +use crate::update::index_documents::helpers::as_cloneable_grenad; +use crate::{ + lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, + Result, +}; + +pub(crate) enum TypedChunk { + DocidWordPositions(grenad::Reader), + FieldIdDocidFacetStrings(grenad::Reader), + FieldIdDocidFacetNumbers(grenad::Reader), + Documents(grenad::Reader), + FieldIdWordcountDocids(grenad::Reader), + NewDocumentsIds(RoaringBitmap), + WordDocids { + word_docids_reader: grenad::Reader, + exact_word_docids_reader: grenad::Reader, + }, + WordPositionDocids(grenad::Reader), + WordPairProximityDocids(grenad::Reader), + FieldIdFacetStringDocids(grenad::Reader), + FieldIdFacetNumberDocids(grenad::Reader), + FieldIdFacetExistsDocids(grenad::Reader), + GeoPoints(grenad::Reader), +} + +/// Write typed chunk in the corresponding LMDB database of the provided index. +/// Return new documents seen. +pub(crate) fn write_typed_chunk_into_index( + typed_chunk: TypedChunk, + index: &Index, + wtxn: &mut RwTxn, + index_is_empty: bool, +) -> Result<(RoaringBitmap, bool)> { + let mut is_merged_database = false; + match typed_chunk { + TypedChunk::DocidWordPositions(docid_word_positions_iter) => { + write_entries_into_database( + docid_word_positions_iter, + &index.docid_word_positions, + wtxn, + index_is_empty, + |value, buffer| { + // ensure that values are unique and ordered + let positions = roaring_bitmap_from_u32s_array(value); + BoRoaringBitmapCodec::serialize_into(&positions, buffer); + Ok(buffer) + }, + |new_values, db_values, buffer| { + let new_values = roaring_bitmap_from_u32s_array(new_values); + let positions = match BoRoaringBitmapCodec::bytes_decode(db_values) { + Some(db_values) => new_values | db_values, + None => new_values, // should not happen + }; + BoRoaringBitmapCodec::serialize_into(&positions, buffer); + Ok(()) + }, + )?; + } + TypedChunk::Documents(obkv_documents_iter) => { + let mut cursor = obkv_documents_iter.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + index.documents.remap_types::().put(wtxn, key, value)?; + } + } + TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => { + append_entries_into_database( + fid_word_count_docids_iter, + &index.field_id_word_count_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } + TypedChunk::NewDocumentsIds(documents_ids) => { + return Ok((documents_ids, is_merged_database)) + } + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; + append_entries_into_database( + word_docids_iter.clone(), + &index.word_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_roaring_bitmaps, + )?; + + let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; + append_entries_into_database( + exact_word_docids_iter.clone(), + &index.exact_word_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_roaring_bitmaps, + )?; + + // create fst from word docids + let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?; + let db_fst = index.words_fst(wtxn)?; + + // merge new fst with database fst + let union_stream = fst.op().add(db_fst.stream()).union(); + let mut builder = fst::SetBuilder::memory(); + builder.extend_stream(union_stream)?; + let fst = builder.into_set(); + index.put_words_fst(wtxn, &fst)?; + is_merged_database = true; + } + TypedChunk::WordPositionDocids(word_position_docids_iter) => { + append_entries_into_database( + word_position_docids_iter, + &index.word_position_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } + TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { + let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); + indexer.execute(wtxn)?; + is_merged_database = true; + } + TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => { + let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter); + indexer.execute(wtxn)?; + is_merged_database = true; + } + TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { + append_entries_into_database( + facet_id_exists_docids, + &index.facet_id_exists_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } + TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { + append_entries_into_database( + word_pair_proximity_docids_iter, + &index.word_pair_proximity_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } + TypedChunk::FieldIdDocidFacetNumbers(fid_docid_facet_number) => { + let index_fid_docid_facet_numbers = + index.field_id_docid_facet_f64s.remap_types::(); + let mut cursor = fid_docid_facet_number.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + if valid_lmdb_key(key) { + index_fid_docid_facet_numbers.put(wtxn, key, value)?; + } + } + } + TypedChunk::FieldIdDocidFacetStrings(fid_docid_facet_string) => { + let index_fid_docid_facet_strings = + index.field_id_docid_facet_strings.remap_types::(); + let mut cursor = fid_docid_facet_string.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + if valid_lmdb_key(key) { + index_fid_docid_facet_strings.put(wtxn, key, value)?; + } + } + } + TypedChunk::GeoPoints(geo_points) => { + let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); + let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; + + let mut cursor = geo_points.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + // convert the key back to a u32 (4 bytes) + let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); + + // convert the latitude and longitude back to a f64 (8 bytes) + let (lat, tail) = helpers::try_split_array_at::(value).unwrap(); + let (lng, _) = helpers::try_split_array_at::(tail).unwrap(); + let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; + let xyz_point = lat_lng_to_xyz(&point); + + rtree.insert(GeoPoint::new(xyz_point, (docid, point))); + geo_faceted_docids.insert(docid); + } + index.put_geo_rtree(wtxn, &rtree)?; + index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; + } + } + + Ok((RoaringBitmap::new(), is_merged_database)) +} + +fn merge_word_docids_reader_into_fst( + word_docids_iter: grenad::Reader>, + exact_word_docids_iter: grenad::Reader>, +) -> Result>> { + let mut merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn); + merger_builder.push(word_docids_iter.into_cursor()?); + merger_builder.push(exact_word_docids_iter.into_cursor()?); + let mut iter = merger_builder.build().into_stream_merger_iter()?; + let mut builder = fst::SetBuilder::memory(); + + while let Some((k, _)) = iter.next()? { + builder.insert(k)?; + } + + Ok(builder.into_set()) +} + +fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { + let new_value = RoaringBitmap::deserialize_from(new_value)?; + let db_value = RoaringBitmap::deserialize_from(db_value)?; + let value = new_value | db_value; + Ok(serialize_roaring_bitmap(&value, buffer)?) +} + +fn merge_cbo_roaring_bitmaps( + new_value: &[u8], + db_value: &[u8], + buffer: &mut Vec, +) -> Result<()> { + Ok(CboRoaringBitmapCodec::merge_into( + &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], + buffer, + )?) +} + +/// Write provided entries in database using serialize_value function. +/// merge_values function is used if an entry already exist in the database. +fn write_entries_into_database( + data: grenad::Reader, + database: &heed::Database, + wtxn: &mut RwTxn, + index_is_empty: bool, + serialize_value: FS, + merge_values: FM, +) -> Result<()> +where + R: io::Read + io::Seek, + FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, + FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, +{ + let mut buffer = Vec::new(); + let database = database.remap_types::(); + + let mut cursor = data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + if valid_lmdb_key(key) { + buffer.clear(); + let value = if index_is_empty { + serialize_value(value, &mut buffer)? + } else { + match database.get(wtxn, key)? { + Some(prev_value) => { + merge_values(value, prev_value, &mut buffer)?; + &buffer[..] + } + None => serialize_value(value, &mut buffer)?, + } + }; + database.put(wtxn, key, value)?; + } + } + + Ok(()) +} + +/// Write provided entries in database using serialize_value function. +/// merge_values function is used if an entry already exist in the database. +/// All provided entries must be ordered. +/// If the index is not empty, write_entries_into_database is called instead. +fn append_entries_into_database( + data: grenad::Reader, + database: &heed::Database, + wtxn: &mut RwTxn, + index_is_empty: bool, + serialize_value: FS, + merge_values: FM, +) -> Result<()> +where + R: io::Read + io::Seek, + FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, + FM: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, +{ + if !index_is_empty { + return write_entries_into_database( + data, + database, + wtxn, + false, + serialize_value, + merge_values, + ); + } + + let mut buffer = Vec::new(); + let mut database = database.iter_mut(wtxn)?.remap_types::(); + + let mut cursor = data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + if valid_lmdb_key(key) { + buffer.clear(); + let value = serialize_value(value, &mut buffer)?; + unsafe { database.append(key, value)? }; + } + } + + Ok(()) +} diff --git a/milli/src/update/indexer_config.rs b/milli/src/update/indexer_config.rs new file mode 100644 index 000000000..af7211f90 --- /dev/null +++ b/milli/src/update/indexer_config.rs @@ -0,0 +1,29 @@ +use grenad::CompressionType; +use rayon::ThreadPool; + +#[derive(Debug)] +pub struct IndexerConfig { + pub log_every_n: Option, + pub max_nb_chunks: Option, + pub documents_chunk_size: Option, + pub max_memory: Option, + pub chunk_compression_type: CompressionType, + pub chunk_compression_level: Option, + pub thread_pool: Option, + pub max_positions_per_attributes: Option, +} + +impl Default for IndexerConfig { + fn default() -> Self { + Self { + log_every_n: None, + max_nb_chunks: None, + documents_chunk_size: None, + max_memory: None, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + thread_pool: None, + max_positions_per_attributes: None, + } + } +} diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs new file mode 100644 index 000000000..948811a6b --- /dev/null +++ b/milli/src/update/mod.rs @@ -0,0 +1,31 @@ +pub use self::available_documents_ids::AvailableDocumentsIds; +pub use self::clear_documents::ClearDocuments; +pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDeletionResult}; +pub use self::facet::bulk::FacetsUpdateBulk; +pub use self::facet::incremental::FacetsUpdateIncrementalInner; +pub use self::index_documents::{ + DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, +}; +pub use self::indexer_config::IndexerConfig; +pub use self::prefix_word_pairs::{ + PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, + MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, +}; +pub use self::settings::{Setting, Settings}; +pub use self::update_step::UpdateIndexingStep; +pub use self::word_prefix_docids::WordPrefixDocids; +pub use self::words_prefix_position_docids::WordPrefixPositionDocids; +pub use self::words_prefixes_fst::WordsPrefixesFst; + +mod available_documents_ids; +mod clear_documents; +mod delete_documents; +pub(crate) mod facet; +mod index_documents; +mod indexer_config; +mod prefix_word_pairs; +mod settings; +mod update_step; +mod word_prefix_docids; +mod words_prefix_position_docids; +mod words_prefixes_fst; diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs new file mode 100644 index 000000000..bed542bdb --- /dev/null +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -0,0 +1,577 @@ +use std::borrow::Cow; +use std::collections::HashSet; +use std::io::BufReader; + +use grenad::CompressionType; +use heed::types::ByteSlice; + +use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; +use crate::{Index, Result}; + +mod prefix_word; +mod word_prefix; + +pub use prefix_word::index_prefix_word_database; +pub use word_prefix::index_word_prefix_database; + +pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4; +pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2; + +pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + max_proximity: u8, + max_prefix_length: usize, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, +} +impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, + ) -> Self { + Self { + wtxn, + index, + max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, + chunk_compression_type, + chunk_compression_level, + } + } + + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] + pub fn execute<'a>( + self, + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &'a [String], + common_prefix_fst_words: &[&'a [String]], + del_prefix_fst_words: &HashSet>, + ) -> Result<()> { + index_word_prefix_database( + self.wtxn, + self.index.word_pair_proximity_docids, + self.index.word_prefix_pair_proximity_docids, + self.max_proximity, + self.max_prefix_length, + new_word_pair_proximity_docids.clone(), + new_prefix_fst_words, + common_prefix_fst_words, + del_prefix_fst_words, + self.chunk_compression_type, + self.chunk_compression_level, + )?; + + index_prefix_word_database( + self.wtxn, + self.index.word_pair_proximity_docids, + self.index.prefix_word_pair_proximity_docids, + self.max_proximity, + self.max_prefix_length, + new_word_pair_proximity_docids, + new_prefix_fst_words, + common_prefix_fst_words, + del_prefix_fst_words, + self.chunk_compression_type, + self.chunk_compression_level, + )?; + + Ok(()) + } +} + +// This is adapted from `sorter_into_lmdb_database` +pub fn insert_into_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + new_key: &[u8], + new_value: &[u8], +) -> Result<()> { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; + match iter.next().transpose()? { + Some((key, old_val)) if new_key == key => { + let val = + merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) + .map_err(|_| { + // TODO just wrap this error? + crate::error::InternalError::IndexingMergingKeys { + process: "get-put-merge", + } + })?; + // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour + unsafe { iter.put_current(new_key, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; + } + } + Ok(()) +} + +// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, +// but it uses `append` if the database is empty, and it assumes that the values in the +// writer don't conflict with values in the database. +pub fn write_into_lmdb_database_without_merging( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + writer: grenad::Writer, +) -> Result<()> { + let file = writer.into_inner()?; + let reader = grenad::Reader::new(BufReader::new(file))?; + if database.is_empty(wtxn)? { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + // safety: the key comes from the grenad reader, not the database + unsafe { out_iter.append(k, v)? }; + } + } else { + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::io::Cursor; + use std::iter::FromIterator; + + use roaring::RoaringBitmap; + + use crate::db_snap; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; + use crate::update::{DeleteDocuments, DeletionStrategy, IndexDocumentsMethod}; + + fn documents_with_enough_different_words_for_prefixes( + prefixes: &[&str], + start_id: usize, + ) -> Vec { + let mut documents = Vec::new(); + let mut id = start_id; + for prefix in prefixes { + for i in 0..50 { + documents.push( + serde_json::json!({ + "id": id, + "text": format!("{prefix}{i:x}"), + }) + .as_object() + .unwrap() + .clone(), + ); + id += 1; + } + } + documents + } + + #[test] + fn add_new_documents() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": "9000", + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": "9001", + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100); + documents.push( + serde_json::json!({ + "id": "9002", + "text": "At an extraordinary house" + }) + .as_object() + .unwrap() + .clone(), + ); + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_pair_proximity_docids, "update"); + db_snap!(index, word_prefix_pair_proximity_docids, "update"); + db_snap!(index, prefix_word_pair_proximity_docids, "update"); + } + #[test] + fn batch_bug_3043() { + // https://github.com/meilisearch/meilisearch/issues/3043 + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "text": "x y" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "text": "x a y" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_pair_proximity_docids); + db_snap!(index, word_prefix_pair_proximity_docids); + db_snap!(index, prefix_word_pair_proximity_docids); + } + + #[test] + fn hard_delete_and_reupdate() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.strategy(DeletionStrategy::AlwaysHard); + delete.delete_documents(&RoaringBitmap::from_iter([50])); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "first_delete"); + db_snap!(index, word_docids, "first_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.strategy(DeletionStrategy::AlwaysHard); + delete.delete_documents(&RoaringBitmap::from_iter(0..50)); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "second_delete"); + db_snap!(index, word_docids, "second_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "reupdate"); + db_snap!(index, word_docids, "reupdate"); + db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); + db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); + } + + #[test] + fn soft_delete_and_reupdate() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.strategy(DeletionStrategy::AlwaysSoft); + delete.delete_documents(&RoaringBitmap::from_iter([50])); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "first_delete"); + db_snap!(index, word_docids, "first_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.strategy(DeletionStrategy::AlwaysSoft); + + delete.delete_documents(&RoaringBitmap::from_iter(0..50)); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, "second_delete"); + db_snap!(index, word_docids, "second_delete"); + db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); + db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "reupdate"); + db_snap!(index, word_docids, "reupdate"); + db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); + db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); + } + + #[test] + fn replace_soft_deletion() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "replaced"); + db_snap!(index, word_docids, "replaced"); + db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); + db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); + db_snap!(index, soft_deleted_documents_ids, "replaced", @"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, ]"); + } + + #[test] + fn replace_hard_deletion() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("id".to_owned()); + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "id": 9000, + "text": "At an amazing house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "id": 9001, + "text": "The bell rings" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, documents_ids, "initial"); + db_snap!(index, word_docids, "initial"); + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + db_snap!(index, prefix_word_pair_proximity_docids, "initial"); + + let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); + index.add_documents(batch_reader_from_documents(documents)).unwrap(); + + db_snap!(index, documents_ids, "replaced"); + db_snap!(index, word_docids, "replaced"); + db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); + db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); + db_snap!(index, soft_deleted_documents_ids, "replaced", @"[]"); + } +} diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs new file mode 100644 index 000000000..60e2e554e --- /dev/null +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -0,0 +1,180 @@ +use std::borrow::Cow; +use std::collections::{BTreeMap, HashSet}; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesDecode; +use log::debug; + +use crate::update::index_documents::{create_writer, CursorClonableMmap}; +use crate::update::prefix_word_pairs::{ + insert_into_database, write_into_lmdb_database_without_merging, +}; +use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; + +#[allow(clippy::too_many_arguments)] +#[logging_timer::time] +pub fn index_prefix_word_database( + wtxn: &mut heed::RwTxn, + word_pair_proximity_docids: heed::Database, + prefix_word_pair_proximity_docids: heed::Database, + max_proximity: u8, + max_prefix_length: usize, + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, +) -> Result<()> { + let max_proximity = max_proximity - 1; + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + let common_prefixes: Vec<_> = common_prefix_fst_words + .iter() + .flat_map(|s| s.iter()) + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length) + .collect(); + + for proximity in 1..max_proximity { + for prefix in common_prefixes.iter() { + let mut prefix_key = vec![proximity]; + prefix_key.extend_from_slice(prefix.as_bytes()); + let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; + // This is the core of the algorithm + execute_on_word_pairs_and_prefixes( + proximity, + prefix.as_bytes(), + // the next two arguments tell how to iterate over the new word pairs + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.next()? { + let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) + .ok_or(heed::Error::Decoding)?; + Ok(Some((word2, value))) + } else { + Ok(None) + } + }, + // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) + |key, value| { + insert_into_database( + wtxn, + *prefix_word_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + } + } + + // Now we do the same thing with the new prefixes and all word pairs in the DB + let new_prefixes: Vec<_> = new_prefix_fst_words + .iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length) + .collect(); + + // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) + // element in an intermediary grenad + let mut writer = + create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); + + for proximity in 1..max_proximity { + for prefix in new_prefixes.iter() { + let mut prefix_key = vec![proximity]; + prefix_key.extend_from_slice(prefix.as_bytes()); + let mut db_iter = word_pair_proximity_docids + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? + .remap_key_type::(); + execute_on_word_pairs_and_prefixes( + proximity, + prefix.as_bytes(), + &mut db_iter, + |db_iter| { + db_iter + .next() + .transpose() + .map(|x| x.map(|((_, _, word2), value)| (word2, value))) + .map_err(|e| e.into()) + }, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); + } + } + + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( + wtxn, + *prefix_word_pair_proximity_docids.as_polymorph(), + writer, + )?; + + // All of the word prefix pairs in the database that have a w2 + // that is contained in the `suppr_pw` set must be removed as well. + if !del_prefix_fst_words.is_empty() { + let mut iter = + prefix_word_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; + while let Some(((_, prefix, _), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(prefix.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; + } + } + } + + Ok(()) +} + +/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database. +/// +/// Its arguments are: +/// - an iterator over the words following the given `prefix` with the given `proximity` +/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements +fn execute_on_word_pairs_and_prefixes( + proximity: u8, + prefix: &[u8], + iter: &mut I, + mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result>, + mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, +) -> Result<()> { + let mut batch: BTreeMap, Vec>> = BTreeMap::default(); + + // Memory usage check: + // The content of the loop will be called for each `word2` that follows a word beginning + // with `prefix` with the given proximity. + // In practice, I don't think the batch can ever get too big. + while let Some((word2, docids)) = next_word2_and_docids(iter)? { + let entry = batch.entry(word2.to_owned()).or_default(); + entry.push(Cow::Owned(docids.to_owned())); + } + + let mut key_buffer = Vec::with_capacity(512); + key_buffer.push(proximity); + key_buffer.extend_from_slice(prefix); + key_buffer.push(0); + + let mut value_buffer = Vec::with_capacity(65_536); + + for (word2, docids) in batch { + key_buffer.truncate(prefix.len() + 2); + value_buffer.clear(); + + key_buffer.extend_from_slice(&word2); + let data = if docids.len() > 1 { + CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?; + value_buffer.as_slice() + } else { + &docids[0] + }; + insert(key_buffer.as_slice(), data)?; + } + Ok(()) +} diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..6609786a3 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,20 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [101, ] +1 a amazing [100, ] +1 a an [100, ] +1 a and [100, ] +1 a beautiful [100, ] +1 b house [100, ] +1 b rings [101, ] +1 be house [100, ] +1 be rings [101, ] +2 a am [101, ] +2 a amazing [100, ] +2 a and [100, ] +2 a beautiful [100, ] +2 a house [100, ] +2 b at [101, ] +2 be at [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..c760ae440 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,26 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [101, ] +1 amazing a [100, ] +1 an a [100, ] +1 and b [100, ] +1 and be [100, ] +1 at a [100, ] +1 rings a [101, ] +1 the b [101, ] +1 the be [101, ] +2 amazing b [100, ] +2 amazing be [100, ] +2 an a [100, ] +2 at a [100, 101, ] +2 bell a [101, ] +3 an b [100, ] +3 an be [100, ] +3 at a [100, ] +3 rings a [101, ] +3 the a [101, ] +4 at b [100, ] +4 at be [100, ] +4 bell a [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..7644c433d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,29 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [101, ] +1 a amazing [100, ] +1 a an [100, 202, ] +1 a and [100, ] +1 a beautiful [100, ] +1 a extraordinary [202, ] +1 am and [100, ] +1 an amazing [100, ] +1 an beautiful [100, ] +1 an extraordinary [202, ] +1 b house [100, ] +1 b rings [101, ] +1 be house [100, ] +1 be rings [101, ] +2 a am [101, ] +2 a amazing [100, ] +2 a and [100, ] +2 a beautiful [100, ] +2 a extraordinary [202, ] +2 a house [100, 202, ] +2 am beautiful [100, ] +2 an and [100, ] +2 an house [100, 202, ] +2 b at [101, ] +2 be at [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap new file mode 100644 index 000000000..4fcd0fbd2 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap @@ -0,0 +1,39 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 am [101, ] +1 amazing and [100, ] +1 an amazing [100, ] +1 an extraordinary [202, ] +1 and beautiful [100, ] +1 at 5 [101, ] +1 at an [100, 202, ] +1 beautiful house [100, ] +1 bell rings [101, ] +1 extraordinary house [202, ] +1 rings at [101, ] +1 the bell [101, ] +2 amazing beautiful [100, ] +2 an and [100, ] +2 an house [202, ] +2 and house [100, ] +2 at am [101, ] +2 at amazing [100, ] +2 at extraordinary [202, ] +2 bell at [101, ] +2 rings 5 [101, ] +2 the rings [101, ] +3 amazing house [100, ] +3 an beautiful [100, ] +3 at and [100, ] +3 at house [202, ] +3 bell 5 [101, ] +3 rings am [101, ] +3 the at [101, ] +4 an house [100, ] +4 at beautiful [100, ] +4 bell am [101, ] +4 the 5 [101, ] +5 at house [100, ] +5 the am [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..0f2e458a8 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,35 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [101, ] +1 5 am [101, ] +1 amazing a [100, ] +1 amazing an [100, ] +1 an a [100, ] +1 an am [100, ] +1 and b [100, ] +1 and be [100, ] +1 at a [100, 202, ] +1 at an [100, 202, ] +1 rings a [101, ] +1 the b [101, ] +1 the be [101, ] +2 amazing b [100, ] +2 amazing be [100, ] +2 an a [100, ] +2 an an [100, ] +2 at a [100, 101, ] +2 at am [100, 101, ] +2 bell a [101, ] +3 an b [100, ] +3 an be [100, ] +3 at a [100, ] +3 at an [100, ] +3 rings a [101, ] +3 rings am [101, ] +3 the a [101, ] +4 at b [100, ] +4 at be [100, ] +4 bell a [101, ] +4 bell am [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..d212999bb --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap new file mode 100644 index 000000000..816895dcf --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap @@ -0,0 +1,8 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a y [51, ] +1 x a [51, ] +1 x y [50, ] +2 x y [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..03530a2f1 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a y [51, ] +1 x y [50, ] +2 x y [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap new file mode 100644 index 000000000..39e9fbe65 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..61987fd4a --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +2 a am [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap new file mode 100644 index 000000000..1caf1a9a3 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +at [51, ] +bell [51, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..41c71ea59 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,11 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 rings a [51, ] +2 at a [51, ] +2 bell a [51, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap new file mode 100644 index 000000000..39e9fbe65 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..267a1c01d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 b rings [51, ] +2 b at [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap new file mode 100644 index 000000000..e5336d58c --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +am [51, ] +at [51, ] +b0 [0, ] +b1 [1, ] +b10 [16, ] +b11 [17, ] +b12 [18, ] +b13 [19, ] +b14 [20, ] +b15 [21, ] +b16 [22, ] +b17 [23, ] +b18 [24, ] +b19 [25, ] +b1a [26, ] +b1b [27, ] +b1c [28, ] +b1d [29, ] +b1e [30, ] +b1f [31, ] +b2 [2, ] +b20 [32, ] +b21 [33, ] +b22 [34, ] +b23 [35, ] +b24 [36, ] +b25 [37, ] +b26 [38, ] +b27 [39, ] +b28 [40, ] +b29 [41, ] +b2a [42, ] +b2b [43, ] +b2c [44, ] +b2d [45, ] +b2e [46, ] +b2f [47, ] +b3 [3, ] +b30 [48, ] +b31 [49, ] +b4 [4, ] +b5 [5, ] +b6 [6, ] +b7 [7, ] +b8 [8, ] +b9 [9, ] +ba [10, ] +bb [11, ] +bc [12, ] +bd [13, ] +be [14, ] +bell [51, ] +bf [15, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..4cdf756ac --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 the b [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap new file mode 100644 index 000000000..4dca775e6 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..61987fd4a --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +2 a am [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap new file mode 100644 index 000000000..7949d464e --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap @@ -0,0 +1,10 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +am [51, ] +at [51, ] +bell [51, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..41c71ea59 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,11 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 rings a [51, ] +2 at a [51, ] +2 bell a [51, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..78b6a3885 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,9 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a amazing [50, ] +1 a an [50, ] +1 a house [50, ] +2 a amazing [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap new file mode 100644 index 000000000..8c7809973 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap @@ -0,0 +1,61 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +amazing [50, ] +an [50, ] +at [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..65d8b806b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 an a [50, ] +1 at a [50, ] +2 at a [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap new file mode 100644 index 000000000..775d41a3d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..54c9e4b9b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 b rings [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap new file mode 100644 index 000000000..f86fdcb8b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap @@ -0,0 +1,61 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +amazing [50, ] +an [50, ] +at [50, ] +b0 [52, ] +b1 [53, ] +b10 [68, ] +b11 [69, ] +b12 [70, ] +b13 [71, ] +b14 [72, ] +b15 [73, ] +b16 [74, ] +b17 [75, ] +b18 [76, ] +b19 [77, ] +b1a [78, ] +b1b [79, ] +b1c [80, ] +b1d [81, ] +b1e [82, ] +b1f [83, ] +b2 [54, ] +b20 [84, ] +b21 [85, ] +b22 [86, ] +b23 [87, ] +b24 [88, ] +b25 [89, ] +b26 [90, ] +b27 [91, ] +b28 [92, ] +b29 [93, ] +b2a [94, ] +b2b [95, ] +b2c [96, ] +b2d [97, ] +b2e [98, ] +b2f [99, ] +b3 [55, ] +b30 [100, ] +b31 [101, ] +b4 [56, ] +b5 [57, ] +b6 [58, ] +b7 [59, ] +b8 [60, ] +b9 [61, ] +ba [62, ] +bb [63, ] +bc [64, ] +bd [65, ] +be [66, ] +bell [51, ] +bf [67, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..4cdf756ac --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 the b [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..78b6a3885 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,9 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a amazing [50, ] +1 a an [50, ] +1 a house [50, ] +2 a amazing [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap new file mode 100644 index 000000000..8c7809973 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap @@ -0,0 +1,61 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +amazing [50, ] +an [50, ] +at [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..65d8b806b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 an a [50, ] +1 at a [50, ] +2 at a [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap new file mode 100644 index 000000000..775d41a3d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..0241f26a5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,10 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a amazing [50, ] +1 a an [50, ] +1 a house [50, ] +1 b rings [51, ] +2 a amazing [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap new file mode 100644 index 000000000..6a481eeee --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5f6443e54fae188aa96d4f27fce28939 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..d20582970 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,8 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 an a [50, ] +1 at a [50, ] +1 the b [51, ] +2 at a [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap new file mode 100644 index 000000000..39e9fbe65 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap new file mode 100644 index 000000000..78008f83b --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap new file mode 100644 index 000000000..c8a1e54b4 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..db62b6566 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,17 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +1 b house [50, ] +1 b rings [51, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] +2 b at [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap new file mode 100644 index 000000000..7fd726325 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +9f4866b80177e321a33ce434992022b5 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..8a684b16d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,21 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 and b [50, ] +1 at a [50, ] +1 rings a [51, ] +1 the b [51, ] +2 amazing b [50, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 an b [50, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 at b [50, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap new file mode 100644 index 000000000..4dca775e6 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +[51, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b380ba9b5 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,14 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a 5 [51, ] +1 a amazing [50, ] +1 a an [50, ] +1 a and [50, ] +1 a beautiful [50, ] +2 a am [51, ] +2 a amazing [50, ] +2 a and [50, ] +2 a beautiful [50, ] +2 a house [50, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap new file mode 100644 index 000000000..6b5658b74 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap @@ -0,0 +1,65 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +5 [51, ] +a0 [0, ] +a1 [1, ] +a10 [16, ] +a11 [17, ] +a12 [18, ] +a13 [19, ] +a14 [20, ] +a15 [21, ] +a16 [22, ] +a17 [23, ] +a18 [24, ] +a19 [25, ] +a1a [26, ] +a1b [27, ] +a1c [28, ] +a1d [29, ] +a1e [30, ] +a1f [31, ] +a2 [2, ] +a20 [32, ] +a21 [33, ] +a22 [34, ] +a23 [35, ] +a24 [36, ] +a25 [37, ] +a26 [38, ] +a27 [39, ] +a28 [40, ] +a29 [41, ] +a2a [42, ] +a2b [43, ] +a2c [44, ] +a2d [45, ] +a2e [46, ] +a2f [47, ] +a3 [3, ] +a30 [48, ] +a31 [49, ] +a4 [4, ] +a5 [5, ] +a6 [6, ] +a7 [7, ] +a8 [8, ] +a9 [9, ] +aa [10, ] +ab [11, ] +ac [12, ] +ad [13, ] +ae [14, ] +af [15, ] +am [51, ] +amazing [50, ] +an [50, ] +and [50, ] +at [50, 51, ] +beautiful [50, ] +bell [51, ] +house [50, ] +rings [51, ] +the [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..e55ebed9d --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,16 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [51, ] +1 amazing a [50, ] +1 an a [50, ] +1 at a [50, ] +1 rings a [51, ] +2 an a [50, ] +2 at a [50, 51, ] +2 bell a [51, ] +3 at a [50, ] +3 rings a [51, ] +3 the a [51, ] +4 bell a [51, ] + diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs new file mode 100644 index 000000000..db607e56c --- /dev/null +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -0,0 +1,727 @@ +/*! +The word-prefix-pair-proximity-docids database is a database whose keys are of +the form `(proximity, word, prefix)` and the values are roaring bitmaps of +the documents which contain `word` followed by another word starting with +`prefix` at a distance of `proximity`. + +The prefixes present in this database are only those that correspond to many +different words in the documents. + +## How is it created/updated? (simplified version) +To compute it, we have access to (mainly) two inputs: + +* a list of sorted prefixes, such as: +```text +c +ca +cat +d +do +dog +``` +Note that only prefixes which correspond to more than a certain number of +different words from the database are included in this list. + +* a sorted list of proximities and word pairs (the proximity is the distance between the two words), +associated with a roaring bitmap, such as: +```text +1 good doggo -> docids1: [8] +1 good door -> docids2: [7, 19, 20] +1 good ghost -> docids3: [1] +2 good dog -> docids4: [2, 5, 6] +2 horror cathedral -> docids5: [1, 2] +``` + +I illustrate a simplified version of the algorithm to create the word-prefix +pair-proximity database below: + +1. **Outer loop:** First, we iterate over each proximity and word pair: +```text +proximity: 1 +word1 : good +word2 : doggo +``` +2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are +in the list of sorted prefixes. And we insert the key `prefix` +and the value (`docids`) to a sorted map which we call the “batch”. For example, +at the end of the first outer loop, we may have: +```text +Outer loop 1: +------------------------------ +proximity: 1 +word1 : good +word2 : doggo +docids : docids1 + +prefixes: [d, do, dog] + +batch: [ + d, -> [docids1] + do -> [docids1] + dog -> [docids1] +] +``` +3. For illustration purpose, let's run through a second iteration of the outer loop: +```text +Outer loop 2: +------------------------------ +proximity: 1 +word1 : good +word2 : door +docids : docids2 + +prefixes: [d, do, doo] + +batch: [ + d -> [docids1, docids2] + do -> [docids1, docids2] + dog -> [docids1] + doo -> [docids2] +] +``` +Notice that there were some conflicts which were resolved by merging the +conflicting values together. Also, an additional prefix was added at the +end of the batch. + +4. On the third iteration of the outer loop, we have: +```text +Outer loop 3: +------------------------------ +proximity: 1 +word1 : good +word2 : ghost +``` +Because `word2` begins with a different letter than the previous `word2`, +we know that all the prefixes of `word2` are greater than the prefixes of the previous word2 + +Therefore, we know that we can insert every element from the batch into the +database before proceeding any further. This operation is called +“flushing the batch”. Flushing the batch should also be done whenever: +* `proximity` is different than the previous `proximity`. +* `word1` is different than the previous `word1`. +* `word2` starts with a different letter than the previous word2 + +6. **Flushing the batch:** to flush the batch, we iterate over its elements: +```text +Flushing Batch loop 1: +------------------------------ +proximity : 1 +word1 : good +prefix : d + +docids : [docids2, docids3] +``` +We then merge the array of `docids` (of type `Vec>`) using +`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a +roaring bitmap of all the document ids where `word1` is followed by `prefix` +at a distance of `proximity`. +Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids` +into the database. + +7. That's it! ... except... + +## How is it created/updated (continued) + +I lied a little bit about the input data. In reality, we get two sets of the +inputs described above, which come from different places: + +* For the list of sorted prefixes, we have: + 1. `new_prefixes`, which are all the prefixes that were not present in the + database before the insertion of the new documents + + 2. `common_prefixes` which are the prefixes that are present both in the + database and in the newly added documents + +* For the list of word pairs and proximities, we have: + 1. `new_word_pairs`, which is the list of word pairs and their proximities + present in the newly added documents + + 2. `word_pairs_db`, which is the list of word pairs from the database. + This list includes all elements in `new_word_pairs` since `new_word_pairs` + was added to the database prior to calling the `WordPrefix::execute` + function. + +To update the prefix database correctly, we call the algorithm described earlier first +on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). +Thus: + +1. For all the word pairs that were already present in the DB, we insert them +again with the `new_prefixes`. Calling the algorithm on them with the +`common_prefixes` would not result in any new data. + +2. For all the new word pairs, we insert them twice: first with the `common_prefixes`, +and then, because they are part of `word_pairs_db`, with the `new_prefixes`. + +Note, also, that since we read data from the database when iterating over +`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- +docids from the batch directly into the database (we would have a concurrent +reader and writer). Therefore, when calling the algorithm on +`(new_prefixes, word_pairs_db)`, we insert the computed +`((proximity, word, prefix), docids)` elements in an intermediary grenad +Writer instead of the DB. At the end of the outer loop, we finally read from +the grenad and insert its elements in the database. +*/ + +use std::borrow::Cow; +use std::collections::HashSet; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesDecode; +use log::debug; + +use crate::update::index_documents::{create_writer, CursorClonableMmap}; +use crate::update::prefix_word_pairs::{ + insert_into_database, write_into_lmdb_database_without_merging, +}; +use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; + +#[allow(clippy::too_many_arguments)] +#[logging_timer::time] +pub fn index_word_prefix_database( + wtxn: &mut heed::RwTxn, + word_pair_proximity_docids: heed::Database, + word_prefix_pair_proximity_docids: heed::Database, + max_proximity: u8, + max_prefix_length: usize, + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, +) -> Result<()> { + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length + let prefixes = PrefixTrieNode::from_sorted_prefixes( + common_prefix_fst_words + .iter() + .flat_map(|s| s.iter()) + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length), + ); + + // If the prefix trie is not empty, then we can iterate over all new + // word pairs to look for new (proximity, word1, common_prefix) elements + // to insert in the DB + if !prefixes.is_empty() { + let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + // This is the core of the algorithm + execute_on_word_pairs_and_prefixes( + // the first two arguments tell how to iterate over the new word pairs + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.move_on_next()? { + let (proximity, word1, word2) = + UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + Ok(Some(((proximity, word1, word2), value))) + } else { + Ok(None) + } + }, + &prefixes, + max_proximity, + // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap) + |key, value| { + insert_into_database( + wtxn, + *word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + } + + // Now we do the same thing with the new prefixes and all word pairs in the DB + + let prefixes = PrefixTrieNode::from_sorted_prefixes( + new_prefix_fst_words.iter().map(|s| s.as_str()).filter(|s| s.len() <= max_prefix_length), + ); + + if !prefixes.is_empty() { + let mut db_iter = word_pair_proximity_docids + .remap_key_type::() + .remap_data_type::() + .iter(wtxn)?; + + // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) + // element in an intermediary grenad + let mut writer = + create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); + + execute_on_word_pairs_and_prefixes( + &mut db_iter, + |db_iter| db_iter.next().transpose().map_err(|e| e.into()), + &prefixes, + max_proximity, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); + + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( + wtxn, + *word_prefix_pair_proximity_docids.as_polymorph(), + writer, + )?; + } + + // All of the word prefix pairs in the database that have a w2 + // that is contained in the `suppr_pw` set must be removed as well. + if !del_prefix_fst_words.is_empty() { + let mut iter = + word_prefix_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; + while let Some(((_, _, prefix), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(prefix.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; + } + } + } + + Ok(()) +} + +/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. +/// +/// Its main arguments are: +/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements +/// 2. a prefix trie +/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements +/// +/// For more information about what this function does, read the module documentation. +fn execute_on_word_pairs_and_prefixes( + iter: &mut I, + mut next_word_pair_proximity: impl for<'a> FnMut( + &'a mut I, + ) -> Result< + Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>, + >, + prefixes: &PrefixTrieNode, + max_proximity: u8, + mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, +) -> Result<()> { + let mut batch = PrefixAndProximityBatch::default(); + let mut prev_word2_start = 0; + + // Optimisation: the index at the root of the prefix trie where to search for + let mut prefix_search_start = PrefixTrieNodeSearchStart(0); + + // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter + let mut empty_prefixes = false; + + let mut prefix_buffer = Vec::with_capacity(8); + let mut merge_buffer = Vec::with_capacity(65_536); + + while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { + // stop indexing if the proximity is over the threshold + if proximity > max_proximity { + break; + }; + let word2_start_different_than_prev = word2[0] != prev_word2_start; + // if there were no potential prefixes for the previous word2 based on its first letter, + // and if the current word2 starts with the same letter, then there is also no potential + // prefixes for the current word2, and we can skip to the next iteration + if empty_prefixes && !word2_start_different_than_prev { + continue; + } + + // if the proximity is different to the previous one, OR + // if word1 is different than the previous word1, OR + // if the start of word2 is different than the previous start of word2, + // THEN we'll need to flush the batch + let prox_different_than_prev = proximity != batch.proximity; + let word1_different_than_prev = word1 != batch.word1; + if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev + { + batch.flush(&mut merge_buffer, &mut insert)?; + batch.proximity = proximity; + // don't forget to reset the value of batch.word1 and prev_word2_start + if word1_different_than_prev { + batch.word1.clear(); + batch.word1.extend_from_slice(word1); + } + if word2_start_different_than_prev { + prev_word2_start = word2[0]; + } + prefix_search_start.0 = 0; + // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2 + empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); + } + + if !empty_prefixes { + // All conditions are satisfied, we can now insert each new prefix of word2 into the batch + prefix_buffer.clear(); + prefixes.for_each_prefix_of( + word2, + &mut prefix_buffer, + &prefix_search_start, + |prefix_buffer| { + batch.insert(prefix_buffer, data.to_vec()); + }, + ); + } + } + batch.flush(&mut merge_buffer, &mut insert)?; + Ok(()) +} +/** +A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps). +The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. + +It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently. + +The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content +can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: +- key : (proximity, word1, prefix) as bytes +- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes +*/ +#[derive(Default)] +struct PrefixAndProximityBatch { + proximity: u8, + word1: Vec, + #[allow(clippy::type_complexity)] + batch: Vec<(Vec, Vec>)>, +} + +impl PrefixAndProximityBatch { + /// Insert the new key and value into the batch + /// + /// The key must either exist in the batch or be greater than all existing keys + fn insert(&mut self, new_key: &[u8], new_value: Vec) { + match self.batch.iter_mut().find(|el| el.0 == new_key) { + Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)), + None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])), + } + } + + /// Empties the batch, calling `insert` on each element. + /// + /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap. + fn flush( + &mut self, + merge_buffer: &mut Vec, + insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, + ) -> Result<()> { + let PrefixAndProximityBatch { proximity, word1, batch } = self; + if batch.is_empty() { + return Ok(()); + } + merge_buffer.clear(); + + let mut buffer = Vec::with_capacity(word1.len() + 1 + 6); + buffer.push(*proximity); + buffer.extend_from_slice(word1); + buffer.push(0); + + for (key, mergeable_data) in batch.drain(..) { + buffer.truncate(1 + word1.len() + 1); + buffer.extend_from_slice(key.as_slice()); + + let data = if mergeable_data.len() > 1 { + CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; + merge_buffer.as_slice() + } else { + &mergeable_data[0] + }; + insert(buffer.as_slice(), data)?; + merge_buffer.clear(); + } + + Ok(()) + } +} + +/** A prefix trie. Used to iterate quickly over the prefixes of a word that are +within a set. + +## Structure +The trie is made of nodes composed of: +1. a byte character (e.g. 'a') +2. whether the node is an end node or not +3. a list of children nodes, sorted by their byte character + +For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]` +is drawn below. Nodes with a double border are "end nodes". + +┌──────────────────────┐ ┌──────────────────────┐ ╔══════════════════════╗ +│ a │ │ c │ ║ r ║ +└──────────────────────┘ └──────────────────────┘ ╚══════════════════════╝ +╔══════╗╔══════╗╔══════╗ ┌─────────┐ ╔═════════╗ ┌─────────┐ ╔══════════╗ +║ c ║║ e ║║ r ║ │ e │ ║ h ║ │ e │ ║ i ║ +╚══════╝╚══════╝╚══════╝ └─────────┘ ╚═════════╝ └─────────┘ ╚══════════╝ + ╔═══╗ ╔═══╗ ╔═══╗ + ║ i ║ ║ l ║ ║ l ║ + ╚═══╝ ╚═══╝ ╚═══╝ +*/ +#[derive(Default, Debug)] +struct PrefixTrieNode { + children: Vec<(PrefixTrieNode, u8)>, + is_end_node: bool, +} + +#[derive(Debug)] +struct PrefixTrieNodeSearchStart(usize); + +impl PrefixTrieNode { + fn is_empty(&self) -> bool { + self.children.is_empty() + } + + /// Returns false if the trie does not contain a prefix of the given word. + /// Returns true if the trie *may* contain a prefix of the given word. + /// + /// Moves the search start to the first node equal to the first letter of the word, + /// or to 0 otherwise. + fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { + let byte = word[0]; + if self.children[search_start.0].1 == byte { + true + } else { + match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { + Ok(position) => { + search_start.0 += position; + true + } + Err(_) => { + search_start.0 = 0; + false + } + } + } + } + + fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { + let mut node = PrefixTrieNode::default(); + for prefix in prefixes { + node.insert_sorted_prefix(prefix.as_bytes().iter()); + } + node + } + fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter) { + if let Some(&c) = prefix.next() { + if let Some((node, byte)) = self.children.last_mut() { + if *byte == c { + node.insert_sorted_prefix(prefix); + return; + } + } + let mut new_node = PrefixTrieNode::default(); + new_node.insert_sorted_prefix(prefix); + self.children.push((new_node, c)); + } else { + self.is_end_node = true; + } + } + + /// Call the given closure on each prefix of the word contained in the prefix trie. + /// + /// The search starts from the given `search_start`. + fn for_each_prefix_of( + &self, + word: &[u8], + buffer: &mut Vec, + search_start: &PrefixTrieNodeSearchStart, + mut do_fn: impl FnMut(&mut Vec), + ) { + let first_byte = word[0]; + let mut cur_node = self; + buffer.push(first_byte); + if let Some((child_node, c)) = + cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) + { + if *c == first_byte { + cur_node = child_node; + if cur_node.is_end_node { + do_fn(buffer); + } + for &byte in &word[1..] { + buffer.push(byte); + if let Some((child_node, c)) = + cur_node.children.iter().find(|(_, c)| *c >= byte) + { + if *c == byte { + cur_node = child_node; + if cur_node.is_end_node { + do_fn(buffer); + } + } else { + break; + } + } else { + break; + } + } + } + } + } +} +#[cfg(test)] +mod tests { + use roaring::RoaringBitmap; + + use super::*; + use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; + + fn check_prefixes( + trie: &PrefixTrieNode, + search_start: &PrefixTrieNodeSearchStart, + word: &str, + expected_prefixes: &[&str], + ) { + let mut actual_prefixes = vec![]; + trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), search_start, |x| { + let s = String::from_utf8(x.to_owned()).unwrap(); + actual_prefixes.push(s); + }); + assert_eq!(actual_prefixes, expected_prefixes); + } + + #[test] + fn test_trie() { + let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", + "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", + "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", + "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", + "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", + "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", + "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", + "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", + "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", + "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", + "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", + "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", + "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", + "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", + "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", + "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", + ])); + + let mut search_start = PrefixTrieNodeSearchStart(0); + + let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); + assert!(!is_empty); + assert_eq!(search_start.0, 2); + + check_prefixes(&trie, &search_start, "affair", &["a"]); + check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); + + let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); + assert!(!is_empty); + assert_eq!(trie.children[search_start.0].1, b'u'); + + check_prefixes(&trie, &search_start, "unique", &["u", "un"]); + + // NOTE: this should fail, because the search start is already beyong 'a' + let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); + assert!(!is_empty); + // search start is reset + assert_eq!(search_start.0, 0); + + let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "arb", "arbre", "cat", "catto", + ])); + check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); + check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); + } + + #[test] + fn test_execute_on_word_pairs_and_prefixes() { + let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ + "arb", "arbre", "cat", "catto", + ])); + + let mut serialised_bitmap123 = vec![]; + let mut bitmap123 = RoaringBitmap::new(); + bitmap123.insert(1); + bitmap123.insert(2); + bitmap123.insert(3); + CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); + + let mut serialised_bitmap456 = vec![]; + let mut bitmap456 = RoaringBitmap::new(); + bitmap456.insert(4); + bitmap456.insert(5); + bitmap456.insert(6); + CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); + + let mut serialised_bitmap789 = vec![]; + let mut bitmap789 = RoaringBitmap::new(); + bitmap789.insert(7); + bitmap789.insert(8); + bitmap789.insert(9); + CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); + + let mut serialised_bitmap_ranges = vec![]; + let mut bitmap_ranges = RoaringBitmap::new(); + bitmap_ranges.insert_range(63_000..65_000); + bitmap_ranges.insert_range(123_000..128_000); + CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); + + let word_pairs = [ + ((1, "healthy", "arbres"), &serialised_bitmap123), + ((1, "healthy", "boat"), &serialised_bitmap123), + ((1, "healthy", "ca"), &serialised_bitmap123), + ((1, "healthy", "cats"), &serialised_bitmap456), + ((1, "healthy", "cattos"), &serialised_bitmap123), + ((1, "jittery", "cat"), &serialised_bitmap123), + ((1, "jittery", "cata"), &serialised_bitmap456), + ((1, "jittery", "catb"), &serialised_bitmap789), + ((1, "jittery", "catc"), &serialised_bitmap_ranges), + ((2, "healthy", "arbre"), &serialised_bitmap123), + ((2, "healthy", "arbres"), &serialised_bitmap456), + ((2, "healthy", "cats"), &serialised_bitmap789), + ((2, "healthy", "cattos"), &serialised_bitmap_ranges), + ((3, "healthy", "arbre"), &serialised_bitmap456), + ((3, "healthy", "arbres"), &serialised_bitmap789), + ]; + + let expected_result = [ + ((1, "healthy", "arb"), bitmap123.clone()), + ((1, "healthy", "arbre"), bitmap123.clone()), + ((1, "healthy", "cat"), &bitmap456 | &bitmap123), + ((1, "healthy", "catto"), bitmap123.clone()), + ((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), + ((2, "healthy", "arb"), &bitmap123 | &bitmap456), + ((2, "healthy", "arbre"), &bitmap123 | &bitmap456), + ((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges), + ((2, "healthy", "catto"), bitmap_ranges.clone()), + ]; + + let mut result = vec![]; + + let mut iter = + IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| { + ((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice()) + }); + execute_on_word_pairs_and_prefixes( + &mut iter, + |iter| Ok(iter.next()), + &prefixes, + 2, + |k, v| { + let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap(); + let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); + result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap)); + Ok(()) + }, + ) + .unwrap(); + + for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { + let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x; + let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y; + + assert_eq!(actual_word1, expected_word1); + assert_eq!(actual_prefix, expected_prefix); + assert_eq!(actual_proximity, expected_proximity); + assert_eq!(actual_bitmap, expected_bitmap); + } + } +} diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs new file mode 100644 index 000000000..f10bfe4e9 --- /dev/null +++ b/milli/src/update/settings.rs @@ -0,0 +1,1557 @@ +use std::collections::{BTreeSet, HashMap, HashSet}; +use std::result::Result as StdResult; + +use charabia::{Tokenizer, TokenizerBuilder}; +use deserr::{DeserializeError, DeserializeFromValue}; +use itertools::Itertools; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; +use time::OffsetDateTime; + +use super::index_documents::{IndexDocumentsConfig, Transform}; +use super::IndexerConfig; +use crate::criterion::Criterion; +use crate::error::UserError; +use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; +use crate::update::index_documents::IndexDocumentsMethod; +use crate::update::{IndexDocuments, UpdateIndexingStep}; +use crate::{FieldsIdsMap, Index, Result}; + +#[derive(Debug, Clone, PartialEq, Eq, Copy)] +pub enum Setting { + Set(T), + Reset, + NotSet, +} + +impl DeserializeFromValue for Setting +where + T: DeserializeFromValue, + E: DeserializeError, +{ + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> std::result::Result { + match value { + deserr::Value::Null => Ok(Setting::Reset), + _ => T::deserialize_from_value(value, location).map(Setting::Set), + } + } + fn default() -> Option { + Some(Self::NotSet) + } +} + +impl Default for Setting { + fn default() -> Self { + Self::NotSet + } +} + +impl Setting { + pub fn set(self) -> Option { + match self { + Self::Set(value) => Some(value), + _ => None, + } + } + + pub const fn as_ref(&self) -> Setting<&T> { + match *self { + Self::Set(ref value) => Setting::Set(value), + Self::Reset => Setting::Reset, + Self::NotSet => Setting::NotSet, + } + } + + pub const fn is_not_set(&self) -> bool { + matches!(self, Self::NotSet) + } + + /// If `Self` is `Reset`, then map self to `Set` with the provided `val`. + pub fn or_reset(self, val: T) -> Self { + match self { + Self::Reset => Self::Set(val), + otherwise => otherwise, + } + } +} + +impl Serialize for Setting { + fn serialize(&self, serializer: S) -> StdResult + where + S: Serializer, + { + match self { + Self::Set(value) => Some(value), + // Usually not_set isn't serialized by setting skip_serializing_if field attribute + Self::NotSet | Self::Reset => None, + } + .serialize(serializer) + } +} + +impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting { + fn deserialize(deserializer: D) -> StdResult + where + D: Deserializer<'de>, + { + Deserialize::deserialize(deserializer).map(|x| match x { + Some(x) => Self::Set(x), + None => Self::Reset, // Reset is forced by sending null value + }) + } +} + +pub struct Settings<'a, 't, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + + indexer_config: &'a IndexerConfig, + + searchable_fields: Setting>, + displayed_fields: Setting>, + filterable_fields: Setting>, + sortable_fields: Setting>, + criteria: Setting>, + stop_words: Setting>, + distinct_field: Setting, + synonyms: Setting>>, + primary_key: Setting, + authorize_typos: Setting, + min_word_len_two_typos: Setting, + min_word_len_one_typo: Setting, + exact_words: Setting>, + /// Attributes on which typo tolerance is disabled. + exact_attributes: Setting>, + max_values_per_facet: Setting, + pagination_max_total_hits: Setting, +} + +impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + indexer_config: &'a IndexerConfig, + ) -> Settings<'a, 't, 'u, 'i> { + Settings { + wtxn, + index, + searchable_fields: Setting::NotSet, + displayed_fields: Setting::NotSet, + filterable_fields: Setting::NotSet, + sortable_fields: Setting::NotSet, + criteria: Setting::NotSet, + stop_words: Setting::NotSet, + distinct_field: Setting::NotSet, + synonyms: Setting::NotSet, + primary_key: Setting::NotSet, + authorize_typos: Setting::NotSet, + exact_words: Setting::NotSet, + min_word_len_two_typos: Setting::NotSet, + min_word_len_one_typo: Setting::NotSet, + exact_attributes: Setting::NotSet, + max_values_per_facet: Setting::NotSet, + pagination_max_total_hits: Setting::NotSet, + indexer_config, + } + } + + pub fn reset_searchable_fields(&mut self) { + self.searchable_fields = Setting::Reset; + } + + pub fn set_searchable_fields(&mut self, names: Vec) { + self.searchable_fields = Setting::Set(names); + } + + pub fn reset_displayed_fields(&mut self) { + self.displayed_fields = Setting::Reset; + } + + pub fn set_displayed_fields(&mut self, names: Vec) { + self.displayed_fields = Setting::Set(names); + } + + pub fn reset_filterable_fields(&mut self) { + self.filterable_fields = Setting::Reset; + } + + pub fn set_filterable_fields(&mut self, names: HashSet) { + self.filterable_fields = Setting::Set(names); + } + + pub fn set_sortable_fields(&mut self, names: HashSet) { + self.sortable_fields = Setting::Set(names); + } + + pub fn reset_sortable_fields(&mut self) { + self.sortable_fields = Setting::Reset; + } + + pub fn reset_criteria(&mut self) { + self.criteria = Setting::Reset; + } + + pub fn set_criteria(&mut self, criteria: Vec) { + self.criteria = Setting::Set(criteria); + } + + pub fn reset_stop_words(&mut self) { + self.stop_words = Setting::Reset; + } + + pub fn set_stop_words(&mut self, stop_words: BTreeSet) { + self.stop_words = + if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) } + } + + pub fn reset_distinct_field(&mut self) { + self.distinct_field = Setting::Reset; + } + + pub fn set_distinct_field(&mut self, distinct_field: String) { + self.distinct_field = Setting::Set(distinct_field); + } + + pub fn reset_synonyms(&mut self) { + self.synonyms = Setting::Reset; + } + + pub fn set_synonyms(&mut self, synonyms: HashMap>) { + self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) } + } + + pub fn reset_primary_key(&mut self) { + self.primary_key = Setting::Reset; + } + + pub fn set_primary_key(&mut self, primary_key: String) { + self.primary_key = Setting::Set(primary_key); + } + + pub fn set_autorize_typos(&mut self, val: bool) { + self.authorize_typos = Setting::Set(val); + } + + pub fn reset_authorize_typos(&mut self) { + self.authorize_typos = Setting::Reset; + } + + pub fn set_min_word_len_two_typos(&mut self, val: u8) { + self.min_word_len_two_typos = Setting::Set(val); + } + + pub fn reset_min_word_len_two_typos(&mut self) { + self.min_word_len_two_typos = Setting::Reset; + } + + pub fn set_min_word_len_one_typo(&mut self, val: u8) { + self.min_word_len_one_typo = Setting::Set(val); + } + + pub fn reset_min_word_len_one_typo(&mut self) { + self.min_word_len_one_typo = Setting::Reset; + } + + pub fn set_exact_words(&mut self, words: BTreeSet) { + self.exact_words = Setting::Set(words); + } + + pub fn reset_exact_words(&mut self) { + self.exact_words = Setting::Reset; + } + + pub fn set_exact_attributes(&mut self, attrs: HashSet) { + self.exact_attributes = Setting::Set(attrs); + } + + pub fn reset_exact_attributes(&mut self) { + self.exact_attributes = Setting::Reset; + } + + pub fn set_max_values_per_facet(&mut self, value: usize) { + self.max_values_per_facet = Setting::Set(value); + } + + pub fn reset_max_values_per_facet(&mut self) { + self.max_values_per_facet = Setting::Reset; + } + + pub fn set_pagination_max_total_hits(&mut self, value: usize) { + self.pagination_max_total_hits = Setting::Set(value); + } + + pub fn reset_pagination_max_total_hits(&mut self) { + self.pagination_max_total_hits = Setting::Reset; + } + + fn reindex( + &mut self, + progress_callback: &FP, + should_abort: &FA, + old_fields_ids_map: FieldsIdsMap, + ) -> Result<()> + where + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, + { + let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + // if the settings are set before any document update, we don't need to do anything, and + // will set the primary key during the first document addition. + if self.index.number_of_documents(self.wtxn)? == 0 { + return Ok(()); + } + + let transform = Transform::new( + self.wtxn, + self.index, + self.indexer_config, + IndexDocumentsMethod::ReplaceDocuments, + false, + )?; + + // We clear the databases and remap the documents fields based on the new `FieldsIdsMap`. + let output = transform.prepare_for_documents_reindexing( + self.wtxn, + old_fields_ids_map, + fields_ids_map, + )?; + + // We index the generated `TransformOutput` which must contain + // all the documents with fields in the newly defined searchable order. + let indexing_builder = IndexDocuments::new( + self.wtxn, + self.index, + self.indexer_config, + IndexDocumentsConfig::default(), + &progress_callback, + &should_abort, + )?; + indexing_builder.execute_raw(output)?; + + Ok(()) + } + + fn update_displayed(&mut self) -> Result { + match self.displayed_fields { + Setting::Set(ref fields) => { + // fields are deduplicated, only the first occurrence is taken into account + let names: Vec<_> = fields.iter().unique().map(String::as_str).collect(); + self.index.put_displayed_fields(self.wtxn, &names)?; + } + Setting::Reset => { + self.index.delete_displayed_fields(self.wtxn)?; + } + Setting::NotSet => return Ok(false), + } + Ok(true) + } + + fn update_distinct_field(&mut self) -> Result { + match self.distinct_field { + Setting::Set(ref attr) => { + self.index.put_distinct_field(self.wtxn, attr)?; + } + Setting::Reset => { + self.index.delete_distinct_field(self.wtxn)?; + } + Setting::NotSet => return Ok(false), + } + Ok(true) + } + + /// Updates the index's searchable attributes. This causes the field map to be recomputed to + /// reflect the order of the searchable attributes. + fn update_searchable(&mut self) -> Result { + match self.searchable_fields { + Setting::Set(ref fields) => { + // Check to see if the searchable fields changed before doing anything else + let old_fields = self.index.searchable_fields(self.wtxn)?; + let did_change = match old_fields { + // If old_fields is Some, let's check to see if the fields actually changed + Some(old_fields) => { + let new_fields = fields.iter().map(String::as_str).collect::>(); + new_fields != old_fields + } + // If old_fields is None, the fields have changed (because they are being set) + None => true, + }; + if !did_change { + return Ok(false); + } + + // every time the searchable attributes are updated, we need to update the + // ids for any settings that uses the facets. (distinct_fields, filterable_fields). + let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + + let mut new_fields_ids_map = FieldsIdsMap::new(); + // fields are deduplicated, only the first occurrence is taken into account + let names = fields.iter().unique().map(String::as_str).collect::>(); + + // Add all the searchable attributes to the field map, and then add the + // remaining fields from the old field map to the new one + for name in names.iter() { + new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; + } + + for (_, name) in old_fields_ids_map.iter() { + new_fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; + } + + self.index.put_all_searchable_fields_from_fields_ids_map( + self.wtxn, + &names, + &new_fields_ids_map, + )?; + self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?; + Ok(true) + } + Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), + Setting::NotSet => Ok(false), + } + } + + fn update_stop_words(&mut self) -> Result { + match self.stop_words { + Setting::Set(ref stop_words) => { + let current = self.index.stop_words(self.wtxn)?; + // since we can't compare a BTreeSet with an FST we are going to convert the + // BTreeSet to an FST and then compare bytes per bytes the two FSTs. + let fst = fst::Set::from_iter(stop_words)?; + + // Does the new FST differ from the previous one? + if current + .map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) + { + // we want to re-create our FST. + self.index.put_stop_words(self.wtxn, &fst)?; + Ok(true) + } else { + Ok(false) + } + } + Setting::Reset => Ok(self.index.delete_stop_words(self.wtxn)?), + Setting::NotSet => Ok(false), + } + } + + fn update_synonyms(&mut self) -> Result { + match self.synonyms { + Setting::Set(ref synonyms) => { + fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec { + tokenizer + .tokenize(text) + .filter_map(|token| { + if token.is_word() { + Some(token.lemma().to_string()) + } else { + None + } + }) + .collect::>() + } + + let mut builder = TokenizerBuilder::new(); + let stop_words = self.index.stop_words(self.wtxn)?; + if let Some(ref stop_words) = stop_words { + builder.stop_words(stop_words); + } + let tokenizer = builder.build(); + + let mut new_synonyms = HashMap::new(); + for (word, synonyms) in synonyms { + // Normalize both the word and associated synonyms. + let normalized_word = normalize(&tokenizer, word); + let normalized_synonyms = + synonyms.iter().map(|synonym| normalize(&tokenizer, synonym)); + + // Store the normalized synonyms under the normalized word, + // merging the possible duplicate words. + let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new); + entry.extend(normalized_synonyms); + } + + // Make sure that we don't have duplicate synonyms. + new_synonyms.iter_mut().for_each(|(_, synonyms)| { + synonyms.sort_unstable(); + synonyms.dedup(); + }); + + let old_synonyms = self.index.synonyms(self.wtxn)?; + + if new_synonyms != old_synonyms { + self.index.put_synonyms(self.wtxn, &new_synonyms)?; + Ok(true) + } else { + Ok(false) + } + } + Setting::Reset => Ok(self.index.delete_synonyms(self.wtxn)?), + Setting::NotSet => Ok(false), + } + } + + fn update_exact_attributes(&mut self) -> Result { + match self.exact_attributes { + Setting::Set(ref attrs) => { + let old_attrs = self.index.exact_attributes(self.wtxn)?; + let old_attrs = old_attrs.into_iter().map(String::from).collect::>(); + + if attrs != &old_attrs { + let attrs = attrs.iter().map(String::as_str).collect::>(); + self.index.put_exact_attributes(self.wtxn, &attrs)?; + Ok(true) + } else { + Ok(false) + } + } + Setting::Reset => Ok(self.index.delete_exact_attributes(self.wtxn)?), + Setting::NotSet => Ok(false), + } + } + + fn update_filterable(&mut self) -> Result<()> { + match self.filterable_fields { + Setting::Set(ref fields) => { + let mut new_facets = HashSet::new(); + for name in fields { + new_facets.insert(name.clone()); + } + self.index.put_filterable_fields(self.wtxn, &new_facets)?; + } + Setting::Reset => { + self.index.delete_filterable_fields(self.wtxn)?; + } + Setting::NotSet => (), + } + Ok(()) + } + + fn update_sortable(&mut self) -> Result<()> { + match self.sortable_fields { + Setting::Set(ref fields) => { + let mut new_fields = HashSet::new(); + for name in fields { + new_fields.insert(name.clone()); + } + self.index.put_sortable_fields(self.wtxn, &new_fields)?; + } + Setting::Reset => { + self.index.delete_sortable_fields(self.wtxn)?; + } + Setting::NotSet => (), + } + Ok(()) + } + + fn update_criteria(&mut self) -> Result<()> { + match &self.criteria { + Setting::Set(criteria) => { + self.index.put_criteria(self.wtxn, criteria)?; + } + Setting::Reset => { + self.index.delete_criteria(self.wtxn)?; + } + Setting::NotSet => (), + } + Ok(()) + } + + fn update_primary_key(&mut self) -> Result<()> { + match self.primary_key { + Setting::Set(ref primary_key) => { + if self.index.number_of_documents(self.wtxn)? == 0 { + let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + self.index.put_primary_key(self.wtxn, primary_key)?; + Ok(()) + } else { + let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); + Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into()) + } + } + Setting::Reset => { + if self.index.number_of_documents(self.wtxn)? == 0 { + self.index.delete_primary_key(self.wtxn)?; + Ok(()) + } else { + let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); + Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into()) + } + } + Setting::NotSet => Ok(()), + } + } + + fn update_authorize_typos(&mut self) -> Result<()> { + match self.authorize_typos { + Setting::Set(flag) => { + self.index.put_authorize_typos(self.wtxn, flag)?; + Ok(()) + } + Setting::Reset => { + self.index.put_authorize_typos(self.wtxn, true)?; + Ok(()) + } + Setting::NotSet => Ok(()), + } + } + + fn update_min_typo_word_len(&mut self) -> Result<()> { + let one = self.min_word_len_one_typo.or_reset(DEFAULT_MIN_WORD_LEN_ONE_TYPO); + let two = self.min_word_len_two_typos.or_reset(DEFAULT_MIN_WORD_LEN_TWO_TYPOS); + match (one, two) { + (Setting::Set(one), Setting::Set(two)) => { + if one > two { + return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); + } else { + self.index.put_min_word_len_one_typo(self.wtxn, one)?; + self.index.put_min_word_len_two_typos(self.wtxn, two)?; + } + } + (Setting::Set(one), _) => { + let two = self.index.min_word_len_two_typos(self.wtxn)?; + if one > two { + return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); + } else { + self.index.put_min_word_len_one_typo(self.wtxn, one)?; + } + } + (_, Setting::Set(two)) => { + let one = self.index.min_word_len_one_typo(self.wtxn)?; + if one > two { + return Err(UserError::InvalidMinTypoWordLenSetting(one, two).into()); + } else { + self.index.put_min_word_len_two_typos(self.wtxn, two)?; + } + } + _ => (), + } + + Ok(()) + } + + fn update_exact_words(&mut self) -> Result<()> { + match self.exact_words { + Setting::Set(ref mut words) => { + fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String { + tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect() + } + + let mut builder = TokenizerBuilder::new(); + let stop_words = self.index.stop_words(self.wtxn)?; + if let Some(ref stop_words) = stop_words { + builder.stop_words(stop_words); + } + let tokenizer = builder.build(); + + let mut words: Vec<_> = + words.iter().map(|word| normalize(&tokenizer, word)).collect(); + + // normalization could reorder words + words.sort_unstable(); + + let words = fst::Set::from_iter(words.iter())?; + self.index.put_exact_words(self.wtxn, &words)?; + } + Setting::Reset => { + self.index.put_exact_words(self.wtxn, &fst::Set::default())?; + } + Setting::NotSet => (), + } + + Ok(()) + } + + fn update_max_values_per_facet(&mut self) -> Result<()> { + match self.max_values_per_facet { + Setting::Set(max) => { + self.index.put_max_values_per_facet(self.wtxn, max)?; + } + Setting::Reset => { + self.index.delete_max_values_per_facet(self.wtxn)?; + } + Setting::NotSet => (), + } + + Ok(()) + } + + fn update_pagination_max_total_hits(&mut self) -> Result<()> { + match self.pagination_max_total_hits { + Setting::Set(max) => { + self.index.put_pagination_max_total_hits(self.wtxn, max)?; + } + Setting::Reset => { + self.index.delete_pagination_max_total_hits(self.wtxn)?; + } + Setting::NotSet => (), + } + + Ok(()) + } + + pub fn execute(mut self, progress_callback: FP, should_abort: FA) -> Result<()> + where + FP: Fn(UpdateIndexingStep) + Sync, + FA: Fn() -> bool + Sync, + { + self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; + + let old_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?; + let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + + self.update_displayed()?; + self.update_filterable()?; + self.update_sortable()?; + self.update_distinct_field()?; + self.update_criteria()?; + self.update_primary_key()?; + self.update_authorize_typos()?; + self.update_min_typo_word_len()?; + self.update_exact_words()?; + self.update_max_values_per_facet()?; + self.update_pagination_max_total_hits()?; + + // If there is new faceted fields we indicate that we must reindex as we must + // index new fields as facets. It means that the distinct attribute, + // an Asc/Desc criterion or a filtered attribute as be added or removed. + let new_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?; + let faceted_updated = old_faceted_fields != new_faceted_fields; + + let stop_words_updated = self.update_stop_words()?; + let synonyms_updated = self.update_synonyms()?; + let searchable_updated = self.update_searchable()?; + let exact_attributes_updated = self.update_exact_attributes()?; + + if stop_words_updated + || faceted_updated + || synonyms_updated + || searchable_updated + || exact_attributes_updated + { + self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use big_s::S; + use heed::types::ByteSlice; + use maplit::{btreeset, hashmap, hashset}; + + use super::*; + use crate::error::Error; + use crate::index::tests::TempIndex; + use crate::update::{ClearDocuments, DeleteDocuments}; + use crate::{Criterion, Filter, SearchResult}; + + #[test] + fn set_and_reset_searchable_fields() { + let index = TempIndex::new(); + + // First we send 3 documents with ids from 1 to 3. + let mut wtxn = index.write_txn().unwrap(); + + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": 1, "name": "kevin", "age": 23 }, + { "id": 2, "name": "kevina", "age": 21}, + { "id": 3, "name": "benoit", "age": 34 } + ]), + ) + .unwrap(); + + // We change the searchable fields to be the "name" field only. + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_searchable_fields(vec!["name".into()]); + }) + .unwrap(); + + wtxn.commit().unwrap(); + + // Check that the searchable field is correctly set to "name" only. + let rtxn = index.read_txn().unwrap(); + // When we search for something that is not in + // the searchable fields it must not return any document. + let result = index.search(&rtxn).query("23").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + + // When we search for something that is in the searchable fields + // we must find the appropriate document. + let result = index.search(&rtxn).query(r#""kevin""#).execute().unwrap(); + let documents = index.documents(&rtxn, result.documents_ids).unwrap(); + assert_eq!(documents.len(), 1); + assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + drop(rtxn); + + // We change the searchable fields to be the "name" field only. + index + .update_settings(|settings| { + settings.reset_searchable_fields(); + }) + .unwrap(); + + // Check that the searchable field have been reset and documents are found now. + let rtxn = index.read_txn().unwrap(); + let searchable_fields = index.searchable_fields(&rtxn).unwrap(); + assert_eq!(searchable_fields, None); + let result = index.search(&rtxn).query("23").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let documents = index.documents(&rtxn, result.documents_ids).unwrap(); + assert_eq!(documents[0].1.get(0), Some(&br#""kevin""#[..])); + } + + #[test] + fn mixup_searchable_with_displayed_fields() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + let mut wtxn = index.write_txn().unwrap(); + // First we send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]), + ) + .unwrap(); + + // In the same transaction we change the displayed fields to be only the "age". + // We also change the searchable fields to be the "name" field only. + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_displayed_fields(vec!["age".into()]); + settings.set_searchable_fields(vec!["name".into()]); + }) + .unwrap(); + wtxn.commit().unwrap(); + + // Check that the displayed fields are correctly set to `None` (default value). + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + assert_eq!(fields_ids.unwrap(), (&["age"][..])); + drop(rtxn); + + // We change the searchable fields to be the "name" field only. + index + .update_settings(|settings| { + settings.reset_searchable_fields(); + }) + .unwrap(); + + // Check that the displayed fields always contains only the "age" field. + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + assert_eq!(fields_ids.unwrap(), &["age"][..]); + } + + #[test] + fn default_displayed_fields() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + // First we send 3 documents with ids from 1 to 3. + index + .add_documents(documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ])) + .unwrap(); + + // Check that the displayed fields are correctly set to `None` (default value). + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + assert_eq!(fields_ids, None); + } + + #[test] + fn set_and_reset_displayed_field() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]), + ) + .unwrap(); + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_displayed_fields(vec!["age".into()]); + }) + .unwrap(); + wtxn.commit().unwrap(); + + // Check that the displayed fields are correctly set to only the "age" field. + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + assert_eq!(fields_ids.unwrap(), &["age"][..]); + drop(rtxn); + + // We reset the fields ids to become `None`, the default value. + index + .update_settings(|settings| { + settings.reset_displayed_fields(); + }) + .unwrap(); + + // Check that the displayed fields are correctly set to `None` (default value). + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.displayed_fields(&rtxn).unwrap(); + assert_eq!(fields_ids, None); + } + + #[test] + fn set_filterable_fields() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + // Set the filterable fields to be the age. + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("age") }); + }) + .unwrap(); + + // Then index some documents. + index + .add_documents(documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ])) + .unwrap(); + + // Check that the displayed fields are correctly set. + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.filterable_fields(&rtxn).unwrap(); + assert_eq!(fields_ids, hashset! { S("age") }); + // Only count the field_id 0 and level 0 facet values. + // TODO we must support typed CSVs for numbers to be understood. + let fidmap = index.fields_ids_map(&rtxn).unwrap(); + for document in index.all_documents(&rtxn).unwrap() { + let document = document.unwrap(); + let json = crate::obkv_to_json(&fidmap.ids().collect::>(), &fidmap, document.1) + .unwrap(); + println!("json: {:?}", json); + } + let count = index + .facet_id_f64_docids + .remap_key_type::() + // The faceted field id is 1u16 + .prefix_iter(&rtxn, &[0, 1, 0]) + .unwrap() + .count(); + assert_eq!(count, 3); + drop(rtxn); + + // Index a little more documents with new and current facets values. + index + .add_documents(documents!([ + { "name": "kevin2", "age": 23}, + { "name": "kevina2", "age": 21 }, + { "name": "benoit", "age": 35 } + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + // Only count the field_id 0 and level 0 facet values. + let count = index + .facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 1, 0]) + .unwrap() + .count(); + assert_eq!(count, 4); + } + + #[test] + fn set_asc_desc_field() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + // Set the filterable fields to be the age. + index + .update_settings(|settings| { + settings.set_displayed_fields(vec![S("name")]); + settings.set_criteria(vec![Criterion::Asc("age".to_owned())]); + }) + .unwrap(); + + // Then index some documents. + index + .add_documents(documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ])) + .unwrap(); + + // Run an empty query just to ensure that the search results are ordered. + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); + let documents = index.documents(&rtxn, documents_ids).unwrap(); + + // Fetch the documents "age" field in the ordre in which the documents appear. + let age_field_id = index.fields_ids_map(&rtxn).unwrap().id("age").unwrap(); + let iter = documents.into_iter().map(|(_, doc)| { + let bytes = doc.get(age_field_id).unwrap(); + let string = std::str::from_utf8(bytes).unwrap(); + string.parse::().unwrap() + }); + + assert_eq!(iter.collect::>(), vec![21, 23, 34]); + } + + #[test] + fn set_distinct_field() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + // Set the filterable fields to be the age. + index + .update_settings(|settings| { + // Don't display the generated `id` field. + settings.set_displayed_fields(vec![S("name"), S("age")]); + settings.set_distinct_field(S("age")); + }) + .unwrap(); + + // Then index some documents. + index + .add_documents(documents!([ + { "name": "kevin", "age": 23 }, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 }, + { "name": "bernard", "age": 34 }, + { "name": "bertrand", "age": 34 }, + { "name": "bernie", "age": 34 }, + { "name": "ben", "age": 34 } + ])) + .unwrap(); + + // Run an empty query just to ensure that the search results are ordered. + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); + + // There must be at least one document with a 34 as the age. + assert_eq!(documents_ids.len(), 3); + } + + #[test] + fn set_nested_distinct_field() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + // Set the filterable fields to be the age. + index + .update_settings(|settings| { + // Don't display the generated `id` field. + settings.set_displayed_fields(vec![S("person")]); + settings.set_distinct_field(S("person.age")); + }) + .unwrap(); + + // Then index some documents. + index + .add_documents(documents!([ + { "person": { "name": "kevin", "age": 23 }}, + { "person": { "name": "kevina", "age": 21 }}, + { "person": { "name": "benoit", "age": 34 }}, + { "person": { "name": "bernard", "age": 34 }}, + { "person": { "name": "bertrand", "age": 34 }}, + { "person": { "name": "bernie", "age": 34 }}, + { "person": { "name": "ben", "age": 34 }} + ])) + .unwrap(); + + // Run an empty query just to ensure that the search results are ordered. + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); + + // There must be at least one document with a 34 as the age. + assert_eq!(documents_ids.len(), 3); + } + + #[test] + fn default_stop_words() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + // First we send 3 documents with ids from 1 to 3. + index + .add_documents(documents!([ + { "name": "kevin", "age": 23}, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ])) + .unwrap(); + + // Ensure there is no stop_words by default + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_none()); + } + + #[test] + fn set_and_reset_stop_words() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + let mut wtxn = index.write_txn().unwrap(); + // First we send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "name": "kevin", "age": 23, "maxim": "I love dogs" }, + { "name": "kevina", "age": 21, "maxim": "Doggos are the best" }, + { "name": "benoit", "age": 34, "maxim": "The crepes are really good" }, + ]), + ) + .unwrap(); + + // In the same transaction we provide some stop_words + let set = btreeset! { "i".to_string(), "the".to_string(), "are".to_string() }; + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_stop_words(set.clone()); + }) + .unwrap(); + + wtxn.commit().unwrap(); + + // Ensure stop_words are effectively stored + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_some()); // at this point the index should return something + + let stop_words = stop_words.unwrap(); + let expected = fst::Set::from_iter(&set).unwrap(); + assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes()); + + // when we search for something that is a non prefix stop_words it should be ignored + // thus we should get a placeholder search (all the results = 3) + let result = index.search(&rtxn).query("the ").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 3); + let result = index.search(&rtxn).query("i ").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 3); + let result = index.search(&rtxn).query("are ").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 3); + + let result = index.search(&rtxn).query("dog").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos + let result = index.search(&rtxn).query("benoît").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data + + // now we'll reset the stop_words and ensure it's None + index + .update_settings(|settings| { + settings.reset_stop_words(); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + assert!(stop_words.is_none()); + + // now we can search for the stop words + let result = index.search(&rtxn).query("the").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + let result = index.search(&rtxn).query("i").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("are").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + + // the rest of the search is still not impacted + let result = index.search(&rtxn).query("dog").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos + let result = index.search(&rtxn).query("benoît").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); // there is one benoit in our data + } + + #[test] + fn set_and_reset_synonyms() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + let mut wtxn = index.write_txn().unwrap(); + // Send 3 documents with ids from 1 to 3. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "name": "kevin", "age": 23, "maxim": "I love dogs"}, + { "name": "kevina", "age": 21, "maxim": "Doggos are the best"}, + { "name": "benoit", "age": 34, "maxim": "The crepes are really good"}, + ]), + ) + .unwrap(); + + // In the same transaction provide some synonyms + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_synonyms(hashmap! { + "blini".to_string() => vec!["crepes".to_string()], + "super like".to_string() => vec!["love".to_string()], + "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()] + }); + }) + .unwrap(); + wtxn.commit().unwrap(); + + // Ensure synonyms are effectively stored + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(!synonyms.is_empty()); // at this point the index should return something + + // Check that we can use synonyms + let result = index.search(&rtxn).query("blini").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("super like").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + let result = index.search(&rtxn).query("puppies").execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + + // Reset the synonyms + index + .update_settings(|settings| { + settings.reset_synonyms(); + }) + .unwrap(); + + // Ensure synonyms are reset + let rtxn = index.read_txn().unwrap(); + let synonyms = index.synonyms(&rtxn).unwrap(); + assert!(synonyms.is_empty()); + + // Check that synonyms are no longer work + let result = index.search(&rtxn).query("blini").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("super like").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + let result = index.search(&rtxn).query("puppies").execute().unwrap(); + assert!(result.documents_ids.is_empty()); + } + + #[test] + fn setting_searchable_recomputes_other_settings() { + let index = TempIndex::new(); + + // Set all the settings except searchable + index + .update_settings(|settings| { + settings.set_displayed_fields(vec!["hello".to_string()]); + settings.set_filterable_fields(hashset! { S("age"), S("toto") }); + settings.set_criteria(vec![Criterion::Asc(S("toto"))]); + }) + .unwrap(); + + // check the output + let rtxn = index.read_txn().unwrap(); + assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap()); + // since no documents have been pushed the primary key is still unset + assert!(index.primary_key(&rtxn).unwrap().is_none()); + assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap()); + drop(rtxn); + + // We set toto and age as searchable to force reordering of the fields + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["toto".to_string(), "age".to_string()]); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + assert_eq!(&["hello"][..], index.displayed_fields(&rtxn).unwrap().unwrap()); + assert!(index.primary_key(&rtxn).unwrap().is_none()); + assert_eq!(vec![Criterion::Asc("toto".to_string())], index.criteria(&rtxn).unwrap()); + } + + #[test] + fn setting_not_filterable_cant_filter() { + let index = TempIndex::new(); + + // Set all the settings except searchable + index + .update_settings(|settings| { + settings.set_displayed_fields(vec!["hello".to_string()]); + // It is only Asc(toto), there is a facet database but it is denied to filter with toto. + settings.set_criteria(vec![Criterion::Asc(S("toto"))]); + }) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let filter = Filter::from_str("toto = 32").unwrap().unwrap(); + let _ = filter.evaluate(&rtxn, &index).unwrap_err(); + } + + #[test] + fn setting_primary_key() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + let mut wtxn = index.write_txn().unwrap(); + // Set the primary key settings + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_primary_key(S("mykey")); + }) + .unwrap(); + assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); + + // Then index some documents with the "mykey" primary key. + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "mykey": 1, "name": "kevin", "age": 23 }, + { "mykey": 2, "name": "kevina", "age": 21 }, + { "mykey": 3, "name": "benoit", "age": 34 }, + { "mykey": 4, "name": "bernard", "age": 34 }, + { "mykey": 5, "name": "bertrand", "age": 34 }, + { "mykey": 6, "name": "bernie", "age": 34 }, + { "mykey": 7, "name": "ben", "age": 34 } + ]), + ) + .unwrap(); + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let error = index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.reset_primary_key(); + }) + .unwrap_err(); + assert!(matches!(error, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_)))); + wtxn.abort().unwrap(); + + // But if we clear the database... + let mut wtxn = index.write_txn().unwrap(); + let builder = ClearDocuments::new(&mut wtxn, &index); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + // ...we can change the primary key + index + .update_settings(|settings| { + settings.set_primary_key(S("myid")); + }) + .unwrap(); + } + + #[test] + fn setting_impact_relevancy() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + // Set the genres setting + index + .update_settings(|settings| { + settings.set_filterable_fields(hashset! { S("genres") }); + }) + .unwrap(); + + index.add_documents(documents!([ + { + "id": 11, + "title": "Star Wars", + "overview": + "Princess Leia is captured and held hostage by the evil Imperial forces in their effort to take over the galactic Empire. Venturesome Luke Skywalker and dashing captain Han Solo team together with the loveable robot duo R2-D2 and C-3PO to rescue the beautiful princess and restore peace and justice in the Empire.", + "genres": ["Adventure", "Action", "Science Fiction"], + "poster": "https://image.tmdb.org/t/p/w500/6FfCtAuVAW8XJjZ7eWeLibRLWTw.jpg", + "release_date": 233366400 + }, + { + "id": 30, + "title": "Magnetic Rose", + "overview": "", + "genres": ["Animation", "Science Fiction"], + "poster": "https://image.tmdb.org/t/p/w500/gSuHDeWemA1menrwfMRChnSmMVN.jpg", + "release_date": 819676800 + } + ])).unwrap(); + + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).query("S").execute().unwrap(); + let first_id = documents_ids[0]; + let documents = index.documents(&rtxn, documents_ids).unwrap(); + let (_, content) = documents.iter().find(|(id, _)| *id == first_id).unwrap(); + + let fid = index.fields_ids_map(&rtxn).unwrap().id("title").unwrap(); + let line = std::str::from_utf8(content.get(fid).unwrap()).unwrap(); + assert_eq!(line, r#""Star Wars""#); + } + + #[test] + fn test_disable_typo() { + let index = TempIndex::new(); + + let mut txn = index.write_txn().unwrap(); + assert!(index.authorize_typos(&txn).unwrap()); + + index + .update_settings_using_wtxn(&mut txn, |settings| { + settings.set_autorize_typos(false); + }) + .unwrap(); + + assert!(!index.authorize_typos(&txn).unwrap()); + } + + #[test] + fn update_min_word_len_for_typo() { + let index = TempIndex::new(); + + // Set the genres setting + index + .update_settings(|settings| { + settings.set_min_word_len_one_typo(8); + settings.set_min_word_len_two_typos(8); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), 8); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), 8); + + index + .update_settings(|settings| { + settings.reset_min_word_len_one_typo(); + settings.reset_min_word_len_two_typos(); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + assert_eq!(index.min_word_len_one_typo(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_ONE_TYPO); + assert_eq!(index.min_word_len_two_typos(&txn).unwrap(), DEFAULT_MIN_WORD_LEN_TWO_TYPOS); + } + + #[test] + fn update_invalid_min_word_len_for_typo() { + let index = TempIndex::new(); + + // Set the genres setting + index + .update_settings(|settings| { + settings.set_min_word_len_one_typo(10); + settings.set_min_word_len_two_typos(7); + }) + .unwrap_err(); + } + + #[test] + fn update_exact_words_normalization() { + let index = TempIndex::new(); + + let mut txn = index.write_txn().unwrap(); + // Set the genres setting + index + .update_settings_using_wtxn(&mut txn, |settings| { + let words = btreeset! { S("Ab"), S("ac") }; + settings.set_exact_words(words); + }) + .unwrap(); + + let exact_words = index.exact_words(&txn).unwrap().unwrap(); + for word in exact_words.into_fst().stream().into_str_vec().unwrap() { + assert!(word.0 == "ac" || word.0 == "ab"); + } + } + + #[test] + fn test_correct_settings_init() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + // we don't actually update the settings, just check their content + let Settings { + wtxn: _, + index: _, + indexer_config: _, + searchable_fields, + displayed_fields, + filterable_fields, + sortable_fields, + criteria, + stop_words, + distinct_field, + synonyms, + primary_key, + authorize_typos, + min_word_len_two_typos, + min_word_len_one_typo, + exact_words, + exact_attributes, + max_values_per_facet, + pagination_max_total_hits, + } = settings; + assert!(matches!(searchable_fields, Setting::NotSet)); + assert!(matches!(displayed_fields, Setting::NotSet)); + assert!(matches!(filterable_fields, Setting::NotSet)); + assert!(matches!(sortable_fields, Setting::NotSet)); + assert!(matches!(criteria, Setting::NotSet)); + assert!(matches!(stop_words, Setting::NotSet)); + assert!(matches!(distinct_field, Setting::NotSet)); + assert!(matches!(synonyms, Setting::NotSet)); + assert!(matches!(primary_key, Setting::NotSet)); + assert!(matches!(authorize_typos, Setting::NotSet)); + assert!(matches!(min_word_len_two_typos, Setting::NotSet)); + assert!(matches!(min_word_len_one_typo, Setting::NotSet)); + assert!(matches!(exact_words, Setting::NotSet)); + assert!(matches!(exact_attributes, Setting::NotSet)); + assert!(matches!(max_values_per_facet, Setting::NotSet)); + assert!(matches!(pagination_max_total_hits, Setting::NotSet)); + }) + .unwrap(); + } + + #[test] + fn settings_must_ignore_soft_deleted() { + use serde_json::json; + + let index = TempIndex::new(); + + let mut docs = vec![]; + for i in 0..10 { + docs.push(json!({ "id": i, "title": format!("{:x}", i) })); + } + index.add_documents(documents! { docs }).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + (0..5).for_each(|id| drop(builder.delete_external_id(&id.to_string()))); + builder.execute().unwrap(); + + index + .update_settings_using_wtxn(&mut wtxn, |settings| { + settings.set_searchable_fields(vec!["id".to_string()]); + }) + .unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.write_txn().unwrap(); + let docs: StdResult, _> = index.all_documents(&rtxn).unwrap().collect(); + let docs = docs.unwrap(); + assert_eq!(docs.len(), 5); + } +} diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap new file mode 100644 index 000000000..6d69b2ffb --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap new file mode 100644 index 000000000..88d3a98aa --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/delete_documents.rs +--- +benoit [2, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap new file mode 100644 index 000000000..6d69b2ffb --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..9139b7a05 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[0, 1, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap new file mode 100644 index 000000000..15c881e87 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/delete_documents.rs +--- +benoit [2, ] +kevin [0, ] +kevina [1, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap new file mode 100644 index 000000000..7481b11c4 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] +2 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap new file mode 100644 index 000000000..87856f6dc --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/delete_documents.rs +--- +2 0 2.2 1 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap new file mode 100644 index 000000000..ab1d2175f --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap @@ -0,0 +1,17 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] +1 0 aquarium 1 [5, ] +1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] +1 0 cartoon 1 [2, 7, 15, 17, ] +1 0 colorfulness 1 [13, ] +1 0 design 1 [2, 18, ] +1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] +1 0 geometry 1 [19, ] +1 0 letter 1 [1, ] +1 0 outdoor 1 [4, ] +1 0 painting 1 [3, ] +1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] +2 0 design 1 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap new file mode 100644 index 000000000..d8125dfcf --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap @@ -0,0 +1,37 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1_36 [3, ] +1_37 [4, ] +1_38 [5, ] +1_39 [6, ] +1_40 [7, ] +1_41 [8, ] +1_42 [9, ] +1_43 [10, ] +1_44 [11, ] +1_45 [12, ] +1_46 [13, ] +1_47 [14, ] +1_5 [1, ] +1_52 [15, ] +1_57 [16, ] +1_58 [17, ] +1_68 [18, ] +1_69 [19, ] +1_7 [2, ] +1_71 [21, ] +2.2 [21, ] +abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +aquarium [5, ] +art [4, 5, 8, 9, 10, 12, 17, ] +cartoon [2, 7, 15, 17, ] +colorfulness [13, ] +design [2, 18, 21, ] +drawing [3, 4, 5, 8, 10, 11, 16, ] +geometry [19, ] +letter [1, ] +outdoor [4, ] +painting [3, ] +pattern [2, 3, 9, 10, 13, 14, 16, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap new file mode 100644 index 000000000..a7ee4348d --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] +2 [20, 21, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap new file mode 100644 index 000000000..cfa649653 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +2 0 1.2 1 [20, 22, ] +2 0 2.2 1 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap new file mode 100644 index 000000000..8336bd712 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap @@ -0,0 +1,19 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] +1 0 aquarium 1 [5, ] +1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] +1 0 cartoon 1 [2, 7, 15, 17, ] +1 0 colorfulness 1 [13, ] +1 0 design 1 [2, 18, ] +1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] +1 0 geometry 1 [19, ] +1 0 letter 1 [1, ] +1 0 outdoor 1 [4, ] +1 0 painting 1 [3, ] +1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] +1 0 sign 1 [0, ] +2 0 design 1 [21, ] +2 0 geometry 1 [20, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..dfac98e59 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[0, 20, 22, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap new file mode 100644 index 000000000..7909d9b06 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap @@ -0,0 +1,42 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1.2 [20, 22, ] +1_36 [3, ] +1_37 [4, ] +1_38 [5, ] +1_39 [6, ] +1_4 [0, ] +1_40 [7, ] +1_41 [8, ] +1_42 [9, ] +1_43 [10, ] +1_44 [11, ] +1_45 [12, ] +1_46 [13, ] +1_47 [14, ] +1_5 [1, ] +1_52 [15, ] +1_57 [16, ] +1_58 [17, ] +1_68 [18, ] +1_69 [19, ] +1_7 [2, ] +1_70 [20, ] +1_71 [21, ] +1_72 [22, ] +2.2 [21, ] +abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +aquarium [5, ] +art [4, 5, 8, 9, 10, 12, 17, ] +cartoon [2, 7, 15, 17, ] +colorfulness [13, ] +design [2, 18, 21, ] +drawing [3, 4, 5, 8, 10, 11, 16, ] +geometry [19, 20, 22, ] +letter [1, ] +outdoor [4, ] +painting [3, ] +pattern [2, 3, 9, 10, 13, 14, 16, ] +sign [0, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap new file mode 100644 index 000000000..18a9d9309 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap @@ -0,0 +1,31 @@ +--- +source: milli/src/update/delete_documents.rs +--- +3 0 48.9021 1 [19, ] +3 0 49.9314 1 [17, ] +3 0 50.1793 1 [15, ] +3 0 50.2844 1 [14, ] +3 0 50.3518 1 [13, ] +3 0 50.4502 1 [12, ] +3 0 50.6053 1 [8, ] +3 0 50.6224 1 [3, ] +3 0 50.6299 1 [0, ] +3 0 50.6312 1 [2, ] +3 0 50.6415 1 [1, ] +3 0 50.7453 1 [7, ] +3 0 50.8466 1 [10, ] +3 0 51.0537 1 [9, ] +4 0 2.271 1 [17, ] +4 0 2.3708 1 [19, ] +4 0 2.7637 1 [14, ] +4 0 3.0569 1 [0, ] +4 0 3.1106 1 [1, 2, ] +4 0 3.1476 1 [3, ] +4 0 3.2189 1 [15, ] +4 0 3.2206 1 [7, ] +4 0 3.3758 1 [8, ] +4 0 3.5326 1 [13, ] +4 0 3.6957 1 [9, ] +4 0 3.9623 1 [12, ] +4 0 4.337 1 [10, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap new file mode 100644 index 000000000..c909a3cd8 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap @@ -0,0 +1,53 @@ +--- +source: milli/src/update/delete_documents.rs +--- +3 0 48.9021 1 [19, ] +3 0 49.4449 1 [18, ] +3 0 49.9314 1 [17, ] +3 0 50.1112 1 [16, ] +3 0 50.1793 1 [15, ] +3 0 50.2844 1 [14, ] +3 0 50.3518 1 [13, ] +3 0 50.4095 1 [11, ] +3 0 50.4502 1 [12, ] +3 0 50.6053 1 [8, ] +3 0 50.6224 1 [3, ] +3 0 50.6299 1 [0, ] +3 0 50.6312 1 [2, ] +3 0 50.6415 1 [1, ] +3 0 50.6552 1 [4, ] +3 0 50.6924 1 [5, ] +3 0 50.7263 1 [6, ] +3 0 50.7453 1 [7, ] +3 0 50.8466 1 [10, ] +3 0 51.0537 1 [9, ] +3 1 48.9021 4 [16, 17, 18, 19, ] +3 1 50.1793 4 [11, 13, 14, 15, ] +3 1 50.4502 4 [0, 3, 8, 12, ] +3 1 50.6312 4 [1, 2, 4, 5, ] +3 1 50.7263 4 [6, 7, 9, 10, ] +4 0 2.271 1 [17, ] +4 0 2.3708 1 [19, ] +4 0 2.7637 1 [14, ] +4 0 2.7913 1 [18, ] +4 0 2.8547 1 [16, ] +4 0 3.0569 1 [0, ] +4 0 3.1106 1 [1, 2, ] +4 0 3.1476 1 [3, ] +4 0 3.1541 1 [6, ] +4 0 3.1763 1 [5, ] +4 0 3.1897 1 [4, ] +4 0 3.2189 1 [15, ] +4 0 3.2206 1 [7, ] +4 0 3.3758 1 [8, ] +4 0 3.5326 1 [13, ] +4 0 3.6957 1 [9, ] +4 0 3.9623 1 [12, ] +4 0 4.337 1 [10, ] +4 0 4.4347 1 [11, ] +4 1 2.271 4 [14, 17, 18, 19, ] +4 1 2.8547 4 [0, 1, 2, 3, 16, ] +4 1 3.1541 4 [4, 5, 6, 15, ] +4 1 3.2206 4 [7, 8, 9, 13, ] +4 1 3.9623 3 [10, 11, 12, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..1260b12de --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[4, 5, 6, 11, 16, 18, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/update_step.rs b/milli/src/update/update_step.rs new file mode 100644 index 000000000..fd5739caf --- /dev/null +++ b/milli/src/update/update_step.rs @@ -0,0 +1,35 @@ +use UpdateIndexingStep::*; + +#[derive(Debug, Clone, Copy)] +pub enum UpdateIndexingStep { + /// Remap document addition fields the one present in the database, adding new fields in to the + /// schema on the go. + RemapDocumentAddition { documents_seen: usize }, + + /// This step check the external document id, computes the internal ids and merge + /// the documents that are already present in the database. + ComputeIdsAndMergeDocuments { documents_seen: usize, total_documents: usize }, + + /// Extract the documents words using the tokenizer and compute the documents + /// facets. Stores those words, facets and documents ids on disk. + IndexDocuments { documents_seen: usize, total_documents: usize }, + + /// Merge the previously extracted data (words and facets) into the final LMDB database. + /// These extracted data are split into multiple databases. + MergeDataIntoFinalDatabase { databases_seen: usize, total_databases: usize }, +} + +impl UpdateIndexingStep { + pub const fn step(&self) -> usize { + match self { + RemapDocumentAddition { .. } => 0, + ComputeIdsAndMergeDocuments { .. } => 1, + IndexDocuments { .. } => 2, + MergeDataIntoFinalDatabase { .. } => 3, + } + } + + pub const fn number_of_steps(&self) -> usize { + 4 + } +} diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs new file mode 100644 index 000000000..b235c44a6 --- /dev/null +++ b/milli/src/update/word_prefix_docids.rs @@ -0,0 +1,136 @@ +use std::collections::{HashMap, HashSet}; + +use grenad::CompressionType; +use heed::types::{ByteSlice, Str}; +use heed::Database; + +use crate::update::index_documents::{ + create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, + CursorClonableMmap, MergeFn, +}; +use crate::{Result, RoaringBitmapCodec}; + +pub struct WordPrefixDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + word_docids: Database, + word_prefix_docids: Database, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, +} + +impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + word_docids: Database, + word_prefix_docids: Database, + ) -> WordPrefixDocids<'t, 'u, 'i> { + WordPrefixDocids { + wtxn, + word_docids, + word_prefix_docids, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + max_nb_chunks: None, + max_memory: None, + } + } + + #[logging_timer::time("WordPrefixDocids::{}")] + pub fn execute( + self, + mut new_word_docids_iter: grenad::ReaderCursor, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, + ) -> Result<()> { + // It is forbidden to keep a mutable reference into the database + // and write into it at the same time, therefore we write into another file. + let mut prefix_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_roaring_bitmaps, + self.chunk_compression_type, + self.chunk_compression_level, + self.max_nb_chunks, + self.max_memory, + ); + + if !common_prefix_fst_words.is_empty() { + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some((word, data)) = new_word_docids_iter.move_on_next()? { + current_prefixes = match current_prefixes.take() { + Some(prefixes) if word.starts_with(prefixes[0].as_bytes()) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; + common_prefix_fst_words + .iter() + .find(|prefixes| word.starts_with(prefixes[0].as_bytes())) + } + }; + + if let Some(prefixes) = current_prefixes { + for prefix in prefixes.iter() { + if word.starts_with(prefix.as_bytes()) { + match prefixes_cache.get_mut(prefix.as_bytes()) { + Some(value) => value.push(data.to_owned()), + None => { + prefixes_cache + .insert(prefix.clone().into(), vec![data.to_owned()]); + } + } + } + } + } + } + + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; + } + + // We fetch the docids associated to the newly added word prefix fst only. + let db = self.word_docids.remap_data_type::(); + for prefix in new_prefix_fst_words { + let prefix = std::str::from_utf8(prefix.as_bytes())?; + for result in db.prefix_iter(self.wtxn, prefix)? { + let (_word, data) = result?; + prefix_docids_sorter.insert(prefix, data)?; + } + } + + // We remove all the entries that are no more required in this word prefix docids database. + let mut iter = self.word_prefix_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + while let Some((prefix, _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(prefix.as_bytes()) { + unsafe { iter.del_current()? }; + } + } + + drop(iter); + + // We finally write the word prefix docids into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.word_prefix_docids.as_polymorph(), + prefix_docids_sorter, + merge_roaring_bitmaps, + )?; + + Ok(()) + } +} + +fn write_prefixes_in_sorter( + prefixes: &mut HashMap, Vec>>, + sorter: &mut grenad::Sorter, +) -> Result<()> { + for (key, data_slices) in prefixes.drain() { + for data in data_slices { + if valid_lmdb_key(&key) { + sorter.insert(&key, data)?; + } + } + } + + Ok(()) +} diff --git a/milli/src/update/words_prefix_position_docids.rs b/milli/src/update/words_prefix_position_docids.rs new file mode 100644 index 000000000..5dbc9f89b --- /dev/null +++ b/milli/src/update/words_prefix_position_docids.rs @@ -0,0 +1,178 @@ +use std::collections::{HashMap, HashSet}; +use std::num::NonZeroU32; +use std::{cmp, str}; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesDecode, BytesEncode}; +use log::debug; + +use crate::error::SerializationError; +use crate::heed_codec::StrBEU32Codec; +use crate::index::main_key::WORDS_PREFIXES_FST_KEY; +use crate::update::index_documents::{ + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, + CursorClonableMmap, MergeFn, +}; +use crate::{Index, Result}; + +pub struct WordPrefixPositionDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + pub(crate) chunk_compression_type: CompressionType, + pub(crate) chunk_compression_level: Option, + pub(crate) max_nb_chunks: Option, + pub(crate) max_memory: Option, + level_group_size: NonZeroU32, + min_level_size: NonZeroU32, +} + +impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordPrefixPositionDocids<'t, 'u, 'i> { + WordPrefixPositionDocids { + wtxn, + index, + chunk_compression_type: CompressionType::None, + chunk_compression_level: None, + max_nb_chunks: None, + max_memory: None, + level_group_size: NonZeroU32::new(4).unwrap(), + min_level_size: NonZeroU32::new(5).unwrap(), + } + } + + pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { + self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); + self + } + + pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { + self.min_level_size = value; + self + } + + #[logging_timer::time("WordPrefixPositionDocids::{}")] + pub fn execute( + self, + new_word_position_docids: grenad::Reader, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, + ) -> Result<()> { + debug!("Computing and writing the word levels positions docids into LMDB on disk..."); + + let mut prefix_position_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + self.chunk_compression_type, + self.chunk_compression_level, + self.max_nb_chunks, + self.max_memory, + ); + + let mut new_word_position_docids_iter = new_word_position_docids.into_cursor()?; + + if !common_prefix_fst_words.is_empty() { + // We fetch all the new common prefixes between the previous and new prefix fst. + let mut buffer = Vec::new(); + let mut current_prefixes: Option<&&[String]> = None; + let mut prefixes_cache = HashMap::new(); + while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { + let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + + current_prefixes = match current_prefixes.take() { + Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), + _otherwise => { + write_prefixes_in_sorter( + &mut prefixes_cache, + &mut prefix_position_docids_sorter, + )?; + common_prefix_fst_words + .iter() + .find(|prefixes| word.starts_with(&prefixes[0])) + } + }; + + if let Some(prefixes) = current_prefixes { + for prefix in prefixes.iter() { + if word.starts_with(prefix) { + buffer.clear(); + buffer.extend_from_slice(prefix.as_bytes()); + buffer.extend_from_slice(&pos.to_be_bytes()); + match prefixes_cache.get_mut(&buffer) { + Some(value) => value.push(data.to_owned()), + None => { + prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); + } + } + } + } + } + } + + write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?; + } + + // We fetch the docids associated to the newly added word prefix fst only. + let db = self.index.word_position_docids.remap_data_type::(); + for prefix_bytes in new_prefix_fst_words { + let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| { + SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } + })?; + + // iter over all lines of the DB where the key is prefixed by the current prefix. + let iter = db + .remap_key_type::() + .prefix_iter(self.wtxn, prefix_bytes.as_bytes())? + .remap_key_type::(); + for result in iter { + let ((word, pos), data) = result?; + if word.starts_with(prefix) { + let key = (prefix, pos); + let bytes = StrBEU32Codec::bytes_encode(&key).unwrap(); + prefix_position_docids_sorter.insert(bytes, data)?; + } + } + } + + // We remove all the entries that are no more required in this word prefix position + // docids database. + let mut iter = + self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data(); + while let Some(((prefix, _), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(prefix.as_bytes()) { + unsafe { iter.del_current()? }; + } + } + + drop(iter); + + // We finally write all the word prefix position docids into the LMDB database. + sorter_into_lmdb_database( + self.wtxn, + *self.index.word_prefix_position_docids.as_polymorph(), + prefix_position_docids_sorter, + merge_cbo_roaring_bitmaps, + )?; + + Ok(()) + } +} + +fn write_prefixes_in_sorter( + prefixes: &mut HashMap, Vec>>, + sorter: &mut grenad::Sorter, +) -> Result<()> { + for (key, data_slices) in prefixes.drain() { + for data in data_slices { + if valid_lmdb_key(&key) { + sorter.insert(&key, data)?; + } + } + } + + Ok(()) +} diff --git a/milli/src/update/words_prefixes_fst.rs b/milli/src/update/words_prefixes_fst.rs new file mode 100644 index 000000000..57fed0922 --- /dev/null +++ b/milli/src/update/words_prefixes_fst.rs @@ -0,0 +1,96 @@ +use std::iter::{repeat_with, FromIterator}; +use std::str; + +use fst::{SetBuilder, Streamer}; + +use crate::{Index, Result, SmallString32}; + +pub struct WordsPrefixesFst<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + threshold: u32, + max_prefix_length: usize, +} + +impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + ) -> WordsPrefixesFst<'t, 'u, 'i> { + WordsPrefixesFst { wtxn, index, threshold: 100, max_prefix_length: 4 } + } + + /// Set the number of words required to make a prefix be part of the words prefixes + /// database. If a word prefix is supposed to match more than this number of words in the + /// dictionnary, therefore this prefix is added to the words prefixes datastructures. + /// + /// Default value is 100. This value must be higher than 50 and will be clamped + /// to this bound otherwise. + pub fn threshold(&mut self, value: u32) -> &mut Self { + self.threshold = value.max(50); + self + } + + /// Set the maximum length of prefixes in bytes. + /// + /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped + /// to these bounds, otherwise. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value.clamp(1, 25); + self + } + + #[logging_timer::time("WordsPrefixesFst::{}")] + pub fn execute(self) -> Result<()> { + let words_fst = self.index.words_fst(self.wtxn)?; + + let mut current_prefix = vec![SmallString32::new(); self.max_prefix_length]; + let mut current_prefix_count = vec![0; self.max_prefix_length]; + let mut builders = + repeat_with(SetBuilder::memory).take(self.max_prefix_length).collect::>(); + + let mut stream = words_fst.stream(); + while let Some(bytes) = stream.next() { + for n in 0..self.max_prefix_length { + let current_prefix = &mut current_prefix[n]; + let current_prefix_count = &mut current_prefix_count[n]; + let builder = &mut builders[n]; + + // We try to get the first n bytes out of this string but we only want + // to split at valid characters bounds. If we try to split in the middle of + // a character we ignore this word and go to the next one. + let word = str::from_utf8(bytes)?; + let prefix = match word.get(..=n) { + Some(prefix) => prefix, + None => continue, + }; + + // This is the first iteration of the loop, + // or the current word doesn't starts with the current prefix. + if *current_prefix_count == 0 || prefix != current_prefix.as_str() { + *current_prefix = SmallString32::from(prefix); + *current_prefix_count = 0; + } + + *current_prefix_count += 1; + + // There is enough words corresponding to this prefix to add it to the cache. + if *current_prefix_count >= self.threshold { + builder.insert(prefix)?; + } + } + } + + // We merge all of the previously computed prefixes into on final set. + let prefix_fsts: Vec<_> = builders.into_iter().map(|sb| sb.into_set()).collect(); + let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); + let mut builder = fst::SetBuilder::memory(); + builder.extend_stream(op.r#union())?; + let prefix_fst = builder.into_set(); + + // Set the words prefixes FST in the dtabase. + self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; + + Ok(()) + } +} diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson new file mode 100644 index 000000000..2e77f9faf --- /dev/null +++ b/milli/tests/assets/test_set.ndjson @@ -0,0 +1,17 @@ +{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null], "tag_in": 1} +{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": [], "tag_in": 2} +{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null, "tag_in": 3} +{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4, "tag_in": "four"} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E", "tag_in": "five"} +{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"], "tag_in": null} +{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":"", "opt1": [7]} +{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8], "tag_in": 8} +{"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":"", "tag_in": "nine"} +{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}, "tag_in": 10} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] , "tag_in": "eleven"} +{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}, "tag_in": 12} +{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":"", "opt1": [13, [{"opt2": null}]]} +{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}}} +{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]]} +{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":"", "opt1.opt2": 16} +{"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""} diff --git a/milli/tests/mod.rs b/milli/tests/mod.rs new file mode 100644 index 000000000..11095a6a9 --- /dev/null +++ b/milli/tests/mod.rs @@ -0,0 +1 @@ +mod search; diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs new file mode 100644 index 000000000..3c6dd8cc0 --- /dev/null +++ b/milli/tests/search/distinct.rs @@ -0,0 +1,179 @@ +use std::collections::HashSet; + +use big_s::S; +use milli::update::Settings; +use milli::{Criterion, Search, SearchResult, TermsMatchingStrategy}; +use Criterion::*; + +use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; + +macro_rules! test_distinct { + ($func:ident, $distinct:ident, $exhaustive:ident, $limit:expr, $criteria:expr, $n_res:expr) => { + #[test] + fn $func() { + let criteria = $criteria; + let index = search::setup_search_index_with_criteria(&criteria); + + // update distinct attribute + let mut wtxn = index.write_txn().unwrap(); + let config = milli::update::IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + builder.set_distinct_field(S(stringify!($distinct))); + builder.execute(|_| (), || false).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.query(search::TEST_QUERY); + search.limit($limit); + search.exhaustive_number_hits($exhaustive); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + let SearchResult { documents_ids, candidates, .. } = search.execute().unwrap(); + + assert_eq!(candidates.len(), $n_res); + + let mut distinct_values = HashSet::new(); + let expected_external_ids: Vec<_> = + search::expected_order(&criteria, true, TermsMatchingStrategy::default(), &[]) + .into_iter() + .filter_map(|d| { + if distinct_values.contains(&d.$distinct) { + None + } else { + distinct_values.insert(d.$distinct.to_owned()); + Some(d.id) + } + }) + .take($limit) + .collect(); + + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + assert_eq!(documents_ids, expected_external_ids); + } + }; +} + +test_distinct!( + exhaustive_distinct_string_default_criteria, + tag, + true, + 1, + vec![Words, Typo, Proximity, Attribute, Exactness], + 3 +); +test_distinct!( + exhaustive_distinct_number_default_criteria, + asc_desc_rank, + true, + 1, + vec![Words, Typo, Proximity, Attribute, Exactness], + 7 +); +test_distinct!( + exhaustive_distinct_number_weird_order_criteria, + asc_desc_rank, + true, + 0, + vec![Desc(S("attribute_rank")), Desc(S("exactness_rank")), Exactness, Typo], + 7 +); + +test_distinct!( + distinct_string_default_criteria, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Typo, Proximity, Attribute, Exactness], + 3 +); +test_distinct!( + distinct_number_default_criteria, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Typo, Proximity, Attribute, Exactness], + 7 +); +test_distinct!( + distinct_string_criterion_words, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words], + 3 +); +test_distinct!( + distinct_number_criterion_words, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words], + 7 +); +test_distinct!( + distinct_string_criterion_words_typo, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Typo], + 3 +); +test_distinct!( + distinct_number_criterion_words_typo, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Typo], + 7 +); +test_distinct!( + distinct_string_criterion_words_proximity, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Proximity], + 3 +); +test_distinct!( + distinct_number_criterion_words_proximity, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Proximity], + 7 +); +test_distinct!( + distinct_string_criterion_words_attribute, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Attribute], + 3 +); +test_distinct!( + distinct_number_criterion_words_attribute, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Attribute], + 7 +); +test_distinct!( + distinct_string_criterion_words_exactness, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Exactness], + 3 +); +test_distinct!( + distinct_number_criterion_words_exactness, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Exactness], + 7 +); diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs new file mode 100644 index 000000000..e2f89f2db --- /dev/null +++ b/milli/tests/search/facet_distribution.rs @@ -0,0 +1,74 @@ +use std::io::Cursor; + +use big_s::S; +use heed::EnvOpenOptions; +use maplit::hashset; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; +use milli::{FacetDistribution, Index, Object}; +use serde_json::Deserializer; + +#[test] +fn test_facet_distribution_with_no_facet_values() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_filterable_fields(hashset! { + S("genres"), + S("tags"), + }); + builder.execute(|_| (), || false).unwrap(); + + // index documents + let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; + let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); + let reader = Cursor::new( + r#"{ + "id": 123, + "title": "What a week, hu...", + "genres": [], + "tags": ["blue"] + } + { + "id": 345, + "title": "I am the pig!", + "tags": ["red"] + }"#, + ); + + for result in Deserializer::from_reader(reader).into_iter::() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); + } + + let vector = documents_builder.into_inner().unwrap(); + + // index documents + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let txn = index.read_txn().unwrap(); + let mut distrib = FacetDistribution::new(&txn, &index); + distrib.facets(vec!["genres"]); + let result = distrib.execute().unwrap(); + assert_eq!(result["genres"].len(), 0); + + let mut distrib = FacetDistribution::new(&txn, &index); + distrib.facets(vec!["tags"]); + let result = distrib.execute().unwrap(); + assert_eq!(result["tags"].len(), 2); +} diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs new file mode 100644 index 000000000..18de24ac3 --- /dev/null +++ b/milli/tests/search/filters.rs @@ -0,0 +1,91 @@ +use either::{Either, Left, Right}; +use milli::{Criterion, Filter, Search, SearchResult, TermsMatchingStrategy}; +use Criterion::*; + +use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; + +macro_rules! test_filter { + ($func:ident, $filter:expr) => { + #[test] + fn $func() { + let criteria = vec![Words, Typo, Proximity, Attribute, Exactness]; + let index = search::setup_search_index_with_criteria(&criteria); + let rtxn = index.read_txn().unwrap(); + + let filter_conditions = + Filter::from_array::, &str>>, _>($filter).unwrap().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + search.filter(filter_conditions); + + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + let filtered_ids = search::expected_filtered_ids($filter); + let expected_external_ids: Vec<_> = + search::expected_order(&criteria, true, TermsMatchingStrategy::default(), &[]) + .into_iter() + .filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None }) + .collect(); + + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + assert_eq!(documents_ids, expected_external_ids); + } + }; +} + +test_filter!(eq_simple_string_filter, vec![Right("tag=red")]); +test_filter!(eq_simple_number_filter, vec![Right("asc_desc_rank=1")]); +test_filter!(eq_string_and_filter_return_empty, vec![Right("tag=red"), Right("tag=green")]); +test_filter!(eq_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank=1")]); +test_filter!(eq_string_or_filter, vec![Left(vec!["tag=red", "tag=green"])]); +test_filter!(eq_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank=1"])]); +test_filter!(eq_number_or_filter, vec![Left(vec!["asc_desc_rank=3", "asc_desc_rank=1"])]); +test_filter!(neq_simple_string_filter, vec![Right("tag!=red")]); +test_filter!(neq_simple_number_filter, vec![Right("asc_desc_rank!=1")]); +test_filter!(neq_simple_string_in_number_column_filter, vec![Right("asc_desc_rank!=red")]); +test_filter!(geo_radius, vec![Right("_geoRadius(50.630010347667806, 3.086251829166809, 100000)")]); +test_filter!( + not_geo_radius, + vec![Right("NOT _geoRadius(50.630010347667806, 3.086251829166809, 1000000)")] +); +test_filter!(eq_complex_filter, vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank=3")]); +test_filter!( + eq_complex_filter_2, + vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank=3", "asc_desc_rank=1"])] +); +test_filter!(greater_simple_number_filter, vec![Right("asc_desc_rank>1")]); +test_filter!(greater_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank>1")]); +test_filter!(greater_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank>1"])]); +test_filter!(greater_number_or_filter, vec![Left(vec!["asc_desc_rank>3", "asc_desc_rank>1"])]); +test_filter!( + greater_complex_filter, + vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank>3")] +); +test_filter!( + greater_complex_filter_2, + vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank>3", "asc_desc_rank>1"])] +); +test_filter!(lower_simple_number_filter, vec![Right("asc_desc_rank<1")]); +test_filter!(lower_mix_and_filter, vec![Right("tag=red"), Right("asc_desc_rank<1")]); +test_filter!(lower_mix_or_filter, vec![Left(vec!["tag=red", "asc_desc_rank<1"])]); +test_filter!(lower_number_or_filter, vec![Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])]); +test_filter!( + lower_complex_filter, + vec![Left(vec!["tag=red", "tag=green"]), Right("asc_desc_rank<3")] +); +test_filter!( + lower_complex_filter_2, + vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])] +); +test_filter!(exists_filter_1, vec![Right("opt1 EXISTS")]); +test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]); +test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]); +test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]); + +test_filter!(in_filter, vec![Right("tag_in IN[1, 2, 3, four, five]")]); +test_filter!(not_in_filter, vec![Right("tag_in NOT IN[1, 2, 3, four, five]")]); +test_filter!(not_not_in_filter, vec![Right("NOT tag_in NOT IN[1, 2, 3, four, five]")]); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs new file mode 100644 index 000000000..c2f8acd4d --- /dev/null +++ b/milli/tests/search/mod.rs @@ -0,0 +1,300 @@ +use std::cmp::Reverse; +use std::collections::HashSet; +use std::io::Cursor; + +use big_s::S; +use either::{Either, Left, Right}; +use heed::EnvOpenOptions; +use maplit::{hashmap, hashset}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; +use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy}; +use serde::{Deserialize, Deserializer}; +use slice_group_by::GroupBy; + +mod distinct; +mod facet_distribution; +mod filters; +mod phrase_search; +mod query_criteria; +mod sort; +mod typo_tolerance; + +pub const TEST_QUERY: &str = "hello world america"; + +pub const EXTERNAL_DOCUMENTS_IDS: &[&str; 17] = + &["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q"]; + +pub const CONTENT: &str = include_str!("../assets/test_set.ndjson"); + +pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_criteria(criteria.to_vec()); + builder.set_filterable_fields(hashset! { + S("tag"), + S("asc_desc_rank"), + S("_geo"), + S("opt1"), + S("opt1.opt2"), + S("tag_in") + }); + builder.set_sortable_fields(hashset! { + S("tag"), + S("asc_desc_rank"), + }); + builder.set_synonyms(hashmap! { + S("hello") => vec![S("good morning")], + S("world") => vec![S("earth")], + S("america") => vec![S("the united states")], + }); + builder.set_searchable_fields(vec![S("title"), S("description")]); + builder.execute(|_| (), || false).unwrap(); + + // index documents + let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; + let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); + let reader = Cursor::new(CONTENT.as_bytes()); + + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); + } + + let vector = documents_builder.into_inner().unwrap(); + + // index documents + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index +} + +pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec { + let mut rtxn = index.read_txn().unwrap(); + let docid_map = index.external_documents_ids(&mut rtxn).unwrap(); + let docid_map: std::collections::HashMap<_, _> = + EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect(); + internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect() +} + +pub fn expected_order( + criteria: &[Criterion], + authorize_typo: bool, + optional_words: TermsMatchingStrategy, + sort_by: &[AscDesc], +) -> Vec { + let dataset = + serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); + let mut groups: Vec> = vec![dataset]; + + for criterion in criteria { + let mut new_groups = Vec::new(); + for group in groups.iter_mut() { + match criterion { + Criterion::Attribute => { + group.sort_by_key(|d| d.attribute_rank); + new_groups + .extend(group.linear_group_by_key(|d| d.attribute_rank).map(Vec::from)); + } + Criterion::Exactness => { + group.sort_by_key(|d| d.exact_rank); + new_groups.extend(group.linear_group_by_key(|d| d.exact_rank).map(Vec::from)); + } + Criterion::Proximity => { + group.sort_by_key(|d| d.proximity_rank); + new_groups + .extend(group.linear_group_by_key(|d| d.proximity_rank).map(Vec::from)); + } + Criterion::Sort if sort_by == [AscDesc::Asc(Member::Field(S("tag")))] => { + group.sort_by_key(|d| d.sort_by_rank); + new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); + } + Criterion::Sort if sort_by == [AscDesc::Desc(Member::Field(S("tag")))] => { + group.sort_by_key(|d| Reverse(d.sort_by_rank)); + new_groups.extend(group.linear_group_by_key(|d| d.sort_by_rank).map(Vec::from)); + } + Criterion::Typo => { + group.sort_by_key(|d| d.typo_rank); + new_groups.extend(group.linear_group_by_key(|d| d.typo_rank).map(Vec::from)); + } + Criterion::Words => { + group.sort_by_key(|d| d.word_rank); + new_groups.extend(group.linear_group_by_key(|d| d.word_rank).map(Vec::from)); + } + Criterion::Asc(field_name) if field_name == "asc_desc_rank" => { + group.sort_by_key(|d| d.asc_desc_rank); + new_groups + .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); + } + Criterion::Desc(field_name) if field_name == "asc_desc_rank" => { + group.sort_by_key(|d| Reverse(d.asc_desc_rank)); + new_groups + .extend(group.linear_group_by_key(|d| d.asc_desc_rank).map(Vec::from)); + } + Criterion::Asc(_) | Criterion::Desc(_) | Criterion::Sort => { + new_groups.push(group.clone()) + } + } + } + groups = std::mem::take(&mut new_groups); + } + + if authorize_typo && optional_words == TermsMatchingStrategy::default() { + groups.into_iter().flatten().collect() + } else if optional_words == TermsMatchingStrategy::default() { + groups.into_iter().flatten().filter(|d| d.typo_rank == 0).collect() + } else if authorize_typo { + groups.into_iter().flatten().filter(|d| d.word_rank == 0).collect() + } else { + groups.into_iter().flatten().filter(|d| d.word_rank == 0 && d.typo_rank == 0).collect() + } +} + +fn execute_filter(filter: &str, document: &TestDocument) -> Option { + let mut id = None; + if let Some((field, filter)) = filter.split_once("!=") { + if field == "tag" && document.tag != filter { + id = Some(document.id.clone()) + } else if field == "asc_desc_rank" + && Ok(&document.asc_desc_rank) != filter.parse::().as_ref() + { + id = Some(document.id.clone()) + } + } else if let Some((field, filter)) = filter.split_once('=') { + if field == "tag" && document.tag == filter { + id = Some(document.id.clone()) + } else if field == "asc_desc_rank" + && document.asc_desc_rank == filter.parse::().unwrap() + { + id = Some(document.id.clone()) + } + } else if let Some(("asc_desc_rank", filter)) = filter.split_once('<') { + if document.asc_desc_rank < filter.parse().unwrap() { + id = Some(document.id.clone()) + } + } else if let Some(("asc_desc_rank", filter)) = filter.split_once('>') { + if document.asc_desc_rank > filter.parse().unwrap() { + id = Some(document.id.clone()) + } + } else if filter.starts_with("_geoRadius") { + id = (document.geo_rank < 100000).then(|| document.id.clone()); + } else if filter.starts_with("NOT _geoRadius") { + id = (document.geo_rank > 1000000).then(|| document.id.clone()); + } else if matches!(filter, "opt1 EXISTS" | "NOT opt1 NOT EXISTS") { + id = document.opt1.is_some().then(|| document.id.clone()); + } else if matches!(filter, "NOT opt1 EXISTS" | "opt1 NOT EXISTS") { + id = document.opt1.is_none().then(|| document.id.clone()); + } else if matches!(filter, "opt1.opt2 EXISTS") { + if document.opt1opt2.is_some() { + id = Some(document.id.clone()); + } else if let Some(opt1) = &document.opt1 { + id = contains_key_rec(opt1, "opt2").then(|| document.id.clone()); + } + } else if matches!( + filter, + "tag_in IN[1, 2, 3, four, five]" | "NOT tag_in NOT IN[1, 2, 3, four, five]" + ) { + id = matches!(document.id.as_str(), "A" | "B" | "C" | "D" | "E") + .then(|| document.id.clone()); + } else if matches!(filter, "tag_in NOT IN[1, 2, 3, four, five]") { + id = (!matches!(document.id.as_str(), "A" | "B" | "C" | "D" | "E")) + .then(|| document.id.clone()); + } + id +} + +pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool { + match v { + serde_json::Value::Array(v) => { + for v in v.iter() { + if contains_key_rec(v, key) { + return true; + } + } + false + } + serde_json::Value::Object(v) => { + for (k, v) in v.iter() { + if k == key || contains_key_rec(v, key) { + return true; + } + } + false + } + _ => false, + } +} + +pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet { + let dataset: Vec = + serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); + + let mut filtered_ids: HashSet<_> = dataset.iter().map(|d| d.id.clone()).collect(); + for either in filters { + let ids = match either { + Left(array) => array + .into_iter() + .map(|f| { + let ids: HashSet = + dataset.iter().filter_map(|d| execute_filter(f, d)).collect(); + ids + }) + .reduce(|a, b| a.union(&b).cloned().collect()) + .unwrap(), + Right(filter) => { + let ids: HashSet = + dataset.iter().filter_map(|d| execute_filter(filter, d)).collect(); + ids + } + }; + + filtered_ids = filtered_ids.intersection(&ids).cloned().collect(); + } + + filtered_ids +} + +#[derive(Debug, Clone, Deserialize, PartialEq, Eq)] +pub struct TestDocument { + pub id: String, + pub word_rank: u32, + pub typo_rank: u32, + pub proximity_rank: u32, + pub attribute_rank: u32, + pub exact_rank: u32, + pub asc_desc_rank: u32, + pub sort_by_rank: u32, + pub geo_rank: u32, + pub title: String, + pub description: String, + pub tag: String, + #[serde(default, deserialize_with = "some_option")] + pub opt1: Option, + #[serde(default, deserialize_with = "some_option", rename = "opt1.opt2")] + pub opt1opt2: Option, +} + +fn some_option<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let result = serde_json::Value::deserialize(deserializer)?; + Ok(Some(result)) +} diff --git a/milli/tests/search/phrase_search.rs b/milli/tests/search/phrase_search.rs new file mode 100644 index 000000000..ca5eaad48 --- /dev/null +++ b/milli/tests/search/phrase_search.rs @@ -0,0 +1,56 @@ +use milli::update::{IndexerConfig, Settings}; +use milli::{Criterion, Index, Search, TermsMatchingStrategy}; + +use crate::search::Criterion::{Attribute, Exactness, Proximity}; + +fn set_stop_words(index: &Index, stop_words: &[&str]) { + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + + let mut builder = Settings::new(&mut wtxn, &index, &config); + let stop_words = stop_words.into_iter().map(|s| s.to_string()).collect(); + builder.set_stop_words(stop_words); + builder.execute(|_| (), || false).unwrap(); + wtxn.commit().unwrap(); +} + +fn test_phrase_search_with_stop_words_given_criteria(criteria: &[Criterion]) { + let index = super::setup_search_index_with_criteria(&criteria); + + // Add stop_words + set_stop_words(&index, &["a", "an", "the", "of"]); + + // Phrase search containing stop words + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("\"the use of force\""); + search.limit(10); + search.authorize_typos(false); + search.terms_matching_strategy(TermsMatchingStrategy::All); + + let result = search.execute().unwrap(); + // 1 document should match + assert_eq!(result.documents_ids.len(), 1); + + // test for a single stop word only, no other search terms + let mut search = Search::new(&txn, &index); + search.query("\"the\""); + search.limit(10); + search.authorize_typos(false); + search.terms_matching_strategy(TermsMatchingStrategy::All); + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); +} + +#[test] +fn test_phrase_search_with_stop_words_no_criteria() { + let criteria = []; + test_phrase_search_with_stop_words_given_criteria(&criteria); +} + +#[test] +fn test_phrase_search_with_stop_words_all_criteria() { + let criteria = [Proximity, Attribute, Exactness]; + test_phrase_search_with_stop_words_given_criteria(&criteria); +} diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs new file mode 100644 index 000000000..16058e941 --- /dev/null +++ b/milli/tests/search/query_criteria.rs @@ -0,0 +1,470 @@ +use std::cmp::Reverse; +use std::io::Cursor; + +use big_s::S; +use heed::EnvOpenOptions; +use itertools::Itertools; +use maplit::hashset; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; +use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy}; +use rand::Rng; +use Criterion::*; + +use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; + +const ALLOW_TYPOS: bool = true; +const DISALLOW_TYPOS: bool = false; +const ALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::Last; +const DISALLOW_OPTIONAL_WORDS: TermsMatchingStrategy = TermsMatchingStrategy::All; +const ASC_DESC_CANDIDATES_THRESHOLD: usize = 1000; + +macro_rules! test_criterion { + ($func:ident, $optional_word:ident, $authorize_typos:ident, $criteria:expr, $sort_criteria:expr) => { + #[test] + fn $func() { + let criteria = $criteria; + let index = search::setup_search_index_with_criteria(&criteria); + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.authorize_typos($authorize_typos); + search.terms_matching_strategy($optional_word); + search.sort_criteria($sort_criteria); + + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = search::expected_order( + &criteria, + $authorize_typos, + $optional_word, + &$sort_criteria[..], + ) + .into_iter() + .map(|d| d.id) + .collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + assert_eq!(documents_ids, expected_external_ids); + } + }; +} + +test_criterion!(none_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![], vec![]); +test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![], vec![]); +test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words], vec![]); +test_criterion!( + attribute_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Attribute], + vec![] +); +test_criterion!(typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Typo], vec![]); +test_criterion!( + attribute_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Attribute], + vec![] +); +test_criterion!( + exactness_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Exactness], + vec![] +); +test_criterion!( + exactness_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Exactness], + vec![] +); +test_criterion!( + proximity_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Proximity], + vec![] +); +test_criterion!( + proximity_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Proximity], + vec![] +); +test_criterion!( + asc_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Asc(S("asc_desc_rank"))], + vec![] +); +test_criterion!( + asc_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Asc(S("asc_desc_rank"))], + vec![] +); +test_criterion!( + desc_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Desc(S("asc_desc_rank"))], + vec![] +); +test_criterion!( + desc_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Desc(S("asc_desc_rank"))], + vec![] +); +test_criterion!( + asc_unexisting_field_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Asc(S("unexisting_field"))], + vec![] +); +test_criterion!( + asc_unexisting_field_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Asc(S("unexisting_field"))], + vec![] +); +test_criterion!( + desc_unexisting_field_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Desc(S("unexisting_field"))], + vec![] +); +test_criterion!( + desc_unexisting_field_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Desc(S("unexisting_field"))], + vec![] +); +test_criterion!(empty_sort_by_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Sort], vec![]); +test_criterion!( + empty_sort_by_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Sort], + vec![] +); +test_criterion!( + sort_by_asc_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Asc(Member::Field(S("tag")))] +); +test_criterion!( + sort_by_asc_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Asc(Member::Field(S("tag")))] +); +test_criterion!( + sort_by_desc_allow_typo, + DISALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Desc(Member::Field(S("tag")))] +); +test_criterion!( + sort_by_desc_disallow_typo, + DISALLOW_OPTIONAL_WORDS, + DISALLOW_TYPOS, + vec![Sort], + vec![AscDesc::Desc(Member::Field(S("tag")))] +); +test_criterion!( + default_criteria_order, + ALLOW_OPTIONAL_WORDS, + ALLOW_TYPOS, + vec![Words, Typo, Proximity, Attribute, Exactness], + vec![] +); + +#[test] +fn criteria_mixup() { + use Criterion::*; + let index = search::setup_search_index_with_criteria(&[ + Words, + Attribute, + Desc(S("asc_desc_rank")), + Exactness, + Proximity, + Typo, + ]); + + #[rustfmt::skip] + let criteria_mix = { + // Criterion doesn't implement Copy, we create a new Criterion using a closure + let desc = || Desc(S("asc_desc_rank")); + // all possible criteria order + vec![ + vec![Words, Attribute, desc(), Exactness, Proximity, Typo], + vec![Words, Attribute, desc(), Exactness, Typo, Proximity], + vec![Words, Attribute, desc(), Proximity, Exactness, Typo], + vec![Words, Attribute, desc(), Proximity, Typo, Exactness], + vec![Words, Attribute, desc(), Typo, Exactness, Proximity], + vec![Words, Attribute, desc(), Typo, Proximity, Exactness], + vec![Words, Attribute, Exactness, desc(), Proximity, Typo], + vec![Words, Attribute, Exactness, desc(), Typo, Proximity], + vec![Words, Attribute, Exactness, Proximity, desc(), Typo], + vec![Words, Attribute, Exactness, Proximity, Typo, desc()], + vec![Words, Attribute, Exactness, Typo, desc(), Proximity], + vec![Words, Attribute, Exactness, Typo, Proximity, desc()], + vec![Words, Attribute, Proximity, desc(), Exactness, Typo], + vec![Words, Attribute, Proximity, desc(), Typo, Exactness], + vec![Words, Attribute, Proximity, Exactness, desc(), Typo], + vec![Words, Attribute, Proximity, Exactness, Typo, desc()], + vec![Words, Attribute, Proximity, Typo, desc(), Exactness], + vec![Words, Attribute, Proximity, Typo, Exactness, desc()], + vec![Words, Attribute, Typo, desc(), Exactness, Proximity], + vec![Words, Attribute, Typo, desc(), Proximity, Exactness], + vec![Words, Attribute, Typo, Exactness, desc(), Proximity], + vec![Words, Attribute, Typo, Exactness, Proximity, desc()], + vec![Words, Attribute, Typo, Proximity, desc(), Exactness], + vec![Words, Attribute, Typo, Proximity, Exactness, desc()], + vec![Words, desc(), Attribute, Exactness, Proximity, Typo], + vec![Words, desc(), Attribute, Exactness, Typo, Proximity], + vec![Words, desc(), Attribute, Proximity, Exactness, Typo], + vec![Words, desc(), Attribute, Proximity, Typo, Exactness], + vec![Words, desc(), Attribute, Typo, Exactness, Proximity], + vec![Words, desc(), Attribute, Typo, Proximity, Exactness], + vec![Words, desc(), Exactness, Attribute, Proximity, Typo], + vec![Words, desc(), Exactness, Attribute, Typo, Proximity], + vec![Words, desc(), Exactness, Proximity, Attribute, Typo], + vec![Words, desc(), Exactness, Proximity, Typo, Attribute], + vec![Words, desc(), Exactness, Typo, Attribute, Proximity], + vec![Words, desc(), Exactness, Typo, Proximity, Attribute], + vec![Words, desc(), Proximity, Attribute, Exactness, Typo], + vec![Words, desc(), Proximity, Attribute, Typo, Exactness], + vec![Words, desc(), Proximity, Exactness, Attribute, Typo], + vec![Words, desc(), Proximity, Exactness, Typo, Attribute], + vec![Words, desc(), Proximity, Typo, Attribute, Exactness], + vec![Words, desc(), Proximity, Typo, Exactness, Attribute], + vec![Words, desc(), Typo, Attribute, Exactness, Proximity], + vec![Words, desc(), Typo, Attribute, Proximity, Exactness], + vec![Words, desc(), Typo, Exactness, Attribute, Proximity], + vec![Words, desc(), Typo, Exactness, Proximity, Attribute], + vec![Words, desc(), Typo, Proximity, Attribute, Exactness], + vec![Words, desc(), Typo, Proximity, Exactness, Attribute], + vec![Words, Exactness, Attribute, desc(), Proximity, Typo], + vec![Words, Exactness, Attribute, desc(), Typo, Proximity], + vec![Words, Exactness, Attribute, Proximity, desc(), Typo], + vec![Words, Exactness, Attribute, Proximity, Typo, desc()], + vec![Words, Exactness, Attribute, Typo, desc(), Proximity], + vec![Words, Exactness, Attribute, Typo, Proximity, desc()], + vec![Words, Exactness, desc(), Attribute, Proximity, Typo], + vec![Words, Exactness, desc(), Attribute, Typo, Proximity], + vec![Words, Exactness, desc(), Proximity, Attribute, Typo], + vec![Words, Exactness, desc(), Proximity, Typo, Attribute], + vec![Words, Exactness, desc(), Typo, Attribute, Proximity], + vec![Words, Exactness, desc(), Typo, Proximity, Attribute], + vec![Words, Exactness, Proximity, Attribute, desc(), Typo], + vec![Words, Exactness, Proximity, Attribute, Typo, desc()], + vec![Words, Exactness, Proximity, desc(), Attribute, Typo], + vec![Words, Exactness, Proximity, desc(), Typo, Attribute], + vec![Words, Exactness, Proximity, Typo, Attribute, desc()], + vec![Words, Exactness, Proximity, Typo, desc(), Attribute], + vec![Words, Exactness, Typo, Attribute, desc(), Proximity], + vec![Words, Exactness, Typo, Attribute, Proximity, desc()], + vec![Words, Exactness, Typo, desc(), Attribute, Proximity], + vec![Words, Exactness, Typo, desc(), Proximity, Attribute], + vec![Words, Exactness, Typo, Proximity, Attribute, desc()], + vec![Words, Exactness, Typo, Proximity, desc(), Attribute], + vec![Words, Proximity, Attribute, desc(), Exactness, Typo], + vec![Words, Proximity, Attribute, desc(), Typo, Exactness], + vec![Words, Proximity, Attribute, Exactness, desc(), Typo], + vec![Words, Proximity, Attribute, Exactness, Typo, desc()], + vec![Words, Proximity, Attribute, Typo, desc(), Exactness], + vec![Words, Proximity, Attribute, Typo, Exactness, desc()], + vec![Words, Proximity, desc(), Attribute, Exactness, Typo], + vec![Words, Proximity, desc(), Attribute, Typo, Exactness], + vec![Words, Proximity, desc(), Exactness, Attribute, Typo], + vec![Words, Proximity, desc(), Exactness, Typo, Attribute], + vec![Words, Proximity, desc(), Typo, Attribute, Exactness], + vec![Words, Proximity, desc(), Typo, Exactness, Attribute], + vec![Words, Proximity, Exactness, Attribute, desc(), Typo], + vec![Words, Proximity, Exactness, Attribute, Typo, desc()], + vec![Words, Proximity, Exactness, desc(), Attribute, Typo], + vec![Words, Proximity, Exactness, desc(), Typo, Attribute], + vec![Words, Proximity, Exactness, Typo, Attribute, desc()], + vec![Words, Proximity, Exactness, Typo, desc(), Attribute], + vec![Words, Proximity, Typo, Attribute, desc(), Exactness], + vec![Words, Proximity, Typo, Attribute, Exactness, desc()], + vec![Words, Proximity, Typo, desc(), Attribute, Exactness], + vec![Words, Proximity, Typo, desc(), Exactness, Attribute], + vec![Words, Proximity, Typo, Exactness, Attribute, desc()], + vec![Words, Proximity, Typo, Exactness, desc(), Attribute], + vec![Words, Typo, Attribute, desc(), Exactness, Proximity], + vec![Words, Typo, Attribute, desc(), Proximity, Exactness], + vec![Words, Typo, Attribute, Exactness, desc(), Proximity], + vec![Words, Typo, Attribute, Exactness, Proximity, desc()], + vec![Words, Typo, Attribute, Proximity, desc(), Exactness], + vec![Words, Typo, Attribute, Proximity, Exactness, desc()], + vec![Words, Typo, desc(), Attribute, Proximity, Exactness], + vec![Words, Typo, desc(), Exactness, Attribute, Proximity], + vec![Words, Typo, desc(), Exactness, Attribute, Proximity], + vec![Words, Typo, desc(), Exactness, Proximity, Attribute], + vec![Words, Typo, desc(), Proximity, Attribute, Exactness], + vec![Words, Typo, desc(), Proximity, Exactness, Attribute], + vec![Words, Typo, Exactness, Attribute, desc(), Proximity], + vec![Words, Typo, Exactness, Attribute, Proximity, desc()], + vec![Words, Typo, Exactness, desc(), Attribute, Proximity], + vec![Words, Typo, Exactness, desc(), Proximity, Attribute], + vec![Words, Typo, Exactness, Proximity, Attribute, desc()], + vec![Words, Typo, Exactness, Proximity, desc(), Attribute], + vec![Words, Typo, Proximity, Attribute, desc(), Exactness], + vec![Words, Typo, Proximity, Attribute, Exactness, desc()], + vec![Words, Typo, Proximity, desc(), Attribute, Exactness], + vec![Words, Typo, Proximity, desc(), Exactness, Attribute], + vec![Words, Typo, Proximity, Exactness, Attribute, desc()], + vec![Words, Typo, Proximity, Exactness, desc(), Attribute], + ] + }; + + let config = IndexerConfig::default(); + for criteria in criteria_mix { + eprintln!("Testing with criteria order: {:?}", &criteria); + //update criteria + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + builder.set_criteria(criteria.clone()); + builder.execute(|_| (), || false).unwrap(); + wtxn.commit().unwrap(); + + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.terms_matching_strategy(ALLOW_OPTIONAL_WORDS); + search.authorize_typos(ALLOW_TYPOS); + + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + let expected_external_ids: Vec<_> = + search::expected_order(&criteria, ALLOW_TYPOS, ALLOW_OPTIONAL_WORDS, &[]) + .into_iter() + .map(|d| d.id) + .collect(); + let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + assert_eq!(documents_ids, expected_external_ids); + } +} + +#[test] +fn criteria_ascdesc() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(12 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_sortable_fields(hashset! { + S("name"), + S("age"), + }); + builder.execute(|_| (), || false).unwrap(); + + // index documents + let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; + let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); + + let mut batch_builder = DocumentsBatchBuilder::new(Vec::new()); + + (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| { + let mut rng = rand::thread_rng(); + + let age = rng.gen::().to_string(); + let name = rng + .sample_iter(&rand::distributions::Alphanumeric) + .map(char::from) + .filter(|c| *c >= 'a' && *c <= 'z') + .take(10) + .collect::(); + + let json = serde_json::json!({ + "name": name, + "age": age, + }); + + let object = match json { + serde_json::Value::Object(object) => object, + _ => panic!(), + }; + + batch_builder.append_json_object(&object).unwrap(); + }); + + let vector = batch_builder.into_inner().unwrap(); + + let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); + let (builder, user_error) = builder.add_documents(reader).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let documents = index.all_documents(&rtxn).unwrap().map(|doc| doc.unwrap()).collect::>(); + + for criterion in [Asc(S("name")), Desc(S("name")), Asc(S("age")), Desc(S("age"))] { + eprintln!("Testing with criterion: {:?}", &criterion); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + builder.set_criteria(vec![criterion.clone()]); + builder.execute(|_| (), || false).unwrap(); + wtxn.commit().unwrap(); + + let mut rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&mut rtxn, &index); + search.limit(ASC_DESC_CANDIDATES_THRESHOLD + 1); + + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + let expected_document_ids = match criterion { + Asc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| obkv.get(0).unwrap()) + } + Desc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| Reverse(obkv.get(0).unwrap())) + } + Asc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| obkv.get(1).unwrap()) + } + Desc(field_name) if field_name == "name" => { + documents.iter().sorted_by_key(|(_, obkv)| Reverse(obkv.get(1).unwrap())) + } + _ => continue, + } + .map(|(id, _)| *id) + .collect::>(); + + assert_eq!(documents_ids, expected_document_ids); + } +} diff --git a/milli/tests/search/sort.rs b/milli/tests/search/sort.rs new file mode 100644 index 000000000..16d21eac8 --- /dev/null +++ b/milli/tests/search/sort.rs @@ -0,0 +1,23 @@ +use big_s::S; +use milli::Criterion::{Attribute, Exactness, Proximity, Typo, Words}; +use milli::{AscDesc, Error, Member, Search, TermsMatchingStrategy, UserError}; + +use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; + +#[test] +fn sort_ranking_rule_missing() { + let criteria = vec![Words, Typo, Proximity, Attribute, Exactness]; + // sortables: `tag` and `asc_desc_rank` + let index = search::setup_search_index_with_criteria(&criteria); + let rtxn = index.read_txn().unwrap(); + + let mut search = Search::new(&rtxn, &index); + search.query(search::TEST_QUERY); + search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + search.sort_criteria(vec![AscDesc::Asc(Member::Field(S("tag")))]); + + let result = search.execute(); + assert!(matches!(result, Err(Error::UserError(UserError::SortRankingRuleMissing)))); +} diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs new file mode 100644 index 000000000..c939186e5 --- /dev/null +++ b/milli/tests/search/typo_tolerance.rs @@ -0,0 +1,208 @@ +use std::collections::BTreeSet; + +use heed::EnvOpenOptions; +use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; +use milli::{Criterion, Index, Search, TermsMatchingStrategy}; +use serde_json::json; +use tempfile::tempdir; +use Criterion::*; + +#[test] +fn test_typo_tolerance_one_typo() { + let criteria = [Typo]; + let index = super::setup_search_index_with_criteria(&criteria); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zeal"); + search.limit(10); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + + let mut search = Search::new(&txn, &index); + search.query("zean"); + search.limit(10); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_min_word_len_one_typo(4); + builder.execute(|_| (), || false).unwrap(); + + // typo is now supported for 4 letters words + let mut search = Search::new(&txn, &index); + search.query("zean"); + search.limit(10); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); +} + +#[test] +fn test_typo_tolerance_two_typo() { + let criteria = [Typo]; + let index = super::setup_search_index_with_criteria(&criteria); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zealand"); + search.limit(10); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + + let mut search = Search::new(&txn, &index); + search.query("zealemd"); + search.limit(10); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + builder.set_min_word_len_two_typos(7); + builder.execute(|_| (), || false).unwrap(); + + // typo is now supported for 4 letters words + let mut search = Search::new(&txn, &index); + search.query("zealemd"); + search.limit(10); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); +} + +#[test] +fn test_typo_disabled_on_word() { + let tmp = tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(4096 * 100); + let index = Index::new(options, tmp.path()).unwrap(); + + let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new()); + let doc1 = json!({ + "id": 1usize, + "data": "zealand", + }); + + let doc2 = json!({ + "id": 2usize, + "data": "zearand", + }); + + builder.append_json_object(doc1.as_object().unwrap()).unwrap(); + builder.append_json_object(doc2.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); + + let documents = + milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap(); + + let mut txn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| (), || false).unwrap(); + + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + txn.commit().unwrap(); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zealand"); + search.limit(10); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 2); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + let mut exact_words = BTreeSet::new(); + // `zealand` doesn't allow typos anymore + exact_words.insert("zealand".to_string()); + builder.set_exact_words(exact_words); + builder.execute(|_| (), || false).unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("zealand"); + search.limit(10); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); +} + +#[test] +fn test_disable_typo_on_attribute() { + let criteria = [Typo]; + let index = super::setup_search_index_with_criteria(&criteria); + + // basic typo search with default typo settings + { + let txn = index.read_txn().unwrap(); + + let mut search = Search::new(&txn, &index); + // typo in `antebel(l)um` + search.query("antebelum"); + search.limit(10); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 1); + } + + let mut txn = index.write_txn().unwrap(); + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut txn, &index, &config); + // disable typos on `description` + builder.set_exact_attributes(vec!["description".to_string()].into_iter().collect()); + builder.execute(|_| (), || false).unwrap(); + + let mut search = Search::new(&txn, &index); + search.query("antebelum"); + search.limit(10); + search.authorize_typos(true); + search.terms_matching_strategy(TermsMatchingStrategy::default()); + + let result = search.execute().unwrap(); + assert_eq!(result.documents_ids.len(), 0); +} diff --git a/script/pre-commit b/script/pre-commit new file mode 100755 index 000000000..4819a3b52 --- /dev/null +++ b/script/pre-commit @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +cargo check --workspace --all-targets &>/dev/null +result=$? + +if [[ ${result} -ne 0 ]] ; then + cat <<\EOF +The project does not compile. You might want to fix your error before commiting. + +If you still want to commit you can do it by appending +--no-verify +at the end of your previous command. + +If you are running a variant of bash you can directly paste this command in your terminal: +!! --no-verify +EOF + exit 1 +fi + +cargo fmt --all -- --check &>/dev/null +result=$? + +if [[ ${result} -ne 0 ]] ; then + cat <<\EOF +The project is badly formatted. Please run: +cargo fmt --all + +If you want to create your commit without propper formatting you can add +--no-verify +at the end of your commit. + +If you are running a variant of bash you can directly paste this command in your terminal: +!! --no-verify +EOF + exit 1 +fi diff --git a/script/qc_loop.sh b/script/qc_loop.sh new file mode 100755 index 000000000..c479307cc --- /dev/null +++ b/script/qc_loop.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +export RUST_BACKTRACE=1 + +while true +do + cargo test qc_ --release -- --nocapture + if [[ x$? != x0 ]] ; then + exit $? + fi +done