4621: Bring back changes from v1.8.0 into main r=curquiza a=curquiza



Co-authored-by: ManyTheFish <many@meilisearch.com>
Co-authored-by: Tamo <tamo@meilisearch.com>
Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com>
Co-authored-by: Clément Renault <clement@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-05-06 13:46:39 +00:00 committed by GitHub
commit 4d5971f343
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
56 changed files with 1450 additions and 802 deletions

247
Cargo.lock generated
View File

@ -354,9 +354,9 @@ dependencies = [
[[package]] [[package]]
name = "anyhow" name = "anyhow"
version = "1.0.80" version = "1.0.82"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" checksum = "f538837af36e6f6a9be0faa67f9a314f8119e4e4b5867c6ab40ed60360142519"
dependencies = [ dependencies = [
"backtrace", "backtrace",
] ]
@ -889,9 +889,9 @@ dependencies = [
[[package]] [[package]]
name = "charabia" name = "charabia"
version = "0.8.8" version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60dc1a562fc8cb53d552d371758a4ecd76d15cc7489d2b968529cd9cadcbd854" checksum = "933f20f2269b24d32fd5503e7b3c268af902190daf8d9d2b73ed2e75d77c00b4"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"cow-utils", "cow-utils",
@ -901,9 +901,7 @@ dependencies = [
"fst", "fst",
"irg-kvariants", "irg-kvariants",
"jieba-rs", "jieba-rs",
"lindera-core", "lindera",
"lindera-dictionary",
"lindera-tokenizer",
"litemap", "litemap",
"once_cell", "once_cell",
"pinyin", "pinyin",
@ -1715,9 +1713,9 @@ dependencies = [
[[package]] [[package]]
name = "env_logger" name = "env_logger"
version = "0.11.2" version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9"
dependencies = [ dependencies = [
"anstream", "anstream",
"anstyle", "anstyle",
@ -2661,6 +2659,15 @@ dependencies = [
"simple_asn1", "simple_asn1",
] ]
[[package]]
name = "kanaria"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff"
dependencies = [
"bitflags 1.3.2",
]
[[package]] [[package]]
name = "kstring" name = "kstring"
version = "2.0.0" version = "2.0.0"
@ -2766,10 +2773,67 @@ dependencies = [
] ]
[[package]] [[package]]
name = "lindera-cc-cedict-builder" name = "lindera"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca21f2ee3ca40e7f3ebbd568d041be1531c2c28dbf540e737aeba934ab53f330" checksum = "a1bbf252ea3490053dc397539ece0b510924f2f72605fa28d3e858d86f43ec88"
dependencies = [
"lindera-analyzer",
"lindera-core",
"lindera-dictionary",
"lindera-filter",
"lindera-tokenizer",
]
[[package]]
name = "lindera-analyzer"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87febfec0e2859ce2154fb90dd6f66b774ddb0b6e264b44f8e3d1303c9dcedd7"
dependencies = [
"anyhow",
"bincode",
"byteorder",
"encoding",
"kanaria",
"lindera-cc-cedict-builder",
"lindera-core",
"lindera-dictionary",
"lindera-filter",
"lindera-ipadic-builder",
"lindera-ko-dic-builder",
"lindera-tokenizer",
"lindera-unidic-builder",
"once_cell",
"regex",
"serde",
"serde_json",
"thiserror",
"unicode-blocks",
"unicode-normalization",
"unicode-segmentation",
"yada",
]
[[package]]
name = "lindera-cc-cedict"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcb91bb8a93ab0f95dbc3c43b5105354bb059134ef731154f75a64b5d919e71d"
dependencies = [
"bincode",
"byteorder",
"lindera-cc-cedict-builder",
"lindera-core",
"lindera-decompress",
"once_cell",
]
[[package]]
name = "lindera-cc-cedict-builder"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6022a8309a287dbef425fd09a61585351670c83001d74f6c089979e2330b683"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -2778,6 +2842,7 @@ dependencies = [
"encoding", "encoding",
"env_logger", "env_logger",
"glob", "glob",
"lindera-compress",
"lindera-core", "lindera-core",
"lindera-decompress", "lindera-decompress",
"log", "log",
@ -2786,9 +2851,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-compress" name = "lindera-compress"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34da125091f3b3a49351f418484a16cb2a23f6888cd53fe219edad19d263da5d" checksum = "32363cbcf433f915e7d77c2a0c410db2d6b23442e80715cf2cf6b9864078a500"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"flate2", "flate2",
@ -2797,9 +2862,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-core" name = "lindera-core"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09d4b717a8a31b73a3cbd3552e0abda14e0c85d97dc8b911035342533defdbad" checksum = "d9a0e858753a02b1a3524fae4fbb11ca4b3a947128fd7854b797386562678be8"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -2814,9 +2879,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-decompress" name = "lindera-decompress"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "98f4476c99cb4ffa54fbfc42953adf69ada7276cfbb594bce9829547de012058" checksum = "0e406345f6f8b665b9a129c67079c18ca9d97e9d171d102b4106a64a592c285e"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"flate2", "flate2",
@ -2825,29 +2890,73 @@ dependencies = [
[[package]] [[package]]
name = "lindera-dictionary" name = "lindera-dictionary"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a45b92f0ce331c2202c6cec3135e4bfce29525ab3bb97a613c27c8e0a29fa967" checksum = "3e2a3ec0e5fd6768a27c6ec1040e8470d3a5926418f7afe065859e98aabb3bfe"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
"byteorder", "byteorder",
"lindera-cc-cedict",
"lindera-cc-cedict-builder", "lindera-cc-cedict-builder",
"lindera-core", "lindera-core",
"lindera-ipadic",
"lindera-ipadic-builder", "lindera-ipadic-builder",
"lindera-ipadic-neologd",
"lindera-ipadic-neologd-builder", "lindera-ipadic-neologd-builder",
"lindera-ko-dic", "lindera-ko-dic",
"lindera-ko-dic-builder", "lindera-ko-dic-builder",
"lindera-unidic", "lindera-unidic",
"lindera-unidic-builder", "lindera-unidic-builder",
"serde", "serde",
"strum",
"strum_macros",
]
[[package]]
name = "lindera-filter"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1badaf51bad051185ea4917ba91bbbf2d6f8167e155647e21e0eaaef0982a95d"
dependencies = [
"anyhow",
"csv",
"kanaria",
"lindera-cc-cedict-builder",
"lindera-core",
"lindera-dictionary",
"lindera-ipadic-builder",
"lindera-ko-dic-builder",
"lindera-unidic-builder",
"once_cell",
"regex",
"serde",
"serde_json",
"unicode-blocks",
"unicode-normalization",
"unicode-segmentation",
"yada",
]
[[package]]
name = "lindera-ipadic"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "129ec16366354998f9791467ad38731539197747f649e573ead845358271ce25"
dependencies = [
"bincode",
"byteorder",
"lindera-core",
"lindera-decompress",
"lindera-ipadic-builder",
"once_cell",
] ]
[[package]] [[package]]
name = "lindera-ipadic-builder" name = "lindera-ipadic-builder"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "642dee52201852df209cb43423ff1ca4d161a329f5cdba049a7b5820118345f2" checksum = "7f0979a56bc57e9c9be2996dff232c47aa146a2e7baebf5dd567e388eba3dd90"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -2857,6 +2966,7 @@ dependencies = [
"encoding_rs_io", "encoding_rs_io",
"env_logger", "env_logger",
"glob", "glob",
"lindera-compress",
"lindera-core", "lindera-core",
"lindera-decompress", "lindera-decompress",
"log", "log",
@ -2865,10 +2975,24 @@ dependencies = [
] ]
[[package]] [[package]]
name = "lindera-ipadic-neologd-builder" name = "lindera-ipadic-neologd"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "325144b154e68159373e944d1cd7f67c6ff9965a2af41240a8e41732b3fdb3af" checksum = "20076660c4e79ef0316735b44e18ec7644e54786acdee8946c972d5f97086d0f"
dependencies = [
"bincode",
"byteorder",
"lindera-core",
"lindera-decompress",
"lindera-ipadic-neologd-builder",
"once_cell",
]
[[package]]
name = "lindera-ipadic-neologd-builder"
version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eccd18ed5f65d1d64ac0cbfa1d6827bfbbaf6530520ae6847e6a91ee38f47e20"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -2878,6 +3002,7 @@ dependencies = [
"encoding_rs_io", "encoding_rs_io",
"env_logger", "env_logger",
"glob", "glob",
"lindera-compress",
"lindera-core", "lindera-core",
"lindera-decompress", "lindera-decompress",
"log", "log",
@ -2887,9 +3012,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-ko-dic" name = "lindera-ko-dic"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b484a2f9964e7424264fda304beb6ff6ad883c347accfe1115e777dedef3661d" checksum = "59073171566c3e498ca048e84c2d0a7e117a42f36c8eb7d7163e65ac38bd6d48"
dependencies = [ dependencies = [
"bincode", "bincode",
"byteorder", "byteorder",
@ -2900,13 +3025,14 @@ dependencies = [
"lindera-ko-dic-builder", "lindera-ko-dic-builder",
"once_cell", "once_cell",
"tar", "tar",
"ureq",
] ]
[[package]] [[package]]
name = "lindera-ko-dic-builder" name = "lindera-ko-dic-builder"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9413d4d9bf7af921f5ac64414a290c7ba81695e8ba08dd2f6c950b57c281a69" checksum = "ae176afa8535ca2a5ee9471873f85d531db0a6c32a3c42b41084506aac22b577"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -2924,9 +3050,9 @@ dependencies = [
[[package]] [[package]]
name = "lindera-tokenizer" name = "lindera-tokenizer"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9987c818462d51ca67e131e40f0386e25e8c557e195059b1257f95731561185d" checksum = "457285bdde84571aa510c9e05371904305a55e8a541fa1473d4393062f06932d"
dependencies = [ dependencies = [
"bincode", "bincode",
"lindera-core", "lindera-core",
@ -2938,26 +3064,27 @@ dependencies = [
[[package]] [[package]]
name = "lindera-unidic" name = "lindera-unidic"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c379cf436b2627cd7d3498642e491eadbff9b3e01231c516ce9f9b1893ab7c3" checksum = "5839980be552dfa639b70964c61914a9ad014148663679b0e148aa72e5e30f23"
dependencies = [ dependencies = [
"bincode", "bincode",
"byteorder", "byteorder",
"encoding", "encoding",
"flate2",
"lindera-core", "lindera-core",
"lindera-decompress", "lindera-decompress",
"lindera-unidic-builder", "lindera-unidic-builder",
"once_cell", "once_cell",
"tar",
"ureq", "ureq",
"zip",
] ]
[[package]] [[package]]
name = "lindera-unidic-builder" name = "lindera-unidic-builder"
version = "0.28.0" version = "0.30.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "601ec33b5174141396a7a4ca066278863840221fec32d0be19091e7fae91ed94" checksum = "dcaab8f061d5b944b1e424f49c7efbf8f276e8a72e4f4ff956d01e46d481f008"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"bincode", "bincode",
@ -4214,9 +4341,9 @@ dependencies = [
[[package]] [[package]]
name = "regex" name = "regex"
version = "1.10.2" version = "1.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"memchr", "memchr",
@ -4226,9 +4353,9 @@ dependencies = [
[[package]] [[package]]
name = "regex-automata" name = "regex-automata"
version = "0.4.3" version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea"
dependencies = [ dependencies = [
"aho-corasick", "aho-corasick",
"memchr", "memchr",
@ -4795,6 +4922,28 @@ version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "strum"
version = "0.26.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29"
dependencies = [
"strum_macros",
]
[[package]]
name = "strum_macros"
version = "0.26.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6cf59daf282c0a494ba14fd21610a0325f9f90ec9d1231dea26bcb1d696c946"
dependencies = [
"heck",
"proc-macro2",
"quote",
"rustversion",
"syn 2.0.58",
]
[[package]] [[package]]
name = "subtle" name = "subtle"
version = "2.5.0" version = "2.5.0"
@ -5324,6 +5473,12 @@ version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
[[package]]
name = "unicode-blocks"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b"
[[package]] [[package]]
name = "unicode-ident" name = "unicode-ident"
version = "1.0.12" version = "1.0.12"
@ -5332,9 +5487,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]] [[package]]
name = "unicode-normalization" name = "unicode-normalization"
version = "0.1.22" version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5"
dependencies = [ dependencies = [
"tinyvec", "tinyvec",
] ]
@ -5350,9 +5505,9 @@ dependencies = [
[[package]] [[package]]
name = "unicode-segmentation" name = "unicode-segmentation"
version = "1.10.1" version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202"
[[package]] [[package]]
name = "unicode-width" name = "unicode-width"
@ -5942,9 +6097,9 @@ dependencies = [
[[package]] [[package]]
name = "yada" name = "yada"
version = "0.5.0" version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6d12cb7a57bbf2ab670ed9545bae3648048547f9039279a89ce000208e585c1" checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd"
[[package]] [[package]]
name = "yaml-rust" name = "yaml-rust"

View File

@ -256,8 +256,8 @@ pub(crate) mod test {
pub fn create_test_settings() -> Settings<Checked> { pub fn create_test_settings() -> Settings<Checked> {
let settings = Settings { let settings = Settings {
displayed_attributes: Setting::Set(vec![S("race"), S("name")]), displayed_attributes: Setting::Set(vec![S("race"), S("name")]).into(),
searchable_attributes: Setting::Set(vec![S("name"), S("race")]), searchable_attributes: Setting::Set(vec![S("name"), S("race")]).into(),
filterable_attributes: Setting::Set(btreeset! { S("race"), S("age") }), filterable_attributes: Setting::Set(btreeset! { S("race"), S("age") }),
sortable_attributes: Setting::Set(btreeset! { S("age") }), sortable_attributes: Setting::Set(btreeset! { S("age") }),
ranking_rules: Setting::NotSet, ranking_rules: Setting::NotSet,

View File

@ -315,8 +315,8 @@ impl From<v5::ResponseError> for v6::ResponseError {
impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> { impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
fn from(settings: v5::Settings<T>) -> Self { fn from(settings: v5::Settings<T>) -> Self {
v6::Settings { v6::Settings {
displayed_attributes: settings.displayed_attributes.into(), displayed_attributes: v6::Setting::from(settings.displayed_attributes).into(),
searchable_attributes: settings.searchable_attributes.into(), searchable_attributes: v6::Setting::from(settings.searchable_attributes).into(),
filterable_attributes: settings.filterable_attributes.into(), filterable_attributes: settings.filterable_attributes.into(),
sortable_attributes: settings.sortable_attributes.into(), sortable_attributes: settings.sortable_attributes.into(),
ranking_rules: { ranking_rules: {

View File

@ -3041,6 +3041,7 @@ mod tests {
source: Setting::Set(milli::vector::settings::EmbedderSource::Rest), source: Setting::Set(milli::vector::settings::EmbedderSource::Rest),
api_key: Setting::Set(S("My super secret")), api_key: Setting::Set(S("My super secret")),
url: Setting::Set(S("http://localhost:7777")), url: Setting::Set(S("http://localhost:7777")),
dimensions: Setting::Set(4),
..Default::default() ..Default::default()
}; };
embedders.insert(S("default"), Setting::Set(embedding_settings)); embedders.insert(S("default"), Setting::Set(embedding_settings));

View File

@ -7,6 +7,7 @@ expression: task.details
"default": { "default": {
"source": "rest", "source": "rest",
"apiKey": "MyXXXX...", "apiKey": "MyXXXX...",
"dimensions": 4,
"url": "http://localhost:7777" "url": "http://localhost:7777"
} }
} }

View File

@ -6,7 +6,7 @@ expression: embedding_config.embedder_options
"Rest": { "Rest": {
"api_key": "My super secret", "api_key": "My super secret",
"distribution": null, "distribution": null,
"dimensions": null, "dimensions": 4,
"url": "http://localhost:7777", "url": "http://localhost:7777",
"query": null, "query": null,
"input_field": [ "input_field": [

View File

@ -7,6 +7,7 @@ expression: task.details
"default": { "default": {
"source": "rest", "source": "rest",
"apiKey": "MyXXXX...", "apiKey": "MyXXXX...",
"dimensions": 4,
"url": "http://localhost:7777" "url": "http://localhost:7777"
} }
} }

View File

@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs
[] []
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Tasks: ### All Tasks:
0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} 0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Status: ### Status:
enqueued [0,] enqueued [0,]

View File

@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs
[] []
---------------------------------------------------------------------- ----------------------------------------------------------------------
### All Tasks: ### All Tasks:
0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
---------------------------------------------------------------------- ----------------------------------------------------------------------
### Status: ### Status:
enqueued [] enqueued []

View File

@ -44,6 +44,7 @@ all-tokenizations = ["milli/all-tokenizations"]
# chinese specialized tokenization # chinese specialized tokenization
chinese = ["milli/chinese"] chinese = ["milli/chinese"]
chinese-pinyin = ["milli/chinese-pinyin"]
# hebrew specialized tokenization # hebrew specialized tokenization
hebrew = ["milli/hebrew"] hebrew = ["milli/hebrew"]
# japanese specialized tokenization # japanese specialized tokenization
@ -56,3 +57,5 @@ greek = ["milli/greek"]
khmer = ["milli/khmer"] khmer = ["milli/khmer"]
# allow vietnamese specialized tokenization # allow vietnamese specialized tokenization
vietnamese = ["milli/vietnamese"] vietnamese = ["milli/vietnamese"]
# force swedish character recomposition
swedish-recomposition = ["milli/swedish-recomposition"]

View File

@ -3,7 +3,7 @@ use std::convert::Infallible;
use std::fmt; use std::fmt;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::num::NonZeroUsize; use std::num::NonZeroUsize;
use std::ops::ControlFlow; use std::ops::{ControlFlow, Deref};
use std::str::FromStr; use std::str::FromStr;
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
@ -143,21 +143,13 @@ impl MergeWithError<milli::CriterionError> for DeserrJsonError<InvalidSettingsRa
)] )]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct Settings<T> { pub struct Settings<T> {
#[serde( #[serde(default, skip_serializing_if = "Setting::is_not_set")]
default,
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Setting::is_not_set"
)]
#[deserr(default, error = DeserrJsonError<InvalidSettingsDisplayedAttributes>)] #[deserr(default, error = DeserrJsonError<InvalidSettingsDisplayedAttributes>)]
pub displayed_attributes: Setting<Vec<String>>, pub displayed_attributes: WildcardSetting,
#[serde( #[serde(default, skip_serializing_if = "Setting::is_not_set")]
default,
serialize_with = "serialize_with_wildcard",
skip_serializing_if = "Setting::is_not_set"
)]
#[deserr(default, error = DeserrJsonError<InvalidSettingsSearchableAttributes>)] #[deserr(default, error = DeserrJsonError<InvalidSettingsSearchableAttributes>)]
pub searchable_attributes: Setting<Vec<String>>, pub searchable_attributes: WildcardSetting,
#[serde(default, skip_serializing_if = "Setting::is_not_set")] #[serde(default, skip_serializing_if = "Setting::is_not_set")]
#[deserr(default, error = DeserrJsonError<InvalidSettingsFilterableAttributes>)] #[deserr(default, error = DeserrJsonError<InvalidSettingsFilterableAttributes>)]
@ -251,8 +243,8 @@ impl<T> Settings<T> {
impl Settings<Checked> { impl Settings<Checked> {
pub fn cleared() -> Settings<Checked> { pub fn cleared() -> Settings<Checked> {
Settings { Settings {
displayed_attributes: Setting::Reset, displayed_attributes: Setting::Reset.into(),
searchable_attributes: Setting::Reset, searchable_attributes: Setting::Reset.into(),
filterable_attributes: Setting::Reset, filterable_attributes: Setting::Reset,
sortable_attributes: Setting::Reset, sortable_attributes: Setting::Reset,
ranking_rules: Setting::Reset, ranking_rules: Setting::Reset,
@ -319,7 +311,7 @@ impl Settings<Checked> {
impl Settings<Unchecked> { impl Settings<Unchecked> {
pub fn check(self) -> Settings<Checked> { pub fn check(self) -> Settings<Checked> {
let displayed_attributes = match self.displayed_attributes { let displayed_attributes = match self.displayed_attributes.0 {
Setting::Set(fields) => { Setting::Set(fields) => {
if fields.iter().any(|f| f == "*") { if fields.iter().any(|f| f == "*") {
Setting::Reset Setting::Reset
@ -330,7 +322,7 @@ impl Settings<Unchecked> {
otherwise => otherwise, otherwise => otherwise,
}; };
let searchable_attributes = match self.searchable_attributes { let searchable_attributes = match self.searchable_attributes.0 {
Setting::Set(fields) => { Setting::Set(fields) => {
if fields.iter().any(|f| f == "*") { if fields.iter().any(|f| f == "*") {
Setting::Reset Setting::Reset
@ -342,8 +334,8 @@ impl Settings<Unchecked> {
}; };
Settings { Settings {
displayed_attributes, displayed_attributes: displayed_attributes.into(),
searchable_attributes, searchable_attributes: searchable_attributes.into(),
filterable_attributes: self.filterable_attributes, filterable_attributes: self.filterable_attributes,
sortable_attributes: self.sortable_attributes, sortable_attributes: self.sortable_attributes,
ranking_rules: self.ranking_rules, ranking_rules: self.ranking_rules,
@ -412,13 +404,13 @@ pub fn apply_settings_to_builder(
_kind, _kind,
} = settings; } = settings;
match searchable_attributes { match searchable_attributes.deref() {
Setting::Set(ref names) => builder.set_searchable_fields(names.clone()), Setting::Set(ref names) => builder.set_searchable_fields(names.clone()),
Setting::Reset => builder.reset_searchable_fields(), Setting::Reset => builder.reset_searchable_fields(),
Setting::NotSet => (), Setting::NotSet => (),
} }
match displayed_attributes { match displayed_attributes.deref() {
Setting::Set(ref names) => builder.set_displayed_fields(names.clone()), Setting::Set(ref names) => builder.set_displayed_fields(names.clone()),
Setting::Reset => builder.reset_displayed_fields(), Setting::Reset => builder.reset_displayed_fields(),
Setting::NotSet => (), Setting::NotSet => (),
@ -690,11 +682,13 @@ pub fn settings(
displayed_attributes: match displayed_attributes { displayed_attributes: match displayed_attributes {
Some(attrs) => Setting::Set(attrs), Some(attrs) => Setting::Set(attrs),
None => Setting::Reset, None => Setting::Reset,
}, }
.into(),
searchable_attributes: match searchable_attributes { searchable_attributes: match searchable_attributes {
Some(attrs) => Setting::Set(attrs), Some(attrs) => Setting::Set(attrs),
None => Setting::Reset, None => Setting::Reset,
}, }
.into(),
filterable_attributes: Setting::Set(filterable_attributes), filterable_attributes: Setting::Set(filterable_attributes),
sortable_attributes: Setting::Set(sortable_attributes), sortable_attributes: Setting::Set(sortable_attributes),
ranking_rules: Setting::Set(criteria.iter().map(|c| c.clone().into()).collect()), ranking_rules: Setting::Set(criteria.iter().map(|c| c.clone().into()).collect()),
@ -848,6 +842,41 @@ impl From<ProximityPrecisionView> for ProximityPrecision {
} }
} }
#[derive(Debug, Clone, Default, Deserialize, PartialEq, Eq)]
pub struct WildcardSetting(Setting<Vec<String>>);
impl From<Setting<Vec<String>>> for WildcardSetting {
fn from(setting: Setting<Vec<String>>) -> Self {
Self(setting)
}
}
impl Serialize for WildcardSetting {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serialize_with_wildcard(&self.0, serializer)
}
}
impl<E: deserr::DeserializeError> Deserr<E> for WildcardSetting {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: ValuePointerRef<'_>,
) -> Result<Self, E> {
Ok(Self(Setting::deserialize_from_value(value, location)?))
}
}
impl std::ops::Deref for WildcardSetting {
type Target = Setting<Vec<String>>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
#[cfg(test)] #[cfg(test)]
pub(crate) mod test { pub(crate) mod test {
use super::*; use super::*;
@ -856,8 +885,8 @@ pub(crate) mod test {
fn test_setting_check() { fn test_setting_check() {
// test no changes // test no changes
let settings = Settings { let settings = Settings {
displayed_attributes: Setting::Set(vec![String::from("hello")]), displayed_attributes: Setting::Set(vec![String::from("hello")]).into(),
searchable_attributes: Setting::Set(vec![String::from("hello")]), searchable_attributes: Setting::Set(vec![String::from("hello")]).into(),
filterable_attributes: Setting::NotSet, filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet, sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet, ranking_rules: Setting::NotSet,
@ -883,8 +912,9 @@ pub(crate) mod test {
// test wildcard // test wildcard
// test no changes // test no changes
let settings = Settings { let settings = Settings {
displayed_attributes: Setting::Set(vec![String::from("*")]), displayed_attributes: Setting::Set(vec![String::from("*")]).into(),
searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]), searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")])
.into(),
filterable_attributes: Setting::NotSet, filterable_attributes: Setting::NotSet,
sortable_attributes: Setting::NotSet, sortable_attributes: Setting::NotSet,
ranking_rules: Setting::NotSet, ranking_rules: Setting::NotSet,
@ -904,7 +934,7 @@ pub(crate) mod test {
}; };
let checked = settings.check(); let checked = settings.check();
assert_eq!(checked.displayed_attributes, Setting::Reset); assert_eq!(checked.displayed_attributes, Setting::Reset.into());
assert_eq!(checked.searchable_attributes, Setting::Reset); assert_eq!(checked.searchable_attributes, Setting::Reset.into());
} }
} }

View File

@ -149,12 +149,14 @@ mini-dashboard = [
"zip", "zip",
] ]
chinese = ["meilisearch-types/chinese"] chinese = ["meilisearch-types/chinese"]
chinese-pinyin = ["meilisearch-types/chinese-pinyin"]
hebrew = ["meilisearch-types/hebrew"] hebrew = ["meilisearch-types/hebrew"]
japanese = ["meilisearch-types/japanese"] japanese = ["meilisearch-types/japanese"]
thai = ["meilisearch-types/thai"] thai = ["meilisearch-types/thai"]
greek = ["meilisearch-types/greek"] greek = ["meilisearch-types/greek"]
khmer = ["meilisearch-types/khmer"] khmer = ["meilisearch-types/khmer"]
vietnamese = ["meilisearch-types/vietnamese"] vietnamese = ["meilisearch-types/vietnamese"]
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
[package.metadata.mini-dashboard] [package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip" assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"

View File

@ -7,7 +7,6 @@ use serde_json::Value;
use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind}; use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind};
use crate::routes::indexes::documents::UpdateDocumentsQuery; use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::tasks::TasksFilterQuery;
use crate::Opt; use crate::Opt;
pub struct MockAnalytics { pub struct MockAnalytics {
@ -86,6 +85,4 @@ impl Analytics for MockAnalytics {
} }
fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
fn get_tasks(&self, _query: &TasksFilterQuery, _request: &HttpRequest) {}
fn health_seen(&self, _request: &HttpRequest) {}
} }

View File

@ -14,7 +14,6 @@ use platform_dirs::AppDirs;
use serde_json::Value; use serde_json::Value;
use crate::routes::indexes::documents::UpdateDocumentsQuery; use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::tasks::TasksFilterQuery;
// if the analytics feature is disabled // if the analytics feature is disabled
// the `SegmentAnalytics` point to the mock instead of the real analytics // the `SegmentAnalytics` point to the mock instead of the real analytics
@ -117,10 +116,4 @@ pub trait Analytics: Sync + Send {
index_creation: bool, index_creation: bool,
request: &HttpRequest, request: &HttpRequest,
); );
// this method should be called to aggregate the get tasks requests.
fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest);
// this method should be called to aggregate a add documents request
fn health_seen(&self, request: &HttpRequest);
} }

View File

@ -33,7 +33,6 @@ use crate::option::{
}; };
use crate::routes::indexes::documents::UpdateDocumentsQuery; use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::indexes::facet_search::FacetSearchQuery; use crate::routes::indexes::facet_search::FacetSearchQuery;
use crate::routes::tasks::TasksFilterQuery;
use crate::routes::{create_all_stats, Stats}; use crate::routes::{create_all_stats, Stats};
use crate::search::{ use crate::search::{
FacetSearchResult, MatchingStrategy, SearchQuery, SearchQueryWithIndex, SearchResult, FacetSearchResult, MatchingStrategy, SearchQuery, SearchQueryWithIndex, SearchResult,
@ -81,8 +80,6 @@ pub enum AnalyticsMsg {
AggregateUpdateDocuments(DocumentsAggregator), AggregateUpdateDocuments(DocumentsAggregator),
AggregateGetFetchDocuments(DocumentsFetchAggregator), AggregateGetFetchDocuments(DocumentsFetchAggregator),
AggregatePostFetchDocuments(DocumentsFetchAggregator), AggregatePostFetchDocuments(DocumentsFetchAggregator),
AggregateTasks(TasksAggregator),
AggregateHealth(HealthAggregator),
} }
pub struct SegmentAnalytics { pub struct SegmentAnalytics {
@ -152,8 +149,6 @@ impl SegmentAnalytics {
update_documents_aggregator: DocumentsAggregator::default(), update_documents_aggregator: DocumentsAggregator::default(),
get_fetch_documents_aggregator: DocumentsFetchAggregator::default(), get_fetch_documents_aggregator: DocumentsFetchAggregator::default(),
post_fetch_documents_aggregator: DocumentsFetchAggregator::default(), post_fetch_documents_aggregator: DocumentsFetchAggregator::default(),
get_tasks_aggregator: TasksAggregator::default(),
health_aggregator: HealthAggregator::default(),
}); });
tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone())); tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone()));
@ -231,16 +226,6 @@ impl super::Analytics for SegmentAnalytics {
let aggregate = DocumentsFetchAggregator::from_query(documents_query, request); let aggregate = DocumentsFetchAggregator::from_query(documents_query, request);
let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate)); let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate));
} }
fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest) {
let aggregate = TasksAggregator::from_query(query, request);
let _ = self.sender.try_send(AnalyticsMsg::AggregateTasks(aggregate));
}
fn health_seen(&self, request: &HttpRequest) {
let aggregate = HealthAggregator::from_query(request);
let _ = self.sender.try_send(AnalyticsMsg::AggregateHealth(aggregate));
}
} }
/// This structure represent the `infos` field we send in the analytics. /// This structure represent the `infos` field we send in the analytics.
@ -394,8 +379,6 @@ pub struct Segment {
update_documents_aggregator: DocumentsAggregator, update_documents_aggregator: DocumentsAggregator,
get_fetch_documents_aggregator: DocumentsFetchAggregator, get_fetch_documents_aggregator: DocumentsFetchAggregator,
post_fetch_documents_aggregator: DocumentsFetchAggregator, post_fetch_documents_aggregator: DocumentsFetchAggregator,
get_tasks_aggregator: TasksAggregator,
health_aggregator: HealthAggregator,
} }
impl Segment { impl Segment {
@ -458,8 +441,6 @@ impl Segment {
Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg), Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateTasks(agreg)) => self.get_tasks_aggregator.aggregate(agreg),
Some(AnalyticsMsg::AggregateHealth(agreg)) => self.health_aggregator.aggregate(agreg),
None => (), None => (),
} }
} }
@ -513,8 +494,6 @@ impl Segment {
update_documents_aggregator, update_documents_aggregator,
get_fetch_documents_aggregator, get_fetch_documents_aggregator,
post_fetch_documents_aggregator, post_fetch_documents_aggregator,
get_tasks_aggregator,
health_aggregator,
} = self; } = self;
if let Some(get_search) = if let Some(get_search) =
@ -562,12 +541,6 @@ impl Segment {
{ {
let _ = self.batcher.push(post_fetch_documents).await; let _ = self.batcher.push(post_fetch_documents).await;
} }
if let Some(get_tasks) = take(get_tasks_aggregator).into_event(user, "Tasks Seen") {
let _ = self.batcher.push(get_tasks).await;
}
if let Some(health) = take(health_aggregator).into_event(user, "Health Seen") {
let _ = self.batcher.push(health).await;
}
let _ = self.batcher.flush().await; let _ = self.batcher.flush().await;
} }
} }
@ -1503,176 +1476,6 @@ impl DocumentsDeletionAggregator {
} }
} }
#[derive(Default, Serialize)]
pub struct TasksAggregator {
#[serde(skip)]
timestamp: Option<OffsetDateTime>,
// context
#[serde(rename = "user-agent")]
user_agents: HashSet<String>,
filtered_by_uid: bool,
filtered_by_index_uid: bool,
filtered_by_type: bool,
filtered_by_status: bool,
filtered_by_canceled_by: bool,
filtered_by_before_enqueued_at: bool,
filtered_by_after_enqueued_at: bool,
filtered_by_before_started_at: bool,
filtered_by_after_started_at: bool,
filtered_by_before_finished_at: bool,
filtered_by_after_finished_at: bool,
total_received: usize,
}
impl TasksAggregator {
pub fn from_query(query: &TasksFilterQuery, request: &HttpRequest) -> Self {
let TasksFilterQuery {
limit: _,
from: _,
uids,
index_uids,
types,
statuses,
canceled_by,
before_enqueued_at,
after_enqueued_at,
before_started_at,
after_started_at,
before_finished_at,
after_finished_at,
} = query;
Self {
timestamp: Some(OffsetDateTime::now_utc()),
user_agents: extract_user_agents(request).into_iter().collect(),
filtered_by_uid: uids.is_some(),
filtered_by_index_uid: index_uids.is_some(),
filtered_by_type: types.is_some(),
filtered_by_status: statuses.is_some(),
filtered_by_canceled_by: canceled_by.is_some(),
filtered_by_before_enqueued_at: before_enqueued_at.is_some(),
filtered_by_after_enqueued_at: after_enqueued_at.is_some(),
filtered_by_before_started_at: before_started_at.is_some(),
filtered_by_after_started_at: after_started_at.is_some(),
filtered_by_before_finished_at: before_finished_at.is_some(),
filtered_by_after_finished_at: after_finished_at.is_some(),
total_received: 1,
}
}
/// Aggregate one [TasksAggregator] into another.
pub fn aggregate(&mut self, other: Self) {
let Self {
timestamp,
user_agents,
total_received,
filtered_by_uid,
filtered_by_index_uid,
filtered_by_type,
filtered_by_status,
filtered_by_canceled_by,
filtered_by_before_enqueued_at,
filtered_by_after_enqueued_at,
filtered_by_before_started_at,
filtered_by_after_started_at,
filtered_by_before_finished_at,
filtered_by_after_finished_at,
} = other;
if self.timestamp.is_none() {
self.timestamp = timestamp;
}
// we can't create a union because there is no `into_union` method
for user_agent in user_agents {
self.user_agents.insert(user_agent);
}
self.filtered_by_uid |= filtered_by_uid;
self.filtered_by_index_uid |= filtered_by_index_uid;
self.filtered_by_type |= filtered_by_type;
self.filtered_by_status |= filtered_by_status;
self.filtered_by_canceled_by |= filtered_by_canceled_by;
self.filtered_by_before_enqueued_at |= filtered_by_before_enqueued_at;
self.filtered_by_after_enqueued_at |= filtered_by_after_enqueued_at;
self.filtered_by_before_started_at |= filtered_by_before_started_at;
self.filtered_by_after_started_at |= filtered_by_after_started_at;
self.filtered_by_before_finished_at |= filtered_by_before_finished_at;
self.filtered_by_after_finished_at |= filtered_by_after_finished_at;
self.filtered_by_after_finished_at |= filtered_by_after_finished_at;
self.total_received = self.total_received.saturating_add(total_received);
}
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
// if we had no timestamp it means we never encountered any events and
// thus we don't need to send this event.
let timestamp = self.timestamp?;
Some(Track {
timestamp: Some(timestamp),
user: user.clone(),
event: event_name.to_string(),
properties: serde_json::to_value(self).ok()?,
..Default::default()
})
}
}
#[derive(Default, Serialize)]
pub struct HealthAggregator {
#[serde(skip)]
timestamp: Option<OffsetDateTime>,
// context
#[serde(rename = "user-agent")]
user_agents: HashSet<String>,
#[serde(rename = "requests.total_received")]
total_received: usize,
}
impl HealthAggregator {
pub fn from_query(request: &HttpRequest) -> Self {
Self {
timestamp: Some(OffsetDateTime::now_utc()),
user_agents: extract_user_agents(request).into_iter().collect(),
total_received: 1,
}
}
/// Aggregate one [HealthAggregator] into another.
pub fn aggregate(&mut self, other: Self) {
let Self { timestamp, user_agents, total_received } = other;
if self.timestamp.is_none() {
self.timestamp = timestamp;
}
// we can't create a union because there is no `into_union` method
for user_agent in user_agents {
self.user_agents.insert(user_agent);
}
self.total_received = self.total_received.saturating_add(total_received);
}
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
// if we had no timestamp it means we never encountered any events and
// thus we don't need to send this event.
let timestamp = self.timestamp?;
Some(Track {
timestamp: Some(timestamp),
user: user.clone(),
event: event_name.to_string(),
properties: serde_json::to_value(self).ok()?,
..Default::default()
})
}
}
#[derive(Default, Serialize)] #[derive(Default, Serialize)]
pub struct DocumentsFetchAggregator { pub struct DocumentsFetchAggregator {
#[serde(skip)] #[serde(skip)]

View File

@ -13,6 +13,7 @@ use byte_unit::{Byte, ByteError};
use clap::Parser; use clap::Parser;
use meilisearch_types::features::InstanceTogglableFeatures; use meilisearch_types::features::InstanceTogglableFeatures;
use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::ThreadPoolNoAbortBuilder;
use rustls::server::{ use rustls::server::{
AllowAnyAnonymousOrAuthenticatedClient, AllowAnyAuthenticatedClient, ServerSessionMemoryCache, AllowAnyAnonymousOrAuthenticatedClient, AllowAnyAuthenticatedClient, ServerSessionMemoryCache,
}; };
@ -666,7 +667,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
type Error = anyhow::Error; type Error = anyhow::Error;
fn try_from(other: &IndexerOpts) -> Result<Self, Self::Error> { fn try_from(other: &IndexerOpts) -> Result<Self, Self::Error> {
let thread_pool = rayon::ThreadPoolBuilder::new() let thread_pool = ThreadPoolNoAbortBuilder::new()
.thread_name(|index| format!("indexing-thread:{index}")) .thread_name(|index| format!("indexing-thread:{index}"))
.num_threads(*other.max_indexing_threads) .num_threads(*other.max_indexing_threads)
.build()?; .build()?;

View File

@ -269,12 +269,8 @@ impl From<index_scheduler::IndexStats> for IndexStats {
pub async fn get_index_stats( pub async fn get_index_stats(
index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>,
index_uid: web::Path<String>, index_uid: web::Path<String>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = IndexUid::try_from(index_uid.into_inner())?;
analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": true }), Some(&req));
let stats = IndexStats::from(index_scheduler.index_stats(&index_uid)?); let stats = IndexStats::from(index_scheduler.index_stats(&index_uid)?);
debug!(returns = ?stats, "Get index stats"); debug!(returns = ?stats, "Get index stats");

View File

@ -137,10 +137,8 @@ macro_rules! make_setting_route {
let settings = settings(&index, &rtxn, meilisearch_types::settings::SecretPolicy::HideSecrets)?; let settings = settings(&index, &rtxn, meilisearch_types::settings::SecretPolicy::HideSecrets)?;
debug!(returns = ?settings, "Update settings"); debug!(returns = ?settings, "Update settings");
let mut json = serde_json::json!(&settings);
let val = json[$camelcase_attr].take();
Ok(HttpResponse::Ok().json(val)) Ok(HttpResponse::Ok().json(settings.$attr))
} }
pub fn resources() -> Resource { pub fn resources() -> Resource {

View File

@ -8,11 +8,9 @@ use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::settings::{Settings, Unchecked}; use meilisearch_types::settings::{Settings, Unchecked};
use meilisearch_types::tasks::{Kind, Status, Task, TaskId}; use meilisearch_types::tasks::{Kind, Status, Task, TaskId};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::json;
use time::OffsetDateTime; use time::OffsetDateTime;
use tracing::debug; use tracing::debug;
use crate::analytics::Analytics;
use crate::extractors::authentication::policies::*; use crate::extractors::authentication::policies::*;
use crate::extractors::authentication::GuardedData; use crate::extractors::authentication::GuardedData;
use crate::search_queue::SearchQueue; use crate::search_queue::SearchQueue;
@ -296,10 +294,7 @@ pub struct Stats {
async fn get_stats( async fn get_stats(
index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>,
auth_controller: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<AuthController>>, auth_controller: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<AuthController>>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": false }), Some(&req));
let filters = index_scheduler.filters(); let filters = index_scheduler.filters();
let stats = create_all_stats((*index_scheduler).clone(), (*auth_controller).clone(), filters)?; let stats = create_all_stats((*index_scheduler).clone(), (*auth_controller).clone(), filters)?;
@ -355,11 +350,7 @@ struct VersionResponse {
async fn get_version( async fn get_version(
_index_scheduler: GuardedData<ActionPolicy<{ actions::VERSION }>, Data<IndexScheduler>>, _index_scheduler: GuardedData<ActionPolicy<{ actions::VERSION }>, Data<IndexScheduler>>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> HttpResponse { ) -> HttpResponse {
analytics.publish("Version Seen".to_string(), json!(null), Some(&req));
let build_info = build_info::BuildInfo::from_build(); let build_info = build_info::BuildInfo::from_build();
HttpResponse::Ok().json(VersionResponse { HttpResponse::Ok().json(VersionResponse {
@ -377,14 +368,10 @@ async fn get_version(
} }
pub async fn get_health( pub async fn get_health(
req: HttpRequest,
index_scheduler: Data<IndexScheduler>, index_scheduler: Data<IndexScheduler>,
auth_controller: Data<AuthController>, auth_controller: Data<AuthController>,
search_queue: Data<SearchQueue>, search_queue: Data<SearchQueue>,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
analytics.health_seen(&req);
search_queue.health().unwrap(); search_queue.health().unwrap();
index_scheduler.health().unwrap(); index_scheduler.health().unwrap();
auth_controller.health().unwrap(); auth_controller.health().unwrap();

View File

@ -270,12 +270,8 @@ pub struct AllTasks {
async fn get_tasks( async fn get_tasks(
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>,
params: AwebQueryParameter<TasksFilterQuery, DeserrQueryParamError>, params: AwebQueryParameter<TasksFilterQuery, DeserrQueryParamError>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let mut params = params.into_inner(); let mut params = params.into_inner();
analytics.get_tasks(&params, &req);
// We +1 just to know if there is more after this "page" or not. // We +1 just to know if there is more after this "page" or not.
params.limit.0 = params.limit.0.saturating_add(1); params.limit.0 = params.limit.0.saturating_add(1);
let limit = params.limit.0; let limit = params.limit.0;
@ -298,8 +294,6 @@ async fn get_tasks(
async fn get_task( async fn get_task(
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>, index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>,
task_uid: web::Path<String>, task_uid: web::Path<String>,
req: HttpRequest,
analytics: web::Data<dyn Analytics>,
) -> Result<HttpResponse, ResponseError> { ) -> Result<HttpResponse, ResponseError> {
let task_uid_string = task_uid.into_inner(); let task_uid_string = task_uid.into_inner();
@ -310,8 +304,6 @@ async fn get_task(
} }
}; };
analytics.publish("Tasks Seen".to_string(), json!({ "per_task_uid": true }), Some(&req));
let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() }; let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() };
let filters = index_scheduler.filters(); let filters = index_scheduler.filters();
let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(query, filters)?; let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(query, filters)?;

View File

@ -1,3 +1,4 @@
use core::fmt;
use std::cmp::min; use std::cmp::min;
use std::collections::{BTreeMap, BTreeSet, HashSet}; use std::collections::{BTreeMap, BTreeSet, HashSet};
use std::str::FromStr; use std::str::FromStr;
@ -39,7 +40,7 @@ pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "<em>".to_string();
pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string(); pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string();
pub const DEFAULT_SEMANTIC_RATIO: fn() -> SemanticRatio = || SemanticRatio(0.5); pub const DEFAULT_SEMANTIC_RATIO: fn() -> SemanticRatio = || SemanticRatio(0.5);
#[derive(Debug, Clone, Default, PartialEq, Deserr)] #[derive(Clone, Default, PartialEq, Deserr)]
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
pub struct SearchQuery { pub struct SearchQuery {
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)] #[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
@ -88,6 +89,110 @@ pub struct SearchQuery {
pub attributes_to_search_on: Option<Vec<String>>, pub attributes_to_search_on: Option<Vec<String>>,
} }
// Since this structure is logged A LOT we're going to reduce the number of things it logs to the bare minimum.
// - Only what IS used, we know everything else is set to None so there is no need to print it
// - Re-order the most important field to debug first
impl fmt::Debug for SearchQuery {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let Self {
q,
vector,
hybrid,
offset,
limit,
page,
hits_per_page,
attributes_to_retrieve,
attributes_to_crop,
crop_length,
attributes_to_highlight,
show_matches_position,
show_ranking_score,
show_ranking_score_details,
filter,
sort,
facets,
highlight_pre_tag,
highlight_post_tag,
crop_marker,
matching_strategy,
attributes_to_search_on,
} = self;
let mut debug = f.debug_struct("SearchQuery");
// First, everything related to the number of documents to retrieve
debug.field("limit", &limit).field("offset", &offset);
if let Some(page) = page {
debug.field("page", &page);
}
if let Some(hits_per_page) = hits_per_page {
debug.field("hits_per_page", &hits_per_page);
}
// Then, everything related to the queries
if let Some(q) = q {
debug.field("q", &q);
}
if let Some(v) = vector {
if v.len() < 10 {
debug.field("vector", &v);
} else {
debug.field(
"vector",
&format!("[{}, {}, {}, ... {} dimensions]", v[0], v[1], v[2], v.len()),
);
}
}
if let Some(hybrid) = hybrid {
debug.field("hybrid", &hybrid);
}
if let Some(attributes_to_search_on) = attributes_to_search_on {
debug.field("attributes_to_search_on", &attributes_to_search_on);
}
if let Some(filter) = filter {
debug.field("filter", &filter);
}
if let Some(sort) = sort {
debug.field("sort", &sort);
}
if let Some(facets) = facets {
debug.field("facets", &facets);
}
debug.field("matching_strategy", &matching_strategy);
// Then everything related to the formatting
debug.field("crop_length", &crop_length);
if *show_matches_position {
debug.field("show_matches_position", show_matches_position);
}
if *show_ranking_score {
debug.field("show_ranking_score", show_ranking_score);
}
if *show_ranking_score_details {
debug.field("self.show_ranking_score_details", show_ranking_score_details);
}
debug.field("crop_length", &crop_length);
if let Some(facets) = facets {
debug.field("facets", &facets);
}
if let Some(attributes_to_retrieve) = attributes_to_retrieve {
debug.field("attributes_to_retrieve", &attributes_to_retrieve);
}
if let Some(attributes_to_crop) = attributes_to_crop {
debug.field("attributes_to_crop", &attributes_to_crop);
}
if let Some(attributes_to_highlight) = attributes_to_highlight {
debug.field("attributes_to_highlight", &attributes_to_highlight);
}
debug.field("highlight_pre_tag", &highlight_pre_tag);
debug.field("highlight_post_tag", &highlight_post_tag);
debug.field("crop_marker", &crop_marker);
debug.finish()
}
}
#[derive(Debug, Clone, Default, PartialEq, Deserr)] #[derive(Debug, Clone, Default, PartialEq, Deserr)]
#[deserr(error = DeserrJsonError<InvalidHybridQuery>, rename_all = camelCase, deny_unknown_fields)] #[deserr(error = DeserrJsonError<InvalidHybridQuery>, rename_all = camelCase, deny_unknown_fields)]
pub struct HybridQuery { pub struct HybridQuery {
@ -370,7 +475,7 @@ pub struct SearchHit {
pub ranking_score_details: Option<serde_json::Map<String, serde_json::Value>>, pub ranking_score_details: Option<serde_json::Map<String, serde_json::Value>>,
} }
#[derive(Serialize, Debug, Clone, PartialEq)] #[derive(Serialize, Clone, PartialEq)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct SearchResult { pub struct SearchResult {
pub hits: Vec<SearchHit>, pub hits: Vec<SearchHit>,
@ -393,6 +498,46 @@ pub struct SearchResult {
pub used_negative_operator: bool, pub used_negative_operator: bool,
} }
impl fmt::Debug for SearchResult {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let SearchResult {
hits,
query,
processing_time_ms,
hits_info,
facet_distribution,
facet_stats,
semantic_hit_count,
degraded,
used_negative_operator,
} = self;
let mut debug = f.debug_struct("SearchResult");
// The most important thing when looking at a search result is the time it took to process
debug.field("processing_time_ms", &processing_time_ms);
debug.field("hits", &format!("[{} hits returned]", hits.len()));
debug.field("query", &query);
debug.field("hits_info", &hits_info);
if *used_negative_operator {
debug.field("used_negative_operator", used_negative_operator);
}
if *degraded {
debug.field("degraded", degraded);
}
if let Some(facet_distribution) = facet_distribution {
debug.field("facet_distribution", &facet_distribution);
}
if let Some(facet_stats) = facet_stats {
debug.field("facet_stats", &facet_stats);
}
if let Some(semantic_hit_count) = semantic_hit_count {
debug.field("semantic_hit_count", &semantic_hit_count);
}
debug.finish()
}
}
#[derive(Serialize, Debug, Clone, PartialEq)] #[derive(Serialize, Debug, Clone, PartialEq)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct SearchResultWithIndex { pub struct SearchResultWithIndex {

View File

@ -113,7 +113,8 @@ async fn secrets_are_hidden_in_settings() {
"default": { "default": {
"source": "rest", "source": "rest",
"url": "https://localhost:7777", "url": "https://localhost:7777",
"apiKey": "My super secret value you will never guess" "apiKey": "My super secret value you will never guess",
"dimensions": 4,
} }
} }
})) }))
@ -184,6 +185,7 @@ async fn secrets_are_hidden_in_settings() {
"default": { "default": {
"source": "rest", "source": "rest",
"apiKey": "My suXXXXXX...", "apiKey": "My suXXXXXX...",
"dimensions": 4,
"documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", "documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}",
"url": "https://localhost:7777", "url": "https://localhost:7777",
"query": null, "query": null,
@ -211,6 +213,7 @@ async fn secrets_are_hidden_in_settings() {
"default": { "default": {
"source": "rest", "source": "rest",
"apiKey": "My suXXXXXX...", "apiKey": "My suXXXXXX...",
"dimensions": 4,
"url": "https://localhost:7777" "url": "https://localhost:7777"
} }
} }

View File

@ -17,7 +17,7 @@ bincode = "1.3.3"
bstr = "1.9.0" bstr = "1.9.0"
bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] } bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] }
byteorder = "1.5.0" byteorder = "1.5.0"
charabia = { version = "0.8.8", default-features = false } charabia = { version = "0.8.10", default-features = false }
concat-arrays = "0.1.2" concat-arrays = "0.1.2"
crossbeam-channel = "0.5.11" crossbeam-channel = "0.5.11"
deserr = "0.6.1" deserr = "0.6.1"
@ -115,6 +115,7 @@ lmdb-posix-sem = ["heed/posix-sem"]
# allow chinese specialized tokenization # allow chinese specialized tokenization
chinese = ["charabia/chinese"] chinese = ["charabia/chinese"]
chinese-pinyin = ["chinese", "charabia/chinese-normalization-pinyin"]
# allow hebrew specialized tokenization # allow hebrew specialized tokenization
hebrew = ["charabia/hebrew"] hebrew = ["charabia/hebrew"]
@ -135,7 +136,11 @@ greek = ["charabia/greek"]
# allow khmer specialized tokenization # allow khmer specialized tokenization
khmer = ["charabia/khmer"] khmer = ["charabia/khmer"]
# allow vietnamese specialized tokenization
vietnamese = ["charabia/vietnamese"] vietnamese = ["charabia/vietnamese"]
# force swedish character recomposition
swedish-recomposition = ["charabia/swedish-recomposition"]
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306> # allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
cuda = ["candle-core/cuda"] cuda = ["candle-core/cuda"]

View File

@ -9,6 +9,7 @@ use serde_json::Value;
use thiserror::Error; use thiserror::Error;
use crate::documents::{self, DocumentsBatchCursorError}; use crate::documents::{self, DocumentsBatchCursorError};
use crate::thread_pool_no_abort::PanicCatched;
use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; use crate::{CriterionError, DocumentId, FieldId, Object, SortError};
pub fn is_reserved_keyword(keyword: &str) -> bool { pub fn is_reserved_keyword(keyword: &str) -> bool {
@ -39,17 +40,19 @@ pub enum InternalError {
Fst(#[from] fst::Error), Fst(#[from] fst::Error),
#[error(transparent)] #[error(transparent)]
DocumentsError(#[from] documents::Error), DocumentsError(#[from] documents::Error),
#[error("Invalid compression type have been specified to grenad.")] #[error("Invalid compression type have been specified to grenad")]
GrenadInvalidCompressionType, GrenadInvalidCompressionType,
#[error("Invalid grenad file with an invalid version format.")] #[error("Invalid grenad file with an invalid version format")]
GrenadInvalidFormatVersion, GrenadInvalidFormatVersion,
#[error("Invalid merge while processing {process}.")] #[error("Invalid merge while processing {process}")]
IndexingMergingKeys { process: &'static str }, IndexingMergingKeys { process: &'static str },
#[error("{}", HeedError::InvalidDatabaseTyping)] #[error("{}", HeedError::InvalidDatabaseTyping)]
InvalidDatabaseTyping, InvalidDatabaseTyping,
#[error(transparent)] #[error(transparent)]
RayonThreadPool(#[from] ThreadPoolBuildError), RayonThreadPool(#[from] ThreadPoolBuildError),
#[error(transparent)] #[error(transparent)]
PanicInThreadPool(#[from] PanicCatched),
#[error(transparent)]
SerdeJson(#[from] serde_json::Error), SerdeJson(#[from] serde_json::Error),
#[error(transparent)] #[error(transparent)]
Serialization(#[from] SerializationError), Serialization(#[from] SerializationError),
@ -57,9 +60,9 @@ pub enum InternalError {
Store(#[from] MdbError), Store(#[from] MdbError),
#[error(transparent)] #[error(transparent)]
Utf8(#[from] str::Utf8Error), Utf8(#[from] str::Utf8Error),
#[error("An indexation process was explicitly aborted.")] #[error("An indexation process was explicitly aborted")]
AbortedIndexation, AbortedIndexation,
#[error("The matching words list contains at least one invalid member.")] #[error("The matching words list contains at least one invalid member")]
InvalidMatchingWords, InvalidMatchingWords,
#[error(transparent)] #[error(transparent)]
ArroyError(#[from] arroy::Error), ArroyError(#[from] arroy::Error),

View File

@ -678,6 +678,23 @@ impl Index {
.get(rtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) .get(rtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY)
} }
/// Identical to `user_defined_searchable_fields`, but returns ids instead.
pub fn user_defined_searchable_fields_ids(&self, rtxn: &RoTxn) -> Result<Option<Vec<FieldId>>> {
match self.user_defined_searchable_fields(rtxn)? {
Some(fields) => {
let fields_ids_map = self.fields_ids_map(rtxn)?;
let mut fields_ids = Vec::new();
for name in fields {
if let Some(field_id) = fields_ids_map.id(name) {
fields_ids.push(field_id);
}
}
Ok(Some(fields_ids))
}
None => Ok(None),
}
}
/* filterable fields */ /* filterable fields */
/// Writes the filterable fields names in the database. /// Writes the filterable fields names in the database.
@ -824,11 +841,11 @@ impl Index {
/// Identical to `user_defined_faceted_fields`, but returns ids instead. /// Identical to `user_defined_faceted_fields`, but returns ids instead.
pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> { pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> {
let fields = self.faceted_fields(rtxn)?; let fields = self.user_defined_faceted_fields(rtxn)?;
let fields_ids_map = self.fields_ids_map(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?;
let mut fields_ids = HashSet::new(); let mut fields_ids = HashSet::new();
for name in fields.into_iter() { for name in fields {
if let Some(field_id) = fields_ids_map.id(&name) { if let Some(field_id) = fields_ids_map.id(&name) {
fields_ids.insert(field_id); fields_ids.insert(field_id);
} }

View File

@ -21,6 +21,7 @@ pub mod prompt;
pub mod proximity; pub mod proximity;
pub mod score_details; pub mod score_details;
mod search; mod search;
mod thread_pool_no_abort;
pub mod update; pub mod update;
pub mod vector; pub mod vector;
@ -42,6 +43,7 @@ pub use search::new::{
SearchLogger, VisualSearchLogger, SearchLogger, VisualSearchLogger,
}; };
use serde_json::Value; use serde_json::Value;
pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
pub use {charabia as tokenizer, heed}; pub use {charabia as tokenizer, heed};
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
@ -128,7 +130,7 @@ impl fmt::Debug for TimeBudget {
impl Default for TimeBudget { impl Default for TimeBudget {
fn default() -> Self { fn default() -> Self {
Self::new(std::time::Duration::from_millis(150)) Self::new(std::time::Duration::from_millis(1500))
} }
} }

View File

@ -97,6 +97,7 @@ impl<'a> FacetDistribution<'a> {
) -> heed::Result<()> { ) -> heed::Result<()> {
match facet_type { match facet_type {
FacetType::Number => { FacetType::Number => {
let mut lexicographic_distribution = BTreeMap::new();
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
let distribution_prelength = distribution.len(); let distribution_prelength = distribution.len();
@ -111,14 +112,17 @@ impl<'a> FacetDistribution<'a> {
for result in iter { for result in iter {
let ((_, _, value), ()) = result?; let ((_, _, value), ()) = result?;
*distribution.entry(value.to_string()).or_insert(0) += 1; *lexicographic_distribution.entry(value.to_string()).or_insert(0) += 1;
if distribution.len() - distribution_prelength == self.max_values_per_facet if lexicographic_distribution.len() - distribution_prelength
== self.max_values_per_facet
{ {
break; break;
} }
} }
} }
distribution.extend(lexicographic_distribution);
} }
FacetType::String => { FacetType::String => {
let mut normalized_distribution = BTreeMap::new(); let mut normalized_distribution = BTreeMap::new();

View File

@ -0,0 +1,69 @@
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use rayon::{ThreadPool, ThreadPoolBuilder};
use thiserror::Error;
/// A rayon ThreadPool wrapper that can catch panics in the pool
/// and modifies the install function accordingly.
#[derive(Debug)]
pub struct ThreadPoolNoAbort {
thread_pool: ThreadPool,
/// Set to true if the thread pool catched a panic.
pool_catched_panic: Arc<AtomicBool>,
}
impl ThreadPoolNoAbort {
pub fn install<OP, R>(&self, op: OP) -> Result<R, PanicCatched>
where
OP: FnOnce() -> R + Send,
R: Send,
{
let output = self.thread_pool.install(op);
// While reseting the pool panic catcher we return an error if we catched one.
if self.pool_catched_panic.swap(false, Ordering::SeqCst) {
Err(PanicCatched)
} else {
Ok(output)
}
}
pub fn current_num_threads(&self) -> usize {
self.thread_pool.current_num_threads()
}
}
#[derive(Error, Debug)]
#[error("A panic occured. Read the logs to find more information about it")]
pub struct PanicCatched;
#[derive(Default)]
pub struct ThreadPoolNoAbortBuilder(ThreadPoolBuilder);
impl ThreadPoolNoAbortBuilder {
pub fn new() -> ThreadPoolNoAbortBuilder {
ThreadPoolNoAbortBuilder::default()
}
pub fn thread_name<F>(mut self, closure: F) -> Self
where
F: FnMut(usize) -> String + 'static,
{
self.0 = self.0.thread_name(closure);
self
}
pub fn num_threads(mut self, num_threads: usize) -> ThreadPoolNoAbortBuilder {
self.0 = self.0.num_threads(num_threads);
self
}
pub fn build(mut self) -> Result<ThreadPoolNoAbort, rayon::ThreadPoolBuildError> {
let pool_catched_panic = Arc::new(AtomicBool::new(false));
self.0 = self.0.panic_handler({
let catched_panic = pool_catched_panic.clone();
move |_result| catched_panic.store(true, Ordering::SeqCst)
});
Ok(ThreadPoolNoAbort { thread_pool: self.0.build()?, pool_catched_panic })
}
}

View File

@ -71,8 +71,8 @@ pub enum DelAddOperation {
/// putting each deletion obkv's keys under an DelAdd::Deletion /// putting each deletion obkv's keys under an DelAdd::Deletion
/// and putting each addition obkv's keys under an DelAdd::Addition /// and putting each addition obkv's keys under an DelAdd::Addition
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>( pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
deletion: obkv::KvReader<K>, deletion: &obkv::KvReader<K>,
addition: obkv::KvReader<K>, addition: &obkv::KvReader<K>,
buffer: &mut Vec<u8>, buffer: &mut Vec<u8>,
) -> Result<(), std::io::Error> { ) -> Result<(), std::io::Error> {
use itertools::merge_join_by; use itertools::merge_join_by;

View File

@ -1,4 +1,4 @@
use std::collections::{HashMap, HashSet}; use std::collections::HashMap;
use std::convert::TryInto; use std::convert::TryInto;
use std::fs::File; use std::fs::File;
use std::io::BufReader; use std::io::BufReader;
@ -12,6 +12,7 @@ use serde_json::Value;
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
use crate::error::{InternalError, SerializationError}; use crate::error::{InternalError, SerializationError};
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
@ -25,10 +26,7 @@ pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, R
pub fn extract_docid_word_positions<R: io::Read + io::Seek>( pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
searchable_fields: &Option<HashSet<FieldId>>, settings_diff: &InnerIndexSettingsDiff,
stop_words: Option<&fst::Set<Vec<u8>>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> { ) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
puffin::profile_function!(); puffin::profile_function!();
@ -36,6 +34,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let max_positions_per_attributes = max_positions_per_attributes let max_positions_per_attributes = max_positions_per_attributes
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let force_reindexing = settings_diff.reindex_searchable();
// initialize destination values. // initialize destination values.
let mut documents_ids = RoaringBitmap::new(); let mut documents_ids = RoaringBitmap::new();
@ -56,8 +55,37 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let mut value_buffer = Vec::new(); let mut value_buffer = Vec::new();
// initialize tokenizer. // initialize tokenizer.
let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None); let old_stop_words = settings_diff.old.stop_words.as_ref();
let tokenizer = builder.build(); let old_separators: Option<Vec<_>> = settings_diff
.old
.allowed_separators
.as_ref()
.map(|s| s.iter().map(String::as_str).collect());
let old_dictionary: Option<Vec<_>> =
settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut del_builder = tokenizer_builder(
old_stop_words,
old_separators.as_deref(),
old_dictionary.as_deref(),
None,
);
let del_tokenizer = del_builder.build();
let new_stop_words = settings_diff.new.stop_words.as_ref();
let new_separators: Option<Vec<_>> = settings_diff
.new
.allowed_separators
.as_ref()
.map(|s| s.iter().map(String::as_str).collect());
let new_dictionary: Option<Vec<_>> =
settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut add_builder = tokenizer_builder(
new_stop_words,
new_separators.as_deref(),
new_dictionary.as_deref(),
None,
);
let add_tokenizer = add_builder.build();
// iterate over documents. // iterate over documents.
let mut cursor = obkv_documents.into_cursor()?; let mut cursor = obkv_documents.into_cursor()?;
@ -69,7 +97,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let obkv = KvReader::<FieldId>::new(value); let obkv = KvReader::<FieldId>::new(value);
// if the searchable fields didn't change, skip the searchable indexing for this document. // if the searchable fields didn't change, skip the searchable indexing for this document.
if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) { if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) {
continue; continue;
} }
@ -85,11 +113,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
// deletions // deletions
lang_safe_tokens_from_document( lang_safe_tokens_from_document(
&obkv, &obkv,
searchable_fields, &settings_diff.old,
&tokenizer, &del_tokenizer,
stop_words,
allowed_separators,
dictionary,
max_positions_per_attributes, max_positions_per_attributes,
DelAdd::Deletion, DelAdd::Deletion,
&mut del_buffers, &mut del_buffers,
@ -99,11 +124,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
// additions // additions
lang_safe_tokens_from_document( lang_safe_tokens_from_document(
&obkv, &obkv,
searchable_fields, &settings_diff.new,
&tokenizer, &add_tokenizer,
stop_words,
allowed_separators,
dictionary,
max_positions_per_attributes, max_positions_per_attributes,
DelAdd::Addition, DelAdd::Addition,
&mut add_buffers, &mut add_buffers,
@ -118,8 +140,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>> // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
value_buffer.clear(); value_buffer.clear();
del_add_from_two_obkvs( del_add_from_two_obkvs(
KvReader::<FieldId>::new(del_obkv), &KvReader::<FieldId>::new(del_obkv),
KvReader::<FieldId>::new(add_obkv), &KvReader::<FieldId>::new(add_obkv),
&mut value_buffer, &mut value_buffer,
)?; )?;
@ -160,8 +182,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
/// Check if any searchable fields of a document changed. /// Check if any searchable fields of a document changed.
fn searchable_fields_changed( fn searchable_fields_changed(
obkv: &KvReader<FieldId>, obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>, settings_diff: &InnerIndexSettingsDiff,
) -> bool { ) -> bool {
let searchable_fields = &settings_diff.new.searchable_fields_ids;
for (field_id, field_bytes) in obkv.iter() { for (field_id, field_bytes) in obkv.iter() {
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
let del_add = KvReaderDelAdd::new(field_bytes); let del_add = KvReaderDelAdd::new(field_bytes);
@ -206,14 +229,10 @@ fn tokenizer_builder<'a>(
/// Extract words mapped with their positions of a document, /// Extract words mapped with their positions of a document,
/// ensuring no Language detection mistakes was made. /// ensuring no Language detection mistakes was made.
#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
fn lang_safe_tokens_from_document<'a>( fn lang_safe_tokens_from_document<'a>(
obkv: &KvReader<FieldId>, obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>, settings: &InnerIndexSettings,
tokenizer: &Tokenizer, tokenizer: &Tokenizer,
stop_words: Option<&fst::Set<Vec<u8>>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: u32, max_positions_per_attributes: u32,
del_add: DelAdd, del_add: DelAdd,
buffers: &'a mut Buffers, buffers: &'a mut Buffers,
@ -222,7 +241,7 @@ fn lang_safe_tokens_from_document<'a>(
tokens_from_document( tokens_from_document(
obkv, obkv,
searchable_fields, &settings.searchable_fields_ids,
tokenizer, tokenizer,
max_positions_per_attributes, max_positions_per_attributes,
del_add, del_add,
@ -246,12 +265,15 @@ fn lang_safe_tokens_from_document<'a>(
// then we don't rerun the extraction. // then we don't rerun the extraction.
if !script_language.is_empty() { if !script_language.is_empty() {
// build a new temporary tokenizer including the allow list. // build a new temporary tokenizer including the allow list.
let mut builder = tokenizer_builder( let stop_words = settings.stop_words.as_ref();
stop_words, let separators: Option<Vec<_>> = settings
allowed_separators, .allowed_separators
dictionary, .as_ref()
Some(&script_language), .map(|s| s.iter().map(String::as_str).collect());
); let dictionary: Option<Vec<_>> =
settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
let mut builder =
tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
let tokenizer = builder.build(); let tokenizer = builder.build();
script_language_word_count.clear(); script_language_word_count.clear();
@ -259,7 +281,7 @@ fn lang_safe_tokens_from_document<'a>(
// rerun the extraction. // rerun the extraction.
tokens_from_document( tokens_from_document(
obkv, obkv,
searchable_fields, &settings.searchable_fields_ids,
&tokenizer, &tokenizer,
max_positions_per_attributes, max_positions_per_attributes,
del_add, del_add,
@ -276,7 +298,7 @@ fn lang_safe_tokens_from_document<'a>(
/// Extract words mapped with their positions of a document. /// Extract words mapped with their positions of a document.
fn tokens_from_document<'a>( fn tokens_from_document<'a>(
obkv: &KvReader<FieldId>, obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>, searchable_fields: &Option<Vec<FieldId>>,
tokenizer: &Tokenizer, tokenizer: &Tokenizer,
max_positions_per_attributes: u32, max_positions_per_attributes: u32,
del_add: DelAdd, del_add: DelAdd,

View File

@ -10,6 +10,7 @@ use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
}; };
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::Result; use crate::Result;
/// Extracts the facet number and the documents ids where this facet number appear. /// Extracts the facet number and the documents ids where this facet number appear.
@ -20,6 +21,7 @@ use crate::Result;
pub fn extract_facet_number_docids<R: io::Read + io::Seek>( pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
fid_docid_facet_number: grenad::Reader<R>, fid_docid_facet_number: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!(); puffin::profile_function!();

View File

@ -15,6 +15,7 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::{ use crate::update::index_documents::helpers::{
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
}; };
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
/// Extracts the facet string and the documents ids where this facet string appear. /// Extracts the facet string and the documents ids where this facet string appear.
@ -25,6 +26,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
pub fn extract_facet_string_docids<R: io::Read + io::Seek>( pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>, docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> { ) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
puffin::profile_function!(); puffin::profile_function!();

View File

@ -1,5 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{BTreeMap, HashSet}; use std::collections::BTreeMap;
use std::convert::TryInto; use std::convert::TryInto;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io::{self, BufReader};
@ -20,6 +20,7 @@ use crate::error::InternalError;
use crate::facet::value_encoding::f64_into_bytes; use crate::facet::value_encoding::f64_into_bytes;
use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::update::index_documents::{create_writer, writer_into_reader};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH}; use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH};
/// The length of the elements that are always in the buffer when inserting new values. /// The length of the elements that are always in the buffer when inserting new values.
@ -43,7 +44,7 @@ pub struct ExtractedFacetValues {
pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
faceted_fields: &HashSet<FieldId>, settings_diff: &InnerIndexSettingsDiff,
geo_fields_ids: Option<(FieldId, FieldId)>, geo_fields_ids: Option<(FieldId, FieldId)>,
) -> Result<ExtractedFacetValues> { ) -> Result<ExtractedFacetValues> {
puffin::profile_function!(); puffin::profile_function!();
@ -82,7 +83,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
let obkv = obkv::KvReader::new(value); let obkv = obkv::KvReader::new(value);
for (field_id, field_bytes) in obkv.iter() { for (field_id, field_bytes) in obkv.iter() {
if faceted_fields.contains(&field_id) { let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id);
let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id);
if delete_faceted || add_faceted {
numbers_key_buffer.clear(); numbers_key_buffer.clear();
strings_key_buffer.clear(); strings_key_buffer.clear();
@ -99,11 +102,12 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
strings_key_buffer.extend_from_slice(docid_bytes); strings_key_buffer.extend_from_slice(docid_bytes);
let del_add_obkv = obkv::KvReader::new(field_bytes); let del_add_obkv = obkv::KvReader::new(field_bytes);
let del_value = match del_add_obkv.get(DelAdd::Deletion) { let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted)
{
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
None => None, None => None,
}; };
let add_value = match del_add_obkv.get(DelAdd::Addition) { let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) {
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
None => None, None => None,
}; };

View File

@ -10,6 +10,7 @@ use super::helpers::{
use crate::error::SerializationError; use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::Result; use crate::Result;
const MAX_COUNTED_WORDS: usize = 30; const MAX_COUNTED_WORDS: usize = 30;
@ -23,6 +24,7 @@ const MAX_COUNTED_WORDS: usize = 30;
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>( pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>, docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!(); puffin::profile_function!();

View File

@ -17,8 +17,9 @@ use crate::error::UserError;
use crate::prompt::Prompt; use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at; use crate::update::index_documents::helpers::try_split_at;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::Embedder; use crate::vector::Embedder;
use crate::{DocumentId, FieldsIdsMap, InternalError, Result, VectorOrArrayOfVectors}; use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, VectorOrArrayOfVectors};
/// The length of the elements that are always in the buffer when inserting new values. /// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>(); const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -71,12 +72,15 @@ impl VectorStateDelta {
pub fn extract_vector_points<R: io::Read + io::Seek>( pub fn extract_vector_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>, obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
field_id_map: &FieldsIdsMap, settings_diff: &InnerIndexSettingsDiff,
prompt: &Prompt, prompt: &Prompt,
embedder_name: &str, embedder_name: &str,
) -> Result<ExtractedVectorPoints> { ) -> Result<ExtractedVectorPoints> {
puffin::profile_function!(); puffin::profile_function!();
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
// (docid, _index) -> KvWriterDelAdd -> Vector // (docid, _index) -> KvWriterDelAdd -> Vector
let mut manual_vectors_writer = create_writer( let mut manual_vectors_writer = create_writer(
indexer.chunk_compression_type, indexer.chunk_compression_type,
@ -98,8 +102,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
tempfile::tempfile()?, tempfile::tempfile()?,
); );
let vectors_fid = field_id_map.id("_vectors");
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?; let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? { while let Some((key, value)) = cursor.move_on_next()? {
@ -116,15 +118,29 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
// lazily get it when needed // lazily get it when needed
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
let vectors_field = vectors_fid // the vector field id may have changed
let old_vectors_fid = old_fields_ids_map.id("_vectors");
// filter the old vector fid if the settings has been changed forcing reindexing.
let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors());
let new_vectors_fid = new_fields_ids_map.id("_vectors");
let vectors_field = {
let del = old_vectors_fid
.and_then(|vectors_fid| obkv.get(vectors_fid)) .and_then(|vectors_fid| obkv.get(vectors_fid))
.map(KvReaderDelAdd::new) .map(KvReaderDelAdd::new)
.map(|obkv| to_vector_maps(obkv, document_id)) .map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id))
.transpose()?; .transpose()?
.flatten();
let add = new_vectors_fid
.and_then(|vectors_fid| obkv.get(vectors_fid))
.map(KvReaderDelAdd::new)
.map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id))
.transpose()?
.flatten();
(del, add)
};
let (del_map, add_map) = vectors_field.unzip(); let (del_map, add_map) = vectors_field;
let del_map = del_map.flatten();
let add_map = add_map.flatten();
let del_value = del_map.and_then(|mut map| map.remove(embedder_name)); let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
let add_value = add_map.and_then(|mut map| map.remove(embedder_name)); let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
@ -155,7 +171,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
VectorStateDelta::NowGenerated(prompt.render( VectorStateDelta::NowGenerated(prompt.render(
obkv, obkv,
DelAdd::Addition, DelAdd::Addition,
field_id_map, new_fields_ids_map,
)?) )?)
} else { } else {
VectorStateDelta::NowRemoved VectorStateDelta::NowRemoved
@ -182,10 +198,16 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
if document_is_kept { if document_is_kept {
// Don't give up if the old prompt was failing // Don't give up if the old prompt was failing
let old_prompt = let old_prompt = Some(prompt)
prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default(); // TODO: this filter works because we erase the vec database when a embedding setting changes.
let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?; // When vector pipeline will be optimized, this should be removed.
if old_prompt != new_prompt { .filter(|_| !settings_diff.reindex_vectors())
.map(|p| {
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
});
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if old_prompt.as_ref() != Some(&new_prompt) {
let old_prompt = old_prompt.unwrap_or_default();
tracing::trace!( tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
); );
@ -207,6 +229,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
&mut manual_vectors_writer, &mut manual_vectors_writer,
&mut key_buffer, &mut key_buffer,
delta, delta,
settings_diff,
)?; )?;
} }
@ -220,15 +243,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
}) })
} }
fn to_vector_maps(
obkv: KvReaderDelAdd,
document_id: impl Fn() -> Value,
) -> Result<(Option<serde_json::Map<String, Value>>, Option<serde_json::Map<String, Value>>)> {
let del = to_vector_map(obkv, DelAdd::Deletion, &document_id)?;
let add = to_vector_map(obkv, DelAdd::Addition, &document_id)?;
Ok((del, add))
}
fn to_vector_map( fn to_vector_map(
obkv: KvReaderDelAdd, obkv: KvReaderDelAdd,
side: DelAdd, side: DelAdd,
@ -256,10 +270,15 @@ fn push_vectors_diff(
manual_vectors_writer: &mut Writer<BufWriter<File>>, manual_vectors_writer: &mut Writer<BufWriter<File>>,
key_buffer: &mut Vec<u8>, key_buffer: &mut Vec<u8>,
delta: VectorStateDelta, delta: VectorStateDelta,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<()> { ) -> Result<()> {
puffin::profile_function!(); puffin::profile_function!();
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
if must_remove { if must_remove
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
&& !settings_diff.reindex_vectors()
{
key_buffer.truncate(TRUNCATE_SIZE); key_buffer.truncate(TRUNCATE_SIZE);
remove_vectors_writer.insert(&key_buffer, [])?; remove_vectors_writer.insert(&key_buffer, [])?;
} }
@ -287,6 +306,9 @@ fn push_vectors_diff(
match eob { match eob {
EitherOrBoth::Both(_, _) => (), // no need to touch anything EitherOrBoth::Both(_, _) => (), // no need to touch anything
EitherOrBoth::Left(vector) => { EitherOrBoth::Left(vector) => {
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
if !settings_diff.reindex_vectors() {
// We insert only the Del part of the Obkv to inform // We insert only the Del part of the Obkv to inform
// that we only want to remove all those vectors. // that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory(); let mut obkv = KvWriterDelAdd::memory();
@ -294,6 +316,7 @@ fn push_vectors_diff(
let bytes = obkv.into_inner()?; let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?; manual_vectors_writer.insert(&key_buffer, bytes)?;
} }
}
EitherOrBoth::Right(vector) => { EitherOrBoth::Right(vector) => {
// We insert only the Add part of the Obkv to inform // We insert only the Add part of the Obkv to inform
// that we only want to remove all those vectors. // that we only want to remove all those vectors.
@ -339,7 +362,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
prompt_reader: grenad::Reader<R>, prompt_reader: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
embedder: Arc<Embedder>, embedder: Arc<Embedder>,
request_threads: &rayon::ThreadPool, request_threads: &ThreadPoolNoAbort,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!(); puffin::profile_function!();
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism

View File

@ -1,20 +1,23 @@
use std::collections::{BTreeSet, HashSet}; use std::collections::BTreeSet;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader}; use std::io::{self, BufReader};
use heed::BytesDecode; use heed::{BytesDecode, BytesEncode};
use obkv::KvReaderU16; use obkv::KvReaderU16;
use roaring::RoaringBitmap;
use super::helpers::{ use super::helpers::{
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
try_split_array_at, writer_into_reader, GrenadParameters, writer_into_reader, GrenadParameters,
}; };
use crate::error::SerializationError; use crate::error::SerializationError;
use crate::heed_codec::StrBEU16Codec; use crate::heed_codec::StrBEU16Codec;
use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::sorter_into_reader;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::update::MergeFn; use crate::update::MergeFn;
use crate::{DocumentId, FieldId, Result}; use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
/// Extracts the word and the documents ids where this word appear. /// Extracts the word and the documents ids where this word appear.
/// ///
@ -27,7 +30,7 @@ use crate::{DocumentId, FieldId, Result};
pub fn extract_word_docids<R: io::Read + io::Seek>( pub fn extract_word_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>, docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
exact_attributes: &HashSet<FieldId>, settings_diff: &InnerIndexSettingsDiff,
) -> Result<( ) -> Result<(
grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>,
@ -43,7 +46,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
indexer.max_nb_chunks, indexer.max_nb_chunks,
max_memory.map(|x| x / 3), max_memory.map(|m| m / 3),
); );
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut del_words = BTreeSet::new(); let mut del_words = BTreeSet::new();
@ -85,13 +88,19 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
add_words.clear(); add_words.clear();
} }
let mut word_fid_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut word_docids_sorter = create_sorter( let mut word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable, grenad::SortAlgorithm::Unstable,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
indexer.max_nb_chunks, indexer.max_nb_chunks,
max_memory.map(|x| x / 3), max_memory.map(|m| m / 3),
); );
let mut exact_word_docids_sorter = create_sorter( let mut exact_word_docids_sorter = create_sorter(
@ -100,31 +109,45 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
indexer.max_nb_chunks, indexer.max_nb_chunks,
max_memory.map(|x| x / 3), max_memory.map(|m| m / 3),
);
let mut word_fid_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
); );
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
// TODO: replace sorters by writers by accumulating values into a buffer before inserting them. let mut buffer = Vec::new();
// NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters.
while let Some((key, value)) = iter.next()? { while let Some((key, value)) = iter.next()? {
// only keep the value if their is a change to apply in the DB. // only keep the value if their is a change to apply in the DB.
if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) { if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
word_fid_docids_writer.insert(key, value)?; word_fid_docids_writer.insert(key, value)?;
} }
let (word, fid) = StrBEU16Codec::bytes_decode(key) let (w, fid) = StrBEU16Codec::bytes_decode(key)
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
// every words contained in an attribute set to exact must be pushed in the exact_words list. // merge all deletions
if exact_attributes.contains(&fid) { let obkv = KvReaderDelAdd::new(value);
exact_word_docids_sorter.insert(word.as_bytes(), value)?; if let Some(value) = obkv.get(DelAdd::Deletion) {
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
obkv.insert(DelAdd::Deletion, value)?;
if delete_from_exact {
exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
} else { } else {
word_docids_sorter.insert(word.as_bytes(), value)?; word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
}
}
// merge all additions
if let Some(value) = obkv.get(DelAdd::Addition) {
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
buffer.clear();
let mut obkv = KvWriterDelAdd::new(&mut buffer);
obkv.insert(DelAdd::Addition, value)?;
if add_in_exact {
exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
} else {
word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
}
} }
} }
@ -178,3 +201,45 @@ fn words_into_sorter(
Ok(()) Ok(())
} }
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
fn docids_into_writers<W>(
word: &str,
deletions: &RoaringBitmap,
additions: &RoaringBitmap,
writer: &mut grenad::Writer<W>,
) -> Result<()>
where
W: std::io::Write,
{
if deletions == additions {
// if the same value is deleted and added, do nothing.
return Ok(());
}
// Write each value in the same KvDelAdd before inserting it in the final writer.
let mut obkv = KvWriterDelAdd::memory();
// deletions:
if !deletions.is_empty() && !deletions.is_subset(additions) {
obkv.insert(
DelAdd::Deletion,
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?,
)?;
}
// additions:
if !additions.is_empty() {
obkv.insert(
DelAdd::Addition,
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?,
)?;
}
// insert everything in the same writer.
writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?;
Ok(())
}

View File

@ -11,8 +11,9 @@ use super::helpers::{
}; };
use crate::error::SerializationError; use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::proximity::{index_proximity, MAX_DISTANCE}; use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE};
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{DocumentId, Result}; use crate::{DocumentId, Result};
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear. /// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
@ -23,8 +24,21 @@ use crate::{DocumentId, Result};
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>( pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>, docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!(); puffin::profile_function!();
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
// early return if the data shouldn't be deleted nor created.
if !any_deletion && !any_addition {
let writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
return writer_into_reader(writer);
}
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
@ -77,6 +91,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
let (del, add): (Result<_>, Result<_>) = rayon::join( let (del, add): (Result<_>, Result<_>) = rayon::join(
|| { || {
if !any_deletion {
return Ok(());
}
// deletions // deletions
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) { if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
for (position, word) in KvReaderU16::new(deletion).iter() { for (position, word) in KvReaderU16::new(deletion).iter() {
@ -106,6 +124,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
Ok(()) Ok(())
}, },
|| { || {
if !any_addition {
return Ok(());
}
// additions // additions
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) { if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
for (position, word) in KvReaderU16::new(addition).iter() { for (position, word) in KvReaderU16::new(addition).iter() {

View File

@ -11,6 +11,7 @@ use super::helpers::{
use crate::error::SerializationError; use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::update::MergeFn; use crate::update::MergeFn;
use crate::{bucketed_position, DocumentId, Result}; use crate::{bucketed_position, DocumentId, Result};
@ -22,6 +23,7 @@ use crate::{bucketed_position, DocumentId, Result};
pub fn extract_word_position_docids<R: io::Read + io::Seek>( pub fn extract_word_position_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>, docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> { ) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!(); puffin::profile_function!();

View File

@ -9,9 +9,9 @@ mod extract_word_docids;
mod extract_word_pair_proximity_docids; mod extract_word_pair_proximity_docids;
mod extract_word_position_docids; mod extract_word_position_docids;
use std::collections::HashSet;
use std::fs::File; use std::fs::File;
use std::io::BufReader; use std::io::BufReader;
use std::sync::Arc;
use crossbeam_channel::Sender; use crossbeam_channel::Sender;
use rayon::prelude::*; use rayon::prelude::*;
@ -30,9 +30,8 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
use self::extract_word_position_docids::extract_word_position_docids; use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk}; use super::{helpers, TypedChunk};
use crate::proximity::ProximityPrecision; use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::EmbeddingConfigs; use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
use crate::{FieldId, FieldsIdsMap, Result};
/// Extract data for each databases from obkv documents in parallel. /// Extract data for each databases from obkv documents in parallel.
/// Send data in grenad file over provided Sender. /// Send data in grenad file over provided Sender.
@ -43,18 +42,10 @@ pub(crate) fn data_from_obkv_documents(
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send, flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
searchable_fields: Option<HashSet<FieldId>>,
faceted_fields: HashSet<FieldId>,
primary_key_id: FieldId, primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>, geo_fields_ids: Option<(FieldId, FieldId)>,
field_id_map: FieldsIdsMap, settings_diff: Arc<InnerIndexSettingsDiff>,
stop_words: Option<fst::Set<Vec<u8>>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
exact_attributes: HashSet<FieldId>,
proximity_precision: ProximityPrecision,
embedders: EmbeddingConfigs,
) -> Result<()> { ) -> Result<()> {
puffin::profile_function!(); puffin::profile_function!();
@ -67,8 +58,7 @@ pub(crate) fn data_from_obkv_documents(
original_documents_chunk, original_documents_chunk,
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
field_id_map.clone(), settings_diff.clone(),
embedders.clone(),
) )
}) })
.collect::<Result<()>>() .collect::<Result<()>>()
@ -81,13 +71,9 @@ pub(crate) fn data_from_obkv_documents(
flattened_obkv_chunks, flattened_obkv_chunks,
indexer, indexer,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
&searchable_fields,
&faceted_fields,
primary_key_id, primary_key_id,
geo_fields_ids, geo_fields_ids,
&stop_words, settings_diff.clone(),
&allowed_separators,
&dictionary,
max_positions_per_attributes, max_positions_per_attributes,
) )
}) })
@ -100,13 +86,12 @@ pub(crate) fn data_from_obkv_documents(
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(), docid_word_positions_chunk.clone(),
indexer, indexer,
settings_diff.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
extract_fid_word_count_docids, extract_fid_word_count_docids,
TypedChunk::FieldIdWordCountDocids, TypedChunk::FieldIdWordCountDocids,
"field-id-wordcount-docids", "field-id-wordcount-docids",
); );
let exact_attributes = exact_attributes.clone();
run_extraction_task::< run_extraction_task::<
_, _,
_, _,
@ -118,10 +103,9 @@ pub(crate) fn data_from_obkv_documents(
>( >(
docid_word_positions_chunk.clone(), docid_word_positions_chunk.clone(),
indexer, indexer,
settings_diff.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
move |doc_word_pos, indexer| { extract_word_docids,
extract_word_docids(doc_word_pos, indexer, &exact_attributes)
},
|( |(
word_docids_reader, word_docids_reader,
exact_word_docids_reader, exact_word_docids_reader,
@ -139,6 +123,7 @@ pub(crate) fn data_from_obkv_documents(
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(), docid_word_positions_chunk.clone(),
indexer, indexer,
settings_diff.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
extract_word_position_docids, extract_word_position_docids,
TypedChunk::WordPositionDocids, TypedChunk::WordPositionDocids,
@ -152,6 +137,7 @@ pub(crate) fn data_from_obkv_documents(
>( >(
fid_docid_facet_strings_chunk.clone(), fid_docid_facet_strings_chunk.clone(),
indexer, indexer,
settings_diff.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
extract_facet_string_docids, extract_facet_string_docids,
TypedChunk::FieldIdFacetStringDocids, TypedChunk::FieldIdFacetStringDocids,
@ -161,23 +147,23 @@ pub(crate) fn data_from_obkv_documents(
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
fid_docid_facet_numbers_chunk.clone(), fid_docid_facet_numbers_chunk.clone(),
indexer, indexer,
settings_diff.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
extract_facet_number_docids, extract_facet_number_docids,
TypedChunk::FieldIdFacetNumberDocids, TypedChunk::FieldIdFacetNumberDocids,
"field-id-facet-number-docids", "field-id-facet-number-docids",
); );
if proximity_precision == ProximityPrecision::ByWord {
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(), docid_word_positions_chunk.clone(),
indexer, indexer,
settings_diff.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
extract_word_pair_proximity_docids, extract_word_pair_proximity_docids,
TypedChunk::WordPairProximityDocids, TypedChunk::WordPairProximityDocids,
"word-pair-proximity-docids", "word-pair-proximity-docids",
); );
} }
}
Ok(()) Ok(())
}) })
@ -195,12 +181,17 @@ pub(crate) fn data_from_obkv_documents(
fn run_extraction_task<FE, FS, M>( fn run_extraction_task<FE, FS, M>(
chunk: grenad::Reader<CursorClonableMmap>, chunk: grenad::Reader<CursorClonableMmap>,
indexer: GrenadParameters, indexer: GrenadParameters,
settings_diff: Arc<InnerIndexSettingsDiff>,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
extract_fn: FE, extract_fn: FE,
serialize_fn: FS, serialize_fn: FS,
name: &'static str, name: &'static str,
) where ) where
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M> FE: Fn(
grenad::Reader<CursorClonableMmap>,
GrenadParameters,
&InnerIndexSettingsDiff,
) -> Result<M>
+ Sync + Sync
+ Send + Send
+ 'static, + 'static,
@ -213,7 +204,7 @@ fn run_extraction_task<FE, FS, M>(
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks"); let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks");
let _entered = child_span.enter(); let _entered = child_span.enter();
puffin::profile_scope!("extract_multiple_chunks", name); puffin::profile_scope!("extract_multiple_chunks", name);
match extract_fn(chunk, indexer) { match extract_fn(chunk, indexer, &settings_diff) {
Ok(chunk) => { Ok(chunk) => {
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk))); let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
} }
@ -230,8 +221,7 @@ fn send_original_documents_data(
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>, original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
field_id_map: FieldsIdsMap, settings_diff: Arc<InnerIndexSettingsDiff>,
embedders: EmbeddingConfigs,
) -> Result<()> { ) -> Result<()> {
let original_documents_chunk = let original_documents_chunk =
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
@ -239,17 +229,19 @@ fn send_original_documents_data(
let documents_chunk_cloned = original_documents_chunk.clone(); let documents_chunk_cloned = original_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
let request_threads = rayon::ThreadPoolBuilder::new() let request_threads = ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM) .num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}")) .thread_name(|index| format!("embedding-request-{index}"))
.build()?; .build()?;
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
let settings_diff = settings_diff.clone();
rayon::spawn(move || { rayon::spawn(move || {
for (name, (embedder, prompt)) in embedders { for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {
let result = extract_vector_points( let result = extract_vector_points(
documents_chunk_cloned.clone(), documents_chunk_cloned.clone(),
indexer, indexer,
&field_id_map, &settings_diff,
&prompt, &prompt,
&name, &name,
); );
@ -288,6 +280,7 @@ fn send_original_documents_data(
} }
} }
}); });
}
// TODO: create a custom internal error // TODO: create a custom internal error
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))); let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
@ -306,13 +299,9 @@ fn send_and_extract_flattened_documents_data(
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>, flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters, indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>, lmdb_writer_sx: Sender<Result<TypedChunk>>,
searchable_fields: &Option<HashSet<FieldId>>,
faceted_fields: &HashSet<FieldId>,
primary_key_id: FieldId, primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>, geo_fields_ids: Option<(FieldId, FieldId)>,
stop_words: &Option<fst::Set<Vec<u8>>>, settings_diff: Arc<InnerIndexSettingsDiff>,
allowed_separators: &Option<&[&str]>,
dictionary: &Option<&[&str]>,
max_positions_per_attributes: Option<u32>, max_positions_per_attributes: Option<u32>,
) -> Result<( ) -> Result<(
grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>,
@ -341,10 +330,7 @@ fn send_and_extract_flattened_documents_data(
extract_docid_word_positions( extract_docid_word_positions(
flattened_documents_chunk.clone(), flattened_documents_chunk.clone(),
indexer, indexer,
searchable_fields, &settings_diff,
stop_words.as_ref(),
*allowed_separators,
*dictionary,
max_positions_per_attributes, max_positions_per_attributes,
)?; )?;
@ -367,7 +353,7 @@ fn send_and_extract_flattened_documents_data(
} = extract_fid_docid_facet_values( } = extract_fid_docid_facet_values(
flattened_documents_chunk.clone(), flattened_documents_chunk.clone(),
indexer, indexer,
faceted_fields, &settings_diff,
geo_fields_ids, geo_fields_ids,
)?; )?;

View File

@ -6,9 +6,9 @@ mod typed_chunk;
use std::collections::{HashMap, HashSet}; use std::collections::{HashMap, HashSet};
use std::io::{Read, Seek}; use std::io::{Read, Seek};
use std::iter::FromIterator;
use std::num::NonZeroU32; use std::num::NonZeroU32;
use std::result::Result as StdResult; use std::result::Result as StdResult;
use std::sync::Arc;
use crossbeam_channel::{Receiver, Sender}; use crossbeam_channel::{Receiver, Sender};
use grenad::{Merger, MergerBuilder}; use grenad::{Merger, MergerBuilder};
@ -33,6 +33,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
pub use self::transform::{Transform, TransformOutput}; pub use self::transform::{Transform, TransformOutput};
use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError, UserError}; use crate::error::{Error, InternalError, UserError};
use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
pub use crate::update::index_documents::helpers::CursorClonableMmap; pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{ use crate::update::{
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
@ -259,21 +260,6 @@ where
.expect("Invalid document addition state") .expect("Invalid document addition state")
.output_from_sorter(self.wtxn, &self.progress)?; .output_from_sorter(self.wtxn, &self.progress)?;
let new_facets = output.compute_real_facets(self.wtxn, self.index)?;
self.index.put_faceted_fields(self.wtxn, &new_facets)?;
// in case new fields were introduced we're going to recreate the searchable fields.
if let Some(faceted_fields) = self.index.user_defined_searchable_fields(self.wtxn)? {
// we can't keep references on the faceted fields while we update the index thus we need to own it.
let faceted_fields: Vec<String> =
faceted_fields.into_iter().map(str::to_string).collect();
self.index.put_all_searchable_fields_from_fields_ids_map(
self.wtxn,
&faceted_fields.iter().map(String::as_ref).collect::<Vec<_>>(),
&output.fields_ids_map,
)?;
}
let indexed_documents = output.documents_count as u64; let indexed_documents = output.documents_count as u64;
let number_of_documents = self.execute_raw(output)?; let number_of_documents = self.execute_raw(output)?;
@ -296,32 +282,35 @@ where
let TransformOutput { let TransformOutput {
primary_key, primary_key,
fields_ids_map, mut settings_diff,
field_distribution, field_distribution,
documents_count, documents_count,
original_documents, original_documents,
flattened_documents, flattened_documents,
} = output; } = output;
// The fields_ids_map is put back to the store now so the rest of the transaction sees an // update the internal facet and searchable list,
// up to date field map. // because they might have changed due to the nested documents flattening.
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; settings_diff.new.recompute_facets(self.wtxn, self.index)?;
settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
let settings_diff = Arc::new(settings_diff);
let backup_pool; let backup_pool;
let pool = match self.indexer_config.thread_pool { let pool = match self.indexer_config.thread_pool {
Some(ref pool) => pool, Some(ref pool) => pool,
#[cfg(not(test))]
None => { None => {
// We initialize a bakcup pool with the default // We initialize a backup pool with the default
// settings if none have already been set. // settings if none have already been set.
backup_pool = rayon::ThreadPoolBuilder::new().build()?; #[allow(unused_mut)]
&backup_pool let mut pool_builder = ThreadPoolNoAbortBuilder::new();
}
#[cfg(test)] #[cfg(test)]
None => { {
// We initialize a bakcup pool with the default pool_builder = pool_builder.num_threads(1);
// settings if none have already been set. }
backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?;
backup_pool = pool_builder.build()?;
&backup_pool &backup_pool
} }
}; };
@ -333,13 +322,8 @@ where
) = crossbeam_channel::unbounded(); ) = crossbeam_channel::unbounded();
// get the primary key field id // get the primary key field id
let primary_key_id = fields_ids_map.id(&primary_key).unwrap(); let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
// get searchable fields for word databases
let searchable_fields =
self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter);
// get filterable fields for facet databases
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
// get the fid of the `_geo.lat` and `_geo.lng` fields. // get the fid of the `_geo.lat` and `_geo.lng` fields.
let mut field_id_map = self.index.fields_ids_map(self.wtxn)?; let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
@ -362,12 +346,6 @@ where
None => None, None => None,
}; };
let stop_words = self.index.stop_words(self.wtxn)?;
let separators = self.index.allowed_separators(self.wtxn)?;
let dictionary = self.index.dictionary(self.wtxn)?;
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default();
let pool_params = GrenadParameters { let pool_params = GrenadParameters {
chunk_compression_type: self.indexer_config.chunk_compression_type, chunk_compression_type: self.indexer_config.chunk_compression_type,
chunk_compression_level: self.indexer_config.chunk_compression_level, chunk_compression_level: self.indexer_config.chunk_compression_level,
@ -400,8 +378,6 @@ where
let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes;
let cloned_embedder = self.embedders.clone();
let mut final_documents_ids = RoaringBitmap::new(); let mut final_documents_ids = RoaringBitmap::new();
let mut databases_seen = 0; let mut databases_seen = 0;
let mut word_position_docids = None; let mut word_position_docids = None;
@ -410,7 +386,6 @@ where
let mut exact_word_docids = None; let mut exact_word_docids = None;
let mut chunk_accumulator = ChunkAccumulator::default(); let mut chunk_accumulator = ChunkAccumulator::default();
let mut dimension = HashMap::new(); let mut dimension = HashMap::new();
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
let current_span = tracing::Span::current(); let current_span = tracing::Span::current();
@ -428,10 +403,6 @@ where
let flattened_chunk_iter = let flattened_chunk_iter =
grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size);
let separators: Option<Vec<_>> =
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
let dictionary: Option<Vec<_>> =
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
let result = original_chunk_iter.and_then(|original_chunk| { let result = original_chunk_iter.and_then(|original_chunk| {
let flattened_chunk = flattened_chunk_iter?; let flattened_chunk = flattened_chunk_iter?;
// extract all databases from the chunked obkv douments // extract all databases from the chunked obkv douments
@ -440,18 +411,10 @@ where
flattened_chunk, flattened_chunk,
pool_params, pool_params,
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
searchable_fields,
faceted_fields,
primary_key_id, primary_key_id,
geo_fields_ids, geo_fields_ids,
field_id_map, settings_diff.clone(),
stop_words,
separators.as_deref(),
dictionary.as_deref(),
max_positions_per_attributes, max_positions_per_attributes,
exact_attributes,
proximity_precision,
cloned_embedder,
) )
}); });
@ -571,7 +534,7 @@ where
} }
Ok(()) Ok(())
})?; }).map_err(InternalError::from)??;
// We write the field distribution into the main database // We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?; self.index.put_field_distribution(self.wtxn, &field_distribution)?;
@ -600,7 +563,8 @@ where
writer.build(wtxn, &mut rng, None)?; writer.build(wtxn, &mut rng, None)?;
} }
Result::Ok(()) Result::Ok(())
})?; })
.map_err(InternalError::from)??;
} }
self.execute_prefix_databases( self.execute_prefix_databases(

View File

@ -1,12 +1,11 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::btree_map::Entry as BEntry; use std::collections::btree_map::Entry as BEntry;
use std::collections::hash_map::Entry as HEntry; use std::collections::hash_map::Entry as HEntry;
use std::collections::{HashMap, HashSet}; use std::collections::HashMap;
use std::fs::File; use std::fs::File;
use std::io::{Read, Seek}; use std::io::{Read, Seek};
use fxhash::FxHashMap; use fxhash::FxHashMap;
use heed::RoTxn;
use itertools::Itertools; use itertools::Itertools;
use obkv::{KvReader, KvReaderU16, KvWriter}; use obkv::{KvReader, KvReaderU16, KvWriter};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -21,14 +20,17 @@ use super::{IndexDocumentsMethod, IndexerConfig};
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
use crate::error::{Error, InternalError, UserError}; use crate::error::{Error, InternalError, UserError};
use crate::index::{db_name, main_key}; use crate::index::{db_name, main_key};
use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; use crate::update::del_add::{
del_add_from_two_obkvs, into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd,
};
use crate::update::index_documents::GrenadParameters; use crate::update::index_documents::GrenadParameters;
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result}; use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result};
pub struct TransformOutput { pub struct TransformOutput {
pub primary_key: String, pub primary_key: String,
pub fields_ids_map: FieldsIdsMap, pub settings_diff: InnerIndexSettingsDiff,
pub field_distribution: FieldDistribution, pub field_distribution: FieldDistribution,
pub documents_count: usize, pub documents_count: usize,
pub original_documents: File, pub original_documents: File,
@ -282,7 +284,9 @@ impl<'a, 'i> Transform<'a, 'i> {
self.original_sorter self.original_sorter
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
let base_obkv = KvReader::new(base_obkv); let base_obkv = KvReader::new(base_obkv);
if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? { if let Some(flattened_obkv) =
Self::flatten_from_fields_ids_map(&base_obkv, &mut self.fields_ids_map)?
{
// we recreate our buffer with the flattened documents // we recreate our buffer with the flattened documents
document_sorter_value_buffer.clear(); document_sorter_value_buffer.clear();
document_sorter_value_buffer.push(Operation::Addition as u8); document_sorter_value_buffer.push(Operation::Addition as u8);
@ -315,7 +319,9 @@ impl<'a, 'i> Transform<'a, 'i> {
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
let flattened_obkv = KvReader::new(&obkv_buffer); let flattened_obkv = KvReader::new(&obkv_buffer);
if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { if let Some(obkv) =
Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)?
{
document_sorter_value_buffer.clear(); document_sorter_value_buffer.clear();
document_sorter_value_buffer.push(Operation::Addition as u8); document_sorter_value_buffer.push(Operation::Addition as u8);
into_del_add_obkv( into_del_add_obkv(
@ -524,7 +530,9 @@ impl<'a, 'i> Transform<'a, 'i> {
// flatten it and push it as to delete in the flattened_sorter // flatten it and push it as to delete in the flattened_sorter
let flattened_obkv = KvReader::new(base_obkv); let flattened_obkv = KvReader::new(base_obkv);
if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { if let Some(obkv) =
Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)?
{
// we recreate our buffer with the flattened documents // we recreate our buffer with the flattened documents
document_sorter_value_buffer.clear(); document_sorter_value_buffer.clear();
document_sorter_value_buffer.push(Operation::Deletion as u8); document_sorter_value_buffer.push(Operation::Deletion as u8);
@ -541,8 +549,15 @@ impl<'a, 'i> Transform<'a, 'i> {
// Flatten a document from the fields ids map contained in self and insert the new // Flatten a document from the fields ids map contained in self and insert the new
// created fields. Returns `None` if the document doesn't need to be flattened. // created fields. Returns `None` if the document doesn't need to be flattened.
#[tracing::instrument(level = "trace", skip(self, obkv), target = "indexing::transform")] #[tracing::instrument(
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> { level = "trace",
skip(obkv, fields_ids_map),
target = "indexing::transform"
)]
fn flatten_from_fields_ids_map(
obkv: &KvReader<FieldId>,
fields_ids_map: &mut FieldsIdsMap,
) -> Result<Option<Vec<u8>>> {
if obkv if obkv
.iter() .iter()
.all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value)) .all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value))
@ -563,7 +578,7 @@ impl<'a, 'i> Transform<'a, 'i> {
// all the raw values get inserted directly in the `key_value` vec. // all the raw values get inserted directly in the `key_value` vec.
for (key, value) in obkv.iter() { for (key, value) in obkv.iter() {
if json_depth_checker::should_flatten_from_unchecked_slice(value) { if json_depth_checker::should_flatten_from_unchecked_slice(value) {
let key = self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { let key = fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
field_id: key, field_id: key,
process: "Flatten from fields ids map.", process: "Flatten from fields ids map.",
})?; })?;
@ -581,7 +596,7 @@ impl<'a, 'i> Transform<'a, 'i> {
// Once we have the flattened version we insert all the new generated fields_ids // Once we have the flattened version we insert all the new generated fields_ids
// (if any) in the fields ids map and serialize the value. // (if any) in the fields ids map and serialize the value.
for (key, value) in flattened.into_iter() { for (key, value) in flattened.into_iter() {
let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; let fid = fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
key_value.push((fid, value.into())); key_value.push((fid, value.into()));
} }
@ -792,9 +807,19 @@ impl<'a, 'i> Transform<'a, 'i> {
fst_new_external_documents_ids_builder.insert(key, value) fst_new_external_documents_ids_builder.insert(key, value)
})?; })?;
let old_inner_settings = InnerIndexSettings::from_index(self.index, wtxn)?;
let mut new_inner_settings = old_inner_settings.clone();
new_inner_settings.fields_ids_map = self.fields_ids_map;
let settings_diff = InnerIndexSettingsDiff {
old: old_inner_settings,
new: new_inner_settings,
embedding_configs_updated: false,
settings_update_only: false,
};
Ok(TransformOutput { Ok(TransformOutput {
primary_key, primary_key,
fields_ids_map: self.fields_ids_map, settings_diff,
field_distribution, field_distribution,
documents_count: self.documents_count, documents_count: self.documents_count,
original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
@ -804,6 +829,44 @@ impl<'a, 'i> Transform<'a, 'i> {
}) })
} }
/// Rebind the field_ids of the provided document to their values
/// based on the field_ids_maps difference between the old and the new settings,
/// then fill the provided buffers with delta documents using KvWritterDelAdd.
fn rebind_existing_document(
old_obkv: KvReader<FieldId>,
settings_diff: &InnerIndexSettingsDiff,
original_obkv_buffer: &mut Vec<u8>,
flattened_obkv_buffer: &mut Vec<u8>,
) -> Result<()> {
let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone();
let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone();
let mut obkv_writer = KvWriter::<_, FieldId>::memory();
// We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
for (id, name) in new_fields_ids_map.iter() {
if let Some(val) = old_fields_ids_map.id(name).and_then(|id| old_obkv.get(id)) {
obkv_writer.insert(id, val)?;
}
}
let data = obkv_writer.into_inner()?;
let new_obkv = KvReader::<FieldId>::new(&data);
// take the non-flattened version if flatten_from_fields_ids_map returns None.
let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?;
let old_flattened =
old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::<FieldId>::new);
let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?;
let new_flattened =
new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::<FieldId>::new);
original_obkv_buffer.clear();
flattened_obkv_buffer.clear();
del_add_from_two_obkvs(&old_obkv, &new_obkv, original_obkv_buffer)?;
del_add_from_two_obkvs(&old_flattened, &new_flattened, flattened_obkv_buffer)?;
Ok(())
}
/// Clear all databases. Returns a `TransformOutput` with a file that contains the documents /// Clear all databases. Returns a `TransformOutput` with a file that contains the documents
/// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument.
/// ///
@ -811,8 +874,7 @@ impl<'a, 'i> Transform<'a, 'i> {
pub fn prepare_for_documents_reindexing( pub fn prepare_for_documents_reindexing(
self, self,
wtxn: &mut heed::RwTxn<'i>, wtxn: &mut heed::RwTxn<'i>,
old_fields_ids_map: FieldsIdsMap, settings_diff: InnerIndexSettingsDiff,
mut new_fields_ids_map: FieldsIdsMap,
) -> Result<TransformOutput> { ) -> Result<TransformOutput> {
// There already has been a document addition, the primary key should be set by now. // There already has been a document addition, the primary key should be set by now.
let primary_key = self let primary_key = self
@ -848,78 +910,27 @@ impl<'a, 'i> Transform<'a, 'i> {
self.indexer_settings.max_memory.map(|mem| mem / 2), self.indexer_settings.max_memory.map(|mem| mem / 2),
); );
let mut obkv_buffer = Vec::new(); let mut original_obkv_buffer = Vec::new();
let mut flattened_obkv_buffer = Vec::new();
let mut document_sorter_key_buffer = Vec::new(); let mut document_sorter_key_buffer = Vec::new();
let mut document_sorter_value_buffer = Vec::new();
for result in self.index.external_documents_ids().iter(wtxn)? { for result in self.index.external_documents_ids().iter(wtxn)? {
let (external_id, docid) = result?; let (external_id, docid) = result?;
let obkv = self.index.documents.get(wtxn, &docid)?.ok_or( let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
)?; )?;
obkv_buffer.clear(); Self::rebind_existing_document(
let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer); old_obkv,
&settings_diff,
// We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. &mut original_obkv_buffer,
for (id, name) in new_fields_ids_map.iter() { &mut flattened_obkv_buffer,
if let Some(val) = old_fields_ids_map.id(name).and_then(|id| obkv.get(id)) { )?;
obkv_writer.insert(id, val)?;
}
}
let buffer = obkv_writer.into_inner()?;
document_sorter_key_buffer.clear(); document_sorter_key_buffer.clear();
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
document_sorter_value_buffer.clear(); original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?;
into_del_add_obkv( flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?;
KvReaderU16::new(buffer),
DelAddOperation::Addition,
&mut document_sorter_value_buffer,
)?;
original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
// Once we have the document. We're going to flatten it
// and insert it in the flattened sorter.
let mut doc = serde_json::Map::new();
let reader = obkv::KvReader::new(buffer);
for (k, v) in reader.iter() {
let key = new_fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId {
field_id: k,
process: "Accessing field distribution in transform.",
})?;
let value = serde_json::from_slice::<serde_json::Value>(v)
.map_err(InternalError::SerdeJson)?;
doc.insert(key.to_string(), value);
}
let flattened = flatten_serde_json::flatten(&doc);
// Once we have the flattened version we can convert it back to obkv and
// insert all the new generated fields_ids (if any) in the fields ids map.
let mut buffer: Vec<u8> = Vec::new();
let mut writer = KvWriter::new(&mut buffer);
let mut flattened: Vec<_> = flattened.into_iter().collect();
// we reorder the field to get all the known field first
flattened.sort_unstable_by_key(|(key, _)| {
new_fields_ids_map.id(key).unwrap_or(FieldId::MAX)
});
for (key, value) in flattened {
let fid =
new_fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
writer.insert(fid, &value)?;
}
document_sorter_value_buffer.clear();
into_del_add_obkv(
KvReaderU16::new(&buffer),
DelAddOperation::Addition,
&mut document_sorter_value_buffer,
)?;
flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
} }
let grenad_params = GrenadParameters { let grenad_params = GrenadParameters {
@ -934,22 +945,14 @@ impl<'a, 'i> Transform<'a, 'i> {
let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?;
let output = TransformOutput { Ok(TransformOutput {
primary_key, primary_key,
fields_ids_map: new_fields_ids_map,
field_distribution, field_distribution,
settings_diff,
documents_count, documents_count,
original_documents: original_documents.into_inner().into_inner(), original_documents: original_documents.into_inner().into_inner(),
flattened_documents: flattened_documents.into_inner().into_inner(), flattened_documents: flattened_documents.into_inner().into_inner(),
}; })
let new_facets = output.compute_real_facets(wtxn, self.index)?;
self.index.put_faceted_fields(wtxn, &new_facets)?;
// We clear the full database (words-fst, documents ids and documents content).
ClearDocuments::new(wtxn, self.index).execute()?;
Ok(output)
} }
} }
@ -964,20 +967,6 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> {
vec.into_iter().map(|_| unreachable!()).collect() vec.into_iter().map(|_| unreachable!()).collect()
} }
impl TransformOutput {
// find and insert the new field ids
pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> {
let user_defined_facets = index.user_defined_faceted_fields(rtxn)?;
Ok(self
.fields_ids_map
.names()
.filter(|&field| crate::is_faceted(field, &user_defined_facets))
.map(|field| field.to_string())
.collect())
}
}
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;

View File

@ -1,5 +1,6 @@
use grenad::CompressionType; use grenad::CompressionType;
use rayon::ThreadPool;
use crate::thread_pool_no_abort::ThreadPoolNoAbort;
#[derive(Debug)] #[derive(Debug)]
pub struct IndexerConfig { pub struct IndexerConfig {
@ -9,7 +10,7 @@ pub struct IndexerConfig {
pub max_memory: Option<usize>, pub max_memory: Option<usize>,
pub chunk_compression_type: CompressionType, pub chunk_compression_type: CompressionType,
pub chunk_compression_level: Option<u32>, pub chunk_compression_level: Option<u32>,
pub thread_pool: Option<ThreadPool>, pub thread_pool: Option<ThreadPoolNoAbort>,
pub max_positions_per_attributes: Option<u32>, pub max_positions_per_attributes: Option<u32>,
pub skip_index_budget: bool, pub skip_index_budget: bool,
} }

View File

@ -20,7 +20,7 @@ use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::update::{IndexDocuments, UpdateIndexingStep};
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
use crate::{FieldsIdsMap, Index, Result}; use crate::{FieldId, FieldsIdsMap, Index, Result};
#[derive(Debug, Clone, PartialEq, Eq, Copy)] #[derive(Debug, Clone, PartialEq, Eq, Copy)]
pub enum Setting<T> { pub enum Setting<T> {
@ -385,14 +385,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
#[tracing::instrument( #[tracing::instrument(
level = "trace" level = "trace"
skip(self, progress_callback, should_abort, old_fields_ids_map), skip(self, progress_callback, should_abort, settings_diff),
target = "indexing::documents" target = "indexing::documents"
)] )]
fn reindex<FP, FA>( fn reindex<FP, FA>(
&mut self, &mut self,
progress_callback: &FP, progress_callback: &FP,
should_abort: &FA, should_abort: &FA,
old_fields_ids_map: FieldsIdsMap, settings_diff: InnerIndexSettingsDiff,
) -> Result<()> ) -> Result<()>
where where
FP: Fn(UpdateIndexingStep) + Sync, FP: Fn(UpdateIndexingStep) + Sync,
@ -400,7 +400,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
{ {
puffin::profile_function!(); puffin::profile_function!();
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
// if the settings are set before any document update, we don't need to do anything, and // if the settings are set before any document update, we don't need to do anything, and
// will set the primary key during the first document addition. // will set the primary key during the first document addition.
if self.index.number_of_documents(self.wtxn)? == 0 { if self.index.number_of_documents(self.wtxn)? == 0 {
@ -416,14 +415,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
)?; )?;
// We clear the databases and remap the documents fields based on the new `FieldsIdsMap`. // We clear the databases and remap the documents fields based on the new `FieldsIdsMap`.
let output = transform.prepare_for_documents_reindexing( let output = transform.prepare_for_documents_reindexing(self.wtxn, settings_diff)?;
self.wtxn,
old_fields_ids_map,
fields_ids_map,
)?;
let embedder_configs = self.index.embedding_configs(self.wtxn)?;
let embedders = self.embedders(embedder_configs)?;
// We index the generated `TransformOutput` which must contain // We index the generated `TransformOutput` which must contain
// all the documents with fields in the newly defined searchable order. // all the documents with fields in the newly defined searchable order.
@ -436,32 +428,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
&should_abort, &should_abort,
)?; )?;
let indexing_builder = indexing_builder.with_embedders(embedders);
indexing_builder.execute_raw(output)?; indexing_builder.execute_raw(output)?;
Ok(()) Ok(())
} }
fn embedders(
&self,
embedding_configs: Vec<(String, EmbeddingConfig)>,
) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs
.into_iter()
.map(|(name, EmbeddingConfig { embedder_options, prompt })| {
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
let embedder = Arc::new(
Embedder::new(embedder_options.clone())
.map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?,
);
Ok((name, (embedder, prompt)))
})
.collect();
res.map(EmbeddingConfigs::new)
}
fn update_displayed(&mut self) -> Result<bool> { fn update_displayed(&mut self) -> Result<bool> {
match self.displayed_fields { match self.displayed_fields {
Setting::Set(ref fields) => { Setting::Set(ref fields) => {
@ -1038,6 +1009,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
} }
Setting::NotSet => false, Setting::NotSet => false,
}; };
// if any changes force a reindexing
// clear the vector database.
if update {
self.index.vector_arroy.clear(self.wtxn)?;
}
Ok(update) Ok(update)
} }
@ -1066,20 +1044,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
{ {
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
// Note: this MUST be before `update_sortable` so that we can get the old value to compare with the updated value afterwards let old_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
let existing_fields: HashSet<_> = self
.index
.field_distribution(self.wtxn)?
.into_iter()
.filter_map(|(field, count)| (count != 0).then_some(field))
.collect();
let old_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?;
let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
// never trigger re-indexing
self.update_displayed()?; self.update_displayed()?;
self.update_filterable()?;
self.update_sortable()?;
self.update_distinct_field()?; self.update_distinct_field()?;
self.update_criteria()?; self.update_criteria()?;
self.update_primary_key()?; self.update_primary_key()?;
@ -1089,16 +1057,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.update_max_values_per_facet()?; self.update_max_values_per_facet()?;
self.update_sort_facet_values_by()?; self.update_sort_facet_values_by()?;
self.update_pagination_max_total_hits()?; self.update_pagination_max_total_hits()?;
self.update_search_cutoff()?;
let faceted_updated = self.update_faceted(existing_fields, old_faceted_fields)?; // could trigger re-indexing
let stop_words_updated = self.update_stop_words()?; self.update_filterable()?;
let non_separator_tokens_updated = self.update_non_separator_tokens()?; self.update_sortable()?;
let separator_tokens_updated = self.update_separator_tokens()?; self.update_stop_words()?;
let dictionary_updated = self.update_dictionary()?; self.update_non_separator_tokens()?;
let synonyms_updated = self.update_synonyms()?; self.update_separator_tokens()?;
let searchable_updated = self.update_searchable()?; self.update_dictionary()?;
let exact_attributes_updated = self.update_exact_attributes()?; self.update_synonyms()?;
let proximity_precision = self.update_proximity_precision()?; self.update_searchable()?;
self.update_exact_attributes()?;
self.update_proximity_precision()?;
// TODO: very rough approximation of the needs for reindexing where any change will result in // TODO: very rough approximation of the needs for reindexing where any change will result in
// a full reindexing. // a full reindexing.
// What can be done instead: // What can be done instead:
@ -1107,53 +1078,193 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
let embedding_configs_updated = self.update_embedding_configs()?; let embedding_configs_updated = self.update_embedding_configs()?;
// never trigger re-indexing let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
self.update_search_cutoff()?; let inner_settings_diff = InnerIndexSettingsDiff {
old: old_inner_settings,
new: new_inner_settings,
embedding_configs_updated,
settings_update_only: true,
};
if stop_words_updated if inner_settings_diff.any_reindexing_needed() {
|| non_separator_tokens_updated self.reindex(&progress_callback, &should_abort, inner_settings_diff)?;
|| separator_tokens_updated
|| dictionary_updated
|| faceted_updated
|| synonyms_updated
|| searchable_updated
|| exact_attributes_updated
|| proximity_precision
|| embedding_configs_updated
{
self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?;
} }
Ok(()) Ok(())
} }
fn update_faceted(
&self,
existing_fields: HashSet<String>,
old_faceted_fields: HashSet<String>,
) -> Result<bool> {
if existing_fields.iter().any(|field| field.contains('.')) {
return Ok(true);
} }
pub struct InnerIndexSettingsDiff {
pub(crate) old: InnerIndexSettings,
pub(crate) new: InnerIndexSettings,
// TODO: compare directly the embedders.
pub(crate) embedding_configs_updated: bool,
pub(crate) settings_update_only: bool,
}
impl InnerIndexSettingsDiff {
pub fn any_reindexing_needed(&self) -> bool {
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
}
pub fn reindex_searchable(&self) -> bool {
self.old
.fields_ids_map
.iter()
.zip(self.new.fields_ids_map.iter())
.any(|(old, new)| old != new)
|| self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
!= self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes())
|| self.old.allowed_separators != self.new.allowed_separators
|| self.old.dictionary != self.new.dictionary
|| self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields
|| self.old.exact_attributes != self.new.exact_attributes
|| self.old.proximity_precision != self.new.proximity_precision
}
pub fn reindex_facets(&self) -> bool {
let existing_fields = &self.new.existing_fields;
if existing_fields.iter().any(|field| field.contains('.')) {
return true;
}
let old_faceted_fields = &self.old.user_defined_faceted_fields;
if old_faceted_fields.iter().any(|field| field.contains('.')) { if old_faceted_fields.iter().any(|field| field.contains('.')) {
return Ok(true); return true;
} }
// If there is new faceted fields we indicate that we must reindex as we must // If there is new faceted fields we indicate that we must reindex as we must
// index new fields as facets. It means that the distinct attribute, // index new fields as facets. It means that the distinct attribute,
// an Asc/Desc criterion or a filtered attribute as be added or removed. // an Asc/Desc criterion or a filtered attribute as be added or removed.
let new_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?; let new_faceted_fields = &self.new.user_defined_faceted_fields;
if new_faceted_fields.iter().any(|field| field.contains('.')) { if new_faceted_fields.iter().any(|field| field.contains('.')) {
return Ok(true); return true;
} }
let faceted_updated = let faceted_updated =
(&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields); (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields);
Ok(faceted_updated) self.old
.fields_ids_map
.iter()
.zip(self.new.fields_ids_map.iter())
.any(|(old, new)| old != new)
|| faceted_updated
} }
pub fn reindex_vectors(&self) -> bool {
self.embedding_configs_updated
}
pub fn settings_update_only(&self) -> bool {
self.settings_update_only
}
}
#[derive(Clone)]
pub(crate) struct InnerIndexSettings {
pub stop_words: Option<fst::Set<Vec<u8>>>,
pub allowed_separators: Option<BTreeSet<String>>,
pub dictionary: Option<BTreeSet<String>>,
pub fields_ids_map: FieldsIdsMap,
pub user_defined_faceted_fields: HashSet<String>,
pub user_defined_searchable_fields: Option<Vec<String>>,
pub faceted_fields_ids: HashSet<FieldId>,
pub searchable_fields_ids: Option<Vec<FieldId>>,
pub exact_attributes: HashSet<FieldId>,
pub proximity_precision: ProximityPrecision,
pub embedding_configs: EmbeddingConfigs,
pub existing_fields: HashSet<String>,
}
impl InnerIndexSettings {
pub fn from_index(index: &Index, rtxn: &heed::RoTxn) -> Result<Self> {
let stop_words = index.stop_words(rtxn)?;
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
let allowed_separators = index.allowed_separators(rtxn)?;
let dictionary = index.dictionary(rtxn)?;
let fields_ids_map = index.fields_ids_map(rtxn)?;
let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?;
let user_defined_searchable_fields =
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
let searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
let faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
let exact_attributes = index.exact_attributes_ids(rtxn)?;
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
let embedding_configs = embedders(index.embedding_configs(rtxn)?)?;
let existing_fields: HashSet<_> = index
.field_distribution(rtxn)?
.into_iter()
.filter_map(|(field, count)| (count != 0).then_some(field))
.collect();
Ok(Self {
stop_words,
allowed_separators,
dictionary,
fields_ids_map,
user_defined_faceted_fields,
user_defined_searchable_fields,
faceted_fields_ids,
searchable_fields_ids,
exact_attributes,
proximity_precision,
embedding_configs,
existing_fields,
})
}
// find and insert the new field ids
pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
let new_facets = self
.fields_ids_map
.names()
.filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields))
.map(|field| field.to_string())
.collect();
index.put_faceted_fields(wtxn, &new_facets)?;
self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?;
Ok(())
}
// find and insert the new field ids
pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
// in case new fields were introduced we're going to recreate the searchable fields.
if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() {
let searchable_fields =
searchable_fields.iter().map(String::as_ref).collect::<Vec<_>>();
index.put_all_searchable_fields_from_fields_ids_map(
wtxn,
&searchable_fields,
&self.fields_ids_map,
)?;
let searchable_fields_ids = index.searchable_fields_ids(wtxn)?;
self.searchable_fields_ids = searchable_fields_ids;
}
Ok(())
}
}
fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs
.into_iter()
.map(|(name, EmbeddingConfig { embedder_options, prompt })| {
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
let embedder = Arc::new(
Embedder::new(embedder_options.clone())
.map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?,
);
Ok((name, (embedder, prompt)))
})
.collect();
res.map(EmbeddingConfigs::new)
} }
fn validate_prompt( fn validate_prompt(
@ -1643,6 +1754,70 @@ mod tests {
.unwrap() .unwrap()
.count(); .count();
assert_eq!(count, 4); assert_eq!(count, 4);
// Set the filterable fields to be the age and the name.
index
.update_settings(|settings| {
settings.set_filterable_fields(hashset! { S("age"), S("name") });
})
.unwrap();
// Check that the displayed fields are correctly set.
let rtxn = index.read_txn().unwrap();
let fields_ids = index.filterable_fields(&rtxn).unwrap();
assert_eq!(fields_ids, hashset! { S("age"), S("name") });
let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values.
let count = index
.facet_id_f64_docids
.remap_key_type::<Bytes>()
.prefix_iter(&rtxn, &[0, 1, 0])
.unwrap()
.count();
assert_eq!(count, 4);
let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values.
let count = index
.facet_id_string_docids
.remap_key_type::<Bytes>()
.prefix_iter(&rtxn, &[0, 0])
.unwrap()
.count();
assert_eq!(count, 5);
// Remove the age from the filterable fields.
index
.update_settings(|settings| {
settings.set_filterable_fields(hashset! { S("name") });
})
.unwrap();
// Check that the displayed fields are correctly set.
let rtxn = index.read_txn().unwrap();
let fields_ids = index.filterable_fields(&rtxn).unwrap();
assert_eq!(fields_ids, hashset! { S("name") });
let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values.
let count = index
.facet_id_f64_docids
.remap_key_type::<Bytes>()
.prefix_iter(&rtxn, &[0, 1, 0])
.unwrap()
.count();
assert_eq!(count, 0);
let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values.
let count = index
.facet_id_string_docids
.remap_key_type::<Bytes>()
.prefix_iter(&rtxn, &[0, 0])
.unwrap()
.count();
assert_eq!(count, 5);
} }
#[test] #[test]

View File

@ -3,6 +3,7 @@ use std::path::PathBuf;
use hf_hub::api::sync::ApiError; use hf_hub::api::sync::ApiError;
use crate::error::FaultSource; use crate::error::FaultSource;
use crate::PanicCatched;
#[derive(Debug, thiserror::Error)] #[derive(Debug, thiserror::Error)]
#[error("Error while generating embeddings: {inner}")] #[error("Error while generating embeddings: {inner}")]
@ -80,6 +81,8 @@ pub enum EmbedErrorKind {
OpenAiUnexpectedDimension(usize, usize), OpenAiUnexpectedDimension(usize, usize),
#[error("no embedding was produced")] #[error("no embedding was produced")]
MissingEmbedding, MissingEmbedding,
#[error(transparent)]
PanicInThreadPool(#[from] PanicCatched),
} }
impl EmbedError { impl EmbedError {

View File

@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize};
use self::error::{EmbedError, NewEmbedderError}; use self::error::{EmbedError, NewEmbedderError};
use crate::prompt::{Prompt, PromptData}; use crate::prompt::{Prompt, PromptData};
use crate::ThreadPoolNoAbort;
pub mod error; pub mod error;
pub mod hf; pub mod hf;
@ -254,7 +255,7 @@ impl Embedder {
pub fn embed_chunks( pub fn embed_chunks(
&self, &self,
text_chunks: Vec<Vec<String>>, text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool, threads: &ThreadPoolNoAbort,
) -> std::result::Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { ) -> std::result::Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
match self { match self {
Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks), Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks),

View File

@ -3,6 +3,8 @@ use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _};
use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind}; use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind};
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
use super::{DistributionShift, Embeddings}; use super::{DistributionShift, Embeddings};
use crate::error::FaultSource;
use crate::ThreadPoolNoAbort;
#[derive(Debug)] #[derive(Debug)]
pub struct Embedder { pub struct Embedder {
@ -71,11 +73,16 @@ impl Embedder {
pub fn embed_chunks( pub fn embed_chunks(
&self, &self,
text_chunks: Vec<Vec<String>>, text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool, threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { ) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
threads.install(move || { threads
.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
}) })
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
} }
pub fn chunk_count_hint(&self) -> usize { pub fn chunk_count_hint(&self) -> usize {

View File

@ -4,7 +4,9 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator as _};
use super::error::{EmbedError, NewEmbedderError}; use super::error::{EmbedError, NewEmbedderError};
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
use super::{DistributionShift, Embeddings}; use super::{DistributionShift, Embeddings};
use crate::error::FaultSource;
use crate::vector::error::EmbedErrorKind; use crate::vector::error::EmbedErrorKind;
use crate::ThreadPoolNoAbort;
#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)]
pub struct EmbedderOptions { pub struct EmbedderOptions {
@ -241,11 +243,16 @@ impl Embedder {
pub fn embed_chunks( pub fn embed_chunks(
&self, &self,
text_chunks: Vec<Vec<String>>, text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool, threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { ) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
threads.install(move || { threads
.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
}) })
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
} }
pub fn chunk_count_hint(&self) -> usize { pub fn chunk_count_hint(&self) -> usize {

View File

@ -2,9 +2,12 @@ use deserr::Deserr;
use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::error::EmbedErrorKind;
use super::{ use super::{
DistributionShift, EmbedError, Embedding, Embeddings, NewEmbedderError, REQUEST_PARALLELISM, DistributionShift, EmbedError, Embedding, Embeddings, NewEmbedderError, REQUEST_PARALLELISM,
}; };
use crate::error::FaultSource;
use crate::ThreadPoolNoAbort;
// retrying in case of failure // retrying in case of failure
@ -158,11 +161,16 @@ impl Embedder {
pub fn embed_chunks( pub fn embed_chunks(
&self, &self,
text_chunks: Vec<Vec<String>>, text_chunks: Vec<Vec<String>>,
threads: &rayon::ThreadPool, threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { ) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> {
threads.install(move || { threads
.install(move || {
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect()
}) })
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
} }
pub fn chunk_count_hint(&self) -> usize { pub fn chunk_count_hint(&self) -> usize {

View File

@ -301,10 +301,14 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
fn from(value: EmbeddingConfig) -> Self { fn from(value: EmbeddingConfig) -> Self {
let EmbeddingConfig { embedder_options, prompt } = value; let EmbeddingConfig { embedder_options, prompt } = value;
match embedder_options { match embedder_options {
super::EmbedderOptions::HuggingFace(options) => Self { super::EmbedderOptions::HuggingFace(super::hf::EmbedderOptions {
model,
revision,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::HuggingFace), source: Setting::Set(EmbedderSource::HuggingFace),
model: Setting::Set(options.model), model: Setting::Set(model),
revision: options.revision.map(Setting::Set).unwrap_or_default(), revision: revision.map(Setting::Set).unwrap_or_default(),
api_key: Setting::NotSet, api_key: Setting::NotSet,
dimensions: Setting::NotSet, dimensions: Setting::NotSet,
document_template: Setting::Set(prompt.template), document_template: Setting::Set(prompt.template),
@ -314,14 +318,19 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
path_to_embeddings: Setting::NotSet, path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet, embedding_object: Setting::NotSet,
input_type: Setting::NotSet, input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(), distribution: distribution.map(Setting::Set).unwrap_or_default(),
}, },
super::EmbedderOptions::OpenAi(options) => Self { super::EmbedderOptions::OpenAi(super::openai::EmbedderOptions {
api_key,
embedding_model,
dimensions,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::OpenAi), source: Setting::Set(EmbedderSource::OpenAi),
model: Setting::Set(options.embedding_model.name().to_owned()), model: Setting::Set(embedding_model.name().to_owned()),
revision: Setting::NotSet, revision: Setting::NotSet,
api_key: options.api_key.map(Setting::Set).unwrap_or_default(), api_key: api_key.map(Setting::Set).unwrap_or_default(),
dimensions: options.dimensions.map(Setting::Set).unwrap_or_default(), dimensions: dimensions.map(Setting::Set).unwrap_or_default(),
document_template: Setting::Set(prompt.template), document_template: Setting::Set(prompt.template),
url: Setting::NotSet, url: Setting::NotSet,
query: Setting::NotSet, query: Setting::NotSet,
@ -329,29 +338,37 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
path_to_embeddings: Setting::NotSet, path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet, embedding_object: Setting::NotSet,
input_type: Setting::NotSet, input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(), distribution: distribution.map(Setting::Set).unwrap_or_default(),
}, },
super::EmbedderOptions::Ollama(options) => Self { super::EmbedderOptions::Ollama(super::ollama::EmbedderOptions {
embedding_model,
url,
api_key,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::Ollama), source: Setting::Set(EmbedderSource::Ollama),
model: Setting::Set(options.embedding_model.to_owned()), model: Setting::Set(embedding_model),
revision: Setting::NotSet, revision: Setting::NotSet,
api_key: Setting::NotSet, api_key: api_key.map(Setting::Set).unwrap_or_default(),
dimensions: Setting::NotSet, dimensions: Setting::NotSet,
document_template: Setting::Set(prompt.template), document_template: Setting::Set(prompt.template),
url: Setting::NotSet, url: url.map(Setting::Set).unwrap_or_default(),
query: Setting::NotSet, query: Setting::NotSet,
input_field: Setting::NotSet, input_field: Setting::NotSet,
path_to_embeddings: Setting::NotSet, path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet, embedding_object: Setting::NotSet,
input_type: Setting::NotSet, input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(), distribution: distribution.map(Setting::Set).unwrap_or_default(),
}, },
super::EmbedderOptions::UserProvided(options) => Self { super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions {
dimensions,
distribution,
}) => Self {
source: Setting::Set(EmbedderSource::UserProvided), source: Setting::Set(EmbedderSource::UserProvided),
model: Setting::NotSet, model: Setting::NotSet,
revision: Setting::NotSet, revision: Setting::NotSet,
api_key: Setting::NotSet, api_key: Setting::NotSet,
dimensions: Setting::Set(options.dimensions), dimensions: Setting::Set(dimensions),
document_template: Setting::NotSet, document_template: Setting::NotSet,
url: Setting::NotSet, url: Setting::NotSet,
query: Setting::NotSet, query: Setting::NotSet,
@ -359,7 +376,7 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
path_to_embeddings: Setting::NotSet, path_to_embeddings: Setting::NotSet,
embedding_object: Setting::NotSet, embedding_object: Setting::NotSet,
input_type: Setting::NotSet, input_type: Setting::NotSet,
distribution: options.distribution.map(Setting::Set).unwrap_or_default(), distribution: distribution.map(Setting::Set).unwrap_or_default(),
}, },
super::EmbedderOptions::Rest(super::rest::EmbedderOptions { super::EmbedderOptions::Rest(super::rest::EmbedderOptions {
api_key, api_key,

View File

@ -217,9 +217,7 @@ fn add_memory_samples(
memory_counters: &mut Option<MemoryCounterHandles>, memory_counters: &mut Option<MemoryCounterHandles>,
last_memory: &mut MemoryStats, last_memory: &mut MemoryStats,
) -> Option<MemoryStats> { ) -> Option<MemoryStats> {
let Some(stats) = memory else { let stats = memory?;
return None;
};
let memory_counters = let memory_counters =
memory_counters.get_or_insert_with(|| MemoryCounterHandles::new(profile, main)); memory_counters.get_or_insert_with(|| MemoryCounterHandles::new(profile, main));

View File

@ -0,0 +1,68 @@
{
"name": "movies-subset-hf-embeddings",
"run_count": 5,
"extra_cli_args": [
"--max-indexing-threads=4"
],
"assets": {
"movies-100.json": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json",
"sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6"
}
},
"commands": [
{
"route": "experimental-features",
"method": "PATCH",
"body": {
"inline": {
"vectorStore": true
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"searchableAttributes": [
"title",
"overview"
],
"filterableAttributes": [
"genres",
"release_date"
],
"sortableAttributes": [
"release_date"
]
}
},
"synchronous": "WaitForTask"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"embedders": {
"default": {
"source": "huggingFace"
}
}
}
},
"synchronous": "WaitForTask"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "movies-100.json"
},
"synchronous": "WaitForTask"
}
]
}

View File

@ -0,0 +1,72 @@
{
"name": "settings-add-embeddings-hf",
"run_count": 5,
"extra_cli_args": [
"--max-indexing-threads=4"
],
"assets": {
"movies-100.json": {
"local_location": null,
"remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json",
"sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6"
}
},
"commands": [
{
"route": "experimental-features",
"method": "PATCH",
"body": {
"inline": {
"vectorStore": true
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"searchableAttributes": [
"title",
"overview"
],
"filterableAttributes": [
"genres",
"release_date"
],
"sortableAttributes": [
"release_date"
]
}
},
"synchronous": "DontWait"
},
{
"route": "indexes/movies/documents",
"method": "POST",
"body": {
"asset": "movies-100.json"
},
"synchronous": "WaitForTask"
},
{
"route": "indexes/movies/settings",
"method": "PATCH",
"body": {
"inline": {
"embedders": {
"default": {
"source": "huggingFace",
"model": null,
"revision": null,
"documentTemplate": null,
"distribution": null
}
}
}
},
"synchronous": "WaitForTask"
}
]
}

View File

@ -1,6 +1,6 @@
{ {
"name": "settings-add-remove-filters.json", "name": "settings-add-remove-filters.json",
"run_count": 2, "run_count": 5,
"extra_cli_args": [ "extra_cli_args": [
"--max-indexing-threads=4" "--max-indexing-threads=4"
], ],

View File

@ -1,6 +1,6 @@
{ {
"name": "settings-proximity-precision.json", "name": "settings-proximity-precision.json",
"run_count": 2, "run_count": 5,
"extra_cli_args": [ "extra_cli_args": [
"--max-indexing-threads=4" "--max-indexing-threads=4"
], ],

View File

@ -1,6 +1,6 @@
{ {
"name": "settings-remove-add-swap-searchable.json", "name": "settings-remove-add-swap-searchable.json",
"run_count": 2, "run_count": 5,
"extra_cli_args": [ "extra_cli_args": [
"--max-indexing-threads=4" "--max-indexing-threads=4"
], ],

View File

@ -1,6 +1,6 @@
{ {
"name": "settings-typo.json", "name": "settings-typo.json",
"run_count": 2, "run_count": 5,
"extra_cli_args": [ "extra_cli_args": [
"--max-indexing-threads=4" "--max-indexing-threads=4"
], ],