mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 08:48:32 +08:00
Initial commit
This commit is contained in:
parent
4573f00a0d
commit
91ba938953
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/target
|
749
Cargo.lock
generated
Normal file
749
Cargo.lock
generated
Normal file
@ -0,0 +1,749 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "0.7.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8716408b8bc624ed7f65d223ddb9ac2d044c0547b6fa4b0d554f3a9540496ada"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f"
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8aac770f1885fd7e387acedd76065302551364496e46b3dd00860b2f8359b9d"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
|
||||
|
||||
[[package]]
|
||||
name = "bitpacking"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3744aff20a3437a99ebc0bb7733e9e60c7bf590478c9b897e95b38d57e5acb68"
|
||||
dependencies = [
|
||||
"crunchy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bstr"
|
||||
version = "0.2.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "31accafdb70df7871592c058eca3985b71104e15ac32f64706022c58867da931"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.54"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7bbb73db36c1246e9034e307d0fba23f9a2e251faa47ade70c1bd252220c8311"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
|
||||
|
||||
[[package]]
|
||||
name = "clap"
|
||||
version = "2.33.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bdfa80d47f954d53a35a64987ca1422f495b8d6483c0fe9f7117b36c2a792129"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"textwrap",
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cloudabi"
|
||||
version = "0.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crc32fast"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285"
|
||||
dependencies = [
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
"maybe-uninit",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
"lazy_static",
|
||||
"maybe-uninit",
|
||||
"memoffset",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-queue"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c695eeca1e7173472a32221542ae469b3e9aac3a4fc81f7696bcad82029493db"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.7.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c3c7c73a2d1e9fc0886a08b93e98eb643461230d5f1925e4036204d5f2e261a8"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"cfg-if",
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crunchy"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
|
||||
|
||||
[[package]]
|
||||
name = "csv"
|
||||
version = "1.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00affe7f6ab566df61b4be3ce8cf16bc2576bca0963ceb0955e45d514bf9a279"
|
||||
dependencies = [
|
||||
"bstr",
|
||||
"csv-core",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv-core"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3"
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36"
|
||||
dependencies = [
|
||||
"log",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs2"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs_extra"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f2a4a2034423744d2cc7ca2068453168dcdb82c438419e639a26bd87839c674"
|
||||
|
||||
[[package]]
|
||||
name = "fst"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7293de202dbfe786c0b3fe6110a027836c5438ed06db7b715c9955ff4bfea51"
|
||||
|
||||
[[package]]
|
||||
name = "fxhash"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.1.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "20564e78d53d2bb135c343b3f47714a56af2061f1c928fdb541dc7b9fdd94205"
|
||||
dependencies = [
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.1.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91780f809e750b0a89f5544be56617ff6b1227ee485bcb06ebe10cdf89bd3b71"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "0.4.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b8b7a7c0c47db5545ed3fef7468ee7bb5b74691498139e4b3f6a20685dc6dd8e"
|
||||
|
||||
[[package]]
|
||||
name = "jemalloc-sys"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"fs_extra",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jemallocator"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69"
|
||||
dependencies = [
|
||||
"jemalloc-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.70"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3baa92041a6fec78c687fa0cc2b3fae8884f743d672cf551bed1d6dac6988d0f"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4da24a77a3d8a6d4862d95f72e6fdb9c09a643ecdb402d754004a557f2bec75"
|
||||
dependencies = [
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "maybe-uninit"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
|
||||
|
||||
[[package]]
|
||||
name = "mega-mini-indexer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bitpacking",
|
||||
"byteorder",
|
||||
"csv",
|
||||
"fst",
|
||||
"fxhash",
|
||||
"jemallocator",
|
||||
"quickcheck",
|
||||
"rayon",
|
||||
"sdset",
|
||||
"sled",
|
||||
"slice-group-by",
|
||||
"smallstr",
|
||||
"structopt",
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400"
|
||||
|
||||
[[package]]
|
||||
name = "memoffset"
|
||||
version = "0.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_cpus"
|
||||
version = "1.13.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3a704eb390aafdc107b0e392f56a82b668e3a71366993b5340f5833fd62505e"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
"parking_lot_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot_core"
|
||||
version = "0.7.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d58c7c768d4ba344e3e8d72518ac13e259d7c7ade24167003b8488e10b6740a3"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cloudabi",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"smallvec",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "237a5ed80e274dbc66f86bd59c1e25edc039660be53194b5fe0a482e0f2612ea"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "98e9e4b82e0ef281812565ea4751049f1bdcdfccda7d3f459f2e138a40c08678"
|
||||
dependencies = [
|
||||
"proc-macro-error-attr",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error-attr"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4f5444ead4e9935abd7f27dc51f7e852a0569ac888096d5ec2499470794e2e53"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"syn-mid",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1502d12e458c49a4c9cbff560d0fe0060c252bc29799ed94ca2ed4bb665a0101"
|
||||
dependencies = [
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quickcheck"
|
||||
version = "0.9.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a44883e74aa97ad63db83c4bf8ca490f02b2fc02f92575e720c8551e843c945f"
|
||||
dependencies = [
|
||||
"env_logger",
|
||||
"log",
|
||||
"rand",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "54a21852a652ad6f610c9510194f398ff6f8692e334fd1145fed931f7fbe44ea"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
"libc",
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
"rand_hc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
|
||||
dependencies = [
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"crossbeam-queue",
|
||||
"crossbeam-utils",
|
||||
"lazy_static",
|
||||
"num_cpus",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.1.56"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.3.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a6020f034922e3194c711b82a627453881bc4682166cabb07134a10c26ba7692"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
"thread_local",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ae1ded71d66a4a97f5e961fd0cb25a5f366a42a41570d16a763a69c092c26ae4"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7fe5bd57d1d7414c6b5ed48563a2c855d995ff777729dcd91c369ec7fea395ae"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed3d612bc64430efeb3f7ee6ef26d590dce0c43249217bddc62112540c7941e1"
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
||||
|
||||
[[package]]
|
||||
name = "sdset"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cbb21fe0588557792176c89bc7b943027b14f346d03c6be6a199c2860277d93a"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.110"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99e7b308464d16b56eba9964e4972a3eee817760ab60d88c3f86e1fecb08204c"
|
||||
|
||||
[[package]]
|
||||
name = "sled"
|
||||
version = "0.31.0"
|
||||
source = "git+https://github.com/spacejam/sled.git?rev=2fe05c9#2fe05c933a4a68d4dbbc06a16a3058236fcc6350"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
"fs2",
|
||||
"fxhash",
|
||||
"libc",
|
||||
"log",
|
||||
"parking_lot",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slice-group-by"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f7474f0b646d228360ab62ed974744617bc869d959eac8403bfa3665931a7fb"
|
||||
|
||||
[[package]]
|
||||
name = "smallstr"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e922794d168678729ffc7e07182721a14219c65814e66e91b839a272fe5ae4f"
|
||||
dependencies = [
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c7cb5678e1615754284ec264d9bb5b4c27d2018577fd90ac0ceb578591ed5ee4"
|
||||
|
||||
[[package]]
|
||||
name = "structopt"
|
||||
version = "0.3.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "863246aaf5ddd0d6928dfeb1a9ca65f505599e4e1b399935ef7e75107516b4ef"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"lazy_static",
|
||||
"structopt-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "structopt-derive"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d239ca4b13aee7a2142e6795cbd69e457665ff8037aed33b3effdc430d2f927a"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "95b5f192649e48a5302a13f2feb224df883b98933222369e4b3b0fe2a5447269"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn-mid"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7be3539f6c128a931cf19dcee741c1af532c7fd387baa739c03dd2e96479338a"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "synstructure"
|
||||
version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "67656ea1dc1b41b1451851562ea232ec2e5a80242139f7e679ceccfb5d61f545"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"unicode-xid",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textwrap"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
version = "1.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e83e153d1053cbb5a118eeff7fd5be06ed99153f00dbcd8ae310c5fb2b22edc0"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5a972e5669d67ba988ce3dc826706fb0a8b01471c088cb0b6110b805cc36aed"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.9.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu",
|
||||
"winapi-x86_64-pc-windows-gnu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6580539ad917b7c026220c4b3f2c08d52ce54d6ce0dc491e66002e35388fab46"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy-derive"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d498dbd1fd7beb83c86709ae1c33ca50942889473473d287d56ce4770a18edfb"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"syn",
|
||||
"synstructure",
|
||||
]
|
27
Cargo.toml
Normal file
27
Cargo.toml
Normal file
@ -0,0 +1,27 @@
|
||||
[package]
|
||||
name = "mega-mini-indexer"
|
||||
version = "0.1.0"
|
||||
authors = ["Kerollmops <clement@meilisearch.com>"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.28"
|
||||
bitpacking = "0.8.2"
|
||||
byteorder = "1.3.4"
|
||||
csv = "1.1.3"
|
||||
fst = "0.4.3"
|
||||
fxhash = "0.2.1"
|
||||
jemallocator = "0.3.2"
|
||||
rayon = "1.3.0"
|
||||
sdset = "0.4.0"
|
||||
sled = { git = "https://github.com/spacejam/sled.git", rev = "2fe05c9"}
|
||||
slice-group-by = "0.2.6"
|
||||
smallstr = "0.2.0"
|
||||
structopt = { version = "0.3.14", default-features = false }
|
||||
zerocopy = "0.3.0"
|
||||
|
||||
[dev-dependencies]
|
||||
quickcheck = "0.9.2"
|
||||
|
||||
[profile.release]
|
||||
debug = true
|
11
qc_loop.sh
Executable file
11
qc_loop.sh
Executable file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
export RUST_BACKTRACE=1
|
||||
|
||||
while true
|
||||
do
|
||||
cargo test qc_ --release -- --nocapture
|
||||
if [[ x$? != x0 ]] ; then
|
||||
exit $?
|
||||
fi
|
||||
done
|
197
src/bp_vec.rs
Normal file
197
src/bp_vec.rs
Normal file
@ -0,0 +1,197 @@
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
|
||||
/// An append only bitpacked u32 vector that ignore order of insertion.
|
||||
#[derive(Default)]
|
||||
pub struct BpVec {
|
||||
compressed: Vec<u8>,
|
||||
uncompressed: Vec<u32>,
|
||||
}
|
||||
|
||||
impl BpVec {
|
||||
pub fn new() -> BpVec {
|
||||
BpVec::default()
|
||||
}
|
||||
|
||||
pub fn push(&mut self, elem: u32) {
|
||||
self.uncompressed.push(elem);
|
||||
if self.uncompressed.len() == BitPacker4x::BLOCK_LEN {
|
||||
encode(&mut self.uncompressed[..], &mut self.compressed);
|
||||
self.uncompressed.clear();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn extend_from_slice(&mut self, elems: &[u32]) {
|
||||
self.uncompressed.extend_from_slice(elems);
|
||||
let remaining = self.uncompressed.len() % BitPacker4x::BLOCK_LEN;
|
||||
for chunk in self.uncompressed[remaining..].chunks_exact_mut(BitPacker4x::BLOCK_LEN) {
|
||||
encode(chunk, &mut self.compressed);
|
||||
}
|
||||
self.uncompressed.truncate(remaining);
|
||||
self.uncompressed.shrink_to_fit();
|
||||
}
|
||||
|
||||
pub fn to_vec(self) -> Vec<u32> {
|
||||
let BpVec { compressed, mut uncompressed } = self;
|
||||
decode(&compressed, &mut uncompressed);
|
||||
uncompressed
|
||||
}
|
||||
|
||||
pub fn capacity(&self) -> usize {
|
||||
self.compressed.capacity() + self.uncompressed.capacity()
|
||||
}
|
||||
}
|
||||
|
||||
fn encode(items: &mut [u32], encoded: &mut Vec<u8>) {
|
||||
assert_eq!(items.len(), BitPacker4x::BLOCK_LEN);
|
||||
|
||||
let bitpacker = BitPacker4x::new();
|
||||
|
||||
// We reserve enough space in the output buffer, filled with zeroes.
|
||||
let len = encoded.len();
|
||||
// initial_value + num_bits + encoded numbers
|
||||
let max_possible_length = 4 + 1 + 4 * BitPacker4x::BLOCK_LEN;
|
||||
encoded.resize(len + max_possible_length, 0);
|
||||
|
||||
// We sort the items to be able to efficiently bitpack them.
|
||||
items.sort_unstable();
|
||||
// We save the initial value to us for this block, the lowest one.
|
||||
let initial_value = items[0];
|
||||
// We compute the number of bits necessary to encode this block
|
||||
let num_bits = bitpacker.num_bits_sorted(initial_value, items);
|
||||
|
||||
// We write the initial value for this block.
|
||||
let buffer = &mut encoded[len..];
|
||||
NativeEndian::write_u32(buffer, initial_value);
|
||||
// We write the num_bits that will be read to decode this block
|
||||
let buffer = &mut buffer[4..];
|
||||
buffer[0] = num_bits;
|
||||
// We encode the block numbers into the buffer using the num_bits
|
||||
let buffer = &mut buffer[1..];
|
||||
let compressed_len = bitpacker.compress_sorted(initial_value, items, buffer, num_bits);
|
||||
|
||||
// We truncate the buffer to the avoid leaking padding zeroes
|
||||
encoded.truncate(len + 4 + 1 + compressed_len);
|
||||
}
|
||||
|
||||
fn decode(mut encoded: &[u8], decoded: &mut Vec<u32>) {
|
||||
let bitpacker = BitPacker4x::new();
|
||||
|
||||
// initial_value + num_bits
|
||||
while let Some(header) = encoded.get(0..4 + 1) {
|
||||
// We extract the header informations
|
||||
let initial_value = NativeEndian::read_u32(header);
|
||||
let num_bits = header[4];
|
||||
let bytes = &encoded[4 + 1..];
|
||||
|
||||
// If the num_bits is equal to zero it means that all encoded numbers were zeroes
|
||||
if num_bits == 0 {
|
||||
decoded.resize(decoded.len() + BitPacker4x::BLOCK_LEN, initial_value);
|
||||
encoded = bytes;
|
||||
continue;
|
||||
}
|
||||
|
||||
// We guess the block size based on the num_bits used for this block
|
||||
let block_size = BitPacker4x::compressed_block_size(num_bits);
|
||||
|
||||
// We pad the decoded vector with zeroes
|
||||
let new_len = decoded.len() + BitPacker4x::BLOCK_LEN;
|
||||
decoded.resize(new_len, 0);
|
||||
|
||||
// Create a view into the decoded buffer and decode into it
|
||||
let to_decompress = &mut decoded[new_len - BitPacker4x::BLOCK_LEN..new_len];
|
||||
bitpacker.decompress_sorted(initial_value, &bytes[..block_size], to_decompress, num_bits);
|
||||
|
||||
// Advance the bytes offset to read the next block (+ num_bits)
|
||||
encoded = &bytes[block_size..];
|
||||
}
|
||||
}
|
||||
|
||||
impl sdset::Collection<u32> for BpVec {
|
||||
fn push(&mut self, elem: u32) {
|
||||
BpVec::push(self, elem);
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, elems: &[u32]) {
|
||||
BpVec::extend_from_slice(self, elems);
|
||||
}
|
||||
|
||||
fn extend<I>(&mut self, elems: I) where I: IntoIterator<Item=u32> {
|
||||
elems.into_iter().for_each(|x| BpVec::push(self, x));
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
quickcheck! {
|
||||
fn qc_push(xs: Vec<u32>) -> bool {
|
||||
let mut xs: Vec<_> = xs.iter().cloned().cycle().take(1300).collect();
|
||||
|
||||
let mut bpvec = BpVec::new();
|
||||
xs.iter().for_each(|x| bpvec.push(*x));
|
||||
let mut result = bpvec.to_vec();
|
||||
|
||||
result.sort_unstable();
|
||||
xs.sort_unstable();
|
||||
|
||||
xs == result
|
||||
}
|
||||
}
|
||||
|
||||
quickcheck! {
|
||||
fn qc_extend_from_slice(xs: Vec<u32>) -> bool {
|
||||
let mut xs: Vec<_> = xs.iter().cloned().cycle().take(1300).collect();
|
||||
|
||||
let mut bpvec = BpVec::new();
|
||||
bpvec.extend_from_slice(&xs);
|
||||
let mut result = bpvec.to_vec();
|
||||
|
||||
result.sort_unstable();
|
||||
xs.sort_unstable();
|
||||
|
||||
xs == result
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty() {
|
||||
let mut bpvec = BpVec::new();
|
||||
bpvec.extend_from_slice(&[]);
|
||||
let result = bpvec.to_vec();
|
||||
|
||||
assert!(result.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn one_zero() {
|
||||
let mut bpvec = BpVec::new();
|
||||
bpvec.extend_from_slice(&[0]);
|
||||
let result = bpvec.to_vec();
|
||||
|
||||
assert_eq!(&[0], &*result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn many_zeros() {
|
||||
let xs: Vec<_> = std::iter::repeat(0).take(1300).collect();
|
||||
|
||||
let mut bpvec = BpVec::new();
|
||||
bpvec.extend_from_slice(&xs);
|
||||
let result = bpvec.to_vec();
|
||||
|
||||
assert_eq!(xs, result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn many_ones() {
|
||||
let xs: Vec<_> = std::iter::repeat(1).take(1300).collect();
|
||||
|
||||
let mut bpvec = BpVec::new();
|
||||
bpvec.extend_from_slice(&xs);
|
||||
let result = bpvec.to_vec();
|
||||
|
||||
assert_eq!(xs, result);
|
||||
}
|
||||
}
|
84
src/codec/bitpacker_sorted.rs
Normal file
84
src/codec/bitpacker_sorted.rs
Normal file
@ -0,0 +1,84 @@
|
||||
use bitpacking::{BitPacker, BitPacker4x};
|
||||
use byteorder::{ReadBytesExt, NativeEndian};
|
||||
use zerocopy::AsBytes;
|
||||
|
||||
pub struct CodecBitPacker4xSorted;
|
||||
|
||||
impl CodecBitPacker4xSorted {
|
||||
pub fn bytes_encode(item: &[u32]) -> Option<Vec<u8>> {
|
||||
// This is a hotfix to the SIGSEGV
|
||||
// https://github.com/tantivy-search/bitpacking/issues/23
|
||||
if item.is_empty() {
|
||||
return Some(Vec::default())
|
||||
}
|
||||
|
||||
let bitpacker = BitPacker4x::new();
|
||||
let mut compressed = Vec::new();
|
||||
let mut initial_value = 0;
|
||||
|
||||
// The number of remaining numbers that don't fit in the block size.
|
||||
compressed.push((item.len() % BitPacker4x::BLOCK_LEN) as u8);
|
||||
|
||||
// we cannot use a mut slice here because of #68630, TooGeneric error.
|
||||
// we can probably avoid this new allocation by directly using the compressed final Vec.
|
||||
let mut buffer = vec![0u8; 4 * BitPacker4x::BLOCK_LEN];
|
||||
|
||||
for chunk in item.chunks(BitPacker4x::BLOCK_LEN) {
|
||||
if chunk.len() == BitPacker4x::BLOCK_LEN {
|
||||
// compute the number of bits necessary to encode this block
|
||||
let num_bits = bitpacker.num_bits_sorted(initial_value, chunk);
|
||||
// Encode the block numbers into the buffer using the num_bits
|
||||
let compressed_len = bitpacker.compress_sorted(initial_value, chunk, &mut buffer, num_bits);
|
||||
// Write the num_bits that will be read to decode this block
|
||||
compressed.push(num_bits);
|
||||
// Wrtie the bytes of the compressed block numbers
|
||||
compressed.extend_from_slice(&buffer[..compressed_len]);
|
||||
// Save the initial_value, which is the last value of the n-1 used for the n block
|
||||
initial_value = *chunk.last().unwrap();
|
||||
} else {
|
||||
// Save the remaining numbers which don't fit inside of a BLOCK_LEN
|
||||
compressed.extend_from_slice(chunk.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
Some(compressed)
|
||||
}
|
||||
|
||||
pub fn bytes_decode(bytes: &[u8]) -> Option<Vec<u32>> {
|
||||
if bytes.is_empty() {
|
||||
return Some(Vec::new())
|
||||
}
|
||||
|
||||
let bitpacker = BitPacker4x::new();
|
||||
let (remaining, bytes) = bytes.split_first().unwrap();
|
||||
let remaining = *remaining as usize;
|
||||
|
||||
let (mut bytes, mut remaining_bytes) = bytes.split_at(bytes.len() - remaining * 4);
|
||||
let mut decompressed = Vec::new();
|
||||
let mut initial_value = 0;
|
||||
|
||||
while let Some(num_bits) = bytes.get(0) {
|
||||
let block_size = BitPacker4x::compressed_block_size(*num_bits);
|
||||
|
||||
let new_len = decompressed.len() + BitPacker4x::BLOCK_LEN;
|
||||
decompressed.resize(new_len, 0);
|
||||
|
||||
// Create a view into the decompressed buffer and decomress into it
|
||||
let to_decompress = &mut decompressed[new_len - BitPacker4x::BLOCK_LEN..new_len];
|
||||
bitpacker.decompress_sorted(initial_value, &bytes[1..block_size + 1], to_decompress, *num_bits);
|
||||
|
||||
// Set the new initial_value for the next block
|
||||
initial_value = *decompressed.last().unwrap();
|
||||
// Advance the bytes offset to read the next block (+ num_bits)
|
||||
bytes = &bytes[block_size + 1..];
|
||||
}
|
||||
|
||||
// We add the remaining uncompressed numbers.
|
||||
let new_len = decompressed.len() + remaining;
|
||||
decompressed.resize(new_len, 0);
|
||||
let to_decompress = &mut decompressed[new_len - remaining..new_len];
|
||||
remaining_bytes.read_u32_into::<NativeEndian>(to_decompress).ok()?;
|
||||
|
||||
Some(decompressed)
|
||||
}
|
||||
}
|
3
src/codec/mod.rs
Normal file
3
src/codec/mod.rs
Normal file
@ -0,0 +1,3 @@
|
||||
mod bitpacker_sorted;
|
||||
|
||||
pub use self::bitpacker_sorted::CodecBitPacker4xSorted;
|
186
src/main.rs
Normal file
186
src/main.rs
Normal file
@ -0,0 +1,186 @@
|
||||
#[cfg(test)]
|
||||
#[macro_use] extern crate quickcheck;
|
||||
|
||||
mod codec;
|
||||
mod bp_vec;
|
||||
|
||||
use std::collections::{HashMap, BTreeSet};
|
||||
use std::convert::TryFrom;
|
||||
use std::fs::File;
|
||||
use std::hash::BuildHasherDefault;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::{ensure, Context};
|
||||
use fst::IntoStreamer;
|
||||
use fxhash::FxHasher32;
|
||||
use rayon::prelude::*;
|
||||
use sdset::{SetOperation, SetBuf};
|
||||
use slice_group_by::StrGroupBy;
|
||||
use structopt::StructOpt;
|
||||
|
||||
use self::codec::CodecBitPacker4xSorted;
|
||||
use self::bp_vec::BpVec;
|
||||
|
||||
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
|
||||
pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
#[structopt(name = "mm-indexer", about = "The server side of the daugt project.")]
|
||||
struct Opt {
|
||||
/// The database path where the database is located.
|
||||
/// It is created if it doesn't already exist.
|
||||
#[structopt(long = "db", parse(from_os_str))]
|
||||
database: PathBuf,
|
||||
|
||||
/// Files to index in parallel.
|
||||
files_to_index: Vec<PathBuf>,
|
||||
}
|
||||
|
||||
fn union_bitpacked_postings_ids(_key: &[u8], old_value: Option<&[u8]>, new_value: &[u8]) -> Option<Vec<u8>> {
|
||||
if old_value.is_none() {
|
||||
return Some(new_value.to_vec())
|
||||
}
|
||||
|
||||
let old_value = old_value.unwrap_or_default();
|
||||
let old_value = CodecBitPacker4xSorted::bytes_decode(&old_value).unwrap();
|
||||
let new_value = CodecBitPacker4xSorted::bytes_decode(&new_value).unwrap();
|
||||
|
||||
let old_set = SetBuf::new(old_value).unwrap();
|
||||
let new_set = SetBuf::new(new_value).unwrap();
|
||||
|
||||
let result = sdset::duo::Union::new(&old_set, &new_set).into_set_buf();
|
||||
let compressed = CodecBitPacker4xSorted::bytes_encode(&result).unwrap();
|
||||
|
||||
Some(compressed)
|
||||
}
|
||||
|
||||
fn union_words_fst(key: &[u8], old_value: Option<&[u8]>, new_value: &[u8]) -> Option<Vec<u8>> {
|
||||
if key != b"words-fst" { unimplemented!() }
|
||||
|
||||
let old_value = match old_value {
|
||||
Some(old_value) => old_value,
|
||||
None => return Some(new_value.to_vec()),
|
||||
};
|
||||
|
||||
eprintln!("old_words size: {}", old_value.len());
|
||||
eprintln!("new_words size: {}", new_value.len());
|
||||
|
||||
let old_words = fst::Set::new(old_value).unwrap();
|
||||
let new_words = fst::Set::new(new_value).unwrap();
|
||||
|
||||
// Do an union of the old and the new set of words.
|
||||
let op = old_words.op().add(new_words.into_stream()).r#union();
|
||||
let mut build = fst::SetBuilder::memory();
|
||||
build.extend_stream(op.into_stream()).unwrap();
|
||||
|
||||
Some(build.into_inner().unwrap())
|
||||
}
|
||||
|
||||
fn alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {
|
||||
let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric);
|
||||
string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
|
||||
}
|
||||
|
||||
fn index_csv(tid: usize, db: sled::Db, mut rdr: csv::Reader<File>) -> anyhow::Result<usize> {
|
||||
const MAX_POSITION: usize = 1000;
|
||||
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
|
||||
|
||||
let main = &*db;
|
||||
let postings_ids = db.open_tree("postings-ids")?;
|
||||
let documents = db.open_tree("documents")?;
|
||||
|
||||
let mut document = csv::StringRecord::new();
|
||||
let mut new_postings_ids = FastMap4::default();
|
||||
let mut new_words = BTreeSet::default();
|
||||
let mut number_of_documents = 0;
|
||||
|
||||
// Write the headers into a Vec of bytes.
|
||||
let headers = rdr.headers()?;
|
||||
let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new());
|
||||
writer.write_byte_record(headers.as_byte_record())?;
|
||||
let headers = writer.into_inner()?;
|
||||
|
||||
if let Some(old_headers) = main.insert("headers", headers.as_slice())? {
|
||||
ensure!(old_headers == headers, "headers differs from the previous ones");
|
||||
}
|
||||
|
||||
while rdr.read_record(&mut document)? {
|
||||
let document_id = db.generate_id()?;
|
||||
let document_id = u32::try_from(document_id).context("Generated id is too big")?;
|
||||
|
||||
for (_attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
|
||||
for (_pos, word) in alphanumeric_tokens(&content).enumerate().take(MAX_POSITION) {
|
||||
new_postings_ids.entry(SmallString32::from(word)).or_insert_with(BpVec::new).push(document_id);
|
||||
}
|
||||
}
|
||||
|
||||
// We write the document in the database.
|
||||
let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new());
|
||||
writer.write_byte_record(document.as_byte_record())?;
|
||||
let document = writer.into_inner()?;
|
||||
documents.insert(document_id.to_be_bytes(), document)?;
|
||||
|
||||
number_of_documents += 1;
|
||||
if number_of_documents % 100000 == 0 {
|
||||
let postings_ids_size = new_postings_ids.iter().map(|(_, v)| v.capacity() * 4).sum::<usize>();
|
||||
eprintln!("{}, documents seen {}, postings size {}",
|
||||
tid, number_of_documents, postings_ids_size);
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("Start collecting the postings lists and words");
|
||||
|
||||
// We compute and store the postings list into the DB.
|
||||
for (word, new_ids) in new_postings_ids {
|
||||
let new_ids = SetBuf::from_dirty(new_ids.to_vec());
|
||||
let compressed = CodecBitPacker4xSorted::bytes_encode(&new_ids)
|
||||
.context("error while compressing using CodecBitPacker4xSorted")?;
|
||||
|
||||
postings_ids.merge(word.as_bytes(), compressed)?;
|
||||
|
||||
new_words.insert(word);
|
||||
}
|
||||
|
||||
eprintln!("Finished collecting the postings lists and words");
|
||||
|
||||
eprintln!("Start merging the words-fst");
|
||||
|
||||
let new_words_fst = fst::Set::from_iter(new_words.iter().map(|s| s.as_str()))?;
|
||||
drop(new_words);
|
||||
main.merge("words-fst", new_words_fst.as_fst().as_bytes())?;
|
||||
|
||||
eprintln!("Finished merging the words-fst");
|
||||
|
||||
Ok(number_of_documents)
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let opt = Opt::from_args();
|
||||
|
||||
let db = sled::open(opt.database)?;
|
||||
let main = &*db;
|
||||
|
||||
// Setup the merge operators
|
||||
main.set_merge_operator(union_words_fst);
|
||||
let postings_ids = db.open_tree("postings-ids")?;
|
||||
postings_ids.set_merge_operator(union_bitpacked_postings_ids);
|
||||
// ...
|
||||
let _documents = db.open_tree("documents")?;
|
||||
|
||||
let res = opt.files_to_index
|
||||
.into_par_iter()
|
||||
.enumerate()
|
||||
.map(|(tid, path)| {
|
||||
let rdr = csv::Reader::from_path(path)?;
|
||||
index_csv(tid, db.clone(), rdr)
|
||||
})
|
||||
.try_reduce(|| 0, |a, b| Ok(a + b));
|
||||
|
||||
println!("{:?}", res);
|
||||
|
||||
Ok(())
|
||||
}
|
Loading…
Reference in New Issue
Block a user