feat: Introduce sst file dumping

Fixes #9
This commit is contained in:
Kerollmops 2018-08-19 13:40:07 +02:00 committed by Clément Renault
parent e5c54c4399
commit 2e0193a39e
7 changed files with 209 additions and 143 deletions

198
Cargo.lock generated
View File

@ -3,16 +3,21 @@ name = "base64"
version = "0.5.2" version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "bitflags"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "blob" name = "blob"
version = "0.2.0" version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", "base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
@ -22,29 +27,29 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "byteorder" name = "byteorder"
version = "1.2.3" version = "1.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "bzip2-sys" name = "bzip2-sys"
version = "0.1.6" version = "0.1.6"
source = "git+https://github.com/alexcrichton/bzip2-rs.git#0ae38c2ccfea01625ae256e4fd483a15eb7ad62c" source = "git+https://github.com/alexcrichton/bzip2-rs.git#54aef43502f91de8fa3e205c1ec276054444cc34"
dependencies = [ dependencies = [
"cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.0.17" version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "cmake" name = "cmake"
version = "0.1.31" version = "0.1.33"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
@ -55,11 +60,6 @@ dependencies = [
"build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", "build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "dtoa"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "elapsed" name = "elapsed"
version = "0.1.2" version = "0.1.2"
@ -68,12 +68,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "fst" name = "fst"
version = "0.3.0" version = "0.3.0"
source = "git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state#6e0ab4e4ee5443cc55079996bf9f703086322c33" source = "git+https://github.com/Kerollmops/fst.git?branch=always-match-clone#56eb2221d1534883d4e10887d945a982b780fccd"
dependencies = [ dependencies = [
"byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "fuchsia-zircon"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
"fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "fuchsia-zircon-sys"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "gcc" name = "gcc"
version = "0.3.54" version = "0.3.54"
@ -97,25 +111,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "levenshtein_automata" name = "levenshtein_automata"
version = "0.1.1" version = "0.1.1"
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#9d01a14e57ded8e7a9a8d2b4e790f7b364e710b4" source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#ed1244d1731b0f81e880f0c9daa860970d7752c3"
dependencies = [ dependencies = [
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)",
] ]
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.42" version = "0.2.43"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "librocksdb_sys" name = "librocksdb_sys"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/pingcap/rust-rocksdb.git#9a1c83c5382fbaee8a5102213c711bbe52d71470" source = "git+https://github.com/pingcap/rust-rocksdb.git#d2fe0a96d18d8ba33da14e0feb49c57529ae28b6"
dependencies = [ dependencies = [
"bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)", "bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)",
"cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)",
"cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)", "cmake 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)",
"libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)", "libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)",
"lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)", "lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)",
"snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)", "snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)",
@ -127,10 +141,10 @@ name = "libz-sys"
version = "1.0.18" version = "1.0.18"
source = "git+https://github.com/busyjay/libz-sys.git?branch=static-link#bb77b618ffc5ca41efd7a89d282d96e35e79dae4" source = "git+https://github.com/busyjay/libz-sys.git?branch=static-link#bb77b618ffc5ca41efd7a89d282d96e35e79dae4"
dependencies = [ dependencies = [
"cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)",
"pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)", "pkg-config 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)",
"vcpkg 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)", "vcpkg 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
@ -138,8 +152,8 @@ name = "lz4-sys"
version = "1.8.0" version = "1.8.0"
source = "git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build#41509fea212e9ca55c1f6c53d4fd1ddf28cdf689" source = "git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build#41509fea212e9ca55c1f6c53d4fd1ddf28cdf689"
dependencies = [ dependencies = [
"cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
@ -147,18 +161,26 @@ name = "memmap"
version = "0.6.2" version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]]
name = "moby-name-gen"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rand 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]] [[package]]
name = "pkg-config" name = "pkg-config"
version = "0.3.11" version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "0.4.6" version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
@ -166,18 +188,38 @@ dependencies = [
[[package]] [[package]]
name = "quote" name = "quote"
version = "0.6.3" version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "proc-macro2 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand"
version = "0.3.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)",
"rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
name = "raptor" name = "raptor"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)",
"group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)", "group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)",
"levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", "levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)",
"rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)", "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
@ -187,11 +229,12 @@ dependencies = [
name = "raptor-indexer" name = "raptor-indexer"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"moby-name-gen 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"raptor 0.1.0", "raptor 0.1.0",
"rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)", "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
"serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.22 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)",
"unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
@ -200,7 +243,7 @@ name = "raptor-search"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", "elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)",
"raptor 0.1.0", "raptor 0.1.0",
"rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)", "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
] ]
@ -208,36 +251,41 @@ dependencies = [
[[package]] [[package]]
name = "rocksdb" name = "rocksdb"
version = "0.3.0" version = "0.3.0"
source = "git+https://github.com/pingcap/rust-rocksdb.git#9a1c83c5382fbaee8a5102213c711bbe52d71470" source = "git+https://github.com/pingcap/rust-rocksdb.git#d2fe0a96d18d8ba33da14e0feb49c57529ae28b6"
dependencies = [ dependencies = [
"crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)", "crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)",
"librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)", "librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
] ]
[[package]]
name = "ryu"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "serde" name = "serde"
version = "1.0.70" version = "1.0.71"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "serde_derive" name = "serde_derive"
version = "1.0.70" version = "1.0.71"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "proc-macro2 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "quote 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.14.4 (registry+https://github.com/rust-lang/crates.io-index)", "syn 0.14.8 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
name = "serde_json" name = "serde_json"
version = "1.0.22" version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"dtoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)",
"itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", "itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", "ryu 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
@ -245,18 +293,18 @@ name = "snappy-sys"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#be02178330bb17648d6ac605af249eba18b32b71" source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#be02178330bb17648d6ac605af249eba18b32b71"
dependencies = [ dependencies = [
"cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)", "cmake 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)",
"pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)", "pkg-config 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
name = "syn" name = "syn"
version = "0.14.4" version = "0.14.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", "proc-macro2 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", "quote 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
@ -272,7 +320,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "vcpkg" name = "vcpkg"
version = "0.2.4" version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
@ -302,43 +350,49 @@ dependencies = [
"blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)", "gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)",
"glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[metadata] [metadata]
"checksum base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "30e93c03064e7590d0466209155251b90c22e37fab1daf2771582598b5827557" "checksum base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "30e93c03064e7590d0466209155251b90c22e37fab1daf2771582598b5827557"
"checksum bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d0c54bb8f454c567f21197eefcdbf5679d0bd99f2ddbe52e84c77061952e6789"
"checksum blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "122c3fa3949d822d2a51c648db9e8105d6e75b89dc628cc366901d3d396fa4f4" "checksum blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "122c3fa3949d822d2a51c648db9e8105d6e75b89dc628cc366901d3d396fa4f4"
"checksum build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39092a32794787acd8525ee150305ff051b0aa6cc2abaf193924f5ab05425f39" "checksum build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39092a32794787acd8525ee150305ff051b0aa6cc2abaf193924f5ab05425f39"
"checksum byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "74c0b906e9446b0a2e4f760cdb3fa4b2c48cdc6db8766a845c54b6ff063fd2e9" "checksum byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8389c509ec62b9fe8eca58c502a0acaf017737355615243496cde4994f8fa4f9"
"checksum bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)" = "<none>" "checksum bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)" = "<none>"
"checksum cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)" = "49ec142f5768efb5b7622aebc3fdbdbb8950a4b9ba996393cb76ef7466e8747d" "checksum cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)" = "2119ea4867bd2b8ed3aecab467709720b2d55b1bcfe09f772fd68066eaf15275"
"checksum cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)" = "95470235c31c726d72bf2e1f421adc1e65b9d561bf5529612cbe1a72da1467b3" "checksum cmake 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)" = "704fbf3bb5149daab0afb255dbea24a1f08d2f4099cedb9baab6d470d4c5eefb"
"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" "checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
"checksum dtoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6d301140eb411af13d3115f9a562c85cc6b541ade9dfa314132244aaee7489dd"
"checksum elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f4e5af126dafd0741c2ad62d47f68b28602550102e5f0dd45c8a97fc8b49c29" "checksum elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f4e5af126dafd0741c2ad62d47f68b28602550102e5f0dd45c8a97fc8b49c29"
"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "<none>" "checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)" = "<none>"
"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
"checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb" "checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb"
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" "checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "<none>" "checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "<none>"
"checksum itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5adb58558dcd1d786b5f0bd15f3226ee23486e24b7b58304b60f64dc68e62606" "checksum itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5adb58558dcd1d786b5f0bd15f3226ee23486e24b7b58304b60f64dc68e62606"
"checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>" "checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
"checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1" "checksum libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)" = "76e3a3ef172f1a0b9a9ff0dd1491ae5e6c948b94479a3021819ba7d860c8645d"
"checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "<none>" "checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "<none>"
"checksum libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)" = "<none>" "checksum libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)" = "<none>"
"checksum lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)" = "<none>" "checksum lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)" = "<none>"
"checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" "checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff"
"checksum pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)" = "110d5ee3593dbb73f56294327fe5668bcc997897097cbc76b51e7aed3f52452f" "checksum moby-name-gen 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d41abd7ae342e42e3a52953738f89eabde3ece58a4b9206384966976955bf7a2"
"checksum proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "effdb53b25cdad54f8f48843d67398f7ef2e14f12c1b4cb4effc549a6462a4d6" "checksum pkg-config 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)" = "104630aa1c83213cbc76db0703630fcb0421dac3585063be4ce9a8a2feeaa745"
"checksum quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e44651a0dc4cdd99f71c83b561e221f714912d11af1a4dff0631f923d53af035" "checksum proc-macro2 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)" = "ee5697238f0d893c7f0ecc59c0999f18d2af85e424de441178bcacc9f9e6cf67"
"checksum quote 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ed7d650913520df631972f21e104a4fa2f9c82a14afc65d17b388a2e29731e7c"
"checksum rand 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)" = "15a732abf9d20f0ad8eeb6f909bf6868722d9a06e1e50802b6a70351f40b4eb1"
"checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd"
"checksum rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "<none>" "checksum rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "<none>"
"checksum serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)" = "0c3adf19c07af6d186d91dae8927b83b0553d07ca56cbf7f2f32560455c91920" "checksum ryu 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0568787116e13c652377b6846f5931454a363a8fdf8ae50463ee40935b278b"
"checksum serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)" = "3525a779832b08693031b8ecfb0de81cd71cfd3812088fafe9a7496789572124" "checksum serde 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)" = "6dfad05c8854584e5f72fb859385ecdfa03af69c3fd0572f0da2d4c95f060bdb"
"checksum serde_json 1.0.22 (registry+https://github.com/rust-lang/crates.io-index)" = "84b8035cabe9b35878adec8ac5fe03d5f6bc97ff6edd7ccb96b44c1276ba390e" "checksum serde_derive 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)" = "b719c6d5e9f73fbc37892246d5852333f040caa617b8873c6aced84bcb28e7bb"
"checksum serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)" = "44dd2cfde475037451fa99b7e5df77aa3cfd1536575fa8e7a538ab36dcde49ae"
"checksum snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)" = "<none>" "checksum snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)" = "<none>"
"checksum syn 0.14.4 (registry+https://github.com/rust-lang/crates.io-index)" = "2beff8ebc3658f07512a413866875adddd20f4fd47b2a4e6c9da65cd281baaea" "checksum syn 0.14.8 (registry+https://github.com/rust-lang/crates.io-index)" = "b7bfcbb0c068d0f642a0ffbd5c604965a360a61f99e8add013cef23a838614f3"
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc" "checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc"
"checksum vcpkg 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "cbe533e138811704c0e3cbde65a818b35d3240409b4346256c5ede403e082474" "checksum vcpkg 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "a51475940ea5ed2f7ba8e7b867c42d6cb7f06fafb9c1673ed8e768c675c771cc"
"checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd" "checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"

View File

@ -9,6 +9,7 @@ serde = "1.0"
serde_derive = "1.0" serde_derive = "1.0"
serde_json = "1.0" serde_json = "1.0"
unidecode = "0.3" unidecode = "0.3"
moby-name-gen = "0.1"
[dependencies.rocksdb] [dependencies.rocksdb]
git = "https://github.com/pingcap/rust-rocksdb.git" git = "https://github.com/pingcap/rust-rocksdb.git"

View File

@ -6,15 +6,16 @@ extern crate rocksdb;
extern crate serde_json; extern crate serde_json;
#[macro_use] extern crate serde_derive; #[macro_use] extern crate serde_derive;
extern crate unidecode; extern crate unidecode;
extern crate moby_name_gen;
use std::path::Path; use std::path::Path;
use std::collections::HashSet; use std::collections::{HashSet, BTreeMap};
use std::fs::{self, File}; use std::fs::{self, File};
use std::io::{self, BufReader, BufRead}; use std::io::{self, BufReader, BufRead};
use std::iter; use std::iter;
use raptor::{MetadataBuilder, Metadata, DocIndex}; use raptor::{MetadataBuilder, Metadata, DocIndex};
use rocksdb::{DB, WriteBatch, Writable}; use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions};
use serde_json::from_str; use serde_json::from_str;
use unidecode::unidecode; use unidecode::unidecode;
@ -33,9 +34,7 @@ where P: AsRef<Path>
fs::set_permissions(&path, perms) fs::set_permissions(&path, perms)
} }
fn is_readonly<P>(path: P) -> io::Result<bool> fn is_readonly<P: AsRef<Path>>(path: P) -> io::Result<bool> {
where P: AsRef<Path>
{
fs::metadata(&path).map(|m| m.permissions().readonly()) fs::metadata(&path).map(|m| m.permissions().readonly())
} }
@ -62,69 +61,85 @@ fn main() {
} }
}; };
let map_file = "map.meta"; // TODO add a subcommand to pack these files in a tar.xxx archive
let indexes_file = "indexes.meta"; let random_name = moby_name_gen::random_name();
let rocksdb_file = "rocksdb/storage"; let map_file = format!("{}.map", random_name);
let idx_file = format!("{}.idx", random_name);
let sst_file = format!("{}.sst", random_name);
for file in &[map_file, indexes_file, rocksdb_file] { for file in &[&map_file, &idx_file, &sst_file] {
match is_readonly(file) { match is_readonly(file) {
Ok(true) => panic!("the {:?} file is readonly, please make it writeable", file), Ok(true) => panic!("the {:?} file is readonly, please make it writeable", file),
Err(ref e) if e.kind() == io::ErrorKind::NotFound => (), Err(ref e) if e.kind() == io::ErrorKind::NotFound => (),
Err(e) => panic!("{:?}", e), Err(e) => panic!("{:?}", e),
_ => (), Ok(false) => (),
} }
} }
let db = DB::open_default(rocksdb_file).unwrap(); let env_options = EnvOptions::new();
let cf_options = ColumnFamilyOptions::new();
let mut sst_file_writer = SstFileWriter::new(env_options, cf_options);
sst_file_writer.open(&sst_file).expect("open the sst file");
let map = File::create(map_file).unwrap(); let map = File::create(&map_file).unwrap();
let indexes = File::create(indexes_file).unwrap(); let indexes = File::create(&idx_file).unwrap();
let mut builder = MetadataBuilder::new(map, indexes); let mut builder = MetadataBuilder::new(map, indexes);
let mut fields = BTreeMap::new();
for line in data.lines() { for line in data.lines() {
let line = line.unwrap(); let line = line.unwrap();
let product: Product = from_str(&line).unwrap(); let product: Product = from_str(&line).unwrap();
let title = iter::repeat(0).zip(product.title.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); {
let description = iter::repeat(1).zip(product.ft.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); let title = iter::repeat(0).zip(product.title.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate();
let description = iter::repeat(1).zip(product.ft.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate();
let mut batch = WriteBatch::new(); let words = title.chain(description);
for (i, (attr, word)) in words {
let doc_index = DocIndex {
document: product.product_id,
attribute: attr,
attribute_index: i as u32,
};
// insert the exact representation
let word_lower = word.to_lowercase();
let title_key = format!("{}-title", product.product_id); // and the unidecoded lowercased version
let _ = batch.put(title_key.as_bytes(), product.title.as_bytes()); let word_unidecoded = unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
builder.insert(word_unidecoded, doc_index);
}
let description_key = format!("{}-description", product.product_id); builder.insert(word_lower, doc_index);
let _ = batch.put(description_key.as_bytes(), product.ft.as_bytes());
db.write(batch).unwrap();
let words = title.chain(description);
for (i, (attr, word)) in words {
let doc_index = DocIndex {
document: product.product_id,
attribute: attr,
attribute_index: i as u32,
};
// insert the exact representation
let word_lower = word.to_lowercase();
// and the unidecoded lowercased version
let word_unidecoded = unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
builder.insert(word_unidecoded, doc_index);
} }
builder.insert(word_lower, doc_index);
} }
// TODO simplify this by using functions and
// use the MetadataBuilder internal BTreeMap ?
let key = format!("{}-title", product.product_id);
let value = product.title;
fields.insert(key, value);
let key = format!("{}-description", product.product_id);
let value = product.ft;
fields.insert(key, value);
} }
for (key, value) in fields {
sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap();
}
let sst_file_info = sst_file_writer.finish().unwrap();
builder.finish().unwrap(); builder.finish().unwrap();
set_readonly(map_file, true).unwrap(); println!("Succesfully created files: {}, {}, {}", map_file, idx_file, sst_file);
set_readonly(indexes_file, true).unwrap();
set_readonly(rocksdb_file, true).unwrap(); set_readonly(&map_file, true).unwrap();
set_readonly(&idx_file, true).unwrap();
set_readonly(&sst_file, true).unwrap();
println!("Checking the dump consistency..."); println!("Checking the dump consistency...");
unsafe { Metadata::from_paths(map_file, indexes_file).unwrap() }; unsafe { Metadata::from_paths(map_file, idx_file).unwrap() };
// TODO do it better!
} }

View File

@ -9,7 +9,7 @@ elapsed = "0.1"
[dependencies.fst] [dependencies.fst]
git = "https://github.com/Kerollmops/fst.git" git = "https://github.com/Kerollmops/fst.git"
branch = "op-builder-with-state" branch = "always-match-clone"
[dependencies.rocksdb] [dependencies.rocksdb]
git = "https://github.com/pingcap/rust-rocksdb.git" git = "https://github.com/pingcap/rust-rocksdb.git"

View File

@ -8,7 +8,7 @@ use std::str::from_utf8_unchecked;
use std::io::{self, Write}; use std::io::{self, Write};
use elapsed::measure_time; use elapsed::measure_time;
use fst::Streamer; use fst::Streamer;
use rocksdb::{DB, DBOptions}; use rocksdb::{DB, IngestExternalFileOptions};
use raptor::{Metadata, RankedStream, LevBuilder}; use raptor::{Metadata, RankedStream, LevBuilder};
fn search(metadata: &Metadata, database: &DB, lev_builder: &LevBuilder, query: &str) { fn search(metadata: &Metadata, database: &DB, lev_builder: &LevBuilder, query: &str) {
@ -35,43 +35,39 @@ fn search(metadata: &Metadata, database: &DB, lev_builder: &LevBuilder, query: &
} }
fn main() { fn main() {
let map_file = "map.meta"; let name = env::args().nth(1).expect("Missing meta file name (e.g. lucid-ptolemy)");
let indexes_file = "indexes.meta"; let map_file = format!("{}.map", name);
let rocksdb_file = "rocksdb/storage"; let idx_file = format!("{}.idx", name);
let sst_file = format!("{}.sst", name);
let rocksdb = "rocksdb/storage";
let (elapsed, meta) = measure_time(|| unsafe { let (elapsed, meta) = measure_time(|| unsafe {
Metadata::from_paths(map_file, indexes_file).unwrap() Metadata::from_paths(map_file, idx_file).unwrap()
}); });
println!("{} to load metadata", elapsed); println!("{} to load metadata", elapsed);
let (elapsed, db) = measure_time(|| { let (elapsed, db) = measure_time(|| {
let options = DBOptions::new(); let db = DB::open_default(rocksdb).unwrap();
DB::open_for_read_only(options, rocksdb_file, false).unwrap() db.ingest_external_file(&IngestExternalFileOptions::new(), &[&sst_file]).unwrap();
db
}); });
println!("{} to load the RocksDB database", elapsed); println!("{} to load the RocksDB database", elapsed);
let (elapsed, lev_builder) = measure_time(|| LevBuilder::new()); let (elapsed, lev_builder) = measure_time(|| LevBuilder::new());
println!("{} to load the levenshtein automaton", elapsed); println!("{} to load the levenshtein automaton", elapsed);
match env::args().nth(1) { loop {
Some(query) => { print!("Searching for: ");
println!("Searching for: {:?}", query); io::stdout().flush().unwrap();
let query = query.to_lowercase();
let (elapsed, _) = measure_time(|| search(&meta, &db, &lev_builder, &query));
println!("Finished in {}", elapsed);
},
None => loop {
print!("Searching for: ");
io::stdout().flush().unwrap();
let mut query = String::new(); let mut query = String::new();
io::stdin().read_line(&mut query).unwrap(); io::stdin().read_line(&mut query).unwrap();
let query = query.trim().to_lowercase(); let query = query.trim().to_lowercase();
if query.is_empty() { break } if query.is_empty() { break }
let (elapsed, _) = measure_time(|| search(&meta, &db, &lev_builder, &query)); let (elapsed, _) = measure_time(|| search(&meta, &db, &lev_builder, &query));
println!("Finished in {}", elapsed); println!("Finished in {}", elapsed);
},
} }
} }

View File

@ -8,7 +8,7 @@ byteorder = "1.2"
[dependencies.fst] [dependencies.fst]
git = "https://github.com/Kerollmops/fst.git" git = "https://github.com/Kerollmops/fst.git"
branch = "op-builder-with-state" branch = "always-match-clone"
[dependencies.levenshtein_automata] [dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git" git = "https://github.com/Kerollmops/levenshtein-automata.git"

View File

@ -186,8 +186,8 @@ pub struct MetadataBuilder<W, X> {
indexes: X, indexes: X,
} }
impl<W: Write, X: Write> MetadataBuilder<W, X> { impl<W: Write, X: Write> MetadataBuilder<W, X>
{
pub fn new(map: W, indexes: X) -> Self { pub fn new(map: W, indexes: X) -> Self {
Self { inner: Inner::new(), map, indexes } Self { inner: Inner::new(), map, indexes }
} }