From 2e0193a39e70e3b8c815e077b558e2e3bfd84d41 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 19 Aug 2018 13:40:07 +0200 Subject: [PATCH] feat: Introduce sst file dumping Fixes #9 --- Cargo.lock | 198 +++++++++++++++++++++++-------------- raptor-indexer/Cargo.toml | 1 + raptor-indexer/src/main.rs | 101 +++++++++++-------- raptor-search/Cargo.toml | 2 +- raptor-search/src/main.rs | 44 ++++----- raptor/Cargo.toml | 2 +- raptor/src/metadata.rs | 4 +- 7 files changed, 209 insertions(+), 143 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 17c8a6b6c..148cbfacc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3,16 +3,21 @@ name = "base64" version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "bitflags" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "blob" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -22,29 +27,29 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "byteorder" -version = "1.2.3" +version = "1.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "bzip2-sys" version = "0.1.6" -source = "git+https://github.com/alexcrichton/bzip2-rs.git#0ae38c2ccfea01625ae256e4fd483a15eb7ad62c" +source = "git+https://github.com/alexcrichton/bzip2-rs.git#54aef43502f91de8fa3e205c1ec276054444cc34" dependencies = [ - "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "cc" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "cmake" -version = "0.1.31" +version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", + "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -55,11 +60,6 @@ dependencies = [ "build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", ] -[[package]] -name = "dtoa" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "elapsed" version = "0.1.2" @@ -68,12 +68,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "fst" version = "0.3.0" -source = "git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state#6e0ab4e4ee5443cc55079996bf9f703086322c33" +source = "git+https://github.com/Kerollmops/fst.git?branch=always-match-clone#56eb2221d1534883d4e10887d945a982b780fccd" dependencies = [ - "byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "fuchsia-zircon" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", + "fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "fuchsia-zircon-sys" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "gcc" version = "0.3.54" @@ -97,25 +111,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "levenshtein_automata" version = "0.1.1" -source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#9d01a14e57ded8e7a9a8d2b4e790f7b364e710b4" +source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#ed1244d1731b0f81e880f0c9daa860970d7752c3" dependencies = [ - "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", + "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)", ] [[package]] name = "libc" -version = "0.2.42" +version = "0.2.43" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "librocksdb_sys" version = "0.1.0" -source = "git+https://github.com/pingcap/rust-rocksdb.git#9a1c83c5382fbaee8a5102213c711bbe52d71470" +source = "git+https://github.com/pingcap/rust-rocksdb.git#d2fe0a96d18d8ba33da14e0feb49c57529ae28b6" dependencies = [ "bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)", - "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", - "cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "cmake 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", "libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)", "lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)", "snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)", @@ -127,10 +141,10 @@ name = "libz-sys" version = "1.0.18" source = "git+https://github.com/busyjay/libz-sys.git?branch=static-link#bb77b618ffc5ca41efd7a89d282d96e35e79dae4" dependencies = [ - "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", - "pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)", - "vcpkg 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)", + "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)", + "vcpkg 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -138,8 +152,8 @@ name = "lz4-sys" version = "1.8.0" source = "git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build#41509fea212e9ca55c1f6c53d4fd1ddf28cdf689" dependencies = [ - "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -147,18 +161,26 @@ name = "memmap" version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "moby-name-gen" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rand 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "pkg-config" -version = "0.3.11" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "proc-macro2" -version = "0.4.6" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -166,18 +188,38 @@ dependencies = [ [[package]] name = "quote" -version = "0.6.3" +version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", + "rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rand" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", + "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "raptor" version = "0.1.0" dependencies = [ - "byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", - "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", + "byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)", + "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)", "group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)", "levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)", @@ -187,11 +229,12 @@ dependencies = [ name = "raptor-indexer" version = "0.1.0" dependencies = [ + "moby-name-gen 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "raptor 0.1.0", "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)", - "serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", - "serde_json 1.0.22 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)", "unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -200,7 +243,7 @@ name = "raptor-search" version = "0.1.0" dependencies = [ "elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", - "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", + "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)", "raptor 0.1.0", "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)", ] @@ -208,36 +251,41 @@ dependencies = [ [[package]] name = "rocksdb" version = "0.3.0" -source = "git+https://github.com/pingcap/rust-rocksdb.git#9a1c83c5382fbaee8a5102213c711bbe52d71470" +source = "git+https://github.com/pingcap/rust-rocksdb.git#d2fe0a96d18d8ba33da14e0feb49c57529ae28b6" dependencies = [ "crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", "librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)", ] +[[package]] +name = "ryu" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "serde" -version = "1.0.70" +version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "serde_derive" -version = "1.0.70" +version = "1.0.71" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 0.14.4 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.14.8 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "serde_json" -version = "1.0.22" +version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "dtoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", "itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", - "serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", + "ryu 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -245,18 +293,18 @@ name = "snappy-sys" version = "0.1.0" source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#be02178330bb17648d6ac605af249eba18b32b71" dependencies = [ - "cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", - "pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)", + "cmake 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "syn" -version = "0.14.4" +version = "0.14.8" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)", - "quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "proc-macro2 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)", "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -272,7 +320,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "vcpkg" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -302,43 +350,49 @@ dependencies = [ "blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)", "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)", ] [metadata] "checksum base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "30e93c03064e7590d0466209155251b90c22e37fab1daf2771582598b5827557" +"checksum bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d0c54bb8f454c567f21197eefcdbf5679d0bd99f2ddbe52e84c77061952e6789" "checksum blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "122c3fa3949d822d2a51c648db9e8105d6e75b89dc628cc366901d3d396fa4f4" "checksum build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39092a32794787acd8525ee150305ff051b0aa6cc2abaf193924f5ab05425f39" -"checksum byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "74c0b906e9446b0a2e4f760cdb3fa4b2c48cdc6db8766a845c54b6ff063fd2e9" +"checksum byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8389c509ec62b9fe8eca58c502a0acaf017737355615243496cde4994f8fa4f9" "checksum bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)" = "" -"checksum cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)" = "49ec142f5768efb5b7622aebc3fdbdbb8950a4b9ba996393cb76ef7466e8747d" -"checksum cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)" = "95470235c31c726d72bf2e1f421adc1e65b9d561bf5529612cbe1a72da1467b3" +"checksum cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)" = "2119ea4867bd2b8ed3aecab467709720b2d55b1bcfe09f772fd68066eaf15275" +"checksum cmake 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)" = "704fbf3bb5149daab0afb255dbea24a1f08d2f4099cedb9baab6d470d4c5eefb" "checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" -"checksum dtoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6d301140eb411af13d3115f9a562c85cc6b541ade9dfa314132244aaee7489dd" "checksum elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f4e5af126dafd0741c2ad62d47f68b28602550102e5f0dd45c8a97fc8b49c29" -"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "" +"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)" = "" +"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" +"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb" "checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" "checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "" "checksum itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5adb58558dcd1d786b5f0bd15f3226ee23486e24b7b58304b60f64dc68e62606" "checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "" -"checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1" +"checksum libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)" = "76e3a3ef172f1a0b9a9ff0dd1491ae5e6c948b94479a3021819ba7d860c8645d" "checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "" "checksum libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)" = "" "checksum lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)" = "" "checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" -"checksum pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)" = "110d5ee3593dbb73f56294327fe5668bcc997897097cbc76b51e7aed3f52452f" -"checksum proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "effdb53b25cdad54f8f48843d67398f7ef2e14f12c1b4cb4effc549a6462a4d6" -"checksum quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e44651a0dc4cdd99f71c83b561e221f714912d11af1a4dff0631f923d53af035" +"checksum moby-name-gen 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d41abd7ae342e42e3a52953738f89eabde3ece58a4b9206384966976955bf7a2" +"checksum pkg-config 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)" = "104630aa1c83213cbc76db0703630fcb0421dac3585063be4ce9a8a2feeaa745" +"checksum proc-macro2 0.4.13 (registry+https://github.com/rust-lang/crates.io-index)" = "ee5697238f0d893c7f0ecc59c0999f18d2af85e424de441178bcacc9f9e6cf67" +"checksum quote 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "ed7d650913520df631972f21e104a4fa2f9c82a14afc65d17b388a2e29731e7c" +"checksum rand 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)" = "15a732abf9d20f0ad8eeb6f909bf6868722d9a06e1e50802b6a70351f40b4eb1" +"checksum rand 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8356f47b32624fef5b3301c1be97e5944ecdd595409cc5da11d05f211db6cfbd" "checksum rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "" -"checksum serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)" = "0c3adf19c07af6d186d91dae8927b83b0553d07ca56cbf7f2f32560455c91920" -"checksum serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)" = "3525a779832b08693031b8ecfb0de81cd71cfd3812088fafe9a7496789572124" -"checksum serde_json 1.0.22 (registry+https://github.com/rust-lang/crates.io-index)" = "84b8035cabe9b35878adec8ac5fe03d5f6bc97ff6edd7ccb96b44c1276ba390e" +"checksum ryu 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "fd0568787116e13c652377b6846f5931454a363a8fdf8ae50463ee40935b278b" +"checksum serde 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)" = "6dfad05c8854584e5f72fb859385ecdfa03af69c3fd0572f0da2d4c95f060bdb" +"checksum serde_derive 1.0.71 (registry+https://github.com/rust-lang/crates.io-index)" = "b719c6d5e9f73fbc37892246d5852333f040caa617b8873c6aced84bcb28e7bb" +"checksum serde_json 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)" = "44dd2cfde475037451fa99b7e5df77aa3cfd1536575fa8e7a538ab36dcde49ae" "checksum snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)" = "" -"checksum syn 0.14.4 (registry+https://github.com/rust-lang/crates.io-index)" = "2beff8ebc3658f07512a413866875adddd20f4fd47b2a4e6c9da65cd281baaea" +"checksum syn 0.14.8 (registry+https://github.com/rust-lang/crates.io-index)" = "b7bfcbb0c068d0f642a0ffbd5c604965a360a61f99e8add013cef23a838614f3" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc" -"checksum vcpkg 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "cbe533e138811704c0e3cbde65a818b35d3240409b4346256c5ede403e082474" +"checksum vcpkg 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "a51475940ea5ed2f7ba8e7b867c42d6cb7f06fafb9c1673ed8e768c675c771cc" "checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/raptor-indexer/Cargo.toml b/raptor-indexer/Cargo.toml index d369002af..d44f98698 100644 --- a/raptor-indexer/Cargo.toml +++ b/raptor-indexer/Cargo.toml @@ -9,6 +9,7 @@ serde = "1.0" serde_derive = "1.0" serde_json = "1.0" unidecode = "0.3" +moby-name-gen = "0.1" [dependencies.rocksdb] git = "https://github.com/pingcap/rust-rocksdb.git" diff --git a/raptor-indexer/src/main.rs b/raptor-indexer/src/main.rs index 332765bb7..76d383758 100644 --- a/raptor-indexer/src/main.rs +++ b/raptor-indexer/src/main.rs @@ -6,15 +6,16 @@ extern crate rocksdb; extern crate serde_json; #[macro_use] extern crate serde_derive; extern crate unidecode; +extern crate moby_name_gen; use std::path::Path; -use std::collections::HashSet; +use std::collections::{HashSet, BTreeMap}; use std::fs::{self, File}; use std::io::{self, BufReader, BufRead}; use std::iter; use raptor::{MetadataBuilder, Metadata, DocIndex}; -use rocksdb::{DB, WriteBatch, Writable}; +use rocksdb::{SstFileWriter, EnvOptions, ColumnFamilyOptions}; use serde_json::from_str; use unidecode::unidecode; @@ -33,9 +34,7 @@ where P: AsRef fs::set_permissions(&path, perms) } -fn is_readonly

(path: P) -> io::Result -where P: AsRef -{ +fn is_readonly>(path: P) -> io::Result { fs::metadata(&path).map(|m| m.permissions().readonly()) } @@ -62,69 +61,85 @@ fn main() { } }; - let map_file = "map.meta"; - let indexes_file = "indexes.meta"; - let rocksdb_file = "rocksdb/storage"; + // TODO add a subcommand to pack these files in a tar.xxx archive + let random_name = moby_name_gen::random_name(); + let map_file = format!("{}.map", random_name); + let idx_file = format!("{}.idx", random_name); + let sst_file = format!("{}.sst", random_name); - for file in &[map_file, indexes_file, rocksdb_file] { + for file in &[&map_file, &idx_file, &sst_file] { match is_readonly(file) { Ok(true) => panic!("the {:?} file is readonly, please make it writeable", file), Err(ref e) if e.kind() == io::ErrorKind::NotFound => (), Err(e) => panic!("{:?}", e), - _ => (), + Ok(false) => (), } } - let db = DB::open_default(rocksdb_file).unwrap(); + let env_options = EnvOptions::new(); + let cf_options = ColumnFamilyOptions::new(); + let mut sst_file_writer = SstFileWriter::new(env_options, cf_options); + sst_file_writer.open(&sst_file).expect("open the sst file"); - let map = File::create(map_file).unwrap(); - let indexes = File::create(indexes_file).unwrap(); + let map = File::create(&map_file).unwrap(); + let indexes = File::create(&idx_file).unwrap(); let mut builder = MetadataBuilder::new(map, indexes); + let mut fields = BTreeMap::new(); for line in data.lines() { let line = line.unwrap(); let product: Product = from_str(&line).unwrap(); - let title = iter::repeat(0).zip(product.title.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); - let description = iter::repeat(1).zip(product.ft.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); + { + let title = iter::repeat(0).zip(product.title.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); + let description = iter::repeat(1).zip(product.ft.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); - let mut batch = WriteBatch::new(); + let words = title.chain(description); + for (i, (attr, word)) in words { + let doc_index = DocIndex { + document: product.product_id, + attribute: attr, + attribute_index: i as u32, + }; + // insert the exact representation + let word_lower = word.to_lowercase(); - let title_key = format!("{}-title", product.product_id); - let _ = batch.put(title_key.as_bytes(), product.title.as_bytes()); + // and the unidecoded lowercased version + let word_unidecoded = unidecode(word).to_lowercase(); + if word_lower != word_unidecoded { + builder.insert(word_unidecoded, doc_index); + } - let description_key = format!("{}-description", product.product_id); - let _ = batch.put(description_key.as_bytes(), product.ft.as_bytes()); - - db.write(batch).unwrap(); - - let words = title.chain(description); - for (i, (attr, word)) in words { - let doc_index = DocIndex { - document: product.product_id, - attribute: attr, - attribute_index: i as u32, - }; - // insert the exact representation - let word_lower = word.to_lowercase(); - - // and the unidecoded lowercased version - let word_unidecoded = unidecode(word).to_lowercase(); - if word_lower != word_unidecoded { - builder.insert(word_unidecoded, doc_index); + builder.insert(word_lower, doc_index); } - - builder.insert(word_lower, doc_index); } + + // TODO simplify this by using functions and + // use the MetadataBuilder internal BTreeMap ? + let key = format!("{}-title", product.product_id); + let value = product.title; + fields.insert(key, value); + + let key = format!("{}-description", product.product_id); + let value = product.ft; + fields.insert(key, value); } + for (key, value) in fields { + sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap(); + } + let sst_file_info = sst_file_writer.finish().unwrap(); + builder.finish().unwrap(); - set_readonly(map_file, true).unwrap(); - set_readonly(indexes_file, true).unwrap(); - set_readonly(rocksdb_file, true).unwrap(); + println!("Succesfully created files: {}, {}, {}", map_file, idx_file, sst_file); + + set_readonly(&map_file, true).unwrap(); + set_readonly(&idx_file, true).unwrap(); + set_readonly(&sst_file, true).unwrap(); println!("Checking the dump consistency..."); - unsafe { Metadata::from_paths(map_file, indexes_file).unwrap() }; + unsafe { Metadata::from_paths(map_file, idx_file).unwrap() }; + // TODO do it better! } diff --git a/raptor-search/Cargo.toml b/raptor-search/Cargo.toml index 6fc6ecd5c..db66a60a1 100644 --- a/raptor-search/Cargo.toml +++ b/raptor-search/Cargo.toml @@ -9,7 +9,7 @@ elapsed = "0.1" [dependencies.fst] git = "https://github.com/Kerollmops/fst.git" -branch = "op-builder-with-state" +branch = "always-match-clone" [dependencies.rocksdb] git = "https://github.com/pingcap/rust-rocksdb.git" diff --git a/raptor-search/src/main.rs b/raptor-search/src/main.rs index 33e20670a..9fa46390c 100644 --- a/raptor-search/src/main.rs +++ b/raptor-search/src/main.rs @@ -8,7 +8,7 @@ use std::str::from_utf8_unchecked; use std::io::{self, Write}; use elapsed::measure_time; use fst::Streamer; -use rocksdb::{DB, DBOptions}; +use rocksdb::{DB, IngestExternalFileOptions}; use raptor::{Metadata, RankedStream, LevBuilder}; fn search(metadata: &Metadata, database: &DB, lev_builder: &LevBuilder, query: &str) { @@ -35,43 +35,39 @@ fn search(metadata: &Metadata, database: &DB, lev_builder: &LevBuilder, query: & } fn main() { - let map_file = "map.meta"; - let indexes_file = "indexes.meta"; - let rocksdb_file = "rocksdb/storage"; + let name = env::args().nth(1).expect("Missing meta file name (e.g. lucid-ptolemy)"); + let map_file = format!("{}.map", name); + let idx_file = format!("{}.idx", name); + let sst_file = format!("{}.sst", name); + + let rocksdb = "rocksdb/storage"; let (elapsed, meta) = measure_time(|| unsafe { - Metadata::from_paths(map_file, indexes_file).unwrap() + Metadata::from_paths(map_file, idx_file).unwrap() }); println!("{} to load metadata", elapsed); let (elapsed, db) = measure_time(|| { - let options = DBOptions::new(); - DB::open_for_read_only(options, rocksdb_file, false).unwrap() + let db = DB::open_default(rocksdb).unwrap(); + db.ingest_external_file(&IngestExternalFileOptions::new(), &[&sst_file]).unwrap(); + db }); println!("{} to load the RocksDB database", elapsed); let (elapsed, lev_builder) = measure_time(|| LevBuilder::new()); println!("{} to load the levenshtein automaton", elapsed); - match env::args().nth(1) { - Some(query) => { - println!("Searching for: {:?}", query); - let query = query.to_lowercase(); - let (elapsed, _) = measure_time(|| search(&meta, &db, &lev_builder, &query)); - println!("Finished in {}", elapsed); - }, - None => loop { - print!("Searching for: "); - io::stdout().flush().unwrap(); + loop { + print!("Searching for: "); + io::stdout().flush().unwrap(); - let mut query = String::new(); - io::stdin().read_line(&mut query).unwrap(); - let query = query.trim().to_lowercase(); + let mut query = String::new(); + io::stdin().read_line(&mut query).unwrap(); + let query = query.trim().to_lowercase(); - if query.is_empty() { break } + if query.is_empty() { break } - let (elapsed, _) = measure_time(|| search(&meta, &db, &lev_builder, &query)); - println!("Finished in {}", elapsed); - }, + let (elapsed, _) = measure_time(|| search(&meta, &db, &lev_builder, &query)); + println!("Finished in {}", elapsed); } } diff --git a/raptor/Cargo.toml b/raptor/Cargo.toml index 666a6b125..63c39c66f 100644 --- a/raptor/Cargo.toml +++ b/raptor/Cargo.toml @@ -8,7 +8,7 @@ byteorder = "1.2" [dependencies.fst] git = "https://github.com/Kerollmops/fst.git" -branch = "op-builder-with-state" +branch = "always-match-clone" [dependencies.levenshtein_automata] git = "https://github.com/Kerollmops/levenshtein-automata.git" diff --git a/raptor/src/metadata.rs b/raptor/src/metadata.rs index 7f34eebf0..80d647f0b 100644 --- a/raptor/src/metadata.rs +++ b/raptor/src/metadata.rs @@ -186,8 +186,8 @@ pub struct MetadataBuilder { indexes: X, } -impl MetadataBuilder { - +impl MetadataBuilder +{ pub fn new(map: W, indexes: X) -> Self { Self { inner: Inner::new(), map, indexes } }