From d6e113c683c1239278cf82662015f0be74a956e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 6 Jul 2018 21:26:07 +0200 Subject: [PATCH] feat: Introduce RocksDB in this project in order to save fields of the products --- raptor-indexer/Cargo.lock | 156 +++++++++++++++++++++++++++++++++++++ raptor-indexer/Cargo.toml | 3 + raptor-indexer/src/main.rs | 19 ++++- raptor-search/Cargo.lock | 156 +++++++++++++++++++++++++++++++++++++ raptor-search/Cargo.toml | 3 + raptor-search/src/main.rs | 40 +++++----- 6 files changed, 354 insertions(+), 23 deletions(-) diff --git a/raptor-indexer/Cargo.lock b/raptor-indexer/Cargo.lock index 09a315df4..1929e71a7 100644 --- a/raptor-indexer/Cargo.lock +++ b/raptor-indexer/Cargo.lock @@ -1,3 +1,11 @@ +[[package]] +name = "base64" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "bincode" version = "1.0.1" @@ -7,11 +15,55 @@ dependencies = [ "serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "blob" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "build_const" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "byteorder" version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "bzip2-sys" +version = "0.1.6" +source = "git+https://github.com/alexcrichton/bzip2-rs.git#0ae38c2ccfea01625ae256e4fd483a15eb7ad62c" +dependencies = [ + "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cc" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "cmake" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crc" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "dtoa" version = "0.4.2" @@ -26,6 +78,16 @@ dependencies = [ "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "gcc" +version = "0.3.54" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "glob" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "group-by" version = "0.1.0" @@ -49,6 +111,41 @@ name = "libc" version = "0.2.42" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "librocksdb_sys" +version = "0.1.0" +source = "git+https://github.com/pingcap/rust-rocksdb.git#9a1c83c5382fbaee8a5102213c711bbe52d71470" +dependencies = [ + "bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)", + "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", + "cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)", + "lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)", + "snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)", + "zstd-sys 1.4.4+zstd.1.3.5 (git+https://github.com/gyscos/zstd-rs.git)", +] + +[[package]] +name = "libz-sys" +version = "1.0.18" +source = "git+https://github.com/busyjay/libz-sys.git?branch=static-link#bb77b618ffc5ca41efd7a89d282d96e35e79dae4" +dependencies = [ + "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)", + "vcpkg 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lz4-sys" +version = "1.8.0" +source = "git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build#41509fea212e9ca55c1f6c53d4fd1ddf28cdf689" +dependencies = [ + "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "memmap" version = "0.6.2" @@ -58,6 +155,11 @@ dependencies = [ "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "pkg-config" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "proc-macro2" version = "0.4.6" @@ -91,12 +193,23 @@ name = "raptor-indexer" version = "0.1.0" dependencies = [ "raptor 0.1.0", + "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)", "serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.22 (registry+https://github.com/rust-lang/crates.io-index)", "unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "rocksdb" +version = "0.3.0" +source = "git+https://github.com/pingcap/rust-rocksdb.git#9a1c83c5382fbaee8a5102213c711bbe52d71470" +dependencies = [ + "crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)", +] + [[package]] name = "serde" version = "1.0.68" @@ -122,6 +235,16 @@ dependencies = [ "serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "snappy-sys" +version = "0.1.0" +source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#be02178330bb17648d6ac605af249eba18b32b71" +dependencies = [ + "cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "syn" version = "0.14.2" @@ -142,6 +265,11 @@ name = "unidecode" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "vcpkg" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "winapi" version = "0.3.5" @@ -161,24 +289,52 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "zstd-sys" +version = "1.4.4+zstd.1.3.5" +source = "git+https://github.com/gyscos/zstd-rs.git#9ff4442c1977fad400f90d9c48e4f114c474117c" +dependencies = [ + "blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)", + "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", +] + [metadata] +"checksum base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "30e93c03064e7590d0466209155251b90c22e37fab1daf2771582598b5827557" "checksum bincode 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9f2fb9e29e72fd6bc12071533d5dc7664cb01480c59406f656d7ac25c7bd8ff7" +"checksum blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "122c3fa3949d822d2a51c648db9e8105d6e75b89dc628cc366901d3d396fa4f4" +"checksum build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39092a32794787acd8525ee150305ff051b0aa6cc2abaf193924f5ab05425f39" "checksum byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "74c0b906e9446b0a2e4f760cdb3fa4b2c48cdc6db8766a845c54b6ff063fd2e9" +"checksum bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)" = "" +"checksum cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)" = "49ec142f5768efb5b7622aebc3fdbdbb8950a4b9ba996393cb76ef7466e8747d" +"checksum cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)" = "95470235c31c726d72bf2e1f421adc1e65b9d561bf5529612cbe1a72da1467b3" +"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" "checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab" "checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "" +"checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb" +"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" "checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "" "checksum itoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c069bbec61e1ca5a596166e55dfe4773ff745c3d16b700013bcaff9a6df2c682" "checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "" "checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1" +"checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "" +"checksum libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)" = "" +"checksum lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)" = "" "checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" +"checksum pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)" = "110d5ee3593dbb73f56294327fe5668bcc997897097cbc76b51e7aed3f52452f" "checksum proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "effdb53b25cdad54f8f48843d67398f7ef2e14f12c1b4cb4effc549a6462a4d6" "checksum quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e44651a0dc4cdd99f71c83b561e221f714912d11af1a4dff0631f923d53af035" +"checksum rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "" "checksum serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)" = "429fcc4efa8a11341b5422c2ace724daba276c1748467e869478f53c0ba4562e" "checksum serde_derive 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)" = "6a25ad0bf818ed2d180c89addbe29198d1de6c89ed08a48aa6a4d3d16a63cbfe" "checksum serde_json 1.0.22 (registry+https://github.com/rust-lang/crates.io-index)" = "84b8035cabe9b35878adec8ac5fe03d5f6bc97ff6edd7ccb96b44c1276ba390e" +"checksum snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)" = "" "checksum syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c67da57e61ebc7b7b6fff56bb34440ca3a83db037320b0507af4c10368deda7d" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc" +"checksum vcpkg 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "cbe533e138811704c0e3cbde65a818b35d3240409b4346256c5ede403e082474" "checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +"checksum zstd-sys 1.4.4+zstd.1.3.5 (git+https://github.com/gyscos/zstd-rs.git)" = "" diff --git a/raptor-indexer/Cargo.toml b/raptor-indexer/Cargo.toml index f42a0a2f4..df1b8d958 100644 --- a/raptor-indexer/Cargo.toml +++ b/raptor-indexer/Cargo.toml @@ -10,6 +10,9 @@ serde_derive = "1.0" serde_json = "1.0" unidecode = "0.3" +[dependencies.rocksdb] +git = "https://github.com/pingcap/rust-rocksdb.git" + [profile.release] debug = true lto = true diff --git a/raptor-indexer/src/main.rs b/raptor-indexer/src/main.rs index 2a2aec211..26e1338a6 100644 --- a/raptor-indexer/src/main.rs +++ b/raptor-indexer/src/main.rs @@ -2,6 +2,7 @@ // make only one binary extern crate raptor; +extern crate rocksdb; extern crate serde_json; #[macro_use] extern crate serde_derive; extern crate unidecode; @@ -13,6 +14,7 @@ use std::io::{self, BufReader, BufRead}; use std::iter; use raptor::{DocIndexMapBuilder, DocIndexMap, DocIndex}; +use rocksdb::{DB, WriteBatch, Writable}; use serde_json::from_str; use unidecode::unidecode; @@ -62,8 +64,9 @@ fn main() { let map_file = "map.fst"; let values_file = "values.vecs"; + let rocksdb_file = "rocksdb/storage"; - for file in &[map_file, values_file] { + for file in &[map_file, values_file, rocksdb_file] { match is_readonly(file) { Ok(true) => panic!("the {:?} file is readonly, please make it writeable", file), Err(ref e) if e.kind() == io::ErrorKind::NotFound => (), @@ -72,6 +75,9 @@ fn main() { } } + fs::remove_file(rocksdb_file); + let db = DB::open_default(rocksdb_file).unwrap(); + let mut builder = DocIndexMapBuilder::new(); for line in data.lines() { let line = line.unwrap(); @@ -81,6 +87,16 @@ fn main() { let title = iter::repeat(0).zip(product.title.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); let description = iter::repeat(1).zip(product.ft.split_whitespace()).filter(|&(_, w)| !common_words.contains(w)).enumerate(); + let mut batch = WriteBatch::new(); + + let title_key = format!("{}-title", product.product_id); + let _ = batch.put(title_key.as_bytes(), product.title.as_bytes()); + + let description_key = format!("{}-description", product.product_id); + let _ = batch.put(description_key.as_bytes(), product.ft.as_bytes()); + + db.write(batch).unwrap(); + let words = title.chain(description); for (i, (attr, word)) in words { let doc_index = DocIndex { @@ -108,6 +124,7 @@ fn main() { set_readonly(map_file, true).unwrap(); set_readonly(values_file, true).unwrap(); + set_readonly(rocksdb_file, true).unwrap(); println!("Checking the dump consistency..."); unsafe { DocIndexMap::from_paths("map.fst", "values.vecs").unwrap() }; diff --git a/raptor-search/Cargo.lock b/raptor-search/Cargo.lock index 25cd776dd..a2a59a8ba 100644 --- a/raptor-search/Cargo.lock +++ b/raptor-search/Cargo.lock @@ -1,3 +1,11 @@ +[[package]] +name = "base64" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "bincode" version = "1.0.1" @@ -7,16 +15,60 @@ dependencies = [ "serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "blob" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "build_const" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "byteorder" version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "bzip2-sys" +version = "0.1.6" +source = "git+https://github.com/alexcrichton/bzip2-rs.git#0ae38c2ccfea01625ae256e4fd483a15eb7ad62c" +dependencies = [ + "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cc" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "cfg-if" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "cmake" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crc" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "elapsed" version = "0.1.2" @@ -39,6 +91,16 @@ dependencies = [ "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "gcc" +version = "0.3.54" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "glob" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "group-by" version = "0.1.0" @@ -57,6 +119,32 @@ name = "libc" version = "0.2.42" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "librocksdb_sys" +version = "0.1.0" +source = "git+https://github.com/pingcap/rust-rocksdb.git#9a1c83c5382fbaee8a5102213c711bbe52d71470" +dependencies = [ + "bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)", + "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", + "cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)", + "lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)", + "snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)", + "zstd-sys 1.4.4+zstd.1.3.5 (git+https://github.com/gyscos/zstd-rs.git)", +] + +[[package]] +name = "libz-sys" +version = "1.0.18" +source = "git+https://github.com/busyjay/libz-sys.git?branch=static-link#bb77b618ffc5ca41efd7a89d282d96e35e79dae4" +dependencies = [ + "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)", + "vcpkg 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "log" version = "0.3.9" @@ -73,6 +161,15 @@ dependencies = [ "cfg-if 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "lz4-sys" +version = "1.8.0" +source = "git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build#41509fea212e9ca55c1f6c53d4fd1ddf28cdf689" +dependencies = [ + "cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "memmap" version = "0.6.2" @@ -82,6 +179,11 @@ dependencies = [ "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "pkg-config" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "proc-macro2" version = "0.4.6" @@ -118,10 +220,21 @@ dependencies = [ "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)", "raptor 0.1.0", + "rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)", "serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "rocksdb" +version = "0.3.0" +source = "git+https://github.com/pingcap/rust-rocksdb.git#9a1c83c5382fbaee8a5102213c711bbe52d71470" +dependencies = [ + "crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)", +] + [[package]] name = "serde" version = "1.0.68" @@ -137,6 +250,16 @@ dependencies = [ "syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "snappy-sys" +version = "0.1.0" +source = "git+https://github.com/busyjay/rust-snappy.git?branch=static-link#be02178330bb17648d6ac605af249eba18b32b71" +dependencies = [ + "cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "syn" version = "0.14.2" @@ -152,6 +275,11 @@ name = "unicode-xid" version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "vcpkg" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "winapi" version = "0.3.5" @@ -171,25 +299,53 @@ name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "zstd-sys" +version = "1.4.4+zstd.1.3.5" +source = "git+https://github.com/gyscos/zstd-rs.git#9ff4442c1977fad400f90d9c48e4f114c474117c" +dependencies = [ + "blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)", + "glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", +] + [metadata] +"checksum base64 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "30e93c03064e7590d0466209155251b90c22e37fab1daf2771582598b5827557" "checksum bincode 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9f2fb9e29e72fd6bc12071533d5dc7664cb01480c59406f656d7ac25c7bd8ff7" +"checksum blob 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "122c3fa3949d822d2a51c648db9e8105d6e75b89dc628cc366901d3d396fa4f4" +"checksum build_const 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39092a32794787acd8525ee150305ff051b0aa6cc2abaf193924f5ab05425f39" "checksum byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "74c0b906e9446b0a2e4f760cdb3fa4b2c48cdc6db8766a845c54b6ff063fd2e9" +"checksum bzip2-sys 0.1.6 (git+https://github.com/alexcrichton/bzip2-rs.git)" = "" +"checksum cc 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)" = "49ec142f5768efb5b7622aebc3fdbdbb8950a4b9ba996393cb76ef7466e8747d" "checksum cfg-if 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "efe5c877e17a9c717a0bf3613b2709f723202c4e4675cc8f12926ded29bcb17e" +"checksum cmake 0.1.31 (registry+https://github.com/rust-lang/crates.io-index)" = "95470235c31c726d72bf2e1f421adc1e65b9d561bf5529612cbe1a72da1467b3" +"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" "checksum elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f4e5af126dafd0741c2ad62d47f68b28602550102e5f0dd45c8a97fc8b49c29" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=op-builder-with-state)" = "" +"checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb" +"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" "checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "" "checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "" "checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1" +"checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "" +"checksum libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)" = "" "checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" "checksum log 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6fddaa003a65722a7fb9e26b0ce95921fe4ba590542ced664d8ce2fa26f9f3ac" +"checksum lz4-sys 1.8.0 (git+https://github.com/busyjay/lz4-rs.git?branch=adjust-build)" = "" "checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" +"checksum pkg-config 0.3.11 (registry+https://github.com/rust-lang/crates.io-index)" = "110d5ee3593dbb73f56294327fe5668bcc997897097cbc76b51e7aed3f52452f" "checksum proc-macro2 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "effdb53b25cdad54f8f48843d67398f7ef2e14f12c1b4cb4effc549a6462a4d6" "checksum quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e44651a0dc4cdd99f71c83b561e221f714912d11af1a4dff0631f923d53af035" +"checksum rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "" "checksum serde 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)" = "429fcc4efa8a11341b5422c2ace724daba276c1748467e869478f53c0ba4562e" "checksum serde_derive 1.0.68 (registry+https://github.com/rust-lang/crates.io-index)" = "6a25ad0bf818ed2d180c89addbe29198d1de6c89ed08a48aa6a4d3d16a63cbfe" +"checksum snappy-sys 0.1.0 (git+https://github.com/busyjay/rust-snappy.git?branch=static-link)" = "" "checksum syn 0.14.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c67da57e61ebc7b7b6fff56bb34440ca3a83db037320b0507af4c10368deda7d" "checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" +"checksum vcpkg 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "cbe533e138811704c0e3cbde65a818b35d3240409b4346256c5ede403e082474" "checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" "checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +"checksum zstd-sys 1.4.4+zstd.1.3.5 (git+https://github.com/gyscos/zstd-rs.git)" = "" diff --git a/raptor-search/Cargo.toml b/raptor-search/Cargo.toml index 983d95321..48c552aa3 100644 --- a/raptor-search/Cargo.toml +++ b/raptor-search/Cargo.toml @@ -14,6 +14,9 @@ elapsed = "0.1" git = "https://github.com/Kerollmops/fst.git" branch = "op-builder-with-state" +[dependencies.rocksdb] +git = "https://github.com/pingcap/rust-rocksdb.git" + [profile.release] lto = true debug = true diff --git a/raptor-search/src/main.rs b/raptor-search/src/main.rs index 0e66954fe..03910ac92 100644 --- a/raptor-search/src/main.rs +++ b/raptor-search/src/main.rs @@ -1,15 +1,18 @@ extern crate env_logger; +extern crate rocksdb; extern crate fst; extern crate raptor; extern crate elapsed; use std::env; +use std::str::from_utf8_unchecked; use std::io::{self, Write}; use elapsed::measure_time; use fst::Streamer; +use rocksdb::{DB, DBOptions}; use raptor::{load_map, DocIndexMap, RankedStream, LevBuilder}; -fn search(map: &DocIndexMap, lev_builder: &LevBuilder, query: &str) { +fn search(map: &DocIndexMap, lev_builder: &LevBuilder, db: &DB, query: &str) { let mut automatons = Vec::new(); for query in query.split_whitespace() { let lev = lev_builder.get_automaton(query); @@ -18,26 +21,12 @@ fn search(map: &DocIndexMap, lev_builder: &LevBuilder, query: &str) { let mut stream = RankedStream::new(&map, map.values(), automatons, 20); while let Some(document_id) = stream.next() { - print!("{:?}", document_id); + print!("{:?} ", document_id); - // /* only here to debug ! - use std::{fs, process::Command}; - if let Ok(_) = fs::File::open("products.json_lines") { - let output = Command::new("rg") - .arg(document_id.to_string()) - .arg("products.json_lines") - .output(); - if let Ok(Ok(output)) = output.map(|o| String::from_utf8(o.stdout)) { - if let Some(line) = output.lines().next() { - let pattern = "\"title\":"; - if let Some(index) = line.find(pattern) { - let line: String = line[index..].chars().skip(pattern.len()).take(100).collect(); - print!(" => {}", line); - } - } - } - } - // */ + let title_key = format!("{}-title", document_id); + let title = db.get(title_key.as_bytes()).unwrap().unwrap(); + let title = unsafe { from_utf8_unchecked(&title) }; + print!("{:?}", title); println!(); } @@ -52,11 +41,18 @@ fn main() { let (elapsed, lev_builder) = measure_time(|| LevBuilder::new()); println!("{} to load the levenshtein automaton", elapsed); + let (elapsed, db) = measure_time(|| { + let opts = DBOptions::new(); + let error_if_log_file_exist = false; + DB::open_for_read_only(opts, "rocksdb/storage", error_if_log_file_exist).unwrap() + }); + println!("{} to load the rocksdb DB", elapsed); + match env::args().nth(1) { Some(query) => { println!("Searching for: {:?}", query); let query = query.to_lowercase(); - let (elapsed, _) = measure_time(|| search(&map, &lev_builder, &query)); + let (elapsed, _) = measure_time(|| search(&map, &lev_builder, &db, &query)); println!("Finished in {}", elapsed); }, None => loop { @@ -69,7 +65,7 @@ fn main() { if query.is_empty() { break } - let (elapsed, _) = measure_time(|| search(&map, &lev_builder, &query)); + let (elapsed, _) = measure_time(|| search(&map, &lev_builder, &db, &query)); println!("Finished in {}", elapsed); }, }