mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
dump: Make the data less prone of memory indirections
This commit is contained in:
parent
d0919b2108
commit
a20405f786
38
Cargo.lock
generated
38
Cargo.lock
generated
@ -12,7 +12,7 @@ version = "1.0.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -274,7 +274,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "0.3.6"
|
version = "0.3.7"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -285,7 +285,7 @@ name = "quote"
|
|||||||
version = "0.5.2"
|
version = "0.5.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"proc-macro2 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -318,10 +318,9 @@ dependencies = [
|
|||||||
"fst-levenshtein 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"fst-levenshtein 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)",
|
"futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde_derive 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde_derive 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde_json 1.0.16 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde_json 1.0.16 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"smallvec 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
"tokio-minihttp 0.1.0 (git+https://github.com/tokio-rs/tokio-minihttp.git)",
|
"tokio-minihttp 0.1.0 (git+https://github.com/tokio-rs/tokio-minihttp.git)",
|
||||||
"tokio-proto 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"tokio-proto 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"tokio-service 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"tokio-service 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -345,15 +344,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde"
|
name = "serde"
|
||||||
version = "1.0.42"
|
version = "1.0.43"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_derive"
|
name = "serde_derive"
|
||||||
version = "1.0.42"
|
version = "1.0.43"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"proc-macro2 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde_derive_internals 0.23.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde_derive_internals 0.23.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"syn 0.13.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"syn 0.13.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -364,7 +363,7 @@ name = "serde_derive_internals"
|
|||||||
version = "0.23.1"
|
version = "0.23.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"proc-macro2 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"syn 0.13.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"syn 0.13.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -375,7 +374,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"itoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"itoa 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -393,20 +392,12 @@ name = "smallvec"
|
|||||||
version = "0.2.1"
|
version = "0.2.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "smallvec"
|
|
||||||
version = "0.6.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
dependencies = [
|
|
||||||
"serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
version = "0.13.1"
|
version = "0.13.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"proc-macro2 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
@ -689,21 +680,20 @@ dependencies = [
|
|||||||
"checksum nodrop 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)" = "9a2228dca57108069a5262f2ed8bd2e82496d2e074a06d1ccc7ce1687b6ae0a2"
|
"checksum nodrop 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)" = "9a2228dca57108069a5262f2ed8bd2e82496d2e074a06d1ccc7ce1687b6ae0a2"
|
||||||
"checksum num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c51a3322e4bca9d212ad9a158a02abc6934d005490c054a2778df73a70aa0a30"
|
"checksum num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c51a3322e4bca9d212ad9a158a02abc6934d005490c054a2778df73a70aa0a30"
|
||||||
"checksum percent-encoding 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "31010dd2e1ac33d5b46a5b413495239882813e0369f8ed8a5e266f173602f831"
|
"checksum percent-encoding 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "31010dd2e1ac33d5b46a5b413495239882813e0369f8ed8a5e266f173602f831"
|
||||||
"checksum proc-macro2 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "49b6a521dc81b643e9a51e0d1cf05df46d5a2f3c0280ea72bcb68276ba64a118"
|
"checksum proc-macro2 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "b16749538926f394755373f0dfec0852d79b3bd512a5906ceaeb72ee64a4eaa0"
|
||||||
"checksum quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9949cfe66888ffe1d53e6ec9d9f3b70714083854be20fd5e271b232a017401e8"
|
"checksum quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9949cfe66888ffe1d53e6ec9d9f3b70714083854be20fd5e271b232a017401e8"
|
||||||
"checksum rand 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)" = "15a732abf9d20f0ad8eeb6f909bf6868722d9a06e1e50802b6a70351f40b4eb1"
|
"checksum rand 0.3.22 (registry+https://github.com/rust-lang/crates.io-index)" = "15a732abf9d20f0ad8eeb6f909bf6868722d9a06e1e50802b6a70351f40b4eb1"
|
||||||
"checksum rand 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "eba5f8cb59cc50ed56be8880a5c7b496bfd9bd26394e176bc67884094145c2c5"
|
"checksum rand 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "eba5f8cb59cc50ed56be8880a5c7b496bfd9bd26394e176bc67884094145c2c5"
|
||||||
"checksum redox_syscall 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "0d92eecebad22b767915e4d529f89f28ee96dbbf5a4810d2b844373f136417fd"
|
"checksum redox_syscall 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "0d92eecebad22b767915e4d529f89f28ee96dbbf5a4810d2b844373f136417fd"
|
||||||
"checksum scoped-tls 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8674d439c964889e2476f474a3bf198cc9e199e77499960893bac5de7e9218a4"
|
"checksum scoped-tls 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8674d439c964889e2476f474a3bf198cc9e199e77499960893bac5de7e9218a4"
|
||||||
"checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27"
|
"checksum scopeguard 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "94258f53601af11e6a49f722422f6e3425c52b06245a5cf9bc09908b174f5e27"
|
||||||
"checksum serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)" = "a73973861352c932ed1365ce22b32467ce260ac4c8db11cf750ce56334ff2dcf"
|
"checksum serde 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)" = "0c855d888276f20d140223bd06515e5bf1647fd6d02593cb5792466d9a8ec2d0"
|
||||||
"checksum serde_derive 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b392c5a0cebb98121454531c50e60e2ffe0fbeb1a44da277da2d681d08d7dc0b"
|
"checksum serde_derive 1.0.43 (registry+https://github.com/rust-lang/crates.io-index)" = "aa113e5fc4b008a626ba2bbd41330b56c9987d667f79f7b243e5a2d03d91ed1c"
|
||||||
"checksum serde_derive_internals 0.23.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9d30c4596450fd7bbda79ef15559683f9a79ac0193ea819db90000d7e1cae794"
|
"checksum serde_derive_internals 0.23.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9d30c4596450fd7bbda79ef15559683f9a79ac0193ea819db90000d7e1cae794"
|
||||||
"checksum serde_json 1.0.16 (registry+https://github.com/rust-lang/crates.io-index)" = "8c6c4e049dc657a99e394bd85c22acbf97356feeec6dbf44150f2dcf79fb3118"
|
"checksum serde_json 1.0.16 (registry+https://github.com/rust-lang/crates.io-index)" = "8c6c4e049dc657a99e394bd85c22acbf97356feeec6dbf44150f2dcf79fb3118"
|
||||||
"checksum slab 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b4fcaed89ab08ef143da37bc52adbcc04d4a69014f4c1208d6b51f0c47bc23"
|
"checksum slab 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b4fcaed89ab08ef143da37bc52adbcc04d4a69014f4c1208d6b51f0c47bc23"
|
||||||
"checksum slab 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fdeff4cd9ecff59ec7e3744cbca73dfe5ac35c2aedb2cfba8a1c715a18912e9d"
|
"checksum slab 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fdeff4cd9ecff59ec7e3744cbca73dfe5ac35c2aedb2cfba8a1c715a18912e9d"
|
||||||
"checksum smallvec 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4c8cbcd6df1e117c2210e13ab5109635ad68a929fcbb8964dc965b76cb5ee013"
|
"checksum smallvec 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4c8cbcd6df1e117c2210e13ab5109635ad68a929fcbb8964dc965b76cb5ee013"
|
||||||
"checksum smallvec 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "44db0ecb22921ef790d17ae13a3f6d15784183ff5f2a01aa32098c7498d2b4b9"
|
|
||||||
"checksum syn 0.13.1 (registry+https://github.com/rust-lang/crates.io-index)" = "91b52877572087400e83d24b9178488541e3d535259e04ff17a63df1e5ceff59"
|
"checksum syn 0.13.1 (registry+https://github.com/rust-lang/crates.io-index)" = "91b52877572087400e83d24b9178488541e3d535259e04ff17a63df1e5ceff59"
|
||||||
"checksum take 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b157868d8ac1f56b64604539990685fa7611d8fa9e5476cf0c02cf34d32917c5"
|
"checksum take 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b157868d8ac1f56b64604539990685fa7611d8fa9e5476cf0c02cf34d32917c5"
|
||||||
"checksum time 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "a15375f1df02096fb3317256ce2cee6a1f42fc84ea5ad5fc8c421cfe40c73098"
|
"checksum time 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "a15375f1df02096fb3317256ce2cee6a1f42fc84ea5ad5fc8c421cfe40c73098"
|
||||||
|
@ -13,7 +13,6 @@ lazy_static = "1.0"
|
|||||||
serde = "1.0"
|
serde = "1.0"
|
||||||
serde_derive = "1.0"
|
serde_derive = "1.0"
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
smallvec = { version = "0.6", features = ["serde"] }
|
|
||||||
tokio-minihttp = { git = "https://github.com/tokio-rs/tokio-minihttp.git" }
|
tokio-minihttp = { git = "https://github.com/tokio-rs/tokio-minihttp.git" }
|
||||||
tokio-proto = "0.1"
|
tokio-proto = "0.1"
|
||||||
tokio-service = "0.1"
|
tokio-service = "0.1"
|
||||||
|
@ -13,7 +13,7 @@ use std::io::{BufReader, BufRead};
|
|||||||
use fst::Streamer;
|
use fst::Streamer;
|
||||||
use serde_json::from_str;
|
use serde_json::from_str;
|
||||||
|
|
||||||
use raptor::{MultiMapBuilder, MultiMap};
|
use raptor::{FstMapBuilder, FstMap};
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
struct Product {
|
struct Product {
|
||||||
@ -42,7 +42,7 @@ fn main() {
|
|||||||
set
|
set
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut builder = MultiMapBuilder::new();
|
let mut builder = FstMapBuilder::new();
|
||||||
for line in data.lines() {
|
for line in data.lines() {
|
||||||
let line = line.unwrap();
|
let line = line.unwrap();
|
||||||
|
|
||||||
@ -65,11 +65,6 @@ fn main() {
|
|||||||
let values = File::create("values.vecs").unwrap();
|
let values = File::create("values.vecs").unwrap();
|
||||||
let (map, values) = builder.build(map, values).unwrap();
|
let (map, values) = builder.build(map, values).unwrap();
|
||||||
|
|
||||||
// just to check if the dump is valid
|
eprintln!("Checking the dump consistency...");
|
||||||
let map = unsafe { MultiMap::from_paths("map.fst", "values.vecs").unwrap() };
|
unsafe { FstMap::<u64>::from_paths("map.fst", "values.vecs").unwrap() };
|
||||||
|
|
||||||
// let mut stream = map.stream();
|
|
||||||
// while let Some(x) = stream.next() {
|
|
||||||
// println!("{:?}", x);
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
@ -21,19 +21,19 @@ use tokio_minihttp::{Request, Response, Http};
|
|||||||
use tokio_proto::TcpServer;
|
use tokio_proto::TcpServer;
|
||||||
use tokio_service::Service;
|
use tokio_service::Service;
|
||||||
|
|
||||||
use raptor::MultiMap;
|
use raptor::FstMap;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref MAP: MultiMap = {
|
static ref MAP: FstMap<u64> = {
|
||||||
let map = read_to_vec("map.fst").unwrap();
|
let map = read_to_vec("map.fst").unwrap();
|
||||||
let values = read_to_vec("values.vecs").unwrap();
|
let values = read_to_vec("values.vecs").unwrap();
|
||||||
|
|
||||||
MultiMap::from_bytes(map, &values).unwrap()
|
FstMap::from_bytes(map, &values).unwrap()
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
struct MainService {
|
struct MainService {
|
||||||
map: &'static MultiMap,
|
map: &'static FstMap<u64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Service for MainService {
|
impl Service for MainService {
|
||||||
|
161
src/fst_map.rs
Normal file
161
src/fst_map.rs
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
use bincode;
|
||||||
|
use fst::{self, Map, MapBuilder, Automaton};
|
||||||
|
use serde::de::DeserializeOwned;
|
||||||
|
use serde::ser::Serialize;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{Write, BufReader};
|
||||||
|
use std::ops::{Range, Deref, DerefMut};
|
||||||
|
use std::path::Path;
|
||||||
|
use {StreamBuilder, Stream};
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct FstMap<T> {
|
||||||
|
inner: Map,
|
||||||
|
values: Values<T>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> FstMap<T> {
|
||||||
|
pub unsafe fn from_paths<P, Q>(map: P, values: Q) -> fst::Result<Self>
|
||||||
|
where
|
||||||
|
T: DeserializeOwned,
|
||||||
|
P: AsRef<Path>,
|
||||||
|
Q: AsRef<Path>
|
||||||
|
{
|
||||||
|
let inner = Map::from_path(map)?;
|
||||||
|
|
||||||
|
// TODO handle errors !!!
|
||||||
|
let values = File::open(values).unwrap();
|
||||||
|
let values = BufReader::new(values);
|
||||||
|
let values = bincode::deserialize_from(values).unwrap();
|
||||||
|
|
||||||
|
Ok(Self { inner, values })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn from_bytes(map: Vec<u8>, values: &[u8]) -> fst::Result<Self>
|
||||||
|
where
|
||||||
|
T: DeserializeOwned
|
||||||
|
{
|
||||||
|
let inner = Map::from_bytes(map)?;
|
||||||
|
let values = bincode::deserialize(values).unwrap();
|
||||||
|
|
||||||
|
Ok(Self { inner, values })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn stream(&self) -> Stream<T> {
|
||||||
|
Stream {
|
||||||
|
inner: self.inner.stream(),
|
||||||
|
values: &self.values,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool {
|
||||||
|
self.inner.contains_key(key)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[T]> {
|
||||||
|
self.inner.get(key).map(|i| unsafe { self.values.get_unchecked(i as usize) })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<T, A> {
|
||||||
|
StreamBuilder {
|
||||||
|
inner: self.inner.search(aut),
|
||||||
|
values: &self.values,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
pub struct Values<T> {
|
||||||
|
ranges: Box<[Range<u64>]>,
|
||||||
|
values: Box<[T]>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> Values<T> {
|
||||||
|
fn new(raw: Vec<Vec<T>>) -> Self {
|
||||||
|
let cap = raw.len();
|
||||||
|
let mut ranges = Vec::with_capacity(cap);
|
||||||
|
let cap = raw.iter().map(Vec::len).sum();
|
||||||
|
let mut values = Vec::with_capacity(cap);
|
||||||
|
|
||||||
|
for v in &raw {
|
||||||
|
let len = v.len() as u64;
|
||||||
|
let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0);
|
||||||
|
|
||||||
|
let range = Range { start, end: start + len };
|
||||||
|
ranges.push(range);
|
||||||
|
}
|
||||||
|
|
||||||
|
values.extend(raw.into_iter().flat_map(IntoIterator::into_iter));
|
||||||
|
|
||||||
|
let ranges = ranges.into_boxed_slice();
|
||||||
|
let values = values.into_boxed_slice();
|
||||||
|
|
||||||
|
Self { ranges, values }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub unsafe fn get_unchecked(&self, index: usize) -> &[T] {
|
||||||
|
let range = self.ranges.get_unchecked(index);
|
||||||
|
let range = Range { start: range.start as usize, end: range.end as usize };
|
||||||
|
self.values.get_unchecked(range)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct FstMapBuilder<T> {
|
||||||
|
map: Vec<(String, u64)>,
|
||||||
|
// This makes many memory indirections but it is only used
|
||||||
|
// at index time, not kept for query time.
|
||||||
|
values: Vec<Vec<T>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> FstMapBuilder<T> {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
map: Vec::new(),
|
||||||
|
values: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn insert<S: Into<String>>(&mut self, key: S, value: T) {
|
||||||
|
let key = key.into();
|
||||||
|
match self.map.binary_search_by_key(&key.as_str(), |&(ref k, _)| k) {
|
||||||
|
Ok(index) => {
|
||||||
|
let (_, index) = self.map[index];
|
||||||
|
let values = &mut self.values[index as usize];
|
||||||
|
|
||||||
|
values.push(value);
|
||||||
|
},
|
||||||
|
Err(index) => {
|
||||||
|
self.values.push(vec![value]);
|
||||||
|
let values_index = (self.values.len() - 1) as u64;
|
||||||
|
|
||||||
|
let value = (key, values_index);
|
||||||
|
self.map.insert(index, value);
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build_memory(self) -> fst::Result<FstMap<T>> {
|
||||||
|
Ok(FstMap {
|
||||||
|
inner: Map::from_iter(self.map)?,
|
||||||
|
values: Values::new(self.values),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build<W, X>(self, map_wrt: W, mut values_wrt: X) -> fst::Result<(W, X)>
|
||||||
|
where
|
||||||
|
T: Serialize,
|
||||||
|
W: Write,
|
||||||
|
X: Write
|
||||||
|
{
|
||||||
|
let mut builder = MapBuilder::new(map_wrt)?;
|
||||||
|
builder.extend_iter(self.map)?;
|
||||||
|
let map = builder.into_inner()?;
|
||||||
|
let values = Values::new(self.values);
|
||||||
|
|
||||||
|
// TODO handle that error !!!
|
||||||
|
bincode::serialize_into(&mut values_wrt, &values).unwrap();
|
||||||
|
|
||||||
|
Ok((map, values_wrt))
|
||||||
|
}
|
||||||
|
}
|
153
src/lib.rs
153
src/lib.rs
@ -1,76 +1,26 @@
|
|||||||
|
#[macro_use] extern crate serde_derive;
|
||||||
extern crate bincode;
|
extern crate bincode;
|
||||||
extern crate fst;
|
extern crate fst;
|
||||||
extern crate smallvec;
|
extern crate serde;
|
||||||
|
|
||||||
use std::ops::{Deref, DerefMut};
|
mod fst_map;
|
||||||
|
|
||||||
|
use std::ops::{Range, Deref, DerefMut};
|
||||||
use std::io::{Write, BufReader};
|
use std::io::{Write, BufReader};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::from_utf8_unchecked;
|
use std::str::from_utf8_unchecked;
|
||||||
|
use fst::Automaton;
|
||||||
|
|
||||||
pub use fst::MapBuilder;
|
pub use self::fst_map::{FstMap, FstMapBuilder};
|
||||||
use smallvec::SmallVec;
|
use self::fst_map::Values;
|
||||||
|
|
||||||
type SmallVec32<T> = SmallVec<[T; 16]>;
|
pub struct StreamBuilder<'a, T: 'a, A: Automaton> {
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct MultiMap {
|
|
||||||
map: fst::Map,
|
|
||||||
values: Box<[SmallVec32<u64>]>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MultiMap {
|
|
||||||
pub unsafe fn from_paths<P, Q>(map: P, values: Q) -> fst::Result<MultiMap>
|
|
||||||
where
|
|
||||||
P: AsRef<Path>,
|
|
||||||
Q: AsRef<Path>
|
|
||||||
{
|
|
||||||
let map = fst::Map::from_path(map)?;
|
|
||||||
|
|
||||||
// TODO handle errors !!!
|
|
||||||
let values = File::open(values).unwrap();
|
|
||||||
let values = BufReader::new(values);
|
|
||||||
let values = bincode::deserialize_from(values).unwrap();
|
|
||||||
|
|
||||||
Ok(MultiMap { map, values })
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn from_bytes(map: Vec<u8>, values: &[u8]) -> fst::Result<MultiMap> {
|
|
||||||
let map = fst::Map::from_bytes(map)?;
|
|
||||||
let values = bincode::deserialize(values).unwrap();
|
|
||||||
|
|
||||||
Ok(MultiMap { map, values })
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn stream(&self) -> Stream {
|
|
||||||
Stream {
|
|
||||||
inner: self.map.stream(),
|
|
||||||
values: &self.values,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool {
|
|
||||||
self.map.contains_key(key)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[u64]> {
|
|
||||||
self.map.get(key).map(|i| &*self.values[i as usize])
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn search<A: fst::Automaton>(&self, aut: A) -> StreamBuilder<A> {
|
|
||||||
StreamBuilder {
|
|
||||||
inner: self.map.search(aut),
|
|
||||||
values: &self.values,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct StreamBuilder<'a, A: fst::Automaton> {
|
|
||||||
inner: fst::map::StreamBuilder<'a, A>,
|
inner: fst::map::StreamBuilder<'a, A>,
|
||||||
values: &'a [SmallVec32<u64>],
|
values: &'a Values<T>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, A: fst::Automaton> Deref for StreamBuilder<'a, A> {
|
impl<'a, T, A: Automaton> Deref for StreamBuilder<'a, T, A> {
|
||||||
type Target = fst::map::StreamBuilder<'a, A>;
|
type Target = fst::map::StreamBuilder<'a, A>;
|
||||||
|
|
||||||
fn deref(&self) -> &Self::Target {
|
fn deref(&self) -> &Self::Target {
|
||||||
@ -78,16 +28,16 @@ impl<'a, A: fst::Automaton> Deref for StreamBuilder<'a, A> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, A: fst::Automaton> DerefMut for StreamBuilder<'a, A> {
|
impl<'a, T, A: Automaton> DerefMut for StreamBuilder<'a, T, A> {
|
||||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||||
&mut self.inner
|
&mut self.inner
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, A: fst::Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, A> {
|
impl<'a, T: 'a, A: Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, T, A> {
|
||||||
type Item = (&'a str, &'a [u64]);
|
type Item = (&'a str, &'a [T]);
|
||||||
|
|
||||||
type Into = Stream<'a, A>;
|
type Into = Stream<'a, T, A>;
|
||||||
|
|
||||||
fn into_stream(self) -> Self::Into {
|
fn into_stream(self) -> Self::Into {
|
||||||
Stream {
|
Stream {
|
||||||
@ -97,84 +47,23 @@ impl<'a, A: fst::Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, A> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Stream<'a, A: fst::Automaton = fst::automaton::AlwaysMatch> {
|
pub struct Stream<'a, T: 'a, A: Automaton = fst::automaton::AlwaysMatch> {
|
||||||
inner: fst::map::Stream<'a, A>,
|
inner: fst::map::Stream<'a, A>,
|
||||||
values: &'a [SmallVec32<u64>],
|
values: &'a Values<T>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 'm, A: fst::Automaton> fst::Streamer<'a> for Stream<'m, A> {
|
impl<'a, 'm, T: 'a, A: Automaton> fst::Streamer<'a> for Stream<'m, T, A> {
|
||||||
type Item = (&'a str, &'a [u64]);
|
type Item = (&'a str, &'a [T]);
|
||||||
|
|
||||||
fn next(&'a mut self) -> Option<Self::Item> {
|
fn next(&'a mut self) -> Option<Self::Item> {
|
||||||
// Here we can't just `map` because of some borrow rules
|
// Here we can't just `map` because of some borrow rules
|
||||||
match self.inner.next() {
|
match self.inner.next() {
|
||||||
Some((key, i)) => {
|
Some((key, i)) => {
|
||||||
let key = unsafe { from_utf8_unchecked(key) };
|
let key = unsafe { from_utf8_unchecked(key) };
|
||||||
Some((key, &*self.values[i as usize]))
|
let values = unsafe { self.values.get_unchecked(i as usize) };
|
||||||
|
Some((key, values))
|
||||||
},
|
},
|
||||||
None => None,
|
None => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct MultiMapBuilder {
|
|
||||||
map: Vec<(String, u64)>,
|
|
||||||
values: Vec<SmallVec32<u64>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> MultiMapBuilder {
|
|
||||||
pub fn new() -> MultiMapBuilder {
|
|
||||||
MultiMapBuilder {
|
|
||||||
map: Vec::new(),
|
|
||||||
values: Vec::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn insert<S: Into<String>>(&mut self, key: S, value: u64) {
|
|
||||||
let key = key.into();
|
|
||||||
match self.map.binary_search_by_key(&key.as_str(), |&(ref k, _)| k) {
|
|
||||||
Ok(index) => {
|
|
||||||
let (_, index) = self.map[index];
|
|
||||||
let values = &mut self.values[index as usize];
|
|
||||||
if let Err(index) = values.binary_search(&value) {
|
|
||||||
values.insert(index, value)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Err(index) => {
|
|
||||||
let values = {
|
|
||||||
let mut vec = SmallVec32::new();
|
|
||||||
vec.push(value);
|
|
||||||
vec
|
|
||||||
};
|
|
||||||
self.values.push(values);
|
|
||||||
let values_index = (self.values.len() - 1) as u64;
|
|
||||||
|
|
||||||
let value = (key, values_index);
|
|
||||||
self.map.insert(index, value);
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build_memory(self) -> fst::Result<MultiMap> {
|
|
||||||
Ok(MultiMap {
|
|
||||||
map: fst::Map::from_iter(self.map)?,
|
|
||||||
values: self.values.into_boxed_slice(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build<W, X>(self, map_wrt: W, mut values_wrt: X) -> fst::Result<(W, X)>
|
|
||||||
where
|
|
||||||
W: Write,
|
|
||||||
X: Write
|
|
||||||
{
|
|
||||||
let mut builder = MapBuilder::new(map_wrt)?;
|
|
||||||
builder.extend_iter(self.map)?;
|
|
||||||
let map = builder.into_inner()?;
|
|
||||||
|
|
||||||
// TODO handle that !!!
|
|
||||||
bincode::serialize_into(&mut values_wrt, &self.values).unwrap();
|
|
||||||
|
|
||||||
Ok((map, values_wrt))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
Loading…
Reference in New Issue
Block a user