mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-02-17 00:00:11 +08:00
feat: Simplify the levenshtein construction
This commit is contained in:
parent
f0f5fc9891
commit
31e04f0120
34
Cargo.lock
generated
34
Cargo.lock
generated
@ -72,8 +72,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "fst"
|
name = "fst"
|
||||||
version = "0.3.0"
|
version = "0.3.2"
|
||||||
source = "git+https://github.com/Kerollmops/fst.git?branch=always-match-clone#56eb2221d1534883d4e10887d945a982b780fccd"
|
source = "git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref#ca3a1ebb60a6f9123f1284de380c7a5fc05d16bb"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -113,12 +113,20 @@ name = "itoa"
|
|||||||
version = "0.4.2"
|
version = "0.4.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lazy_static"
|
||||||
|
version = "1.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"version_check 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "levenshtein_automata"
|
name = "levenshtein_automata"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#ed1244d1731b0f81e880f0c9daa860970d7752c3"
|
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=new-custom-fst#01400dfc181425a482cb6cad66f2a61b78b59e14"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)",
|
"fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -225,9 +233,10 @@ version = "0.1.0"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)",
|
"fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)",
|
||||||
"group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)",
|
"group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)",
|
||||||
"levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)",
|
"lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=new-custom-fst)",
|
||||||
"rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
|
"rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -249,7 +258,7 @@ name = "raptor-search"
|
|||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)",
|
"fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)",
|
||||||
"raptor 0.1.0",
|
"raptor 0.1.0",
|
||||||
"rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
|
"rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
|
||||||
]
|
]
|
||||||
@ -329,6 +338,11 @@ name = "vcpkg"
|
|||||||
version = "0.2.6"
|
version = "0.2.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "version_check"
|
||||||
|
version = "0.1.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "winapi"
|
name = "winapi"
|
||||||
version = "0.3.5"
|
version = "0.3.5"
|
||||||
@ -371,14 +385,15 @@ dependencies = [
|
|||||||
"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
|
"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
|
||||||
"checksum elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f4e5af126dafd0741c2ad62d47f68b28602550102e5f0dd45c8a97fc8b49c29"
|
"checksum elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f4e5af126dafd0741c2ad62d47f68b28602550102e5f0dd45c8a97fc8b49c29"
|
||||||
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
|
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
|
||||||
"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)" = "<none>"
|
"checksum fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)" = "<none>"
|
||||||
"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
|
"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
|
||||||
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
|
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
|
||||||
"checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb"
|
"checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb"
|
||||||
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
|
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
|
||||||
"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "<none>"
|
"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "<none>"
|
||||||
"checksum itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5adb58558dcd1d786b5f0bd15f3226ee23486e24b7b58304b60f64dc68e62606"
|
"checksum itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5adb58558dcd1d786b5f0bd15f3226ee23486e24b7b58304b60f64dc68e62606"
|
||||||
"checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
|
"checksum lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ca488b89a5657b0a2ecd45b95609b3e848cf1755da332a0da46e2b2b1cb371a7"
|
||||||
|
"checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=new-custom-fst)" = "<none>"
|
||||||
"checksum libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)" = "76e3a3ef172f1a0b9a9ff0dd1491ae5e6c948b94479a3021819ba7d860c8645d"
|
"checksum libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)" = "76e3a3ef172f1a0b9a9ff0dd1491ae5e6c948b94479a3021819ba7d860c8645d"
|
||||||
"checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "<none>"
|
"checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "<none>"
|
||||||
"checksum libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)" = "<none>"
|
"checksum libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)" = "<none>"
|
||||||
@ -400,6 +415,7 @@ dependencies = [
|
|||||||
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
||||||
"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc"
|
"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc"
|
||||||
"checksum vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "def296d3eb3b12371b2c7d0e83bfe1403e4db2d7a0bba324a12b21c4ee13143d"
|
"checksum vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "def296d3eb3b12371b2c7d0e83bfe1403e4db2d7a0bba324a12b21c4ee13143d"
|
||||||
|
"checksum version_check 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7716c242968ee87e5542f8021178248f267f295a5c4803beae8b8b7fd9bc6051"
|
||||||
"checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd"
|
"checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd"
|
||||||
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||||
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
|
cargo-features = ["edition"]
|
||||||
|
|
||||||
[package]
|
[package]
|
||||||
|
edition = "2018"
|
||||||
name = "raptor-indexer"
|
name = "raptor-indexer"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||||
|
@ -1,12 +1,7 @@
|
|||||||
// TODO make the raptor binary expose multiple subcommand
|
// TODO make the raptor binary expose multiple subcommand
|
||||||
// make only one binary
|
// make only one binary
|
||||||
|
|
||||||
extern crate raptor;
|
|
||||||
extern crate rocksdb;
|
|
||||||
extern crate serde_json;
|
|
||||||
#[macro_use] extern crate serde_derive;
|
#[macro_use] extern crate serde_derive;
|
||||||
extern crate unidecode;
|
|
||||||
extern crate moby_name_gen;
|
|
||||||
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::collections::{HashSet, BTreeMap};
|
use std::collections::{HashSet, BTreeMap};
|
||||||
@ -129,7 +124,7 @@ fn main() {
|
|||||||
for (key, value) in fields {
|
for (key, value) in fields {
|
||||||
sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap();
|
sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap();
|
||||||
}
|
}
|
||||||
let sst_file_info = sst_file_writer.finish().unwrap();
|
let _sst_file_info = sst_file_writer.finish().unwrap();
|
||||||
|
|
||||||
builder.finish().unwrap();
|
builder.finish().unwrap();
|
||||||
|
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
|
cargo-features = ["edition"]
|
||||||
|
|
||||||
[package]
|
[package]
|
||||||
|
edition = "2018"
|
||||||
name = "raptor-search"
|
name = "raptor-search"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||||
@ -9,7 +12,7 @@ elapsed = "0.1"
|
|||||||
|
|
||||||
[dependencies.fst]
|
[dependencies.fst]
|
||||||
git = "https://github.com/Kerollmops/fst.git"
|
git = "https://github.com/Kerollmops/fst.git"
|
||||||
branch = "always-match-clone"
|
branch = "automaton-for-deref"
|
||||||
|
|
||||||
[dependencies.rocksdb]
|
[dependencies.rocksdb]
|
||||||
git = "https://github.com/pingcap/rust-rocksdb.git"
|
git = "https://github.com/pingcap/rust-rocksdb.git"
|
||||||
|
@ -1,20 +1,15 @@
|
|||||||
extern crate rocksdb;
|
|
||||||
extern crate fst;
|
|
||||||
extern crate raptor;
|
|
||||||
extern crate elapsed;
|
|
||||||
|
|
||||||
use std::env;
|
use std::env;
|
||||||
use std::str::from_utf8_unchecked;
|
use std::str::from_utf8_unchecked;
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
use elapsed::measure_time;
|
use elapsed::measure_time;
|
||||||
use fst::Streamer;
|
use fst::Streamer;
|
||||||
use rocksdb::{DB, DBOptions, IngestExternalFileOptions};
|
use rocksdb::{DB, DBOptions, IngestExternalFileOptions};
|
||||||
use raptor::{Metadata, RankedStream, LevBuilder};
|
use raptor::{automaton, Metadata, RankedStream};
|
||||||
|
|
||||||
fn search(metadata: &Metadata, database: &DB, lev_builder: &LevBuilder, query: &str) {
|
fn search(metadata: &Metadata, database: &DB, query: &str) {
|
||||||
let mut automatons = Vec::new();
|
let mut automatons = Vec::new();
|
||||||
for query in query.split_whitespace() {
|
for query in query.split_whitespace() {
|
||||||
let lev = lev_builder.get_automaton(query);
|
let lev = automaton::build(query);
|
||||||
automatons.push(lev);
|
automatons.push(lev);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -55,9 +50,6 @@ fn main() {
|
|||||||
});
|
});
|
||||||
println!("{} to load the SST file in RocksDB and reopen it for read-only", elapsed);
|
println!("{} to load the SST file in RocksDB and reopen it for read-only", elapsed);
|
||||||
|
|
||||||
let (elapsed, lev_builder) = measure_time(|| LevBuilder::new());
|
|
||||||
println!("{} to load the levenshtein automaton", elapsed);
|
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
print!("Searching for: ");
|
print!("Searching for: ");
|
||||||
io::stdout().flush().unwrap();
|
io::stdout().flush().unwrap();
|
||||||
@ -68,7 +60,7 @@ fn main() {
|
|||||||
|
|
||||||
if query.is_empty() { break }
|
if query.is_empty() { break }
|
||||||
|
|
||||||
let (elapsed, _) = measure_time(|| search(&meta, &db, &lev_builder, &query));
|
let (elapsed, _) = measure_time(|| search(&meta, &db, &query));
|
||||||
println!("Finished in {}", elapsed);
|
println!("Finished in {}", elapsed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
|
cargo-features = ["edition"]
|
||||||
|
|
||||||
[package]
|
[package]
|
||||||
|
edition = "2018"
|
||||||
name = "raptor"
|
name = "raptor"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||||
@ -6,14 +9,15 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
|
|||||||
[dependencies]
|
[dependencies]
|
||||||
byteorder = "1.2"
|
byteorder = "1.2"
|
||||||
fnv = "1.0"
|
fnv = "1.0"
|
||||||
|
lazy_static = "1.1"
|
||||||
|
|
||||||
[dependencies.fst]
|
[dependencies.fst]
|
||||||
git = "https://github.com/Kerollmops/fst.git"
|
git = "https://github.com/Kerollmops/fst.git"
|
||||||
branch = "always-match-clone"
|
branch = "automaton-for-deref"
|
||||||
|
|
||||||
[dependencies.levenshtein_automata]
|
[dependencies.levenshtein_automata]
|
||||||
git = "https://github.com/Kerollmops/levenshtein-automata.git"
|
git = "https://github.com/Kerollmops/levenshtein-automata.git"
|
||||||
branch = "custom-fst"
|
branch = "new-custom-fst"
|
||||||
features = ["fst_automaton"]
|
features = ["fst_automaton"]
|
||||||
|
|
||||||
[dependencies.rocksdb]
|
[dependencies.rocksdb]
|
||||||
|
50
raptor/src/automaton.rs
Normal file
50
raptor/src/automaton.rs
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
use std::ops::Deref;
|
||||||
|
use fst::Automaton;
|
||||||
|
use levenshtein_automata::{
|
||||||
|
LevenshteinAutomatonBuilder as LevBuilder,
|
||||||
|
DFA, Distance,
|
||||||
|
};
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
|
||||||
|
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
|
||||||
|
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct DfaExt {
|
||||||
|
query_len: usize,
|
||||||
|
automaton: DFA,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Deref for DfaExt {
|
||||||
|
type Target = DFA;
|
||||||
|
|
||||||
|
fn deref(&self) -> &Self::Target {
|
||||||
|
&self.automaton
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(query: &str) -> DfaExt {
|
||||||
|
let dfa = match query.len() {
|
||||||
|
0 ..= 4 => LEVDIST0.build_prefix_dfa(query),
|
||||||
|
5 ..= 8 => LEVDIST1.build_prefix_dfa(query),
|
||||||
|
_ => LEVDIST2.build_prefix_dfa(query),
|
||||||
|
};
|
||||||
|
|
||||||
|
DfaExt { query_len: query.len(), automaton: dfa }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub trait AutomatonExt: Automaton {
|
||||||
|
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
|
||||||
|
fn query_len(&self) -> usize;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AutomatonExt for DfaExt {
|
||||||
|
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
|
||||||
|
self.automaton.eval(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn query_len(&self) -> usize {
|
||||||
|
self.query_len
|
||||||
|
}
|
||||||
|
}
|
@ -1,37 +0,0 @@
|
|||||||
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
|
|
||||||
|
|
||||||
pub struct LevBuilder {
|
|
||||||
automatons: [LevenshteinAutomatonBuilder; 3],
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LevBuilder {
|
|
||||||
pub fn new() -> Self {
|
|
||||||
Self {
|
|
||||||
automatons: [
|
|
||||||
LevenshteinAutomatonBuilder::new(0, false),
|
|
||||||
LevenshteinAutomatonBuilder::new(1, false),
|
|
||||||
LevenshteinAutomatonBuilder::new(2, false),
|
|
||||||
],
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_automaton(&self, query: &str) -> Levenshtein {
|
|
||||||
assert!(!query.is_empty());
|
|
||||||
|
|
||||||
let dfa = if query.len() <= 4 {
|
|
||||||
self.automatons[0].build_prefix_dfa(query)
|
|
||||||
} else if query.len() <= 8 {
|
|
||||||
self.automatons[1].build_prefix_dfa(query)
|
|
||||||
} else {
|
|
||||||
self.automatons[2].build_prefix_dfa(query)
|
|
||||||
};
|
|
||||||
|
|
||||||
Levenshtein { dfa, query_len: query.len() }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct Levenshtein {
|
|
||||||
pub dfa: DFA,
|
|
||||||
pub query_len: usize,
|
|
||||||
}
|
|
@ -1,24 +1,16 @@
|
|||||||
#![feature(nll)]
|
#[macro_use] extern crate lazy_static;
|
||||||
|
|
||||||
extern crate fst;
|
|
||||||
extern crate fnv;
|
|
||||||
extern crate group_by;
|
|
||||||
extern crate levenshtein_automata;
|
|
||||||
extern crate byteorder;
|
|
||||||
extern crate rocksdb;
|
|
||||||
|
|
||||||
pub mod rank;
|
pub mod rank;
|
||||||
pub mod metadata;
|
pub mod metadata;
|
||||||
pub mod levenshtein;
|
pub mod automaton;
|
||||||
|
|
||||||
pub use self::metadata::{
|
pub use self::metadata::{
|
||||||
Metadata, MetadataBuilder,
|
Metadata, MetadataBuilder,
|
||||||
StreamWithState, StreamWithStateBuilder,
|
Stream, StreamBuilder,
|
||||||
UnionWithState, OpWithStateBuilder,
|
Union, OpBuilder,
|
||||||
IndexedValuesWithState,
|
IndexedValues,
|
||||||
};
|
};
|
||||||
pub use self::rank::{RankedStream};
|
pub use self::rank::RankedStream;
|
||||||
pub use self::levenshtein::LevBuilder;
|
|
||||||
|
|
||||||
pub type DocumentId = u64;
|
pub type DocumentId = u64;
|
||||||
|
|
||||||
|
@ -9,7 +9,7 @@ use std::mem;
|
|||||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||||
use fst::{self, Map, MapBuilder, Automaton};
|
use fst::{self, Map, MapBuilder, Automaton};
|
||||||
use fst::raw::MmapReadOnly;
|
use fst::raw::MmapReadOnly;
|
||||||
use DocIndex;
|
use crate::DocIndex;
|
||||||
|
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
struct Range {
|
struct Range {
|
||||||
@ -256,23 +256,23 @@ unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
|
|||||||
from_raw_parts(ptr, len)
|
from_raw_parts(ptr, len)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct OpWithStateBuilder<'m, 'v, U> {
|
pub struct OpBuilder<'m, 'v> {
|
||||||
inner: fst::map::OpWithStateBuilder<'m, U>,
|
inner: fst::map::OpBuilder<'m>,
|
||||||
indexes: &'v DocIndexes,
|
indexes: &'v DocIndexes,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
|
impl<'m, 'v> OpBuilder<'m, 'v> {
|
||||||
pub fn new(indexes: &'v DocIndexes) -> Self {
|
pub fn new(indexes: &'v DocIndexes) -> Self {
|
||||||
Self {
|
Self {
|
||||||
inner: fst::map::OpWithStateBuilder::new(),
|
inner: fst::map::OpBuilder::new(),
|
||||||
indexes: indexes,
|
indexes: indexes,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add<I, S>(mut self, streamable: I) -> Self
|
pub fn add<I, S>(mut self, streamable: I) -> Self
|
||||||
where
|
where
|
||||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
|
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
|
||||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
|
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
|
||||||
{
|
{
|
||||||
self.push(streamable);
|
self.push(streamable);
|
||||||
self
|
self
|
||||||
@ -280,14 +280,14 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
|
|||||||
|
|
||||||
pub fn push<I, S>(&mut self, streamable: I)
|
pub fn push<I, S>(&mut self, streamable: I)
|
||||||
where
|
where
|
||||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
|
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
|
||||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
|
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
|
||||||
{
|
{
|
||||||
self.inner.push(streamable);
|
self.inner.push(streamable);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn union(self) -> UnionWithState<'m, 'v, U> {
|
pub fn union(self) -> Union<'m, 'v> {
|
||||||
UnionWithState {
|
Union {
|
||||||
inner: self.inner.union(),
|
inner: self.inner.union(),
|
||||||
outs: Vec::new(),
|
outs: Vec::new(),
|
||||||
indexes: self.indexes,
|
indexes: self.indexes,
|
||||||
@ -296,23 +296,19 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
|
#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
|
||||||
pub struct IndexedValuesWithState<'a, U> {
|
pub struct IndexedValues<'a> {
|
||||||
pub index: usize,
|
pub index: usize,
|
||||||
pub values: &'a [DocIndex],
|
pub values: &'a [DocIndex],
|
||||||
pub state: U,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct UnionWithState<'m, 'v, U> {
|
pub struct Union<'m, 'v> {
|
||||||
inner: fst::map::UnionWithState<'m, U>,
|
inner: fst::map::Union<'m>,
|
||||||
outs: Vec<IndexedValuesWithState<'v, U>>,
|
outs: Vec<IndexedValues<'v>>,
|
||||||
indexes: &'v DocIndexes,
|
indexes: &'v DocIndexes,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 'm, 'v, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, U>
|
impl<'a, 'm, 'v> fst::Streamer<'a> for Union<'m, 'v> {
|
||||||
where
|
type Item = (&'a [u8], &'a [IndexedValues<'a>]);
|
||||||
U: Clone,
|
|
||||||
{
|
|
||||||
type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, U>]);
|
|
||||||
|
|
||||||
fn next(&'a mut self) -> Option<Self::Item> {
|
fn next(&'a mut self) -> Option<Self::Item> {
|
||||||
match self.inner.next() {
|
match self.inner.next() {
|
||||||
@ -322,8 +318,7 @@ where
|
|||||||
for ivalue in ivalues {
|
for ivalue in ivalues {
|
||||||
if let Some(values) = self.indexes.get(ivalue.value) {
|
if let Some(values) = self.indexes.get(ivalue.value) {
|
||||||
let index = ivalue.index;
|
let index = ivalue.index;
|
||||||
let state = ivalue.state.clone();
|
self.outs.push(IndexedValues { index, values })
|
||||||
self.outs.push(IndexedValuesWithState { index, values, state })
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Some((s, &self.outs))
|
Some((s, &self.outs))
|
||||||
@ -333,44 +328,43 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct StreamWithStateBuilder<'m, 'v, A> {
|
pub struct StreamBuilder<'m, 'v, A> {
|
||||||
inner: fst::map::StreamWithStateBuilder<'m, A>,
|
inner: fst::map::StreamBuilder<'m, A>,
|
||||||
indexes: &'v DocIndexes,
|
indexes: &'v DocIndexes,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamWithStateBuilder<'m, 'v, A>
|
impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamBuilder<'m, 'v, A>
|
||||||
where
|
where
|
||||||
A: Automaton,
|
A: Automaton,
|
||||||
A::State: Clone,
|
A::State: Clone,
|
||||||
{
|
{
|
||||||
type Item = <Self::Into as fst::Streamer<'a>>::Item;
|
type Item = <Self::Into as fst::Streamer<'a>>::Item;
|
||||||
type Into = StreamWithState<'m, 'v, A>;
|
type Into = Stream<'m, 'v, A>;
|
||||||
|
|
||||||
fn into_stream(self) -> Self::Into {
|
fn into_stream(self) -> Self::Into {
|
||||||
StreamWithState {
|
Stream {
|
||||||
inner: self.inner.into_stream(),
|
inner: self.inner.into_stream(),
|
||||||
indexes: self.indexes,
|
indexes: self.indexes,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct StreamWithState<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> {
|
pub struct Stream<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> {
|
||||||
inner: fst::map::StreamWithState<'m, A>,
|
inner: fst::map::Stream<'m, A>,
|
||||||
indexes: &'v DocIndexes,
|
indexes: &'v DocIndexes,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for StreamWithState<'m, 'v, A>
|
impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for Stream<'m, 'v, A>
|
||||||
where
|
where
|
||||||
A: Automaton,
|
A: Automaton,
|
||||||
A::State: Clone,
|
|
||||||
{
|
{
|
||||||
type Item = (&'a [u8], &'a [DocIndex], A::State);
|
type Item = (&'a [u8], &'a [DocIndex]);
|
||||||
|
|
||||||
fn next(&'a mut self) -> Option<Self::Item> {
|
fn next(&'a mut self) -> Option<Self::Item> {
|
||||||
match self.inner.next() {
|
match self.inner.next() {
|
||||||
Some((key, i, state)) => {
|
Some((key, i)) => {
|
||||||
match self.indexes.get(i) {
|
match self.indexes.get(i) {
|
||||||
Some(values) => Some((key, values, state)),
|
Some(values) => Some((key, values)),
|
||||||
None => None,
|
None => None,
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use Match;
|
|
||||||
use rank::{match_query_index, Document};
|
|
||||||
use group_by::GroupBy;
|
use group_by::GroupBy;
|
||||||
|
use crate::Match;
|
||||||
|
use crate::rank::{match_query_index, Document};
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn contains_exact(matches: &[Match]) -> bool {
|
fn contains_exact(matches: &[Match]) -> bool {
|
||||||
|
@ -6,13 +6,14 @@ mod sum_of_words_position;
|
|||||||
mod exact;
|
mod exact;
|
||||||
|
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
|
use std::rc::Rc;
|
||||||
use std::{mem, vec};
|
use std::{mem, vec};
|
||||||
use fst;
|
use fst;
|
||||||
use fnv::FnvHashMap;
|
use fnv::FnvHashMap;
|
||||||
use levenshtein::Levenshtein;
|
|
||||||
use metadata::{DocIndexes, OpWithStateBuilder, UnionWithState};
|
|
||||||
use {Match, DocumentId};
|
|
||||||
use group_by::GroupByMut;
|
use group_by::GroupByMut;
|
||||||
|
use crate::automaton::{DfaExt, AutomatonExt};
|
||||||
|
use crate::metadata::{DocIndexes, OpBuilder, Union};
|
||||||
|
use crate::{Match, DocumentId};
|
||||||
|
|
||||||
use self::{
|
use self::{
|
||||||
sum_of_typos::sum_of_typos,
|
sum_of_typos::sum_of_typos,
|
||||||
@ -85,11 +86,12 @@ fn matches_into_iter(matches: FnvHashMap<DocumentId, Vec<Match>>, limit: usize)
|
|||||||
pub struct RankedStream<'m, 'v>(RankedStreamInner<'m, 'v>);
|
pub struct RankedStream<'m, 'v>(RankedStreamInner<'m, 'v>);
|
||||||
|
|
||||||
impl<'m, 'v> RankedStream<'m, 'v> {
|
impl<'m, 'v> RankedStream<'m, 'v> {
|
||||||
pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<Levenshtein>, limit: usize) -> Self {
|
pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<DfaExt>, limit: usize) -> Self {
|
||||||
let mut op = OpWithStateBuilder::new(indexes);
|
let mut op = OpBuilder::new(indexes);
|
||||||
|
|
||||||
for automaton in automatons.iter().map(|l| l.dfa.clone()) {
|
let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect();
|
||||||
let stream = map.search(automaton).with_state();
|
for automaton in automatons.iter().cloned() {
|
||||||
|
let stream = map.search(automaton);
|
||||||
op.push(stream);
|
op.push(stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,8 +116,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> {
|
|||||||
|
|
||||||
enum RankedStreamInner<'m, 'v> {
|
enum RankedStreamInner<'m, 'v> {
|
||||||
Fed {
|
Fed {
|
||||||
inner: UnionWithState<'m, 'v, u32>,
|
inner: Union<'m, 'v>,
|
||||||
automatons: Vec<Levenshtein>,
|
automatons: Vec<Rc<DfaExt>>,
|
||||||
limit: usize,
|
limit: usize,
|
||||||
matches: FnvHashMap<DocumentId, Vec<Match>>,
|
matches: FnvHashMap<DocumentId, Vec<Match>>,
|
||||||
},
|
},
|
||||||
@ -136,7 +138,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> {
|
|||||||
for iv in indexed_values {
|
for iv in indexed_values {
|
||||||
|
|
||||||
let automaton = &automatons[iv.index];
|
let automaton = &automatons[iv.index];
|
||||||
let distance = automaton.dfa.distance(iv.state).to_u8();
|
let distance = automaton.eval(string).to_u8();
|
||||||
|
let same_length = string.len() == automaton.query_len();
|
||||||
|
|
||||||
for di in iv.values {
|
for di in iv.values {
|
||||||
let match_ = Match {
|
let match_ = Match {
|
||||||
@ -144,11 +147,11 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> {
|
|||||||
distance: distance,
|
distance: distance,
|
||||||
attribute: di.attribute,
|
attribute: di.attribute,
|
||||||
attribute_index: di.attribute_index,
|
attribute_index: di.attribute_index,
|
||||||
is_exact: distance == 0 && string.len() == automaton.query_len,
|
is_exact: distance == 0 && same_length,
|
||||||
};
|
};
|
||||||
matches.entry(di.document)
|
matches.entry(di.document)
|
||||||
.or_insert_with(Vec::new)
|
.or_insert_with(Vec::new)
|
||||||
.push(match_);
|
.push(match_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use Match;
|
|
||||||
use rank::{match_query_index, Document};
|
|
||||||
use group_by::GroupBy;
|
use group_by::GroupBy;
|
||||||
|
use crate::Match;
|
||||||
|
use crate::rank::{match_query_index, Document};
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn number_of_query_words(matches: &[Match]) -> usize {
|
fn number_of_query_words(matches: &[Match]) -> usize {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use Match;
|
|
||||||
use rank::{match_query_index, Document};
|
|
||||||
use group_by::GroupBy;
|
use group_by::GroupBy;
|
||||||
|
use crate::Match;
|
||||||
|
use crate::rank::{match_query_index, Document};
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_typos(matches: &[Match]) -> u8 {
|
fn sum_matches_typos(matches: &[Match]) -> u8 {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use Match;
|
|
||||||
use rank::{match_query_index, Document};
|
|
||||||
use group_by::GroupBy;
|
use group_by::GroupBy;
|
||||||
|
use crate::Match;
|
||||||
|
use crate::rank::{match_query_index, Document};
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_attributes(matches: &[Match]) -> u8 {
|
fn sum_matches_attributes(matches: &[Match]) -> u8 {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use Match;
|
|
||||||
use rank::{match_query_index, Document};
|
|
||||||
use group_by::GroupBy;
|
use group_by::GroupBy;
|
||||||
|
use crate::Match;
|
||||||
|
use crate::rank::{match_query_index, Document};
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
|
fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use std::cmp::{self, Ordering};
|
use std::cmp::{self, Ordering};
|
||||||
use Match;
|
|
||||||
use rank::{match_query_index, Document};
|
|
||||||
use group_by::GroupBy;
|
use group_by::GroupBy;
|
||||||
|
use crate::Match;
|
||||||
|
use crate::rank::{match_query_index, Document};
|
||||||
|
|
||||||
const MAX_DISTANCE: u32 = 8;
|
const MAX_DISTANCE: u32 = 8;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user