mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
feat: Simplify the levenshtein construction
This commit is contained in:
parent
f0f5fc9891
commit
31e04f0120
34
Cargo.lock
generated
34
Cargo.lock
generated
@ -72,8 +72,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "fst"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/Kerollmops/fst.git?branch=always-match-clone#56eb2221d1534883d4e10887d945a982b780fccd"
|
||||
version = "0.3.2"
|
||||
source = "git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref#ca3a1ebb60a6f9123f1284de380c7a5fc05d16bb"
|
||||
dependencies = [
|
||||
"byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -113,12 +113,20 @@ name = "itoa"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"version_check 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "levenshtein_automata"
|
||||
version = "0.1.1"
|
||||
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#ed1244d1731b0f81e880f0c9daa860970d7752c3"
|
||||
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=new-custom-fst#01400dfc181425a482cb6cad66f2a61b78b59e14"
|
||||
dependencies = [
|
||||
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)",
|
||||
"fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -225,9 +233,10 @@ version = "0.1.0"
|
||||
dependencies = [
|
||||
"byteorder 1.2.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)",
|
||||
"fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)",
|
||||
"group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)",
|
||||
"levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)",
|
||||
"lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=new-custom-fst)",
|
||||
"rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
|
||||
]
|
||||
|
||||
@ -249,7 +258,7 @@ name = "raptor-search"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)",
|
||||
"fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)",
|
||||
"raptor 0.1.0",
|
||||
"rocksdb 0.3.0 (git+https://github.com/pingcap/rust-rocksdb.git)",
|
||||
]
|
||||
@ -329,6 +338,11 @@ name = "vcpkg"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.5"
|
||||
@ -371,14 +385,15 @@ dependencies = [
|
||||
"checksum crc 1.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb"
|
||||
"checksum elapsed 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f4e5af126dafd0741c2ad62d47f68b28602550102e5f0dd45c8a97fc8b49c29"
|
||||
"checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
|
||||
"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=always-match-clone)" = "<none>"
|
||||
"checksum fst 0.3.2 (git+https://github.com/Kerollmops/fst.git?branch=automaton-for-deref)" = "<none>"
|
||||
"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
|
||||
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
|
||||
"checksum gcc 0.3.54 (registry+https://github.com/rust-lang/crates.io-index)" = "5e33ec290da0d127825013597dbdfc28bee4964690c7ce1166cbc2a7bd08b1bb"
|
||||
"checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb"
|
||||
"checksum group-by 0.1.0 (git+https://github.com/Kerollmops/group-by.git)" = "<none>"
|
||||
"checksum itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5adb58558dcd1d786b5f0bd15f3226ee23486e24b7b58304b60f64dc68e62606"
|
||||
"checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
|
||||
"checksum lazy_static 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ca488b89a5657b0a2ecd45b95609b3e848cf1755da332a0da46e2b2b1cb371a7"
|
||||
"checksum levenshtein_automata 0.1.1 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=new-custom-fst)" = "<none>"
|
||||
"checksum libc 0.2.43 (registry+https://github.com/rust-lang/crates.io-index)" = "76e3a3ef172f1a0b9a9ff0dd1491ae5e6c948b94479a3021819ba7d860c8645d"
|
||||
"checksum librocksdb_sys 0.1.0 (git+https://github.com/pingcap/rust-rocksdb.git)" = "<none>"
|
||||
"checksum libz-sys 1.0.18 (git+https://github.com/busyjay/libz-sys.git?branch=static-link)" = "<none>"
|
||||
@ -400,6 +415,7 @@ dependencies = [
|
||||
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
|
||||
"checksum unidecode 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "402bb19d8e03f1d1a7450e2bd613980869438e0666331be3e073089124aa1adc"
|
||||
"checksum vcpkg 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "def296d3eb3b12371b2c7d0e83bfe1403e4db2d7a0bba324a12b21c4ee13143d"
|
||||
"checksum version_check 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7716c242968ee87e5542f8021178248f267f295a5c4803beae8b8b7fd9bc6051"
|
||||
"checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd"
|
||||
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
@ -1,4 +1,7 @@
|
||||
cargo-features = ["edition"]
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "raptor-indexer"
|
||||
version = "0.1.0"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
|
@ -1,12 +1,7 @@
|
||||
// TODO make the raptor binary expose multiple subcommand
|
||||
// make only one binary
|
||||
|
||||
extern crate raptor;
|
||||
extern crate rocksdb;
|
||||
extern crate serde_json;
|
||||
#[macro_use] extern crate serde_derive;
|
||||
extern crate unidecode;
|
||||
extern crate moby_name_gen;
|
||||
|
||||
use std::path::Path;
|
||||
use std::collections::{HashSet, BTreeMap};
|
||||
@ -129,7 +124,7 @@ fn main() {
|
||||
for (key, value) in fields {
|
||||
sst_file_writer.put(key.as_bytes(), value.as_bytes()).unwrap();
|
||||
}
|
||||
let sst_file_info = sst_file_writer.finish().unwrap();
|
||||
let _sst_file_info = sst_file_writer.finish().unwrap();
|
||||
|
||||
builder.finish().unwrap();
|
||||
|
||||
|
@ -1,4 +1,7 @@
|
||||
cargo-features = ["edition"]
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "raptor-search"
|
||||
version = "0.1.0"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
@ -9,7 +12,7 @@ elapsed = "0.1"
|
||||
|
||||
[dependencies.fst]
|
||||
git = "https://github.com/Kerollmops/fst.git"
|
||||
branch = "always-match-clone"
|
||||
branch = "automaton-for-deref"
|
||||
|
||||
[dependencies.rocksdb]
|
||||
git = "https://github.com/pingcap/rust-rocksdb.git"
|
||||
|
@ -1,20 +1,15 @@
|
||||
extern crate rocksdb;
|
||||
extern crate fst;
|
||||
extern crate raptor;
|
||||
extern crate elapsed;
|
||||
|
||||
use std::env;
|
||||
use std::str::from_utf8_unchecked;
|
||||
use std::io::{self, Write};
|
||||
use elapsed::measure_time;
|
||||
use fst::Streamer;
|
||||
use rocksdb::{DB, DBOptions, IngestExternalFileOptions};
|
||||
use raptor::{Metadata, RankedStream, LevBuilder};
|
||||
use raptor::{automaton, Metadata, RankedStream};
|
||||
|
||||
fn search(metadata: &Metadata, database: &DB, lev_builder: &LevBuilder, query: &str) {
|
||||
fn search(metadata: &Metadata, database: &DB, query: &str) {
|
||||
let mut automatons = Vec::new();
|
||||
for query in query.split_whitespace() {
|
||||
let lev = lev_builder.get_automaton(query);
|
||||
let lev = automaton::build(query);
|
||||
automatons.push(lev);
|
||||
}
|
||||
|
||||
@ -55,9 +50,6 @@ fn main() {
|
||||
});
|
||||
println!("{} to load the SST file in RocksDB and reopen it for read-only", elapsed);
|
||||
|
||||
let (elapsed, lev_builder) = measure_time(|| LevBuilder::new());
|
||||
println!("{} to load the levenshtein automaton", elapsed);
|
||||
|
||||
loop {
|
||||
print!("Searching for: ");
|
||||
io::stdout().flush().unwrap();
|
||||
@ -68,7 +60,7 @@ fn main() {
|
||||
|
||||
if query.is_empty() { break }
|
||||
|
||||
let (elapsed, _) = measure_time(|| search(&meta, &db, &lev_builder, &query));
|
||||
let (elapsed, _) = measure_time(|| search(&meta, &db, &query));
|
||||
println!("Finished in {}", elapsed);
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,7 @@
|
||||
cargo-features = ["edition"]
|
||||
|
||||
[package]
|
||||
edition = "2018"
|
||||
name = "raptor"
|
||||
version = "0.1.0"
|
||||
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
@ -6,14 +9,15 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||
[dependencies]
|
||||
byteorder = "1.2"
|
||||
fnv = "1.0"
|
||||
lazy_static = "1.1"
|
||||
|
||||
[dependencies.fst]
|
||||
git = "https://github.com/Kerollmops/fst.git"
|
||||
branch = "always-match-clone"
|
||||
branch = "automaton-for-deref"
|
||||
|
||||
[dependencies.levenshtein_automata]
|
||||
git = "https://github.com/Kerollmops/levenshtein-automata.git"
|
||||
branch = "custom-fst"
|
||||
branch = "new-custom-fst"
|
||||
features = ["fst_automaton"]
|
||||
|
||||
[dependencies.rocksdb]
|
||||
|
50
raptor/src/automaton.rs
Normal file
50
raptor/src/automaton.rs
Normal file
@ -0,0 +1,50 @@
|
||||
use std::ops::Deref;
|
||||
use fst::Automaton;
|
||||
use levenshtein_automata::{
|
||||
LevenshteinAutomatonBuilder as LevBuilder,
|
||||
DFA, Distance,
|
||||
};
|
||||
|
||||
lazy_static! {
|
||||
static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false);
|
||||
static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false);
|
||||
static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false);
|
||||
}
|
||||
|
||||
pub struct DfaExt {
|
||||
query_len: usize,
|
||||
automaton: DFA,
|
||||
}
|
||||
|
||||
impl Deref for DfaExt {
|
||||
type Target = DFA;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.automaton
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build(query: &str) -> DfaExt {
|
||||
let dfa = match query.len() {
|
||||
0 ..= 4 => LEVDIST0.build_prefix_dfa(query),
|
||||
5 ..= 8 => LEVDIST1.build_prefix_dfa(query),
|
||||
_ => LEVDIST2.build_prefix_dfa(query),
|
||||
};
|
||||
|
||||
DfaExt { query_len: query.len(), automaton: dfa }
|
||||
}
|
||||
|
||||
pub trait AutomatonExt: Automaton {
|
||||
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance;
|
||||
fn query_len(&self) -> usize;
|
||||
}
|
||||
|
||||
impl AutomatonExt for DfaExt {
|
||||
fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance {
|
||||
self.automaton.eval(s)
|
||||
}
|
||||
|
||||
fn query_len(&self) -> usize {
|
||||
self.query_len
|
||||
}
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
|
||||
|
||||
pub struct LevBuilder {
|
||||
automatons: [LevenshteinAutomatonBuilder; 3],
|
||||
}
|
||||
|
||||
impl LevBuilder {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
automatons: [
|
||||
LevenshteinAutomatonBuilder::new(0, false),
|
||||
LevenshteinAutomatonBuilder::new(1, false),
|
||||
LevenshteinAutomatonBuilder::new(2, false),
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_automaton(&self, query: &str) -> Levenshtein {
|
||||
assert!(!query.is_empty());
|
||||
|
||||
let dfa = if query.len() <= 4 {
|
||||
self.automatons[0].build_prefix_dfa(query)
|
||||
} else if query.len() <= 8 {
|
||||
self.automatons[1].build_prefix_dfa(query)
|
||||
} else {
|
||||
self.automatons[2].build_prefix_dfa(query)
|
||||
};
|
||||
|
||||
Levenshtein { dfa, query_len: query.len() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Levenshtein {
|
||||
pub dfa: DFA,
|
||||
pub query_len: usize,
|
||||
}
|
@ -1,24 +1,16 @@
|
||||
#![feature(nll)]
|
||||
|
||||
extern crate fst;
|
||||
extern crate fnv;
|
||||
extern crate group_by;
|
||||
extern crate levenshtein_automata;
|
||||
extern crate byteorder;
|
||||
extern crate rocksdb;
|
||||
#[macro_use] extern crate lazy_static;
|
||||
|
||||
pub mod rank;
|
||||
pub mod metadata;
|
||||
pub mod levenshtein;
|
||||
pub mod automaton;
|
||||
|
||||
pub use self::metadata::{
|
||||
Metadata, MetadataBuilder,
|
||||
StreamWithState, StreamWithStateBuilder,
|
||||
UnionWithState, OpWithStateBuilder,
|
||||
IndexedValuesWithState,
|
||||
Stream, StreamBuilder,
|
||||
Union, OpBuilder,
|
||||
IndexedValues,
|
||||
};
|
||||
pub use self::rank::{RankedStream};
|
||||
pub use self::levenshtein::LevBuilder;
|
||||
pub use self::rank::RankedStream;
|
||||
|
||||
pub type DocumentId = u64;
|
||||
|
||||
|
@ -9,7 +9,7 @@ use std::mem;
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::{self, Map, MapBuilder, Automaton};
|
||||
use fst::raw::MmapReadOnly;
|
||||
use DocIndex;
|
||||
use crate::DocIndex;
|
||||
|
||||
#[repr(C)]
|
||||
struct Range {
|
||||
@ -256,23 +256,23 @@ unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
||||
|
||||
pub struct OpWithStateBuilder<'m, 'v, U> {
|
||||
inner: fst::map::OpWithStateBuilder<'m, U>,
|
||||
pub struct OpBuilder<'m, 'v> {
|
||||
inner: fst::map::OpBuilder<'m>,
|
||||
indexes: &'v DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
|
||||
impl<'m, 'v> OpBuilder<'m, 'v> {
|
||||
pub fn new(indexes: &'v DocIndexes) -> Self {
|
||||
Self {
|
||||
inner: fst::map::OpWithStateBuilder::new(),
|
||||
inner: fst::map::OpBuilder::new(),
|
||||
indexes: indexes,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add<I, S>(mut self, streamable: I) -> Self
|
||||
where
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
|
||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
|
||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
|
||||
{
|
||||
self.push(streamable);
|
||||
self
|
||||
@ -280,14 +280,14 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
|
||||
|
||||
pub fn push<I, S>(&mut self, streamable: I)
|
||||
where
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
|
||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
|
||||
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
|
||||
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
|
||||
{
|
||||
self.inner.push(streamable);
|
||||
}
|
||||
|
||||
pub fn union(self) -> UnionWithState<'m, 'v, U> {
|
||||
UnionWithState {
|
||||
pub fn union(self) -> Union<'m, 'v> {
|
||||
Union {
|
||||
inner: self.inner.union(),
|
||||
outs: Vec::new(),
|
||||
indexes: self.indexes,
|
||||
@ -296,23 +296,19 @@ impl<'m, 'v, U: 'static> OpWithStateBuilder<'m, 'v, U> {
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)]
|
||||
pub struct IndexedValuesWithState<'a, U> {
|
||||
pub struct IndexedValues<'a> {
|
||||
pub index: usize,
|
||||
pub values: &'a [DocIndex],
|
||||
pub state: U,
|
||||
}
|
||||
|
||||
pub struct UnionWithState<'m, 'v, U> {
|
||||
inner: fst::map::UnionWithState<'m, U>,
|
||||
outs: Vec<IndexedValuesWithState<'v, U>>,
|
||||
pub struct Union<'m, 'v> {
|
||||
inner: fst::map::Union<'m>,
|
||||
outs: Vec<IndexedValues<'v>>,
|
||||
indexes: &'v DocIndexes,
|
||||
}
|
||||
|
||||
impl<'a, 'm, 'v, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, U>
|
||||
where
|
||||
U: Clone,
|
||||
{
|
||||
type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, U>]);
|
||||
impl<'a, 'm, 'v> fst::Streamer<'a> for Union<'m, 'v> {
|
||||
type Item = (&'a [u8], &'a [IndexedValues<'a>]);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.inner.next() {
|
||||
@ -322,8 +318,7 @@ where
|
||||
for ivalue in ivalues {
|
||||
if let Some(values) = self.indexes.get(ivalue.value) {
|
||||
let index = ivalue.index;
|
||||
let state = ivalue.state.clone();
|
||||
self.outs.push(IndexedValuesWithState { index, values, state })
|
||||
self.outs.push(IndexedValues { index, values })
|
||||
}
|
||||
}
|
||||
Some((s, &self.outs))
|
||||
@ -333,44 +328,43 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StreamWithStateBuilder<'m, 'v, A> {
|
||||
inner: fst::map::StreamWithStateBuilder<'m, A>,
|
||||
pub struct StreamBuilder<'m, 'v, A> {
|
||||
inner: fst::map::StreamBuilder<'m, A>,
|
||||
indexes: &'v DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamWithStateBuilder<'m, 'v, A>
|
||||
impl<'m, 'v, 'a, A: 'a> fst::IntoStreamer<'a> for StreamBuilder<'m, 'v, A>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone,
|
||||
{
|
||||
type Item = <Self::Into as fst::Streamer<'a>>::Item;
|
||||
type Into = StreamWithState<'m, 'v, A>;
|
||||
type Into = Stream<'m, 'v, A>;
|
||||
|
||||
fn into_stream(self) -> Self::Into {
|
||||
StreamWithState {
|
||||
Stream {
|
||||
inner: self.inner.into_stream(),
|
||||
indexes: self.indexes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StreamWithState<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> {
|
||||
inner: fst::map::StreamWithState<'m, A>,
|
||||
pub struct Stream<'m, 'v, A: Automaton = fst::automaton::AlwaysMatch> {
|
||||
inner: fst::map::Stream<'m, A>,
|
||||
indexes: &'v DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for StreamWithState<'m, 'v, A>
|
||||
impl<'m, 'v, 'a, A: 'a> fst::Streamer<'a> for Stream<'m, 'v, A>
|
||||
where
|
||||
A: Automaton,
|
||||
A::State: Clone,
|
||||
{
|
||||
type Item = (&'a [u8], &'a [DocIndex], A::State);
|
||||
type Item = (&'a [u8], &'a [DocIndex]);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.inner.next() {
|
||||
Some((key, i, state)) => {
|
||||
Some((key, i)) => {
|
||||
match self.indexes.get(i) {
|
||||
Some(values) => Some((key, values, state)),
|
||||
Some(values) => Some((key, values)),
|
||||
None => None,
|
||||
}
|
||||
},
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn contains_exact(matches: &[Match]) -> bool {
|
||||
|
@ -6,13 +6,14 @@ mod sum_of_words_position;
|
||||
mod exact;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::rc::Rc;
|
||||
use std::{mem, vec};
|
||||
use fst;
|
||||
use fnv::FnvHashMap;
|
||||
use levenshtein::Levenshtein;
|
||||
use metadata::{DocIndexes, OpWithStateBuilder, UnionWithState};
|
||||
use {Match, DocumentId};
|
||||
use group_by::GroupByMut;
|
||||
use crate::automaton::{DfaExt, AutomatonExt};
|
||||
use crate::metadata::{DocIndexes, OpBuilder, Union};
|
||||
use crate::{Match, DocumentId};
|
||||
|
||||
use self::{
|
||||
sum_of_typos::sum_of_typos,
|
||||
@ -85,11 +86,12 @@ fn matches_into_iter(matches: FnvHashMap<DocumentId, Vec<Match>>, limit: usize)
|
||||
pub struct RankedStream<'m, 'v>(RankedStreamInner<'m, 'v>);
|
||||
|
||||
impl<'m, 'v> RankedStream<'m, 'v> {
|
||||
pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<Levenshtein>, limit: usize) -> Self {
|
||||
let mut op = OpWithStateBuilder::new(indexes);
|
||||
pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<DfaExt>, limit: usize) -> Self {
|
||||
let mut op = OpBuilder::new(indexes);
|
||||
|
||||
for automaton in automatons.iter().map(|l| l.dfa.clone()) {
|
||||
let stream = map.search(automaton).with_state();
|
||||
let automatons: Vec<_> = automatons.into_iter().map(Rc::new).collect();
|
||||
for automaton in automatons.iter().cloned() {
|
||||
let stream = map.search(automaton);
|
||||
op.push(stream);
|
||||
}
|
||||
|
||||
@ -114,8 +116,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> {
|
||||
|
||||
enum RankedStreamInner<'m, 'v> {
|
||||
Fed {
|
||||
inner: UnionWithState<'m, 'v, u32>,
|
||||
automatons: Vec<Levenshtein>,
|
||||
inner: Union<'m, 'v>,
|
||||
automatons: Vec<Rc<DfaExt>>,
|
||||
limit: usize,
|
||||
matches: FnvHashMap<DocumentId, Vec<Match>>,
|
||||
},
|
||||
@ -136,7 +138,8 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> {
|
||||
for iv in indexed_values {
|
||||
|
||||
let automaton = &automatons[iv.index];
|
||||
let distance = automaton.dfa.distance(iv.state).to_u8();
|
||||
let distance = automaton.eval(string).to_u8();
|
||||
let same_length = string.len() == automaton.query_len();
|
||||
|
||||
for di in iv.values {
|
||||
let match_ = Match {
|
||||
@ -144,11 +147,11 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> {
|
||||
distance: distance,
|
||||
attribute: di.attribute,
|
||||
attribute_index: di.attribute_index,
|
||||
is_exact: distance == 0 && string.len() == automaton.query_len,
|
||||
is_exact: distance == 0 && same_length,
|
||||
};
|
||||
matches.entry(di.document)
|
||||
.or_insert_with(Vec::new)
|
||||
.push(match_);
|
||||
.or_insert_with(Vec::new)
|
||||
.push(match_);
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn number_of_query_words(matches: &[Match]) -> usize {
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_typos(matches: &[Match]) -> u8 {
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(matches: &[Match]) -> u8 {
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(matches: &[Match]) -> u32 {
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::cmp::{self, Ordering};
|
||||
use Match;
|
||||
use rank::{match_query_index, Document};
|
||||
use group_by::GroupBy;
|
||||
use crate::Match;
|
||||
use crate::rank::{match_query_index, Document};
|
||||
|
||||
const MAX_DISTANCE: u32 = 8;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user