Use OnceCell to cache levenshtein builders

This commit is contained in:
Kerollmops 2020-05-31 18:21:24 +02:00
parent dde3e01a59
commit dff68a339a
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
4 changed files with 18 additions and 14 deletions

1
Cargo.lock generated
View File

@ -793,6 +793,7 @@ dependencies = [
"jemallocator",
"levenshtein_automata",
"memmap",
"once_cell",
"oxidized-mtbl",
"rayon",
"roaring",

View File

@ -17,6 +17,7 @@ heed = { version = "0.8.0", default-features = false, features = ["lmdb"] }
jemallocator = "0.3.2"
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
memmap = "0.7.0"
once_cell = "1.4.0"
oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "8918476" }
rayon = "1.3.0"
roaring = "0.5.2"

View File

@ -190,13 +190,15 @@ fn index_csv(mut rdr: csv::Reader<File>) -> anyhow::Result<MtblKvStore> {
.or_insert_with(RoaringBitmap::new)
.insert(document_id);
if let Some(prefix) = word.as_bytes().get(0..word.len().min(4)) {
prefix_postings_ids.entry(SmallVec32::from(prefix))
for i in 0..prefix.len() {
prefix_postings_ids.entry(SmallVec32::from(&prefix[..i]))
.or_insert_with(RoaringBitmap::new)
.insert(document_id);
}
}
}
}
}
// We write the document in the database.
let mut writer = csv::WriterBuilder::new().has_headers(false).from_writer(Vec::new());

View File

@ -7,10 +7,15 @@ use fst::{IntoStreamer, Streamer};
use fxhash::FxHasher32;
use heed::types::*;
use heed::{PolyDatabase, Database};
use levenshtein_automata::LevenshteinAutomatonBuilder;
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
use once_cell::sync::OnceCell;
use roaring::RoaringBitmap;
use slice_group_by::StrGroupBy;
static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
static LEVDIST2: OnceCell<LevBuilder> = OnceCell::new();
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>;
@ -37,12 +42,7 @@ impl Index {
let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?;
let documents = env.create_database(Some("documents"))?;
Ok(Index {
main,
postings_ids,
prefix_postings_ids,
documents,
})
Ok(Index { main, postings_ids, prefix_postings_ids, documents })
}
pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> {
@ -56,9 +56,9 @@ impl Index {
};
// Building these factories is not free.
let lev0 = LevenshteinAutomatonBuilder::new(0, true);
let lev1 = LevenshteinAutomatonBuilder::new(1, true);
let lev2 = LevenshteinAutomatonBuilder::new(2, true);
let lev0 = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
let lev1 = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
let lev2 = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));
let words: Vec<_> = alphanumeric_tokens(query).collect();
let number_of_words = words.len();
@ -91,8 +91,8 @@ impl Index {
union_result.union_with(&right);
}
}
eprintln!("union for {:?} took {:.02?}", word, before.elapsed());
}
eprintln!("union for {:?} took {:.02?}", word, before.elapsed());
intersect_result = match intersect_result.take() {
Some(mut left) => {