diff --git a/Cargo.lock b/Cargo.lock index 6acef59fe..be5640e22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -793,6 +793,7 @@ dependencies = [ "jemallocator", "levenshtein_automata", "memmap", + "once_cell", "oxidized-mtbl", "rayon", "roaring", diff --git a/Cargo.toml b/Cargo.toml index 5087f59bc..f3b09dc56 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ heed = { version = "0.8.0", default-features = false, features = ["lmdb"] } jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } memmap = "0.7.0" +once_cell = "1.4.0" oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "8918476" } rayon = "1.3.0" roaring = "0.5.2" diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 17689823c..c410e7e99 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -190,9 +190,11 @@ fn index_csv(mut rdr: csv::Reader) -> anyhow::Result { .or_insert_with(RoaringBitmap::new) .insert(document_id); if let Some(prefix) = word.as_bytes().get(0..word.len().min(4)) { - prefix_postings_ids.entry(SmallVec32::from(prefix)) - .or_insert_with(RoaringBitmap::new) - .insert(document_id); + for i in 0..prefix.len() { + prefix_postings_ids.entry(SmallVec32::from(&prefix[..i])) + .or_insert_with(RoaringBitmap::new) + .insert(document_id); + } } } } diff --git a/src/lib.rs b/src/lib.rs index 248e360cf..f5de08980 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,10 +7,15 @@ use fst::{IntoStreamer, Streamer}; use fxhash::FxHasher32; use heed::types::*; use heed::{PolyDatabase, Database}; -use levenshtein_automata::LevenshteinAutomatonBuilder; +use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; +use once_cell::sync::OnceCell; use roaring::RoaringBitmap; use slice_group_by::StrGroupBy; +static LEVDIST0: OnceCell = OnceCell::new(); +static LEVDIST1: OnceCell = OnceCell::new(); +static LEVDIST2: OnceCell = OnceCell::new(); + pub type FastMap4 = HashMap>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>; @@ -37,12 +42,7 @@ impl Index { let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?; let documents = env.create_database(Some("documents"))?; - Ok(Index { - main, - postings_ids, - prefix_postings_ids, - documents, - }) + Ok(Index { main, postings_ids, prefix_postings_ids, documents }) } pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result> { @@ -56,9 +56,9 @@ impl Index { }; // Building these factories is not free. - let lev0 = LevenshteinAutomatonBuilder::new(0, true); - let lev1 = LevenshteinAutomatonBuilder::new(1, true); - let lev2 = LevenshteinAutomatonBuilder::new(2, true); + let lev0 = LEVDIST0.get_or_init(|| LevBuilder::new(0, true)); + let lev1 = LEVDIST1.get_or_init(|| LevBuilder::new(1, true)); + let lev2 = LEVDIST2.get_or_init(|| LevBuilder::new(2, true)); let words: Vec<_> = alphanumeric_tokens(query).collect(); let number_of_words = words.len(); @@ -91,8 +91,8 @@ impl Index { union_result.union_with(&right); } } - eprintln!("union for {:?} took {:.02?}", word, before.elapsed()); } + eprintln!("union for {:?} took {:.02?}", word, before.elapsed()); intersect_result = match intersect_result.take() { Some(mut left) => {