From 2ae3f4097130646880c0d58733dce11a76deceee Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 1 Jul 2020 17:49:46 +0200 Subject: [PATCH] Make the indexer ignore certain words This is a preparation for making the indexing fully parallel by making the indexer only be aware of certain words for each threads to avoid postings lists conflicts for each words --- Cargo.lock | 10 ++++++---- Cargo.toml | 1 + src/bin/indexer.rs | 16 +++++++++++++--- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e3c89585f..519a2b791 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -817,6 +817,7 @@ dependencies = [ "linked-hash-map", "memmap", "once_cell", + "rayon", "roaring", "serde", "slice-group-by", @@ -1342,10 +1343,11 @@ dependencies = [ [[package]] name = "rayon" -version = "1.3.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098" +checksum = "62f02856753d04e03e26929f820d0a0a337ebe71f849801eea335d464b349080" dependencies = [ + "autocfg 1.0.0", "crossbeam-deque", "either", "rayon-core", @@ -1353,9 +1355,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.7.0" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9" +checksum = "e92e15d89083484e11353891f1af602cc661426deb9564c298b270c726973280" dependencies = [ "crossbeam-deque", "crossbeam-queue", diff --git a/Cargo.toml b/Cargo.toml index 714b1b158..db45da11e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.3" memmap = "0.7.0" once_cell = "1.4.0" +rayon = "1.3.1" roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" } slice-group-by = "0.2.6" smallstr = "0.2.0" diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 5a2a2ccfb..af45d27d5 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -77,7 +77,14 @@ where a } -fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index: &Index) -> anyhow::Result<()> { +fn index_csv( + wtxn: &mut heed::RwTxn, + mut rdr: csv::Reader, + index: &Index, + num_threads: usize, + thread_index: usize, +) -> anyhow::Result<()> +{ eprintln!("Indexing into LMDB..."); let mut words_cache = ArcCache::<_, (RoaringBitmap, FastMap4<_, RoaringBitmap>)>::new(100_000); @@ -100,6 +107,9 @@ fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index let word = word.to_lowercase(); // TODO cow_to_lowercase let position = (attr * 1000 + pos) as u32; + // If this indexing process is not concerned by this word, then ignore it. + if fxhash::hash32(&word) as usize % num_threads != thread_index { continue; } + match words_cache.get_mut(&word) { (Some(entry), evicted) => { let (ids, positions) = entry; @@ -214,11 +224,11 @@ fn main() -> anyhow::Result<()> { match opt.csv_file { Some(path) => { let rdr = csv::Reader::from_path(path)?; - index_csv(&mut wtxn, rdr, &index)?; + index_csv(&mut wtxn, rdr, &index, 1, 0)?; }, None => { let rdr = csv::Reader::from_reader(io::stdin()); - index_csv(&mut wtxn, rdr, &index)?; + index_csv(&mut wtxn, rdr, &index, 1, 0)?; } };