Make the indexer ignore certain words

This is a preparation for making the indexing fully parallel by making the
indexer only be aware of certain words for each threads to avoid postings lists
conflicts for each words
This commit is contained in:
Kerollmops 2020-07-01 17:49:46 +02:00
parent a3ac2623d5
commit 2ae3f40971
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 20 additions and 7 deletions

10
Cargo.lock generated
View File

@ -817,6 +817,7 @@ dependencies = [
"linked-hash-map", "linked-hash-map",
"memmap", "memmap",
"once_cell", "once_cell",
"rayon",
"roaring", "roaring",
"serde", "serde",
"slice-group-by", "slice-group-by",
@ -1342,10 +1343,11 @@ dependencies = [
[[package]] [[package]]
name = "rayon" name = "rayon"
version = "1.3.0" version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098" checksum = "62f02856753d04e03e26929f820d0a0a337ebe71f849801eea335d464b349080"
dependencies = [ dependencies = [
"autocfg 1.0.0",
"crossbeam-deque", "crossbeam-deque",
"either", "either",
"rayon-core", "rayon-core",
@ -1353,9 +1355,9 @@ dependencies = [
[[package]] [[package]]
name = "rayon-core" name = "rayon-core"
version = "1.7.0" version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9" checksum = "e92e15d89083484e11353891f1af602cc661426deb9564c298b270c726973280"
dependencies = [ dependencies = [
"crossbeam-deque", "crossbeam-deque",
"crossbeam-queue", "crossbeam-queue",

View File

@ -19,6 +19,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
linked-hash-map = "0.5.3" linked-hash-map = "0.5.3"
memmap = "0.7.0" memmap = "0.7.0"
once_cell = "1.4.0" once_cell = "1.4.0"
rayon = "1.3.1"
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" } roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "deserialize-from-slice" }
slice-group-by = "0.2.6" slice-group-by = "0.2.6"
smallstr = "0.2.0" smallstr = "0.2.0"

View File

@ -77,7 +77,14 @@ where
a a
} }
fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index: &Index) -> anyhow::Result<()> { fn index_csv<R: io::Read>(
wtxn: &mut heed::RwTxn,
mut rdr: csv::Reader<R>,
index: &Index,
num_threads: usize,
thread_index: usize,
) -> anyhow::Result<()>
{
eprintln!("Indexing into LMDB..."); eprintln!("Indexing into LMDB...");
let mut words_cache = ArcCache::<_, (RoaringBitmap, FastMap4<_, RoaringBitmap>)>::new(100_000); let mut words_cache = ArcCache::<_, (RoaringBitmap, FastMap4<_, RoaringBitmap>)>::new(100_000);
@ -100,6 +107,9 @@ fn index_csv<R: io::Read>(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader<R>, index
let word = word.to_lowercase(); // TODO cow_to_lowercase let word = word.to_lowercase(); // TODO cow_to_lowercase
let position = (attr * 1000 + pos) as u32; let position = (attr * 1000 + pos) as u32;
// If this indexing process is not concerned by this word, then ignore it.
if fxhash::hash32(&word) as usize % num_threads != thread_index { continue; }
match words_cache.get_mut(&word) { match words_cache.get_mut(&word) {
(Some(entry), evicted) => { (Some(entry), evicted) => {
let (ids, positions) = entry; let (ids, positions) = entry;
@ -214,11 +224,11 @@ fn main() -> anyhow::Result<()> {
match opt.csv_file { match opt.csv_file {
Some(path) => { Some(path) => {
let rdr = csv::Reader::from_path(path)?; let rdr = csv::Reader::from_path(path)?;
index_csv(&mut wtxn, rdr, &index)?; index_csv(&mut wtxn, rdr, &index, 1, 0)?;
}, },
None => { None => {
let rdr = csv::Reader::from_reader(io::stdin()); let rdr = csv::Reader::from_reader(io::stdin());
index_csv(&mut wtxn, rdr, &index)?; index_csv(&mut wtxn, rdr, &index, 1, 0)?;
} }
}; };