From a3ac2623d5b9ef59b2099761d6eed4df921dd196 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 1 Jul 2020 17:24:55 +0200 Subject: [PATCH] Introduce multiple functions to clean up the code --- src/bin/indexer.rs | 75 ++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 32 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index d6ef50567..5a2a2ccfb 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -1,6 +1,7 @@ use std::collections::hash_map::Entry; use std::collections::{HashMap, BTreeSet}; use std::convert::{TryFrom, TryInto}; +use std::hash::{Hash, BuildHasher}; use std::io; use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -43,6 +44,39 @@ struct Opt { csv_file: Option, } +fn put_evicted_into_heed(wtxn: &mut heed::RwTxn, index: &Index, iter: I) -> anyhow::Result<()> +where + I: IntoIterator))> +{ + for (word, (positions, positions_docids)) in iter { + index.word_positions.put(wtxn, &word, &positions)?; + + for (position, docids) in positions_docids { + let mut key = word.as_bytes().to_vec(); + key.extend_from_slice(&position.to_be_bytes()); + index.word_position_docids.put(wtxn, &key, &docids)?; + } + } + Ok(()) +} + +fn merge_hashmaps(mut a: HashMap, mut b: HashMap, mut merge: F) -> HashMap +where + K: Hash + Eq, + S: BuildHasher, + F: FnMut(&K, &mut V, V) +{ + for (k, v) in a.iter_mut() { + if let Some(vb) = b.remove(k) { + (merge)(k, v, vb) + } + } + + a.extend(b); + + a +} + fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index: &Index) -> anyhow::Result<()> { eprintln!("Indexing into LMDB..."); @@ -67,11 +101,13 @@ fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index let position = (attr * 1000 + pos) as u32; match words_cache.get_mut(&word) { - Some((ids, positions)) => { + (Some(entry), evicted) => { + let (ids, positions) = entry; ids.insert(position); positions.entry(position).or_default().insert(document_id); + put_evicted_into_heed(wtxn, index, evicted)?; }, - None => { + (None, _evicted) => { let mut key = word.as_bytes().to_vec(); key.extend_from_slice(&position.to_be_bytes()); @@ -85,28 +121,11 @@ fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index map.insert(position, words_position_docids); let value = (words_positions, map); - // TODO clean this up - let merge = |(pa, mut pda): (_, FastMap4<_, RoaringBitmap>), (pb, mut pdb): (_, FastMap4<_, RoaringBitmap>)| { - for (pos, dia) in &mut pda { - if let Some(dib) = pdb.remove(pos) { - dia.union_with(&dib); - } - } + let evicted = words_cache.insert(word.clone(), value, |(pa, pda), (pb, pdb)| { + (pa | pb, merge_hashmaps(pda, pdb, |_, a, b| RoaringBitmap::union_with(a, &b))) + }); - pdb.into_iter().for_each(|(pos, map)| drop(pda.insert(pos, map))); - - (pa | pb, pda) - }; - - for (word, (positions, positions_docids)) in words_cache.insert(word.clone(), value, merge) { - index.word_positions.put(wtxn, &word, &positions)?; - - for (position, docids) in positions_docids { - let mut key = word.as_bytes().to_vec(); - key.extend_from_slice(&position.to_be_bytes()); - index.word_position_docids.put(wtxn, &key, &docids)?; - } - } + put_evicted_into_heed(wtxn, index, evicted)?; } } } @@ -120,15 +139,7 @@ fn index_csv(wtxn: &mut heed::RwTxn, mut rdr: csv::Reader, index index.documents.put(wtxn, &BEU32::new(document_id), &document)?; } - for (word, (positions, positions_docids)) in &words_cache { - index.word_positions.put(wtxn, &word, &positions)?; - - for (position, docids) in positions_docids { - let mut key = word.as_bytes().to_vec(); - key.extend_from_slice(&position.to_be_bytes()); - index.word_position_docids.put(wtxn, &key, &docids)?; - } - } + put_evicted_into_heed(wtxn, index, words_cache)?; // We store the words from the postings. let mut new_words = BTreeSet::default();