Add a little bit more debug

This commit is contained in:
Kerollmops 2020-06-01 17:52:43 +02:00
parent dff68a339a
commit 5404776f7a
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 28 additions and 14 deletions

View File

@ -48,6 +48,8 @@ struct MtblKvStore(Option<File>);
impl MtblKvStore { impl MtblKvStore {
fn from_indexed(mut indexed: Indexed) -> anyhow::Result<MtblKvStore> { fn from_indexed(mut indexed: Indexed) -> anyhow::Result<MtblKvStore> {
eprintln!("{:?}: Creating an MTBL store from an Indexed...", rayon::current_thread_index());
let outfile = tempfile::tempfile()?; let outfile = tempfile::tempfile()?;
let mut out = Writer::new(outfile, None)?; let mut out = Writer::new(outfile, None)?;
@ -73,10 +75,10 @@ impl MtblKvStore {
// We must write the prefix postings ids // We must write the prefix postings ids
key[0] = 2; key[0] = 2;
let mut stream = indexed.fst.stream(); let mut stream = indexed.fst.stream();
while let Some(word) = stream.next() { while let Some(prefix) = stream.next() {
key.truncate(1); key.truncate(1);
key.extend_from_slice(word); key.extend_from_slice(prefix);
if let Some(ids) = indexed.prefix_postings_ids.remove(word) { if let Some(ids) = indexed.prefix_postings_ids.remove(prefix) {
buffer.clear(); buffer.clear();
ids.serialize_into(&mut buffer)?; ids.serialize_into(&mut buffer)?;
out.add(&key, &buffer).unwrap(); out.add(&key, &buffer).unwrap();
@ -93,10 +95,14 @@ impl MtblKvStore {
} }
let out = out.into_inner()?; let out = out.into_inner()?;
eprintln!("{:?}: MTBL store created!", rayon::current_thread_index());
Ok(MtblKvStore(Some(out))) Ok(MtblKvStore(Some(out)))
} }
fn merge_with(self, other: MtblKvStore) -> anyhow::Result<MtblKvStore> { fn merge_with(self, other: MtblKvStore) -> anyhow::Result<MtblKvStore> {
eprintln!("{:?}: Merging two MTBL stores...", rayon::current_thread_index());
let (left, right) = match (self.0, other.0) { let (left, right) = match (self.0, other.0) {
(Some(left), Some(right)) => (left, right), (Some(left), Some(right)) => (left, right),
(Some(left), None) => return Ok(MtblKvStore(Some(left))), (Some(left), None) => return Ok(MtblKvStore(Some(left))),
@ -159,11 +165,15 @@ impl MtblKvStore {
} }
let out = out.into_inner()?; let out = out.into_inner()?;
eprintln!("{:?}: MTBL stores merged!", rayon::current_thread_index());
Ok(MtblKvStore(Some(out))) Ok(MtblKvStore(Some(out)))
} }
} }
fn index_csv(mut rdr: csv::Reader<File>) -> anyhow::Result<MtblKvStore> { fn index_csv(mut rdr: csv::Reader<File>) -> anyhow::Result<MtblKvStore> {
eprintln!("{:?}: Indexing into an Indexed...", rayon::current_thread_index());
const MAX_POSITION: usize = 1000; const MAX_POSITION: usize = 1000;
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
@ -189,8 +199,8 @@ fn index_csv(mut rdr: csv::Reader<File>) -> anyhow::Result<MtblKvStore> {
postings_ids.entry(SmallVec32::from(word.as_bytes())) postings_ids.entry(SmallVec32::from(word.as_bytes()))
.or_insert_with(RoaringBitmap::new) .or_insert_with(RoaringBitmap::new)
.insert(document_id); .insert(document_id);
if let Some(prefix) = word.as_bytes().get(0..word.len().min(4)) { if let Some(prefix) = word.as_bytes().get(0..word.len().min(5)) {
for i in 0..prefix.len() { for i in 0..=prefix.len() {
prefix_postings_ids.entry(SmallVec32::from(&prefix[..i])) prefix_postings_ids.entry(SmallVec32::from(&prefix[..i]))
.or_insert_with(RoaringBitmap::new) .or_insert_with(RoaringBitmap::new)
.insert(document_id); .insert(document_id);
@ -216,6 +226,7 @@ fn index_csv(mut rdr: csv::Reader<File>) -> anyhow::Result<MtblKvStore> {
let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallVec32::as_ref))?; let new_words_fst = fst::Set::from_iter(new_words.iter().map(SmallVec32::as_ref))?;
let indexed = Indexed { fst: new_words_fst, headers, postings_ids, prefix_postings_ids, documents }; let indexed = Indexed { fst: new_words_fst, headers, postings_ids, prefix_postings_ids, documents };
eprintln!("{:?}: Indexed created!", rayon::current_thread_index());
MtblKvStore::from_indexed(indexed) MtblKvStore::from_indexed(indexed)
} }
@ -274,19 +285,17 @@ fn main() -> anyhow::Result<()> {
.open(opt.database)?; .open(opt.database)?;
let index = Index::new(&env)?; let index = Index::new(&env)?;
let res = opt.files_to_index let mtbl_store = opt.files_to_index
.into_par_iter() .into_par_iter()
.try_fold(MtblKvStore::default, |acc, path| { .try_fold(MtblKvStore::default, |acc, path| {
let rdr = csv::Reader::from_path(path)?; let rdr = csv::Reader::from_path(path)?;
let mtbl_store = index_csv(rdr)?; let store = index_csv(rdr)?;
acc.merge_with(mtbl_store) acc.merge_with(store)
}) })
.inspect(|_| { .inspect(|_| {
eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed)) eprintln!("Total number of documents seen so far is {}", ID_GENERATOR.load(Ordering::Relaxed))
}) })
.try_reduce(MtblKvStore::default, MtblKvStore::merge_with); .try_reduce(MtblKvStore::default, MtblKvStore::merge_with)?;
let mtbl_store = res?;
eprintln!("We are writing into LMDB..."); eprintln!("We are writing into LMDB...");
let mut wtxn = env.write_txn()?; let mut wtxn = env.write_txn()?;

View File

@ -79,18 +79,21 @@ impl Index {
let mut union_result = RoaringBitmap::default(); let mut union_result = RoaringBitmap::default();
if word.len() <= 4 { if word.len() <= 4 {
if let Some(ids) = self.prefix_postings_ids.get(rtxn, &word[..word.len().min(4)])? { if let Some(ids) = self.prefix_postings_ids.get(rtxn, &word[..word.len().min(5)])? {
union_result = RoaringBitmap::deserialize_from(ids)?; union_result = RoaringBitmap::deserialize_from(ids)?;
} }
} else { } else {
let mut count = 0;
let mut stream = fst.search(dfa).into_stream(); let mut stream = fst.search(dfa).into_stream();
while let Some(word) = stream.next() { while let Some(word) = stream.next() {
count += 1;
let word = std::str::from_utf8(word)?; let word = std::str::from_utf8(word)?;
if let Some(ids) = self.postings_ids.get(rtxn, word)? { if let Some(ids) = self.postings_ids.get(rtxn, word)? {
let right = RoaringBitmap::deserialize_from(ids)?; let right = RoaringBitmap::deserialize_from(ids)?;
union_result.union_with(&right); union_result.union_with(&right);
} }
} }
eprint!("with {:?} words ", count);
} }
eprintln!("union for {:?} took {:.02?}", word, before.elapsed()); eprintln!("union for {:?} took {:.02?}", word, before.elapsed());
@ -99,14 +102,16 @@ impl Index {
let before = Instant::now(); let before = Instant::now();
let left_len = left.len(); let left_len = left.len();
left.intersect_with(&union_result); left.intersect_with(&union_result);
eprintln!("intersect between {:?} and {:?} took {:.02?}", eprintln!("intersect between {:?} and {:?} gives {:?} took {:.02?}",
left_len, union_result.len(), before.elapsed()); left_len, union_result.len(), left.len(), before.elapsed());
Some(left) Some(left)
}, },
None => Some(union_result), None => Some(union_result),
}; };
} }
eprintln!("{} candidates", intersect_result.as_ref().map_or(0, |r| r.len()));
Ok(intersect_result.unwrap_or_default().iter().take(20).collect()) Ok(intersect_result.unwrap_or_default().iter().take(20).collect())
} }
} }