From 374ec6773fa58a2a88eb5ad45ed8b5c6eb76ae42 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 22 Jun 2020 19:04:10 +0200 Subject: [PATCH] Introduce a database to store all docids for a word and attribute --- src/bin/indexer.rs | 49 ++++++++++++++++++++++++++++++++++++++++++---- src/bin/search.rs | 2 +- src/lib.rs | 16 ++++++++++++--- 3 files changed, 59 insertions(+), 8 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 877dfd39c..0b1cb5977 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -1,5 +1,6 @@ -use std::collections::{BTreeSet, BTreeMap}; -use std::convert::TryFrom; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, BTreeSet, BTreeMap}; +use std::convert::{TryFrom, TryInto}; use std::fs::File; use std::path::PathBuf; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -344,6 +345,46 @@ fn writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyh Ok(()) } +fn compute_words_attributes_docids(wtxn: &mut heed::RwTxn, index: &Index) -> anyhow::Result<()> { + eprintln!("Computing the attributes documents ids..."); + + let fst = match index.fst(&wtxn)? { + Some(fst) => fst.map_data(|s| s.to_vec())?, + None => return Ok(()), + }; + + let mut word_attributes = HashMap::new(); + let mut stream = fst.stream(); + while let Some(word) = stream.next() { + word_attributes.clear(); + + // Loop on the word attributes and unions all the documents ids by attribute. + for result in index.word_position_docids.prefix_iter(wtxn, word)? { + let (key, docids) = result?; + let (_key_word, key_pos) = key.split_at(key.len() - 4); + let key_pos = key_pos.try_into().map(u32::from_be_bytes)?; + // If the key corresponds to the word (minus the attribute) + if key.len() == word.len() + 4 { + let attribute = key_pos / 1000; + match word_attributes.entry(attribute) { + Entry::Vacant(entry) => { entry.insert(docids); }, + Entry::Occupied(mut entry) => entry.get_mut().union_with(&docids), + } + } + } + + // Write this word attributes unions into LMDB. + let mut key = word.to_vec(); + for (attribute, docids) in word_attributes.drain() { + key.truncate(word.len()); + key.extend_from_slice(&attribute.to_be_bytes()); + index.word_attribute_docids.put(wtxn, &key, &docids)?; + } + } + + Ok(()) +} + fn main() -> anyhow::Result<()> { let opt = Opt::from_args(); @@ -351,7 +392,7 @@ fn main() -> anyhow::Result<()> { let env = EnvOpenOptions::new() .map_size(100 * 1024 * 1024 * 1024) // 100 GB .max_readers(10) - .max_dbs(5) + .max_dbs(10) .open(opt.database)?; let index = Index::new(&env)?; @@ -370,7 +411,7 @@ fn main() -> anyhow::Result<()> { eprintln!("We are writing into LMDB..."); let mut wtxn = env.write_txn()?; MtblKvStore::from_many(stores, |k, v| writer(&mut wtxn, &index, k, v))?; - // FIXME Why is this count wrong? (indicates 99 when must return 100) + compute_words_attributes_docids(&mut wtxn, &index)?; let count = index.documents.len(&wtxn)?; wtxn.commit()?; eprintln!("Wrote {} documents into LMDB", count); diff --git a/src/bin/search.rs b/src/bin/search.rs index b365b3025..22015860a 100644 --- a/src/bin/search.rs +++ b/src/bin/search.rs @@ -30,7 +30,7 @@ fn main() -> anyhow::Result<()> { let env = EnvOpenOptions::new() .map_size(100 * 1024 * 1024 * 1024) // 100 GB .max_readers(10) - .max_dbs(5) + .max_dbs(10) .open(opt.database)?; let index = Index::new(&env)?; diff --git a/src/lib.rs b/src/lib.rs index 554790578..67d7adb58 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -44,6 +44,8 @@ pub struct Index { /// Maps a word at a position (u32) and all the documents ids where it appears. pub word_position_docids: Database, pub prefix_word_position_docids: Database, + /// Maps a word and an attribute (u32) to all the documents ids that it appears in. + pub word_attribute_docids: Database, /// Maps an internal document to the content of the document in CSV. pub documents: Database, ByteSlice>, } @@ -56,6 +58,7 @@ impl Index { prefix_word_positions: env.create_database(Some("prefix-word-positions"))?, word_position_docids: env.create_database(Some("word-position-docids"))?, prefix_word_position_docids: env.create_database(Some("prefix-word-position-docids"))?, + word_attribute_docids: env.create_database(Some("word-attribute-docids"))?, documents: env.create_database(Some("documents"))?, }) } @@ -64,10 +67,17 @@ impl Index { self.main.get::<_, Str, ByteSlice>(rtxn, "headers") } + pub fn fst<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? { + Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), + None => Ok(None), + } + } + pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result> { - let fst = match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? { - Some(bytes) => fst::Set::new(bytes)?, - None => return Ok(Vec::new()), + let fst = match self.fst(rtxn)? { + Some(fst) => fst, + None => return Ok(vec![]), }; let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2);