From ba3e805981a3e835ad51072637bc85fcafd28352 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 22 Jun 2020 18:02:22 +0200 Subject: [PATCH] Document the Index types and the internal LMDB databases --- src/bin/indexer.rs | 8 ++++---- src/lib.rs | 32 ++++++++++++++++++-------------- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 5b44b3734..f20b3187c 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -329,22 +329,22 @@ fn writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyh } else if key.starts_with(&[1]) { // Write the postings lists - index.postings_attrs.as_polymorph() + index.word_positions.as_polymorph() .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; } else if key.starts_with(&[2]) { // Write the prefix postings lists - index.prefix_postings_attrs.as_polymorph() + index.prefix_word_positions.as_polymorph() .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; } else if key.starts_with(&[3]) { // Write the postings lists - index.postings_ids.as_polymorph() + index.word_position_docids.as_polymorph() .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; } else if key.starts_with(&[4]) { // Write the prefix postings lists - index.prefix_postings_ids.as_polymorph() + index.prefix_word_position_docids.as_polymorph() .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; } else if key.starts_with(&[5]) { diff --git a/src/lib.rs b/src/lib.rs index 90fe6e541..554790578 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -36,24 +36,28 @@ pub type AttributeId = u32; #[derive(Clone)] pub struct Index { + /// Contains many different types (e.g. the documents CSV headers). pub main: PolyDatabase, - pub postings_attrs: Database, - pub prefix_postings_attrs: Database, - pub postings_ids: Database, - pub prefix_postings_ids: Database, + /// A word and all the positions where it appears in the whole dataset. + pub word_positions: Database, + pub prefix_word_positions: Database, + /// Maps a word at a position (u32) and all the documents ids where it appears. + pub word_position_docids: Database, + pub prefix_word_position_docids: Database, + /// Maps an internal document to the content of the document in CSV. pub documents: Database, ByteSlice>, } impl Index { pub fn new(env: &heed::Env) -> heed::Result { - let main = env.create_poly_database(None)?; - let postings_attrs = env.create_database(Some("postings-attrs"))?; - let prefix_postings_attrs = env.create_database(Some("prefix-postings-attrs"))?; - let postings_ids = env.create_database(Some("postings-ids"))?; - let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?; - let documents = env.create_database(Some("documents"))?; - - Ok(Index { main, postings_attrs, prefix_postings_attrs, postings_ids, prefix_postings_ids, documents }) + Ok(Index { + main: env.create_poly_database(None)?, + word_positions: env.create_database(Some("word-positions"))?, + prefix_word_positions: env.create_database(Some("prefix-word-positions"))?, + word_position_docids: env.create_database(Some("word-position-docids"))?, + prefix_word_position_docids: env.create_database(Some("prefix-word-position-docids"))?, + documents: env.create_database(Some("documents"))?, + }) } pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result> { @@ -107,7 +111,7 @@ impl Index { let mut stream = fst.search(&dfa).into_stream(); while let Some(word) = stream.next() { let word = std::str::from_utf8(word)?; - if let Some(right) = self.postings_attrs.get(rtxn, word)? { + if let Some(right) = self.word_positions.get(rtxn, word)? { union_positions.union_with(&right); derived_words.push((word.as_bytes().to_vec(), right)); count += 1; @@ -131,7 +135,7 @@ impl Index { if attrs.contains(pos) { let mut key = word.clone(); key.extend_from_slice(&pos.to_be_bytes()); - if let Some(right) = self.postings_ids.get(rtxn, &key).unwrap() { + if let Some(right) = self.word_position_docids.get(rtxn, &key).unwrap() { union_docids.union_with(&right); } }