diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 000000000..e628807a2 --- /dev/null +++ b/src/index.rs @@ -0,0 +1,110 @@ +use anyhow::Context; +use csv::StringRecord; +use heed::types::*; +use heed::{PolyDatabase, Database}; +use roaring::RoaringBitmap; + +use crate::Search; +use crate::{BEU32, DocumentId}; +use crate::{ + RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, + CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, +}; + +pub const WORDS_FST_KEY: &str = "words-fst"; +pub const HEADERS_KEY: &str = "headers"; +pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; + +#[derive(Clone)] +pub struct Index { + /// Contains many different types (e.g. the documents CSV headers). + pub main: PolyDatabase, + /// A word and all the documents ids containing the word. + pub word_docids: Database, + /// Maps a word and a document id (u32) to all the positions where the given word appears. + pub docid_word_positions: Database, + /// Maps the proximity between a pair of words with all the docids where this relation appears. + pub word_pair_proximity_docids: Database, + /// Maps the document id to the document as a CSV line. + pub documents: Database, ByteSlice>, +} + +impl Index { + pub fn new(env: &heed::Env) -> anyhow::Result { + Ok(Index { + main: env.create_poly_database(Some("main"))?, + word_docids: env.create_database(Some("word-docids"))?, + docid_word_positions: env.create_database(Some("docid-word-positions"))?, + word_pair_proximity_docids: env.create_database(Some("word-pair-proximity-docids"))?, + documents: env.create_database(Some("documents"))?, + }) + } + + pub fn documents_ids(&self, rtxn: &heed::RoTxn) -> anyhow::Result> { + Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?) + } + + pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &StringRecord) -> heed::Result<()> { + self.main.put::<_, Str, CsvStringRecordCodec>(wtxn, HEADERS_KEY, headers) + } + + pub fn headers(&self, rtxn: &heed::RoTxn) -> heed::Result> { + self.main.get::<_, Str, CsvStringRecordCodec>(rtxn, HEADERS_KEY) + } + + pub fn number_of_attributes(&self, rtxn: &heed::RoTxn) -> anyhow::Result> { + match self.headers(rtxn)? { + Some(headers) => Ok(Some(headers.len())), + None => Ok(None), + } + } + + pub fn put_fst>(&self, wtxn: &mut heed::RwTxn, fst: &fst::Set) -> anyhow::Result<()> { + Ok(self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_FST_KEY, fst.as_fst().as_bytes())?) + } + + pub fn fst<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result>> { + match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? { + Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), + None => Ok(None), + } + } + + /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing. + pub fn documents<'t>( + &self, + rtxn: &'t heed::RoTxn, + iter: impl IntoIterator, + ) -> anyhow::Result> + { + let ids: Vec<_> = iter.into_iter().collect(); + let mut content = Vec::new(); + + for id in ids.iter().cloned() { + let document_content = self.documents.get(rtxn, &BEU32::new(id))? + .with_context(|| format!("Could not find document {}", id))?; + content.extend_from_slice(document_content); + } + + let mut rdr = csv::ReaderBuilder::new().has_headers(false).from_reader(&content[..]); + + let mut documents = Vec::with_capacity(ids.len()); + for (id, result) in ids.into_iter().zip(rdr.records()) { + documents.push((id, result?)); + } + + Ok(documents) + } + + /// Returns the number of documents indexed in the database. + pub fn number_of_documents(&self, rtxn: &heed::RoTxn) -> anyhow::Result { + match self.documents_ids(rtxn)? { + Some(docids) => Ok(docids.len() as usize), + None => Ok(0), + } + } + + pub fn search<'a>(&'a self, rtxn: &'a heed::RoTxn) -> Search<'a> { + Search::new(rtxn, self) + } +} diff --git a/src/indexing/merge_function.rs b/src/indexing/merge_function.rs index 68ea53ac4..ac55d62ee 100644 --- a/src/indexing/merge_function.rs +++ b/src/indexing/merge_function.rs @@ -7,9 +7,9 @@ use roaring::RoaringBitmap; use crate::heed_codec::CboRoaringBitmapCodec; -const WORDS_FST_KEY: &[u8] = crate::WORDS_FST_KEY.as_bytes(); -const HEADERS_KEY: &[u8] = crate::HEADERS_KEY.as_bytes(); -const DOCUMENTS_IDS_KEY: &[u8] = crate::DOCUMENTS_IDS_KEY.as_bytes(); +const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); +const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes(); +const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes(); pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { match key { diff --git a/src/indexing/store.rs b/src/indexing/store.rs index 3f96880a3..5be54f2eb 100644 --- a/src/indexing/store.rs +++ b/src/indexing/store.rs @@ -29,9 +29,9 @@ const ONE_KILOBYTE: usize = 1024 * 1024; const MAX_POSITION: usize = 1000; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; -const WORDS_FST_KEY: &[u8] = crate::WORDS_FST_KEY.as_bytes(); -const HEADERS_KEY: &[u8] = crate::HEADERS_KEY.as_bytes(); -const DOCUMENTS_IDS_KEY: &[u8] = crate::DOCUMENTS_IDS_KEY.as_bytes(); +const WORDS_FST_KEY: &[u8] = crate::index::WORDS_FST_KEY.as_bytes(); +const HEADERS_KEY: &[u8] = crate::index::HEADERS_KEY.as_bytes(); +const DOCUMENTS_IDS_KEY: &[u8] = crate::index::DOCUMENTS_IDS_KEY.as_bytes(); pub struct Readers { pub main: Reader, diff --git a/src/lib.rs b/src/lib.rs index 1124f84e0..d35a5674f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ mod criterion; +mod index; mod indexing; mod mdfs; mod query_tokens; @@ -11,19 +12,14 @@ pub mod tokenizer; use std::collections::HashMap; use std::hash::BuildHasherDefault; - -use anyhow::Context; -use csv::StringRecord; use fxhash::{FxHasher32, FxHasher64}; -use heed::types::*; -use heed::{PolyDatabase, Database}; -use roaring::RoaringBitmap; -pub use self::update_store::UpdateStore; -pub use self::search::{Search, SearchResult}; pub use self::criterion::{Criterion, default_criteria}; +pub use self::index::Index; +pub use self::search::{Search, SearchResult}; +pub use self::update_store::UpdateStore; pub use self::heed_codec::{ - RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, + RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, CsvStringRecordCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, }; @@ -38,100 +34,3 @@ pub type DocumentId = u32; pub type Attribute = u32; pub type Position = u32; -pub const WORDS_FST_KEY: &str = "words-fst"; -pub const HEADERS_KEY: &str = "headers"; -pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; - -#[derive(Clone)] -pub struct Index { - /// Contains many different types (e.g. the documents CSV headers). - pub main: PolyDatabase, - /// A word and all the documents ids containing the word. - pub word_docids: Database, - /// Maps a word and a document id (u32) to all the positions where the given word appears. - pub docid_word_positions: Database, - /// Maps the proximity between a pair of words with all the docids where this relation appears. - pub word_pair_proximity_docids: Database, - /// Maps the document id to the document as a CSV line. - pub documents: Database, ByteSlice>, -} - -impl Index { - pub fn new(env: &heed::Env) -> anyhow::Result { - Ok(Index { - main: env.create_poly_database(Some("main"))?, - word_docids: env.create_database(Some("word-docids"))?, - docid_word_positions: env.create_database(Some("docid-word-positions"))?, - word_pair_proximity_docids: env.create_database(Some("word-pair-proximity-docids"))?, - documents: env.create_database(Some("documents"))?, - }) - } - - pub fn documents_ids(&self, rtxn: &heed::RoTxn) -> anyhow::Result> { - Ok(self.main.get::<_, Str, RoaringBitmapCodec>(rtxn, DOCUMENTS_IDS_KEY)?) - } - - pub fn put_headers(&self, wtxn: &mut heed::RwTxn, headers: &StringRecord) -> heed::Result<()> { - self.main.put::<_, Str, CsvStringRecordCodec>(wtxn, HEADERS_KEY, headers) - } - - pub fn headers(&self, rtxn: &heed::RoTxn) -> heed::Result> { - self.main.get::<_, Str, CsvStringRecordCodec>(rtxn, HEADERS_KEY) - } - - pub fn number_of_attributes(&self, rtxn: &heed::RoTxn) -> anyhow::Result> { - match self.headers(rtxn)? { - Some(headers) => Ok(Some(headers.len())), - None => Ok(None), - } - } - - pub fn put_fst>(&self, wtxn: &mut heed::RwTxn, fst: &fst::Set) -> anyhow::Result<()> { - Ok(self.main.put::<_, Str, ByteSlice>(wtxn, WORDS_FST_KEY, fst.as_fst().as_bytes())?) - } - - pub fn fst<'t>(&self, rtxn: &'t heed::RoTxn) -> anyhow::Result>> { - match self.main.get::<_, Str, ByteSlice>(rtxn, WORDS_FST_KEY)? { - Some(bytes) => Ok(Some(fst::Set::new(bytes)?)), - None => Ok(None), - } - } - - /// Returns a [`Vec`] of the requested documents. Returns an error if a document is missing. - pub fn documents<'t>( - &self, - rtxn: &'t heed::RoTxn, - iter: impl IntoIterator, - ) -> anyhow::Result> - { - let ids: Vec<_> = iter.into_iter().collect(); - let mut content = Vec::new(); - - for id in ids.iter().cloned() { - let document_content = self.documents.get(rtxn, &BEU32::new(id))? - .with_context(|| format!("Could not find document {}", id))?; - content.extend_from_slice(document_content); - } - - let mut rdr = csv::ReaderBuilder::new().has_headers(false).from_reader(&content[..]); - - let mut documents = Vec::with_capacity(ids.len()); - for (id, result) in ids.into_iter().zip(rdr.records()) { - documents.push((id, result?)); - } - - Ok(documents) - } - - /// Returns the number of documents indexed in the database. - pub fn number_of_documents(&self, rtxn: &heed::RoTxn) -> anyhow::Result { - match self.documents_ids(rtxn)? { - Some(docids) => Ok(docids.len() as usize), - None => Ok(0), - } - } - - pub fn search<'a>(&'a self, rtxn: &'a heed::RoTxn) -> Search<'a> { - Search::new(rtxn, self) - } -}