Introduce the size_of_database infos subcommand

This commit is contained in:
Clément Renault 2020-10-02 16:46:05 +02:00
parent c6b883289c
commit bc35c9a598
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 51 additions and 3 deletions

View File

@ -12,6 +12,20 @@ use Command::*;
#[global_allocator] #[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
const MAIN_DB_NAME: &str = "main";
const WORD_DOCIDS_DB_NAME: &str = "word-docids";
const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions";
const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
const DOCUMENTS_DB_NAME: &str = "documents";
const DATABASE_NAMES: &[&str] = &[
MAIN_DB_NAME,
WORD_DOCIDS_DB_NAME,
DOCID_WORD_POSITIONS_DB_NAME,
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME,
DOCUMENTS_DB_NAME,
];
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
#[structopt(name = "milli-info", about = "A stats crawler for milli.")] #[structopt(name = "milli-info", about = "A stats crawler for milli.")]
struct Opt { struct Opt {
@ -74,6 +88,12 @@ enum Command {
/// Outputs the average number of documents for each words pair. /// Outputs the average number of documents for each words pair.
AverageNumberOfDocumentByWordPairProximity, AverageNumberOfDocumentByWordPairProximity,
/// Outputs the size in bytes of the specified database.
SizeOfDatabase {
#[structopt(possible_values = DATABASE_NAMES)]
database: String,
},
/// Outputs a CSV with the proximities for the two specidied words and /// Outputs a CSV with the proximities for the two specidied words and
/// the documents ids where these relations appears. /// the documents ids where these relations appears.
/// ///
@ -130,6 +150,7 @@ fn main() -> anyhow::Result<()> {
AverageNumberOfPositionsByWord => { AverageNumberOfPositionsByWord => {
average_number_of_positions_by_word(&index, &rtxn) average_number_of_positions_by_word(&index, &rtxn)
}, },
SizeOfDatabase { database } => size_of_database(&index, &rtxn, &database),
AverageNumberOfDocumentByWordPairProximity => { AverageNumberOfDocumentByWordPairProximity => {
average_number_of_document_by_word_pair_proximity(&index, &rtxn) average_number_of_document_by_word_pair_proximity(&index, &rtxn)
} }
@ -336,6 +357,33 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any
Ok(()) Ok(())
} }
fn size_of_database(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Result<()> {
use heed::types::ByteSlice;
let database = match name {
MAIN_DB_NAME => &index.main,
WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(),
DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(),
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(),
DOCUMENTS_DB_NAME => index.documents.as_polymorph(),
otherwise => anyhow::bail!("unknown database {:?}", otherwise),
};
let mut key_size: u64 = 0;
let mut val_size: u64 = 0;
for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? {
let (k, v) = result?;
key_size += k.len() as u64;
val_size += v.len() as u64;
}
eprintln!("The {} database weigh {} bytes in terms of keys and {} bytes in terms of values.",
name, key_size, val_size,
);
Ok(())
}
fn average_number_of_document_by_word_pair_proximity( fn average_number_of_document_by_word_pair_proximity(
index: &Index, index: &Index,
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,

View File

@ -43,10 +43,10 @@ pub struct Index {
pub word_docids: Database<Str, RoaringBitmapCodec>, pub word_docids: Database<Str, RoaringBitmapCodec>,
/// Maps a word and a document id (u32) to all the positions where the given word appears. /// Maps a word and a document id (u32) to all the positions where the given word appears.
pub docid_word_positions: Database<BEU32StrCodec, ByteorderXRoaringBitmapCodec>, pub docid_word_positions: Database<BEU32StrCodec, ByteorderXRoaringBitmapCodec>,
/// Maps the document id to the document as a CSV line.
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
/// Maps the proximity between a pair of words with all the docids where this relation appears. /// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<StrStrU8Codec, RoaringBitmapCodec>, pub word_pair_proximity_docids: Database<StrStrU8Codec, RoaringBitmapCodec>,
/// Maps the document id to the document as a CSV line.
pub documents: Database<OwnedType<BEU32>, ByteSlice>,
} }
impl Index { impl Index {
@ -55,8 +55,8 @@ impl Index {
main: env.create_poly_database(None)?, main: env.create_poly_database(None)?,
word_docids: env.create_database(Some("word-docids"))?, word_docids: env.create_database(Some("word-docids"))?,
docid_word_positions: env.create_database(Some("docid-word-positions"))?, docid_word_positions: env.create_database(Some("docid-word-positions"))?,
documents: env.create_database(Some("documents"))?,
word_pair_proximity_docids: env.create_database(Some("word-pair-proximity-docids"))?, word_pair_proximity_docids: env.create_database(Some("word-pair-proximity-docids"))?,
documents: env.create_database(Some("documents"))?,
}) })
} }