diff --git a/src/bin/infos.rs b/src/bin/infos.rs new file mode 100644 index 000000000..a4513cce2 --- /dev/null +++ b/src/bin/infos.rs @@ -0,0 +1,112 @@ +use std::path::PathBuf; +use std::{str, io}; + +use heed::EnvOpenOptions; +use milli::Index; +use structopt::StructOpt; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +#[derive(Debug, StructOpt)] +#[structopt(name = "milli-info", about = "A stats crawler for milli.")] +struct Opt { + /// The database path where the database is located. + /// It is created if it doesn't already exist. + #[structopt(long = "db", parse(from_os_str))] + database: PathBuf, + + /// The maximum size the database can take on disk. It is recommended to specify + /// the whole disk space (value must be a multiple of a page size). + #[structopt(long = "db-size", default_value = "107374182400")] // 100 GB + database_size: usize, + + /// Verbose mode (-v, -vv, -vvv, etc.) + #[structopt(short, long, parse(from_occurrences))] + verbose: usize, + + #[structopt(subcommand)] + command: Command, +} + +#[derive(Debug, StructOpt)] +enum Command { + /// Outputs a CSV of the most frequent words of this index. + /// + /// `word` are displayed and ordered by frequency. + /// `document_frequency` defines the number of documents which contains the word. + /// `frequency` defines the number times the word appears in all the documents. + MostCommonWords { + /// The maximum number of frequencies to return. + #[structopt(default_value = "10")] + limit: usize, + } +} + +fn main() -> anyhow::Result<()> { + let opt = Opt::from_args(); + + stderrlog::new() + .verbosity(opt.verbose) + .show_level(false) + .timestamp(stderrlog::Timestamp::Off) + .init()?; + + let env = EnvOpenOptions::new() + .map_size(opt.database_size) + .max_dbs(10) + .open(&opt.database)?; + + // Open the LMDB database. + let index = Index::new(&env, opt.database)?; + let rtxn = env.read_txn()?; + + match opt.command { + Command::MostCommonWords { limit } => most_common_words(&index, &rtxn, limit), + } +} + +fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { + use std::collections::BinaryHeap; + use std::cmp::Reverse; + use roaring::RoaringBitmap; + + let mut heap = BinaryHeap::with_capacity(limit + 1); + let mut prev = None as Option<(String, u64, RoaringBitmap)>; + for result in index.word_position_docids.iter(rtxn)? { + if limit == 0 { break } + + let (bytes, postings) = result?; + let (word, _position) = bytes.split_at(bytes.len() - 4); + let word = str::from_utf8(word)?; + + match prev.as_mut() { + Some((prev_word, freq, docids)) if prev_word == word => { + *freq += docids.len(); + docids.union_with(&postings); + }, + Some((prev_word, freq, docids)) => { + heap.push(Reverse((docids.len(), *freq, prev_word.to_string()))); + if heap.len() > limit { heap.pop(); } + prev = Some((word.to_string(), postings.len(), postings)) + }, + None => prev = Some((word.to_string(), postings.len(), postings)), + } + } + + if let Some((prev_word, freq, docids)) = prev { + heap.push(Reverse((docids.len(), freq, prev_word.to_string()))); + if heap.len() > limit { heap.pop(); } + } + + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["word", "document_frequency", "frequency"])?; + + for Reverse((document_frequency, frequency, word)) in heap.into_sorted_vec() { + wtr.write_record(&[word, document_frequency.to_string(), frequency.to_string()])?; + } + + Ok(wtr.flush()?) +}