From 32da07ccee19174121493902e240421b98a3b1f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 23 Aug 2020 10:52:47 +0200 Subject: [PATCH] Introduce the word-positions-doc-ids and words-positions infos commands --- src/bin/infos.rs | 87 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 4 deletions(-) diff --git a/src/bin/infos.rs b/src/bin/infos.rs index fcf14acfc..ac7695778 100644 --- a/src/bin/infos.rs +++ b/src/bin/infos.rs @@ -1,10 +1,13 @@ use std::path::PathBuf; use std::{str, io}; +use anyhow::Context; use heed::EnvOpenOptions; use milli::Index; use structopt::StructOpt; +use Command::*; + #[cfg(target_os = "linux")] #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; @@ -58,6 +61,24 @@ enum Command { #[structopt(default_value = "10")] limit: usize, }, + + /// Outputs a CSV with the document ids for all the positions of the given words. + WordPositionDocIds { + /// Show the value entirely, not just the debug version. + #[structopt(long)] + full_display: bool, + /// The words you want to display the values of. + words: Vec, + }, + + /// Outputs a CSV with all the positions of the given words. + WordPositions { + /// Show the value entirely, not just the debug version. + #[structopt(long)] + full_display: bool, + /// The words you want to display the values of. + words: Vec, + } } fn main() -> anyhow::Result<()> { @@ -79,9 +100,11 @@ fn main() -> anyhow::Result<()> { let rtxn = env.read_txn()?; match opt.command { - Command::MostCommonWords { limit } => most_common_words(&index, &rtxn, limit), - Command::WordsFrequencies { words } => words_frequencies(&index, &rtxn, words), - Command::BiggestValueSizes { limit } => biggest_value_sizes(&index, &rtxn, limit), + MostCommonWords { limit } => most_common_words(&index, &rtxn, limit), + WordsFrequencies { words } => words_frequencies(&index, &rtxn, words), + BiggestValueSizes { limit } => biggest_value_sizes(&index, &rtxn, limit), + WordPositionDocIds { full_display, words } => word_position_doc_ids(&index, &rtxn, !full_display, words), + WordPositions { full_display, words } => word_positions(&index, &rtxn, !full_display, words), } } @@ -207,5 +230,61 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho } Ok(wtr.flush()?) - +} + +fn word_position_doc_ids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec) -> anyhow::Result<()> { + use std::convert::TryInto; + + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["word", "position", "document_ids"])?; + + let mut non_debug = Vec::new(); + for word in words { + for result in index.word_position_docids.prefix_iter(rtxn, word.as_bytes())? { + let (bytes, postings) = result?; + let (w, position) = bytes.split_at(bytes.len() - 4); + let position = position.try_into().map(u32::from_be_bytes)?; + + // if the word is not exactly the word we requested then it means + // we found a word that *starts with* the requested word and we must stop. + if word.as_bytes() != w { break } + + let postings_string = if debug { + format!("{:?}", postings) + } else { + non_debug.clear(); + non_debug.extend(postings); + format!("{:?}", non_debug) + }; + + wtr.write_record(&[&word, &position.to_string(), &postings_string])?; + } + } + + Ok(wtr.flush()?) +} + +fn word_positions(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec) -> anyhow::Result<()> { + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["word", "positions"])?; + + let mut non_debug = Vec::new(); + for word in words { + let postings = index.word_positions.get(rtxn, &word)? + .with_context(|| format!("could not find word {:?}", &word))?; + + let postings_string = if debug { + format!("{:?}", postings) + } else { + non_debug.clear(); + non_debug.extend(postings); + format!("{:?}", non_debug) + }; + + wtr.write_record(&[word, postings_string])?; + } + + Ok(wtr.flush()?) }