Introduce two new infos subcommands

2024-11-27 04:25:06 +08:00 · 2020-09-07 14:56:48 +02:00 · 2020-09-07 14:56:48 +02:00 · ea605b499c
commit ea605b499c
parent bb1ab428db
1 changed files with 69 additions and 0 deletions
--- a/src/bin/infos.rs
+++ b/src/bin/infos.rs
@ -52,6 +52,12 @@ enum Command {
        limit: usize,
    },
    /// Outputs the total size of all the docid-word-positions keys and values.
    TotalDocidWordPositionsSize,
    /// Outputs the average number of *different* words by document.
    AverageNumberOfWordsByDoc,
    /// Outputs the words FST to disk.
    ///
    /// One can use the FST binary helper to dissect and analyze it,
@ -84,6 +90,8 @@ fn main() -> anyhow::Result<()> {
    match opt.command {
        MostCommonWords { limit } => most_common_words(&index, &rtxn, limit),
        BiggestValues { limit } => biggest_value_sizes(&index, &rtxn, limit),
        TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn),
        AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
        ExportWordsFst { output } => export_words_fst(&index, &rtxn, output),
    }
 }
@ -181,3 +189,64 @@ fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyho
    Ok(())
 }
 fn total_docid_word_positions_size(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
    use heed::types::ByteSlice;
    let mut total_key_size = 0;
    let mut total_val_size = 0;
    let mut count = 0;
    let iter = index.docid_word_positions.as_polymorph().iter::<_, ByteSlice, ByteSlice>(rtxn)?;
    for result in iter {
        let (key, val) = result?;
        total_key_size += key.len();
        total_val_size += val.len();
        count += 1;
    }
    println!("number of keys: {}", count);
    println!("total key size: {}", total_key_size);
    println!("total value size: {}", total_val_size);
    Ok(())
 }
 fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
    use heed::types::DecodeIgnore;
    use milli::{DocumentId, BEU32StrCodec};
    let mut words_counts = Vec::new();
    let mut count = 0;
    let mut prev = None as Option<(DocumentId, u32)>;
    let iter = index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?;
    for result in iter {
        let ((docid, _word), ()) = result?;
        match prev.as_mut() {
            Some((prev_docid, prev_count)) if docid == *prev_docid => {
                *prev_count += 1;
            },
            Some((prev_docid, prev_count)) => {
                words_counts.push(*prev_count);
                *prev_docid = docid;
                *prev_count = 0;
                count += 1;
            },
            None => prev = Some((docid, 1)),
        }
    }
    if let Some((_, prev_count)) = prev.take() {
        words_counts.push(prev_count);
        count += 1;
    }
    let words_count = words_counts.into_iter().map(|c| c as usize).sum::<usize>() as f64;
    let count = count as f64;
    println!("average number of different words by document: {}", words_count / count);
    Ok(())
 }