mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 18:45:06 +08:00
Introduce two new infos subcommands
This commit is contained in:
parent
bb1ab428db
commit
ea605b499c
@ -52,6 +52,12 @@ enum Command {
|
|||||||
limit: usize,
|
limit: usize,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/// Outputs the total size of all the docid-word-positions keys and values.
|
||||||
|
TotalDocidWordPositionsSize,
|
||||||
|
|
||||||
|
/// Outputs the average number of *different* words by document.
|
||||||
|
AverageNumberOfWordsByDoc,
|
||||||
|
|
||||||
/// Outputs the words FST to disk.
|
/// Outputs the words FST to disk.
|
||||||
///
|
///
|
||||||
/// One can use the FST binary helper to dissect and analyze it,
|
/// One can use the FST binary helper to dissect and analyze it,
|
||||||
@ -84,6 +90,8 @@ fn main() -> anyhow::Result<()> {
|
|||||||
match opt.command {
|
match opt.command {
|
||||||
MostCommonWords { limit } => most_common_words(&index, &rtxn, limit),
|
MostCommonWords { limit } => most_common_words(&index, &rtxn, limit),
|
||||||
BiggestValues { limit } => biggest_value_sizes(&index, &rtxn, limit),
|
BiggestValues { limit } => biggest_value_sizes(&index, &rtxn, limit),
|
||||||
|
TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn),
|
||||||
|
AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn),
|
||||||
ExportWordsFst { output } => export_words_fst(&index, &rtxn, output),
|
ExportWordsFst { output } => export_words_fst(&index, &rtxn, output),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -181,3 +189,64 @@ fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyho
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn total_docid_word_positions_size(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
|
||||||
|
use heed::types::ByteSlice;
|
||||||
|
|
||||||
|
let mut total_key_size = 0;
|
||||||
|
let mut total_val_size = 0;
|
||||||
|
let mut count = 0;
|
||||||
|
|
||||||
|
let iter = index.docid_word_positions.as_polymorph().iter::<_, ByteSlice, ByteSlice>(rtxn)?;
|
||||||
|
for result in iter {
|
||||||
|
let (key, val) = result?;
|
||||||
|
total_key_size += key.len();
|
||||||
|
total_val_size += val.len();
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("number of keys: {}", count);
|
||||||
|
println!("total key size: {}", total_key_size);
|
||||||
|
println!("total value size: {}", total_val_size);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> {
|
||||||
|
use heed::types::DecodeIgnore;
|
||||||
|
use milli::{DocumentId, BEU32StrCodec};
|
||||||
|
|
||||||
|
let mut words_counts = Vec::new();
|
||||||
|
let mut count = 0;
|
||||||
|
let mut prev = None as Option<(DocumentId, u32)>;
|
||||||
|
|
||||||
|
let iter = index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?;
|
||||||
|
for result in iter {
|
||||||
|
let ((docid, _word), ()) = result?;
|
||||||
|
|
||||||
|
match prev.as_mut() {
|
||||||
|
Some((prev_docid, prev_count)) if docid == *prev_docid => {
|
||||||
|
*prev_count += 1;
|
||||||
|
},
|
||||||
|
Some((prev_docid, prev_count)) => {
|
||||||
|
words_counts.push(*prev_count);
|
||||||
|
*prev_docid = docid;
|
||||||
|
*prev_count = 0;
|
||||||
|
count += 1;
|
||||||
|
},
|
||||||
|
None => prev = Some((docid, 1)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some((_, prev_count)) = prev.take() {
|
||||||
|
words_counts.push(prev_count);
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let words_count = words_counts.into_iter().map(|c| c as usize).sum::<usize>() as f64;
|
||||||
|
let count = count as f64;
|
||||||
|
|
||||||
|
println!("average number of different words by document: {}", words_count / count);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user