diff --git a/infos/src/main.rs b/infos/src/main.rs index ee2060d38..a00c882b7 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -23,6 +23,7 @@ const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; +const FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME: &str = "field-id-word-count-docids"; const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids"; const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids"; const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s"; @@ -39,6 +40,7 @@ const ALL_DATABASE_NAMES: &[&str] = &[ WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_LEVEL_POSITION_DOCIDS_DB_NAME, WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, + FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME, FACET_ID_F64_DOCIDS_DB_NAME, FACET_ID_STRING_DOCIDS_DB_NAME, FIELD_ID_DOCID_FACET_F64S_DB_NAME, @@ -155,6 +157,15 @@ enum Command { prefixes: Vec, }, + FieldIdWordCountDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// The field name in the document. + field_name: String, + }, + /// Outputs a CSV with the documents ids, words and the positions where this word appears. DocidsWordsPositions { /// Display the whole positions in detail. @@ -271,6 +282,9 @@ fn main() -> anyhow::Result<()> { WordPrefixesLevelPositionsDocids { full_display, prefixes } => { word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) }, + FieldIdWordCountDocids { full_display, field_name } => { + field_id_word_count_docids(&index, &rtxn, !full_display, field_name) + }, DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) }, @@ -357,6 +371,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, + field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s: _, @@ -372,6 +387,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let word_level_position_docids_name = "word_level_position_docids"; let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; + let field_id_word_count_docids_name = "field_id_word_count_docids"; let facet_id_f64_docids_name = "facet_id_f64_docids"; let facet_id_string_docids_name = "facet_id_string_docids"; let documents_name = "documents"; @@ -443,6 +459,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho if heap.len() > limit { heap.pop(); } } + for result in field_id_word_count_docids.remap_data_type::().iter(rtxn)? { + let ((field_id, word_count), docids) = result?; + let key = format!("{} {}", field_id, word_count); + heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name))); + if heap.len() > limit { heap.pop(); } + } + let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; @@ -676,6 +699,39 @@ fn word_prefixes_level_positions_docids( Ok(wtr.flush()?) } +fn field_id_word_count_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + field_name: String +) -> anyhow::Result<()> +{ + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["field_name", "word_count", "docids"])?; + + let field_id = index.fields_ids_map(rtxn)? + .id(&field_name) + .with_context(|| format!("unknown field name: {}", &field_name))?; + + let left = (field_id, 1); + let right = (field_id, 11); + let iter = index.field_id_word_count_docids + .range(rtxn, &(left..=right))?; + + for result in iter { + let ((_, word_count), docids) = result?; + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[&field_name, &format!("{}", word_count), &docids])?; + } + + Ok(wtr.flush()?) +} + fn docids_words_positions( index: &Index, rtxn: &heed::RoTxn, @@ -870,6 +926,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, + field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, field_id_docid_facet_f64s, @@ -893,6 +950,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), + FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => field_id_word_count_docids.as_polymorph(), FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(), FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(), FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(), @@ -999,6 +1057,10 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu let db = index.word_prefix_pair_proximity_docids.as_polymorph(); compute_stats::(*db, rtxn, name) }, + FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => { + let db = index.field_id_word_count_docids.as_polymorph(); + compute_stats::(*db, rtxn, name) + }, unknown => anyhow::bail!("unknown database {:?}", unknown), } }