diff --git a/infos/src/main.rs b/infos/src/main.rs index c219c5758..2c11d3783 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -346,6 +346,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let docid_word_positions_name = "docid_word_positions"; let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids"; let word_pair_proximity_docids_name = "word_pair_proximity_docids"; + let word_level_position_docids_name = "word_level_position_docids"; let facet_field_id_value_docids_name = "facet_field_id_value_docids"; let documents_name = "documents"; @@ -402,6 +403,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho if heap.len() > limit { heap.pop(); } } + for result in word_level_position_docids.remap_data_type::().iter(rtxn)? { + let ((word, level, left, right), value) = result?; + let key = format!("{} {} {:?}", word, level, left..=right); + heap.push(Reverse((value.len(), key, word_level_position_docids_name))); + if heap.len() > limit { heap.pop(); } + } + let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; for (field_id, field_type) in faceted_fields { @@ -549,7 +557,7 @@ fn words_level_positions_docids( { let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["word", "level", "position_range", "documents_count", "documents_ids"])?; + wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?; for word in words.iter().map(AsRef::as_ref) { let range = { @@ -561,14 +569,18 @@ fn words_level_positions_docids( let ((w, level, left, right), docids) = result?; if word != w { break } - let level = level.to_string(); let count = docids.len().to_string(); let docids = if debug { format!("{:?}", docids) } else { format!("{:?}", docids.iter().collect::>()) }; - let position_range = format!("{:?}", left..=right); + let position_range = if level == 0 { + format!("{:?}", left) + } else { + format!("{:?}", left..=right) + }; + let level = level.to_string(); wtr.write_record(&[w, &level, &position_range, &count, &docids])?; } } diff --git a/milli/src/update/words_level_positions.rs b/milli/src/update/words_level_positions.rs index 77cec246a..a7be248b6 100644 --- a/milli/src/update/words_level_positions.rs +++ b/milli/src/update/words_level_positions.rs @@ -104,16 +104,16 @@ fn compute_positions_levels( for result in words_db.iter(rtxn)? { let (word, ()) = result?; - let first_level_size = words_positions_db.remap_data_type::() - .prefix_iter(rtxn, &(word, 0, u32::min_value(), u32::min_value()))? - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - let level_0_range = { let left = (word, 0, u32::min_value(), u32::min_value()); let right = (word, 0, u32::max_value(), u32::max_value()); left..=right }; + let first_level_size = words_positions_db.remap_data_type::() + .range(rtxn, &level_0_range)? + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. let group_size_iter = (1u8..) @@ -132,7 +132,7 @@ fn compute_positions_levels( let mut group_docids = RoaringBitmap::new(); for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() { - let ((_field_id, _level, value, _right), docids) = result?; + let ((_word, _level, value, _right), docids) = result?; if i == 0 { left = value;