mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 10:37:41 +08:00
Compute the biggest values of the words_level_positions_docids
This commit is contained in:
parent
f713828406
commit
8bd4f5d93e
@ -346,6 +346,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
let docid_word_positions_name = "docid_word_positions";
|
let docid_word_positions_name = "docid_word_positions";
|
||||||
let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids";
|
let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids";
|
||||||
let word_pair_proximity_docids_name = "word_pair_proximity_docids";
|
let word_pair_proximity_docids_name = "word_pair_proximity_docids";
|
||||||
|
let word_level_position_docids_name = "word_level_position_docids";
|
||||||
let facet_field_id_value_docids_name = "facet_field_id_value_docids";
|
let facet_field_id_value_docids_name = "facet_field_id_value_docids";
|
||||||
let documents_name = "documents";
|
let documents_name = "documents";
|
||||||
|
|
||||||
@ -402,6 +403,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
if heap.len() > limit { heap.pop(); }
|
if heap.len() > limit { heap.pop(); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||||
|
let ((word, level, left, right), value) = result?;
|
||||||
|
let key = format!("{} {} {:?}", word, level, left..=right);
|
||||||
|
heap.push(Reverse((value.len(), key, word_level_position_docids_name)));
|
||||||
|
if heap.len() > limit { heap.pop(); }
|
||||||
|
}
|
||||||
|
|
||||||
let faceted_fields = index.faceted_fields_ids(rtxn)?;
|
let faceted_fields = index.faceted_fields_ids(rtxn)?;
|
||||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||||
for (field_id, field_type) in faceted_fields {
|
for (field_id, field_type) in faceted_fields {
|
||||||
@ -549,7 +557,7 @@ fn words_level_positions_docids(
|
|||||||
{
|
{
|
||||||
let stdout = io::stdout();
|
let stdout = io::stdout();
|
||||||
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||||
wtr.write_record(&["word", "level", "position_range", "documents_count", "documents_ids"])?;
|
wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?;
|
||||||
|
|
||||||
for word in words.iter().map(AsRef::as_ref) {
|
for word in words.iter().map(AsRef::as_ref) {
|
||||||
let range = {
|
let range = {
|
||||||
@ -561,14 +569,18 @@ fn words_level_positions_docids(
|
|||||||
let ((w, level, left, right), docids) = result?;
|
let ((w, level, left, right), docids) = result?;
|
||||||
if word != w { break }
|
if word != w { break }
|
||||||
|
|
||||||
let level = level.to_string();
|
|
||||||
let count = docids.len().to_string();
|
let count = docids.len().to_string();
|
||||||
let docids = if debug {
|
let docids = if debug {
|
||||||
format!("{:?}", docids)
|
format!("{:?}", docids)
|
||||||
} else {
|
} else {
|
||||||
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
||||||
};
|
};
|
||||||
let position_range = format!("{:?}", left..=right);
|
let position_range = if level == 0 {
|
||||||
|
format!("{:?}", left)
|
||||||
|
} else {
|
||||||
|
format!("{:?}", left..=right)
|
||||||
|
};
|
||||||
|
let level = level.to_string();
|
||||||
wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
|
wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -104,16 +104,16 @@ fn compute_positions_levels(
|
|||||||
for result in words_db.iter(rtxn)? {
|
for result in words_db.iter(rtxn)? {
|
||||||
let (word, ()) = result?;
|
let (word, ()) = result?;
|
||||||
|
|
||||||
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
|
|
||||||
.prefix_iter(rtxn, &(word, 0, u32::min_value(), u32::min_value()))?
|
|
||||||
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
|
|
||||||
|
|
||||||
let level_0_range = {
|
let level_0_range = {
|
||||||
let left = (word, 0, u32::min_value(), u32::min_value());
|
let left = (word, 0, u32::min_value(), u32::min_value());
|
||||||
let right = (word, 0, u32::max_value(), u32::max_value());
|
let right = (word, 0, u32::max_value(), u32::max_value());
|
||||||
left..=right
|
left..=right
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
|
||||||
|
.range(rtxn, &level_0_range)?
|
||||||
|
.fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?;
|
||||||
|
|
||||||
// Groups sizes are always a power of the original level_group_size and therefore a group
|
// Groups sizes are always a power of the original level_group_size and therefore a group
|
||||||
// always maps groups of the previous level and never splits previous levels groups in half.
|
// always maps groups of the previous level and never splits previous levels groups in half.
|
||||||
let group_size_iter = (1u8..)
|
let group_size_iter = (1u8..)
|
||||||
@ -132,7 +132,7 @@ fn compute_positions_levels(
|
|||||||
let mut group_docids = RoaringBitmap::new();
|
let mut group_docids = RoaringBitmap::new();
|
||||||
|
|
||||||
for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() {
|
for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() {
|
||||||
let ((_field_id, _level, value, _right), docids) = result?;
|
let ((_word, _level, value, _right), docids) = result?;
|
||||||
|
|
||||||
if i == 0 {
|
if i == 0 {
|
||||||
left = value;
|
left = value;
|
||||||
|
Loading…
Reference in New Issue
Block a user