mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-27 04:25:06 +08:00
Compute stats about the word prefix level positions database in the infos crate
This commit is contained in:
parent
e65bad16cc
commit
1aad66bdaa
@ -21,6 +21,7 @@ const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions";
|
|||||||
const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
|
const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids";
|
||||||
const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
|
const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids";
|
||||||
const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids";
|
const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids";
|
||||||
|
const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids";
|
||||||
const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids";
|
const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids";
|
||||||
const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values";
|
const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values";
|
||||||
const DOCUMENTS_DB_NAME: &str = "documents";
|
const DOCUMENTS_DB_NAME: &str = "documents";
|
||||||
@ -33,6 +34,7 @@ const ALL_DATABASE_NAMES: &[&str] = &[
|
|||||||
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME,
|
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME,
|
||||||
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
|
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME,
|
||||||
WORD_LEVEL_POSITION_DOCIDS_DB_NAME,
|
WORD_LEVEL_POSITION_DOCIDS_DB_NAME,
|
||||||
|
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME,
|
||||||
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME,
|
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME,
|
||||||
FIELD_ID_DOCID_FACET_VALUES_DB_NAME,
|
FIELD_ID_DOCID_FACET_VALUES_DB_NAME,
|
||||||
DOCUMENTS_DB_NAME,
|
DOCUMENTS_DB_NAME,
|
||||||
@ -122,10 +124,21 @@ enum Command {
|
|||||||
#[structopt(long)]
|
#[structopt(long)]
|
||||||
full_display: bool,
|
full_display: bool,
|
||||||
|
|
||||||
/// The field name in the document.
|
/// Words appearing in the documents.
|
||||||
words: Vec<String>,
|
words: Vec<String>,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/// Outputs a CSV with the documents ids along with
|
||||||
|
/// the word prefix level positions where it appears.
|
||||||
|
WordPrefixesLevelPositionsDocids {
|
||||||
|
/// Display the whole documents ids in details.
|
||||||
|
#[structopt(long)]
|
||||||
|
full_display: bool,
|
||||||
|
|
||||||
|
/// Prefixes of words appearing in the documents.
|
||||||
|
prefixes: Vec<String>,
|
||||||
|
},
|
||||||
|
|
||||||
/// Outputs a CSV with the documents ids, words and the positions where this word appears.
|
/// Outputs a CSV with the documents ids, words and the positions where this word appears.
|
||||||
DocidsWordsPositions {
|
DocidsWordsPositions {
|
||||||
/// Display the whole positions in detail.
|
/// Display the whole positions in detail.
|
||||||
@ -236,6 +249,9 @@ fn main() -> anyhow::Result<()> {
|
|||||||
WordsLevelPositionsDocids { full_display, words } => {
|
WordsLevelPositionsDocids { full_display, words } => {
|
||||||
words_level_positions_docids(&index, &rtxn, !full_display, words)
|
words_level_positions_docids(&index, &rtxn, !full_display, words)
|
||||||
},
|
},
|
||||||
|
WordPrefixesLevelPositionsDocids { full_display, prefixes } => {
|
||||||
|
word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes)
|
||||||
|
},
|
||||||
DocidsWordsPositions { full_display, internal_documents_ids } => {
|
DocidsWordsPositions { full_display, internal_documents_ids } => {
|
||||||
docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
|
docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids)
|
||||||
},
|
},
|
||||||
@ -335,6 +351,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
word_level_position_docids,
|
word_level_position_docids,
|
||||||
|
word_prefix_level_position_docids,
|
||||||
facet_field_id_value_docids,
|
facet_field_id_value_docids,
|
||||||
field_id_docid_facet_values: _,
|
field_id_docid_facet_values: _,
|
||||||
documents,
|
documents,
|
||||||
@ -348,6 +365,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids";
|
let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids";
|
||||||
let word_pair_proximity_docids_name = "word_pair_proximity_docids";
|
let word_pair_proximity_docids_name = "word_pair_proximity_docids";
|
||||||
let word_level_position_docids_name = "word_level_position_docids";
|
let word_level_position_docids_name = "word_level_position_docids";
|
||||||
|
let word_prefix_level_position_docids_name = "word_prefix_level_position_docids";
|
||||||
let facet_field_id_value_docids_name = "facet_field_id_value_docids";
|
let facet_field_id_value_docids_name = "facet_field_id_value_docids";
|
||||||
let documents_name = "documents";
|
let documents_name = "documents";
|
||||||
|
|
||||||
@ -411,6 +429,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
if heap.len() > limit { heap.pop(); }
|
if heap.len() > limit { heap.pop(); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for result in word_prefix_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? {
|
||||||
|
let ((word, level, left, right), value) = result?;
|
||||||
|
let key = format!("{} {} {:?}", word, level, left..=right);
|
||||||
|
heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name)));
|
||||||
|
if heap.len() > limit { heap.pop(); }
|
||||||
|
}
|
||||||
|
|
||||||
let faceted_fields = index.faceted_fields_ids(rtxn)?;
|
let faceted_fields = index.faceted_fields_ids(rtxn)?;
|
||||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||||
for (field_id, field_type) in faceted_fields {
|
for (field_id, field_type) in faceted_fields {
|
||||||
@ -588,6 +613,45 @@ fn words_level_positions_docids(
|
|||||||
Ok(wtr.flush()?)
|
Ok(wtr.flush()?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn word_prefixes_level_positions_docids(
|
||||||
|
index: &Index,
|
||||||
|
rtxn: &heed::RoTxn,
|
||||||
|
debug: bool,
|
||||||
|
prefixes: Vec<String>,
|
||||||
|
) -> anyhow::Result<()>
|
||||||
|
{
|
||||||
|
let stdout = io::stdout();
|
||||||
|
let mut wtr = csv::Writer::from_writer(stdout.lock());
|
||||||
|
wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?;
|
||||||
|
|
||||||
|
for word in prefixes.iter().map(AsRef::as_ref) {
|
||||||
|
let range = {
|
||||||
|
let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value());
|
||||||
|
let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value());
|
||||||
|
left..=right
|
||||||
|
};
|
||||||
|
for result in index.word_prefix_level_position_docids.range(rtxn, &range)? {
|
||||||
|
let ((w, level, left, right), docids) = result?;
|
||||||
|
|
||||||
|
let count = docids.len().to_string();
|
||||||
|
let docids = if debug {
|
||||||
|
format!("{:?}", docids)
|
||||||
|
} else {
|
||||||
|
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
||||||
|
};
|
||||||
|
let position_range = if level == TreeLevel::min_value() {
|
||||||
|
format!("{:?}", left)
|
||||||
|
} else {
|
||||||
|
format!("{:?}", left..=right)
|
||||||
|
};
|
||||||
|
let level = level.to_string();
|
||||||
|
wtr.write_record(&[w, &level, &position_range, &count, &docids])?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(wtr.flush()?)
|
||||||
|
}
|
||||||
|
|
||||||
fn docids_words_positions(
|
fn docids_words_positions(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
rtxn: &heed::RoTxn,
|
rtxn: &heed::RoTxn,
|
||||||
@ -779,6 +843,21 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any
|
|||||||
fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> anyhow::Result<()> {
|
fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> anyhow::Result<()> {
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
|
|
||||||
|
let Index {
|
||||||
|
env: _,
|
||||||
|
main,
|
||||||
|
word_docids,
|
||||||
|
word_prefix_docids,
|
||||||
|
docid_word_positions,
|
||||||
|
word_pair_proximity_docids,
|
||||||
|
word_prefix_pair_proximity_docids,
|
||||||
|
word_level_position_docids,
|
||||||
|
word_prefix_level_position_docids,
|
||||||
|
facet_field_id_value_docids,
|
||||||
|
field_id_docid_facet_values,
|
||||||
|
documents,
|
||||||
|
} = index;
|
||||||
|
|
||||||
let names = if names.is_empty() {
|
let names = if names.is_empty() {
|
||||||
ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect()
|
ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect()
|
||||||
} else {
|
} else {
|
||||||
@ -787,15 +866,17 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
|
|||||||
|
|
||||||
for name in names {
|
for name in names {
|
||||||
let database = match name.as_str() {
|
let database = match name.as_str() {
|
||||||
MAIN_DB_NAME => &index.main,
|
MAIN_DB_NAME => &main,
|
||||||
WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(),
|
WORD_PREFIX_DOCIDS_DB_NAME => word_prefix_docids.as_polymorph(),
|
||||||
WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(),
|
WORD_DOCIDS_DB_NAME => word_docids.as_polymorph(),
|
||||||
DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(),
|
DOCID_WORD_POSITIONS_DB_NAME => docid_word_positions.as_polymorph(),
|
||||||
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(),
|
WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_pair_proximity_docids.as_polymorph(),
|
||||||
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(),
|
WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(),
|
||||||
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => index.facet_field_id_value_docids.as_polymorph(),
|
WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(),
|
||||||
FIELD_ID_DOCID_FACET_VALUES_DB_NAME => index.field_id_docid_facet_values.as_polymorph(),
|
WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(),
|
||||||
DOCUMENTS_DB_NAME => index.documents.as_polymorph(),
|
FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(),
|
||||||
|
FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(),
|
||||||
|
DOCUMENTS_DB_NAME => documents.as_polymorph(),
|
||||||
unknown => anyhow::bail!("unknown database {:?}", unknown),
|
unknown => anyhow::bail!("unknown database {:?}", unknown),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user