last PR fixes

This commit is contained in:
ManyTheFish 2023-03-09 15:34:36 +01:00
parent dea101e3d9
commit 2f8eb4f54a

View File

@ -79,7 +79,11 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
// if we detect a potetial mistake in the language detection, // if we detect a potetial mistake in the language detection,
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
// context: https://github.com/meilisearch/meilisearch/issues/3565 // context: https://github.com/meilisearch/meilisearch/issues/3565
if script_language_word_count.values().any(potential_language_detection_error) { if script_language_word_count
.values()
.map(Vec::as_slice)
.any(potential_language_detection_error)
{
// build an allow list with the most frequent detected languages in the document. // build an allow list with the most frequent detected languages in the document.
let script_language: HashMap<_, _> = let script_language: HashMap<_, _> =
script_language_word_count.iter().filter_map(most_frequent_languages).collect(); script_language_word_count.iter().filter_map(most_frequent_languages).collect();
@ -254,7 +258,7 @@ fn process_tokens<'a>(
.filter(|(_, t)| t.is_word()) .filter(|(_, t)| t.is_word())
} }
fn potential_language_detection_error(languages_frequency: &Vec<(Language, usize)>) -> bool { fn potential_language_detection_error(languages_frequency: &[(Language, usize)]) -> bool {
if languages_frequency.len() > 1 { if languages_frequency.len() > 1 {
let threshold = compute_language_frequency_threshold(languages_frequency); let threshold = compute_language_frequency_threshold(languages_frequency);
languages_frequency.iter().any(|(_, c)| *c <= threshold) languages_frequency.iter().any(|(_, c)| *c <= threshold)
@ -289,6 +293,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)
#[derive(Default)] #[derive(Default)]
struct Buffers { struct Buffers {
// the key buffer is the concatenation of the internal document id with the field id.
// The buffer has to be completelly cleared between documents,
// and the field id part must be cleared between each field.
key_buffer: Vec<u8>, key_buffer: Vec<u8>,
// the field buffer for each fields desserialization, and must be cleared between each field.
field_buffer: String, field_buffer: String,
} }