From 37d4551e8e196ad849a91eeef9b86c73bb5c88f8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 7 Mar 2023 19:38:01 +0100 Subject: [PATCH] Add a threshold filtering the Languages allowed to be detected at search time --- milli/src/index.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index a4048dfb0..7a473c0b4 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1211,11 +1211,22 @@ impl Index { let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; let mut script_language: HashMap> = HashMap::new(); + let mut script_language_doc_count: Vec<(Script, Language, u64)> = Vec::new(); + let mut total = 0; for sl in self.script_language_docids.iter(rtxn)? { let ((script, language), docids) = sl?; // keep only Languages that contains at least 1 document. - if !soft_deleted_documents.is_superset(&docids) { + let remaining_documents_count = (docids - &soft_deleted_documents).len(); + total += remaining_documents_count; + if remaining_documents_count > 0 { + script_language_doc_count.push((script, language, remaining_documents_count)); + } + } + + let threshold = total / 20; // 5% (arbitrar) + for (script, language, count) in script_language_doc_count { + if count > threshold { if let Some(languages) = script_language.get_mut(&script) { (*languages).push(language); } else {