diff --git a/milli/src/index.rs b/milli/src/index.rs index 76bc273d9..03f16a126 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -4,6 +4,7 @@ use std::fs::File; use std::mem::size_of; use std::path::Path; +use charabia::{Language, Script}; use heed::flags::Flags; use heed::types::*; use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn}; @@ -1194,6 +1195,12 @@ impl Index { pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS) } + + /* script language docids */ + /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any. + pub fn script_language_documents_ids(&self, rtxn: &RoTxn, key: &(Script, Language)) -> heed::Result> { + self.script_language_docids.get(rtxn, key) + } } #[cfg(test)] diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index be97defbd..f736cb55e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1907,4 +1907,39 @@ mod tests { index.add_documents(doc1).unwrap(); } + + #[cfg(feature = "default")] + #[test] + fn store_detected_script_and_language_per_document_during_indexing() { + use charabia::{Language, Script}; + let index = TempIndex::new(); + index + .add_documents(documents!([ + { "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, + { "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, + { "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" }, + { "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" }, + { "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let key_thai = (Script::Thai, Language::Other); + let key_jpn = (Script::Cj, Language::Jpn); + let key_cmn = (Script::Cj, Language::Cmn); + let thai_docs = index.script_language_documents_ids(&rtxn, &key_thai).unwrap().unwrap(); + let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap(); + let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap(); + let mut expected_thai_docids = RoaringBitmap::new(); + expected_thai_docids.push(4); + assert_eq!(thai_docs, expected_thai_docids); + let mut expected_cj_jpn_docids = RoaringBitmap::new(); + expected_cj_jpn_docids.push(3); + assert_eq!(cj_jpn_docs, expected_cj_jpn_docids); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(1); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + } }