diff --git a/Cargo.lock b/Cargo.lock index 3f27d12a7..2b0657da8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -517,12 +517,6 @@ dependencies = [ "serde", ] -[[package]] -name = "build_const" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ae4235e6dac0694637c763029ecea1a2ec9e4e06ec2729bd21ba4d9c863eb7" - [[package]] name = "bumpalo" version = "3.11.1" @@ -659,16 +653,19 @@ dependencies = [ [[package]] name = "charabia" -version = "0.7.0" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b57f9571f611796ea38e5a9c12e5ce37476f70397b032757f8dfe0c7b9bc5637" +checksum = "1ad3d9667a6b4e03813162c22c4d58235c2dc25d580d60837ce29199038341c9" dependencies = [ "cow-utils", "csv", "deunicode", "fst", + "irg-kvariants", "jieba-rs", "lindera", + "lindera-ipadic", + "lindera-ko-dic", "once_cell", "pinyin", "serde", @@ -721,14 +718,9 @@ version = "3.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71655c45cb9845d3270c9d6df84ebe72b4dad3c2ba3f7023ad47c144e4e473a5" dependencies = [ - "atty", "bitflags", - "clap_derive 3.2.18", "clap_lex 0.2.4", "indexmap", - "once_cell", - "strsim", - "termcolor", "textwrap", ] @@ -739,7 +731,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39" dependencies = [ "bitflags", - "clap_derive 4.0.21", + "clap_derive", "clap_lex 0.3.0", "is-terminal", "once_cell", @@ -747,19 +739,6 @@ dependencies = [ "termcolor", ] -[[package]] -name = "clap_derive" -version = "3.2.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea0c8bce528c4be4da13ea6fead8965e95b6073585a2f05204bd8f4119f82a65" -dependencies = [ - "heck", - "proc-macro-error", - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "clap_derive" version = "4.0.21" @@ -873,15 +852,6 @@ dependencies = [ "libc", ] -[[package]] -name = "crc" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d663548de7f5cca343f1e0a48d14dcfb0e9eb4e079ec58883b7251539fa10aeb" -dependencies = [ - "build_const", -] - [[package]] name = "crc32fast" version = "1.3.2" @@ -1333,6 +1303,19 @@ dependencies = [ "termcolor", ] +[[package]] +name = "env_logger" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0" +dependencies = [ + "humantime", + "is-terminal", + "log", + "regex", + "termcolor", +] + [[package]] name = "errno" version = "0.2.8" @@ -1986,6 +1969,17 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30e22bd8629359895450b59ea7a776c850561b96a3b1d31321c1949d9e6c9146" +[[package]] +name = "irg-kvariants" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c73214298363629cf9dbfc93b426808865ee3c121029778cb31b1284104fdf78" +dependencies = [ + "csv", + "once_cell", + "serde", +] + [[package]] name = "is-terminal" version = "0.4.2" @@ -2074,6 +2068,15 @@ dependencies = [ "simple_asn1", ] +[[package]] +name = "kanaria" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" +dependencies = [ + "bitflags", +] + [[package]] name = "language-tags" version = "0.3.2" @@ -2143,14 +2146,15 @@ dependencies = [ [[package]] name = "lindera" -version = "0.17.0" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "082ca91ac4d1557028ace9bfb8cee1500d156a4574dda93cfcdcf4caaebb9bd7" +checksum = "0f33a20bb9cbf95572b2d2f40d7040c8d8c7ad09ae20e1f6513db6ef2564dfc5" dependencies = [ "anyhow", "bincode", "byteorder", "encoding", + "kanaria", "lindera-cc-cedict-builder", "lindera-core", "lindera-dictionary", @@ -2159,24 +2163,27 @@ dependencies = [ "lindera-ko-dic", "lindera-ko-dic-builder", "lindera-unidic-builder", + "regex", "serde", "serde_json", "thiserror", + "unicode-blocks", + "unicode-normalization", + "yada", ] [[package]] name = "lindera-cc-cedict-builder" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8967615a6d85320ec2755e1435c36165467ba01a79026adc3f86dad1b668df3" +checksum = "60c3b379251edadbac7a5fdb31e482274e11dae6ab6cc789d0d86cf34369cf49" dependencies = [ "anyhow", "bincode", "byteorder", - "clap 3.2.23", "csv", "encoding", - "env_logger", + "env_logger 0.10.0", "glob", "lindera-core", "lindera-decompress", @@ -2185,16 +2192,28 @@ dependencies = [ ] [[package]] -name = "lindera-core" -version = "0.17.0" +name = "lindera-compress" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e8ed3cea13f73557a4574a179b1518670a3b70bfdad120521313b03cc89380e" +checksum = "a8d0ea3de5625e2381cac94e518d3b56103fde56bc0dce840fe875c1e871b125" +dependencies = [ + "anyhow", + "flate2", + "lindera-decompress", +] + +[[package]] +name = "lindera-core" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2281747b98fdd46bcc54ce7fdb6870dad9f67ddb3dc086c47b6704f3e1178cd5" dependencies = [ "anyhow", "bincode", "byteorder", "encoding_rs", "log", + "once_cell", "serde", "thiserror", "yada", @@ -2202,20 +2221,20 @@ dependencies = [ [[package]] name = "lindera-decompress" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2badb41828f89cfa6452db0a66da77897c0a04478304de26c8b2b36613e08d43" +checksum = "52101bd454754c506305ab897af5ac2ae41fe91e3272c1ff5c6a02a089dfaefd" dependencies = [ "anyhow", - "lzma-rs", + "flate2", "serde", ] [[package]] name = "lindera-dictionary" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e219722c9f56b920c231210e7c25d8b5d35b508e7a2fd69d368916c4b1c926f6" +checksum = "af1c6668848f1d30d216c99093a3ed3fe125c105fa12a4aeed5a1861dc01dd52" dependencies = [ "anyhow", "bincode", @@ -2225,15 +2244,16 @@ dependencies = [ [[package]] name = "lindera-ipadic" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c8e87c8362c724e8188fb7d9b6d184cac15d01369295e9bff7812b630d57e3b" +checksum = "693098007200fa43fd5cdc9ca8740f371327369672ce812cd87a1f6344971e31" dependencies = [ "bincode", "byteorder", "encoding", "flate2", "lindera-core", + "lindera-decompress", "lindera-ipadic-builder", "once_cell", "tar", @@ -2241,19 +2261,19 @@ dependencies = [ [[package]] name = "lindera-ipadic-builder" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1439e95852e444a116424086dc64d709c90e8af269ff7d2c2c4020f666f8dfab" +checksum = "7b6b7240d097a8fc37ee8f90ebff02c4db0ba5325ecb0dacb6da3724596798c9" dependencies = [ "anyhow", "bincode", "byteorder", - "clap 3.2.23", "csv", "encoding_rs", "encoding_rs_io", - "env_logger", + "env_logger 0.10.0", "glob", + "lindera-compress", "lindera-core", "lindera-decompress", "log", @@ -2263,15 +2283,16 @@ dependencies = [ [[package]] name = "lindera-ko-dic" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb15f949220da45872d774b7831bb030855ec083435c907499782f8558c8a203" +checksum = "abd3c5a4addeb61ca66788a3dd1fd51093e6cd8fea1d997042ada5aa60e8cc5e" dependencies = [ "bincode", "byteorder", "encoding", "flate2", "lindera-core", + "lindera-decompress", "lindera-ko-dic-builder", "once_cell", "tar", @@ -2279,18 +2300,18 @@ dependencies = [ [[package]] name = "lindera-ko-dic-builder" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde5a7352f4754be4f741e90bf4dff38a12a6572ab3880d0cf688e1166b8d82b" +checksum = "512bb1393a9281e0b13704319d1343b7931416865852d9d7b7c0178431518326" dependencies = [ "anyhow", "bincode", "byteorder", - "clap 3.2.23", "csv", "encoding", - "env_logger", + "env_logger 0.10.0", "glob", + "lindera-compress", "lindera-core", "lindera-decompress", "log", @@ -2299,17 +2320,16 @@ dependencies = [ [[package]] name = "lindera-unidic-builder" -version = "0.17.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1451b2ed8a7184a5f815d84f99d358c1d67297305831453dfdc0eb5d08e22b5" +checksum = "7f575a27f8ba67c15fe16ebf7d277a0ac04e8c8a0f72670ebc2443da9d41c450" dependencies = [ "anyhow", "bincode", "byteorder", - "clap 3.2.23", "csv", "encoding", - "env_logger", + "env_logger 0.10.0", "glob", "lindera-core", "lindera-decompress", @@ -2398,16 +2418,6 @@ dependencies = [ "syn", ] -[[package]] -name = "lzma-rs" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba8ecb0450dfabce4ad72085eed0a75dffe8f21f7ada05638564ea9db2d7fb1" -dependencies = [ - "byteorder", - "crc", -] - [[package]] name = "manifest-dir-macros" version = "0.1.16" @@ -2466,7 +2476,7 @@ dependencies = [ "deserr", "dump", "either", - "env_logger", + "env_logger 0.9.3", "file-store", "flate2", "fst", @@ -4011,6 +4021,12 @@ version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "099b7128301d285f79ddd55b9a83d5e6b9e97c92e0ea0daebee7263e932de992" +[[package]] +name = "unicode-blocks" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9de2be6bad6f56ce8373d377e611cbb2265de3a656138065609ce82e217aad70" + [[package]] name = "unicode-ident" version = "1.0.6" diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 60ffa6cee..91ff64d37 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -148,6 +148,28 @@ async fn simple_search() { .await; } +#[cfg(feature = "default")] +#[actix_rt::test] +async fn test_kanji_language_detection() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = json!([ + { "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": 1, "title": "東京のお寿司。" }, + { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" } + ]); + index.add_documents(documents, None).await; + index.wait_task(0).await; + + index + .search(json!({"q": "東京"}), |response, code| { + assert_eq!(code, 200, "{}", response); + assert_eq!(response["hits"].as_array().unwrap().len(), 1); + }) + .await; +} + #[actix_rt::test] async fn search_multiple_params() { let server = Server::new().await; diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 5e359dddc..04591e8fd 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -16,7 +16,7 @@ bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" bstr = "1.0.1" byteorder = "1.4.3" -charabia = { version = "0.7.0", default-features = false } +charabia = { version = "0.7.1", default-features = false } concat-arrays = "0.1.2" crossbeam-channel = "0.5.6" deserr = "0.5.0" diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 702dcf661..a4df63e22 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -5,6 +5,7 @@ mod field_id_word_count_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; +mod script_language_codec; mod str_beu32_codec; mod str_ref; mod str_str_u8_codec; @@ -19,5 +20,6 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; +pub use self::script_language_codec::ScriptLanguageCodec; pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs new file mode 100644 index 000000000..83e8a7241 --- /dev/null +++ b/milli/src/heed_codec/script_language_codec.rs @@ -0,0 +1,38 @@ +use std::borrow::Cow; +use std::str; + +use charabia::{Language, Script}; + +pub struct ScriptLanguageCodec; + +impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec { + type DItem = (Script, Language); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let sep = bytes.iter().position(|b| *b == 0)?; + let (s_bytes, l_bytes) = bytes.split_at(sep); + let script = str::from_utf8(s_bytes).ok()?; + let script_name = Script::from_name(script); + let lan = str::from_utf8(l_bytes).ok()?; + // skip '\0' byte between the two strings. + let lan_name = Language::from_name(&lan[1..]); + + Some((script_name, lan_name)) + } +} + +impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec { + type EItem = (Script, Language); + + fn bytes_encode((script, lan): &Self::EItem) -> Option> { + let script_name = script.name().as_bytes(); + let lan_name = lan.name().as_bytes(); + + let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len() + 1); + bytes.extend_from_slice(script_name); + bytes.push(0); + bytes.extend_from_slice(lan_name); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 47bf54e2d..a4048dfb0 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -4,6 +4,7 @@ use std::fs::File; use std::mem::size_of; use std::path::Path; +use charabia::{Language, Script}; use heed::flags::Flags; use heed::types::*; use heed::{CompactionOption, Database, PolyDatabase, RoTxn, RwTxn}; @@ -18,7 +19,7 @@ use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, }; -use crate::heed_codec::StrRefCodec; +use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec}; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, @@ -83,6 +84,7 @@ pub mod db_name { pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const DOCUMENTS: &str = "documents"; + pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids"; } #[derive(Clone)] @@ -122,6 +124,9 @@ pub struct Index { /// Maps the position of a word prefix with all the docids where this prefix appears. pub word_prefix_position_docids: Database, + /// Maps the script and language with all the docids that corresponds to it. + pub script_language_docids: Database, + /// Maps the facet field id and the docids for which this field exists pub facet_id_exists_docids: Database, @@ -148,7 +153,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(18); + options.max_dbs(19); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -159,6 +164,7 @@ impl Index { let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; + let script_language_docids = env.create_database(Some(SCRIPT_LANGUAGE_DOCIDS))?; let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; let prefix_word_pair_proximity_docids = @@ -186,6 +192,7 @@ impl Index { exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, + script_language_docids, word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids, word_position_docids, @@ -1187,6 +1194,38 @@ impl Index { pub(crate) fn delete_pagination_max_total_hits(&self, txn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(txn, main_key::PAGINATION_MAX_TOTAL_HITS) } + + /* script language docids */ + /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any. + pub fn script_language_documents_ids( + &self, + rtxn: &RoTxn, + key: &(Script, Language), + ) -> heed::Result> { + let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; + let doc_ids = self.script_language_docids.get(rtxn, key)?; + Ok(doc_ids.map(|ids| ids - soft_deleted_documents)) + } + + pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result>> { + let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?; + + let mut script_language: HashMap> = HashMap::new(); + for sl in self.script_language_docids.iter(rtxn)? { + let ((script, language), docids) = sl?; + + // keep only Languages that contains at least 1 document. + if !soft_deleted_documents.is_superset(&docids) { + if let Some(languages) = script_language.get_mut(&script) { + (*languages).push(language); + } else { + script_language.insert(script, vec![language]); + } + } + } + + Ok(script_language) + } } #[cfg(test)] diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index dc48e04a8..bd140284d 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -152,6 +152,11 @@ impl<'a> Search<'a> { tokbuilder.stop_words(stop_words); } + let script_lang_map = self.index.script_language(self.rtxn)?; + if !script_lang_map.is_empty() { + tokbuilder.allow_list(&script_lang_map); + } + let tokenizer = tokbuilder.build(); let tokens = tokenizer.tokenize(query); builder @@ -446,6 +451,28 @@ mod test { use super::*; use crate::index::tests::TempIndex; + #[cfg(feature = "default")] + #[test] + fn test_kanji_language_detection() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": 1, "title": "東京のお寿司。" }, + { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" } + ])) + .unwrap(); + + let txn = index.write_txn().unwrap(); + let mut search = Search::new(&txn, &index); + + search.query("東京"); + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + assert_eq!(documents_ids, vec![1]); + } + #[test] fn test_is_authorized_typos() { let index = TempIndex::new(); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index adeea11fa..0296bc192 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -30,6 +30,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_position_docids, field_id_word_count_docids, word_prefix_position_docids, + script_language_docids, facet_id_f64_docids, facet_id_string_docids, facet_id_exists_docids, @@ -82,6 +83,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_position_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?; + script_language_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; facet_id_exists_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index daf186683..eeb67b829 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -243,6 +243,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { facet_id_string_docids: _, field_id_docid_facet_f64s: _, field_id_docid_facet_strings: _, + script_language_docids, facet_id_exists_docids, documents, } = self.index; @@ -499,6 +500,22 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { .execute(self.wtxn)?; } + // Remove the documents ids from the script language database. + let mut iter = script_language_docids.iter_mut(self.wtxn)?; + while let Some((key, mut docids)) = iter.next().transpose()? { + let previous_len = docids.len(); + docids -= &self.to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } + } + + drop(iter); // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_id_exists_docids( self.wtxn, @@ -1166,4 +1183,52 @@ mod tests { stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard); stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft); } + + fn stored_detected_script_and_language_should_not_return_deleted_documents_( + deletion_strategy: DeletionStrategy, + ) { + use charabia::{Language, Script}; + let index = TempIndex::new(); + let mut wtxn = index.write_txn().unwrap(); + index + .add_documents_using_wtxn( + &mut wtxn, + documents!([ + { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, + { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, + { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" }, + { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" }, + { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" }, + ])) + .unwrap(); + + let key_cmn = (Script::Cj, Language::Cmn); + let cj_cmn_docs = + index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default(); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(1); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + + delete_documents(&mut wtxn, &index, &["1"], deletion_strategy); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let cj_cmn_docs = + index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default(); + let mut expected_cj_cmn_docids = RoaringBitmap::new(); + expected_cj_cmn_docids.push(5); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + } + + #[test] + fn stored_detected_script_and_language_should_not_return_deleted_documents() { + stored_detected_script_and_language_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysHard, + ); + stored_detected_script_and_language_should_not_return_deleted_documents_( + DeletionStrategy::AlwaysSoft, + ); + } } diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index be9b479bb..2d51fcc1a 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -1,9 +1,9 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::{io, mem, str}; -use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; +use charabia::{Language, Script, SeparatorKind, Token, TokenKind, TokenizerBuilder}; use roaring::RoaringBitmap; use serde_json::Value; @@ -13,6 +13,8 @@ use crate::{ absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, }; +pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>; + /// Extracts the word and positions where this word appear and /// prefixes it by the document id. /// @@ -25,12 +27,13 @@ pub fn extract_docid_word_positions( searchable_fields: &Option>, stop_words: Option<&fst::Set<&[u8]>>, max_positions_per_attributes: Option, -) -> Result<(RoaringBitmap, grenad::Reader)> { +) -> Result<(RoaringBitmap, grenad::Reader, ScriptLanguageDocidsMap)> { let max_positions_per_attributes = max_positions_per_attributes .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); let mut documents_ids = RoaringBitmap::new(); + let mut script_language_pair = HashMap::new(); let mut docid_word_positions_sorter = create_sorter( grenad::SortAlgorithm::Stable, concat_u32s_array, @@ -70,6 +73,13 @@ pub fn extract_docid_word_positions( .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { + if let Some(language) = token.language { + let script = token.script; + let entry = script_language_pair + .entry((script, language)) + .or_insert_with(RoaringBitmap::new); + entry.push(document_id); + } let token = token.lemma().trim(); if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { key_buffer.truncate(mem::size_of::()); @@ -88,7 +98,8 @@ pub fn extract_docid_word_positions( } } - sorter_into_reader(docid_word_positions_sorter, indexer).map(|reader| (documents_ids, reader)) + sorter_into_reader(docid_word_positions_sorter, indexer) + .map(|reader| (documents_ids, reader, script_language_pair)) } /// Transform a JSON value into a string that can be indexed. diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index e696ed44b..c0f07cf79 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -257,13 +257,14 @@ fn send_and_extract_flattened_documents_data( let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = rayon::join( || { - let (documents_ids, docid_word_positions_chunk) = extract_docid_word_positions( - flattened_documents_chunk.clone(), - indexer, - searchable_fields, - stop_words.as_ref(), - max_positions_per_attributes, - )?; + let (documents_ids, docid_word_positions_chunk, script_language_pair) = + extract_docid_word_positions( + flattened_documents_chunk.clone(), + indexer, + searchable_fields, + stop_words.as_ref(), + max_positions_per_attributes, + )?; // send documents_ids to DB writer let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))); @@ -274,6 +275,9 @@ fn send_and_extract_flattened_documents_data( let _ = lmdb_writer_sx .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); + let _ = + lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); + Ok(docid_word_positions_chunk) }, || { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 5e547a049..2a7930f84 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1906,6 +1906,33 @@ mod tests { index.add_documents(doc1).unwrap(); } + #[cfg(feature = "default")] + #[test] + fn store_detected_script_and_language_per_document_during_indexing() { + use charabia::{Language, Script}; + let index = TempIndex::new(); + index + .add_documents(documents!([ + { "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, + { "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, + { "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" }, + { "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" }, + { "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" }, + ])) + .unwrap(); + + let rtxn = index.read_txn().unwrap(); + let key_jpn = (Script::Cj, Language::Jpn); + let key_cmn = (Script::Cj, Language::Cmn); + let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap(); + let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap(); + let expected_cj_jpn_docids = [3].iter().collect(); + assert_eq!(cj_jpn_docs, expected_cj_jpn_docids); + let expected_cj_cmn_docids = [1, 5].iter().collect(); + assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); + } + #[test] fn add_and_delete_documents_in_single_transform() { let mut index = TempIndex::new(); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 16784bd92..b9b11cfa8 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,8 +1,10 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::convert::TryInto; use std::fs::File; use std::io; +use charabia::{Language, Script}; use grenad::MergerBuilder; use heed::types::ByteSlice; use heed::{BytesDecode, RwTxn}; @@ -38,6 +40,7 @@ pub(crate) enum TypedChunk { FieldIdFacetNumberDocids(grenad::Reader), FieldIdFacetExistsDocids(grenad::Reader), GeoPoints(grenad::Reader), + ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), } /// Write typed chunk in the corresponding LMDB database of the provided index. @@ -210,6 +213,24 @@ pub(crate) fn write_typed_chunk_into_index( index.put_geo_rtree(wtxn, &rtree)?; index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; } + TypedChunk::ScriptLanguageDocids(hash_pair) => { + let mut buffer = Vec::new(); + for (key, value) in hash_pair { + buffer.clear(); + let final_value = match index.script_language_docids.get(wtxn, &key)? { + Some(db_values) => { + let mut db_value_buffer = Vec::new(); + serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?; + let mut new_value_buffer = Vec::new(); + serialize_roaring_bitmap(&value, &mut new_value_buffer)?; + merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?; + RoaringBitmap::deserialize_from(&buffer[..])? + } + None => value, + }; + index.script_language_docids.put(wtxn, &key, &final_value)?; + } + } } Ok((RoaringBitmap::new(), is_merged_database))