diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 702dcf661..2ac130f48 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -8,6 +8,7 @@ mod roaring_bitmap_length; mod str_beu32_codec; mod str_ref; mod str_str_u8_codec; +mod script_language_codec; pub use byte_slice_ref::ByteSliceRefCodec; pub use str_ref::StrRefCodec; @@ -21,3 +22,4 @@ pub use self::roaring_bitmap_length::{ }; pub use self::str_beu32_codec::StrBEU32Codec; pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; +pub use self::script_language_codec::ScriptLanguageCodec; diff --git a/milli/src/heed_codec/script_language_codec.rs b/milli/src/heed_codec/script_language_codec.rs new file mode 100644 index 000000000..af15990ea --- /dev/null +++ b/milli/src/heed_codec/script_language_codec.rs @@ -0,0 +1,43 @@ +use std::borrow::Cow; + +use std::mem::size_of; +use std::str; + +use charabia::{Language, Script}; + +pub struct ScriptLanguageCodec; + +impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec { + type DItem = (Script, Language); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let footer_len = size_of::(); + + if bytes.len() < footer_len { + return None; + } + + let (script, bytes) = bytes.split_at(bytes.len() - footer_len); + let script = str::from_utf8(script).ok()?; + let script_name = Script::from_name(script); + let lan = str::from_utf8(bytes).ok()?; + let lan_name = Language::from_name(lan); + + Some((script_name, lan_name)) + } +} + +impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec { + type EItem = (Script, Language); + + fn bytes_encode((script, lan): &Self::EItem) -> Option> { + let script_name = script.name(); + let lan_name = lan.name(); + + let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len()); + bytes.extend_from_slice(script_name.as_bytes()); + bytes.extend_from_slice(lan_name.as_bytes()); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 3f7ef14e6..76bc273d9 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -14,6 +14,7 @@ use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; +use crate::heed_codec::ScriptLanguageCodec; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, @@ -83,6 +84,7 @@ pub mod db_name { pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const DOCUMENTS: &str = "documents"; + pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids"; } #[derive(Clone)] @@ -122,6 +124,9 @@ pub struct Index { /// Maps the position of a word prefix with all the docids where this prefix appears. pub word_prefix_position_docids: Database, + /// Maps the script and language with all the docids that corresponds to it. + pub script_language_docids: Database, + /// Maps the facet field id and the docids for which this field exists pub facet_id_exists_docids: Database, @@ -159,6 +164,7 @@ impl Index { let exact_word_prefix_docids = env.create_database(Some(EXACT_WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; + let script_language_docids = env.create_database(Some(SCRIPT_LANGUAGE_DOCIDS))?; let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; let prefix_word_pair_proximity_docids = @@ -186,6 +192,7 @@ impl Index { exact_word_prefix_docids, docid_word_positions, word_pair_proximity_docids, + script_language_docids, word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids, word_position_docids,