diff --git a/Cargo.lock b/Cargo.lock index 70128cfa9..fcca06546 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,6 +6,15 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10" +[[package]] +name = "aho-corasick" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" +dependencies = [ + "memchr", +] + [[package]] name = "anyhow" version = "1.0.31" @@ -345,6 +354,16 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "form_urlencoded" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ece68d15c92e84fa4f19d3780f1294e5ca82a78a6d515f1efaabcc144688be00" +dependencies = [ + "matches", + "percent-encoding", +] + [[package]] name = "fs_extra" version = "1.1.0" @@ -429,9 +448,9 @@ dependencies = [ [[package]] name = "heed" -version = "0.10.4" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616" +checksum = "2eaba3b0edee6a9cd551f24caca2027922b03259f7203a15f0b86af4c1348fcc" dependencies = [ "byteorder", "heed-traits", @@ -453,9 +472,9 @@ checksum = "b328f6260a7e51bdb0ca6b68e6ea27ee3d11fba5dee930896ee7ff6ad5fc072c" [[package]] name = "heed-types" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72fc61caee13e85ea330eabf0c6c7098c511ff173bcb57a760b1eda3bba9f6eb" +checksum = "e628efb08beaee58355f80dc4adba79d644940ea9eef60175ea17dc218aab405" dependencies = [ "bincode", "heed-traits", @@ -571,9 +590,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.79" +version = "0.2.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2448f6066e80e3bfc792e9c98bf705b4b0fc6e8ef5b43e5889aff0eaa9c58743" +checksum = "4d58d1b70b004888f764dfbf6a26a3b0342a1632d33968e4a179d8011c760614" [[package]] name = "linked-hash-map" @@ -654,6 +673,7 @@ dependencies = [ "criterion", "crossbeam-channel", "csv", + "either", "flate2", "fst", "fxhash", @@ -675,6 +695,7 @@ dependencies = [ "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", "pest_derive", "rayon", + "regex", "ringtail", "roaring", "serde", @@ -760,9 +781,9 @@ checksum = "ce30a214135d83e7250f2e8fad781f7cb987e3a3f1b4529712d891594bda311c" [[package]] name = "once_cell" -version = "1.4.0" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b631f7e854af39a1739f401cf34a8a013dfe09eac4fa4dba91e9768bd28168d" +checksum = "13bd41f508810a131401606d54ac32a467c97172d74ba7662562ebba5ad07fa0" [[package]] name = "oorandom" @@ -1003,11 +1024,14 @@ dependencies = [ [[package]] name = "regex" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8963b85b8ce3074fecffde43b4b0dded83ce2f367dc8d363afc56679f3ee820b" +checksum = "38cf2c13ed4745de91a5eb834e11c00bcc3709e773173b2ce4c56c9fbde04b9c" dependencies = [ + "aho-corasick", + "memchr", "regex-syntax", + "thread_local", ] [[package]] @@ -1021,9 +1045,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.6.20" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8cab7a364d15cde1e505267766a2d3c4e22a843e1a601f0fa7564c0f82ced11c" +checksum = "3b181ba2dcf07aaccad5448e8ead58db5b742cf85dfe035e2227f137a539a189" [[package]] name = "remove_dir_all" @@ -1096,9 +1120,9 @@ checksum = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" [[package]] name = "serde" -version = "1.0.110" +version = "1.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99e7b308464d16b56eba9964e4972a3eee817760ab60d88c3f86e1fecb08204c" +checksum = "b88fa983de7720629c9387e9f517353ed404164b1e482c970a90c1a4aaf7dc1a" dependencies = [ "serde_derive", ] @@ -1115,9 +1139,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.110" +version = "1.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "818fbf6bfa9a42d3bfcaca148547aa00c7b915bec71d1757aa2d44ca68771984" +checksum = "cbd1ae72adb44aab48f325a02444a5fc079349a8d804c1fc922aed3f7454c74e" dependencies = [ "proc-macro2", "quote", @@ -1406,10 +1430,11 @@ checksum = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c" [[package]] name = "url" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829d4a8476c35c9bf0bbce5a3b23f4106f79728039b726d292bb93bc106787cb" +checksum = "5909f2b0817350449ed73e8bcd81c8c3c8d9a7a5d8acba4b27db277f1868976e" dependencies = [ + "form_urlencoded", "idna", "matches", "percent-encoding", diff --git a/Cargo.toml b/Cargo.toml index 37c83b4f0..3fcdfff03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,11 +10,12 @@ bstr = "0.2.13" byteorder = "1.3.4" crossbeam-channel = "0.5.0" csv = "1.1.3" +either = "1.6.1" flate2 = "1.0.17" fst = "0.4.4" fxhash = "0.2.1" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } -heed = { version = "0.10.4", default-features = false, features = ["lmdb", "sync-read-txn"] } +heed = { version = "0.10.5", default-features = false, features = ["lmdb", "sync-read-txn"] } human_format = "1.0.3" jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } @@ -26,6 +27,7 @@ obkv = "0.1.0" once_cell = "1.4.0" ordered-float = "2.0.0" rayon = "1.3.1" +regex = "1.4.2" ringtail = "0.3.0" roaring = "0.6.1" serde = { version = "1.0", features = ["derive"] } diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index b15700ce5..662225c77 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -6,6 +6,15 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" +[[package]] +name = "aho-corasick" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7404febffaa47dac81aa44dba71523c9d069b1bdc50a77db41195149e17f68e5" +dependencies = [ + "memchr", +] + [[package]] name = "anyhow" version = "1.0.34" @@ -404,6 +413,16 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "form_urlencoded" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ece68d15c92e84fa4f19d3780f1294e5ca82a78a6d515f1efaabcc144688be00" +dependencies = [ + "matches", + "percent-encoding", +] + [[package]] name = "fs_extra" version = "1.2.0" @@ -654,9 +673,9 @@ dependencies = [ [[package]] name = "heed" -version = "0.10.4" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616" +checksum = "2eaba3b0edee6a9cd551f24caca2027922b03259f7203a15f0b86af4c1348fcc" dependencies = [ "byteorder", "heed-traits", @@ -678,9 +697,9 @@ checksum = "b328f6260a7e51bdb0ca6b68e6ea27ee3d11fba5dee930896ee7ff6ad5fc072c" [[package]] name = "heed-types" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72fc61caee13e85ea330eabf0c6c7098c511ff173bcb57a760b1eda3bba9f6eb" +checksum = "e628efb08beaee58355f80dc4adba79d644940ea9eef60175ea17dc218aab405" dependencies = [ "bincode", "heed-traits", @@ -980,6 +999,7 @@ dependencies = [ "byteorder", "crossbeam-channel", "csv", + "either", "flate2", "fst", "fxhash", @@ -1000,6 +1020,7 @@ dependencies = [ "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", "pest_derive", "rayon", + "regex", "ringtail", "roaring", "serde", @@ -1199,9 +1220,9 @@ checksum = "ddd8a5a0aa2f3adafe349259a5b3e21a19c388b792414c1161d60a69c1fa48e8" [[package]] name = "once_cell" -version = "1.4.1" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "260e51e7efe62b592207e9e13a68e43692a7a279171d6ba57abd208bf23645ad" +checksum = "13bd41f508810a131401606d54ac32a467c97172d74ba7662562ebba5ad07fa0" [[package]] name = "opaque-debug" @@ -1602,6 +1623,18 @@ version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" +[[package]] +name = "regex" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38cf2c13ed4745de91a5eb834e11c00bcc3709e773173b2ce4c56c9fbde04b9c" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", + "thread_local", +] + [[package]] name = "regex-automata" version = "0.1.9" @@ -1611,6 +1644,12 @@ dependencies = [ "byteorder", ] +[[package]] +name = "regex-syntax" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b181ba2dcf07aaccad5448e8ead58db5b742cf85dfe035e2227f137a539a189" + [[package]] name = "remove_dir_all" version = "0.5.3" @@ -2137,10 +2176,11 @@ checksum = "f7fe0bb3479651439c9112f72b6c505038574c9fbb575ed1bf3b797fa39dd564" [[package]] name = "url" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829d4a8476c35c9bf0bbce5a3b23f4106f79728039b726d292bb93bc106787cb" +checksum = "5909f2b0817350449ed73e8bcd81c8c3c8d9a7a5d8acba4b27db277f1868976e" dependencies = [ + "form_urlencoded", "idna", "matches", "percent-encoding", diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index b30fb95c2..bfe39dcf6 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -8,7 +8,7 @@ edition = "2018" [dependencies] anyhow = "1.0.28" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } -heed = "0.10.4" +heed = "0.10.5" memmap = "0.7.0" milli = { path = ".." } once_cell = "1.4.1" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 62d3d75bd..b14e7f892 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -243,6 +243,13 @@ struct Settings { #[serde(default)] faceted_attributes: Option>, + + #[serde( + default, + deserialize_with = "deserialize_some", + skip_serializing_if = "Option::is_none", + )] + criteria: Option>>, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -399,6 +406,14 @@ async fn main() -> anyhow::Result<()> { builder.set_faceted_fields(facet_types); } + // We transpose the settings JSON struct into a real setting update. + if let Some(criteria) = settings.criteria { + match criteria { + Some(criteria) => builder.set_criteria(criteria), + None => builder.reset_criteria(), + } + } + let result = builder.execute(|indexing_step| { let (current, total) = match indexing_step { TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), diff --git a/src/criterion.rs b/src/criterion.rs index fd334f7d9..ea82055f2 100644 --- a/src/criterion.rs +++ b/src/criterion.rs @@ -1,3 +1,10 @@ +use crate::{FieldsIdsMap, FieldId}; + +use anyhow::{Context, bail}; +use regex::Regex; +use serde::{Serialize, Deserialize}; + +#[derive(Debug, Serialize, Deserialize, Copy, Clone, PartialEq, Eq)] pub enum Criterion { /// Sorted by increasing number of typos. Typo, @@ -14,9 +21,41 @@ pub enum Criterion { /// Sorted by the similarity of the matched words with the query words. Exactness, /// Sorted by the increasing value of the field specified. - CustomAsc(String), + Asc(FieldId), /// Sorted by the decreasing value of the field specified. - CustomDesc(String), + Desc(FieldId), +} + +impl Criterion { + pub fn from_str(fields_ids_map: &mut FieldsIdsMap, txt: &str) -> anyhow::Result { + match txt { + "typo" => Ok(Criterion::Typo), + "words" => Ok(Criterion::Words), + "proximity" => Ok(Criterion::Proximity), + "attribute" => Ok(Criterion::Attribute), + "wordsposition" => Ok(Criterion::WordsPosition), + "exactness" => Ok(Criterion::Exactness), + text => { + let re = Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#)?; + let caps = re.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; + let order = caps.get(1).unwrap().as_str(); + let field_name = caps.get(2).unwrap().as_str(); + let field_id = fields_ids_map.insert(field_name).context("field id limit reached")?; + match order { + "asc" => Ok(Criterion::Asc(field_id)), + "desc" => Ok(Criterion::Desc(field_id)), + otherwise => bail!("unknown criterion name: {}", otherwise), + } + }, + } + } + + pub fn field_id(&self) -> Option { + match *self { + Criterion::Asc(fid) | Criterion::Desc(fid) => Some(fid), + _ => None, + } + } } pub fn default_criteria() -> Vec { diff --git a/src/fields_ids_map.rs b/src/fields_ids_map.rs index 82d06e818..ce79e6e04 100644 --- a/src/fields_ids_map.rs +++ b/src/fields_ids_map.rs @@ -1,11 +1,12 @@ use std::collections::BTreeMap; use serde::{Serialize, Deserialize}; +use crate::FieldId; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FieldsIdsMap { - names_ids: BTreeMap, - ids_names: BTreeMap, - next_id: Option, + names_ids: BTreeMap, + ids_names: BTreeMap, + next_id: Option, } impl FieldsIdsMap { @@ -29,7 +30,7 @@ impl FieldsIdsMap { /// Returns the field id related to a field name, it will create a new field id if the /// name is not already known. Returns `None` if the maximum field id as been reached. - pub fn insert(&mut self, name: &str) -> Option { + pub fn insert(&mut self, name: &str) -> Option { match self.names_ids.get(name) { Some(id) => Some(*id), None => { @@ -43,17 +44,17 @@ impl FieldsIdsMap { } /// Get the id of a field based on its name. - pub fn id(&self, name: &str) -> Option { + pub fn id(&self, name: &str) -> Option { self.names_ids.get(name).copied() } /// Get the name of a field based on its id. - pub fn name(&self, id: u8) -> Option<&str> { + pub fn name(&self, id: FieldId) -> Option<&str> { self.ids_names.get(&id).map(String::as_str) } /// Remove a field name and id based on its name. - pub fn remove(&mut self, name: &str) -> Option { + pub fn remove(&mut self, name: &str) -> Option { match self.names_ids.remove(name) { Some(id) => self.ids_names.remove_entry(&id).map(|(id, _)| id), None => None, @@ -61,7 +62,7 @@ impl FieldsIdsMap { } /// Iterate over the ids and names in the ids order. - pub fn iter(&self) -> impl Iterator { + pub fn iter(&self) -> impl Iterator { self.ids_names.iter().map(|(id, name)| (*id, name.as_str())) } } diff --git a/src/heed_codec/facet/facet_level_value_f64_codec.rs b/src/heed_codec/facet/facet_level_value_f64_codec.rs index 1ee8e6bf3..a4642f961 100644 --- a/src/heed_codec/facet/facet_level_value_f64_codec.rs +++ b/src/heed_codec/facet/facet_level_value_f64_codec.rs @@ -2,12 +2,13 @@ use std::borrow::Cow; use std::convert::TryInto; use crate::facet::value_encoding::f64_into_bytes; +use crate::FieldId; // TODO do not de/serialize right bound when level = 0 pub struct FacetLevelValueF64Codec; impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { - type DItem = (u8, u8, f64, f64); + type DItem = (FieldId, u8, f64, f64); fn bytes_decode(bytes: &'a [u8]) -> Option { let (field_id, bytes) = bytes.split_first()?; @@ -27,7 +28,7 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { } impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { - type EItem = (u8, u8, f64, f64); + type EItem = (FieldId, u8, f64, f64); fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { let mut buffer = [0u8; 32]; diff --git a/src/heed_codec/facet/facet_level_value_i64_codec.rs b/src/heed_codec/facet/facet_level_value_i64_codec.rs index 7cf9a714b..cc0d3120d 100644 --- a/src/heed_codec/facet/facet_level_value_i64_codec.rs +++ b/src/heed_codec/facet/facet_level_value_i64_codec.rs @@ -2,11 +2,12 @@ use std::borrow::Cow; use std::convert::TryInto; use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; +use crate::FieldId; pub struct FacetLevelValueI64Codec; impl<'a> heed::BytesDecode<'a> for FacetLevelValueI64Codec { - type DItem = (u8, u8, i64, i64); + type DItem = (FieldId, u8, i64, i64); fn bytes_decode(bytes: &'a [u8]) -> Option { let (field_id, bytes) = bytes.split_first()?; @@ -24,7 +25,7 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueI64Codec { } impl heed::BytesEncode<'_> for FacetLevelValueI64Codec { - type EItem = (u8, u8, i64, i64); + type EItem = (FieldId, u8, i64, i64); fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { let left = i64_into_bytes(*left); diff --git a/src/heed_codec/facet/facet_value_string_codec.rs b/src/heed_codec/facet/facet_value_string_codec.rs index faa8b407b..350efc450 100644 --- a/src/heed_codec/facet/facet_value_string_codec.rs +++ b/src/heed_codec/facet/facet_value_string_codec.rs @@ -1,10 +1,12 @@ use std::borrow::Cow; use std::str; +use crate::FieldId; + pub struct FacetValueStringCodec; impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { - type DItem = (u8, &'a str); + type DItem = (FieldId, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { let (field_id, bytes) = bytes.split_first()?; @@ -14,7 +16,7 @@ impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { } impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec { - type EItem = (u8, &'a str); + type EItem = (FieldId, &'a str); fn bytes_encode((field_id, value): &Self::EItem) -> Option> { let mut bytes = Vec::with_capacity(value.len() + 1); diff --git a/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs b/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs new file mode 100644 index 000000000..e9b5abeb8 --- /dev/null +++ b/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs @@ -0,0 +1,36 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use crate::{FieldId, DocumentId}; +use crate::facet::value_encoding::f64_into_bytes; + +pub struct FieldDocIdFacetF64Codec; + +impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { + type DItem = (FieldId, DocumentId, f64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + + let (document_id_bytes, bytes) = bytes.split_at(4); + let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; + + let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?; + + Some((*field_id, document_id, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec { + type EItem = (FieldId, DocumentId, f64); + + fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(1 + 4 + 8 + 8); + bytes.push(*field_id); + bytes.extend_from_slice(&document_id.to_be_bytes()); + let value_bytes = f64_into_bytes(*value)?; + bytes.extend_from_slice(&value_bytes); + bytes.extend_from_slice(&value.to_be_bytes()); + Some(Cow::Owned(bytes)) + } +} diff --git a/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs b/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs new file mode 100644 index 000000000..a9eaf188c --- /dev/null +++ b/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs @@ -0,0 +1,34 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use crate::facet::value_encoding::{i64_into_bytes, i64_from_bytes}; +use crate::{FieldId, DocumentId}; + +pub struct FieldDocIdFacetI64Codec; + +impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetI64Codec { + type DItem = (FieldId, DocumentId, i64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + + let (document_id_bytes, bytes) = bytes.split_at(4); + let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; + + let value = bytes[..8].try_into().map(i64_from_bytes).ok()?; + + Some((*field_id, document_id, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetI64Codec { + type EItem = (FieldId, DocumentId, i64); + + fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(1 + 4 + 8); + bytes.push(*field_id); + bytes.extend_from_slice(&document_id.to_be_bytes()); + bytes.extend_from_slice(&i64_into_bytes(*value)); + Some(Cow::Owned(bytes)) + } +} diff --git a/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/src/heed_codec/facet/field_doc_id_facet_string_codec.rs new file mode 100644 index 000000000..2e282b2a0 --- /dev/null +++ b/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -0,0 +1,31 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::str; + +use crate::{FieldId, DocumentId}; + +pub struct FieldDocIdFacetStringCodec; + +impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { + type DItem = (FieldId, DocumentId, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (document_id_bytes, bytes) = bytes.split_at(4); + let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; + let value = str::from_utf8(bytes).ok()?; + Some((*field_id, document_id, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { + type EItem = (FieldId, DocumentId, &'a str); + + fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(1 + 4 + value.len()); + bytes.push(*field_id); + bytes.extend_from_slice(&document_id.to_be_bytes()); + bytes.extend_from_slice(value.as_bytes()); + Some(Cow::Owned(bytes)) + } +} diff --git a/src/heed_codec/facet/mod.rs b/src/heed_codec/facet/mod.rs index ef97e6add..d8ce936e0 100644 --- a/src/heed_codec/facet/mod.rs +++ b/src/heed_codec/facet/mod.rs @@ -1,7 +1,13 @@ mod facet_level_value_f64_codec; mod facet_level_value_i64_codec; mod facet_value_string_codec; +mod field_doc_id_facet_f64_codec; +mod field_doc_id_facet_i64_codec; +mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec; pub use self::facet_value_string_codec::FacetValueStringCodec; +pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; +pub use self::field_doc_id_facet_i64_codec::FieldDocIdFacetI64Codec; +pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/src/index.rs b/src/index.rs index b21c7d39b..26ca4ae29 100644 --- a/src/index.rs +++ b/src/index.rs @@ -9,13 +9,14 @@ use roaring::RoaringBitmap; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; -use crate::Search; -use crate::{BEU32, DocumentId, ExternalDocumentsIds}; +use crate::{default_criteria, Criterion, Search}; +use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds}; use crate::{ RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, }; +pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; @@ -41,13 +42,15 @@ pub struct Index { pub word_pair_proximity_docids: Database, /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. pub facet_field_id_value_docids: Database, + /// Maps the document id, the facet field id and the globally ordered value. + pub field_id_docid_facet_values: Database, /// Maps the document id to the document as an obkv store. pub documents: Database, ObkvCodec>, } impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(6); + options.max_dbs(7); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -55,6 +58,7 @@ impl Index { let docid_word_positions = env.create_database(Some("docid-word-positions"))?; let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; + let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; Ok(Index { @@ -64,6 +68,7 @@ impl Index { docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, + field_id_docid_facet_values, documents, }) } @@ -107,8 +112,8 @@ impl Index { /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. - pub fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: u8) -> heed::Result<()> { - self.main.put::<_, Str, OwnedType>(wtxn, PRIMARY_KEY_KEY, &primary_key) + pub fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: FieldId) -> heed::Result<()> { + self.main.put::<_, Str, OwnedType>(wtxn, PRIMARY_KEY_KEY, &primary_key) } /// Deletes the primary key of the documents, this can be done to reset indexes settings. @@ -117,8 +122,8 @@ impl Index { } /// Returns the documents primary key, `None` if it hasn't been defined. - pub fn primary_key(&self, rtxn: &RoTxn) -> heed::Result> { - self.main.get::<_, Str, OwnedType>(rtxn, PRIMARY_KEY_KEY) + pub fn primary_key(&self, rtxn: &RoTxn) -> heed::Result> { + self.main.get::<_, Str, OwnedType>(rtxn, PRIMARY_KEY_KEY) } /* external documents ids */ @@ -172,7 +177,7 @@ impl Index { /// Writes the fields ids that must be displayed in the defined order. /// There must be not be any duplicate field id. - pub fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[u8]) -> heed::Result<()> { + pub fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[FieldId]) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, DISPLAYED_FIELDS_KEY, fields) } @@ -184,14 +189,14 @@ impl Index { /// Returns the displayed fields ids in the order they must be returned. If it returns /// `None` it means that all the attributes are displayed in the order of the `FieldsIdsMap`. - pub fn displayed_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result> { + pub fn displayed_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result> { self.main.get::<_, Str, ByteSlice>(rtxn, DISPLAYED_FIELDS_KEY) } /* searchable fields */ /// Writes the searchable fields, when this list is specified, only these are indexed. - pub fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[u8]) -> heed::Result<()> { + pub fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[FieldId]) -> heed::Result<()> { assert!(fields.windows(2).all(|win| win[0] < win[1])); // is sorted self.main.put::<_, Str, ByteSlice>(wtxn, SEARCHABLE_FIELDS_KEY, fields) } @@ -203,7 +208,7 @@ impl Index { /// Returns the searchable fields ids, those are the fields that are indexed, /// if the searchable fields aren't there it means that **all** the fields are indexed. - pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result> { + pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result> { self.main.get::<_, Str, ByteSlice>(rtxn, SEARCHABLE_FIELDS_KEY) } @@ -211,7 +216,7 @@ impl Index { /// Writes the facet fields ids associated with their facet type or `None` if /// the facet type is currently unknown. - pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap) -> heed::Result<()> { + pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types) } @@ -221,14 +226,14 @@ impl Index { } /// Returns the facet fields ids associated with their facet type. - pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result> { + pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result> { Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) } /* faceted documents ids */ /// Writes the documents ids that are faceted under this field id. - pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: u8, docids: &RoaringBitmap) -> heed::Result<()> { + pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap) -> heed::Result<()> { let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; @@ -236,7 +241,7 @@ impl Index { } /// Retrieve all the documents ids that faceted under this field id. - pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: u8) -> heed::Result { + pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: FieldId) -> heed::Result { let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; @@ -246,6 +251,23 @@ impl Index { } } + /* criteria */ + + pub fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, CRITERIA_KEY, &criteria) + } + + pub fn delete_criteria(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, CRITERIA_KEY) + } + + pub fn criteria(&self, rtxn: &RoTxn) -> heed::Result> { + match self.main.get::<_, Str, SerdeJson>>(rtxn, CRITERIA_KEY)? { + Some(criteria) => Ok(criteria), + None => Ok(default_criteria()), + } + } + /* words fst */ /// Writes the FST which is the words dictionnary of the engine. diff --git a/src/lib.rs b/src/lib.rs index 93f9cc0df..9fa19c68c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -40,15 +40,16 @@ pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; pub type BEU32 = heed::zerocopy::U32; pub type BEU64 = heed::zerocopy::U64; -pub type DocumentId = u32; pub type Attribute = u32; +pub type DocumentId = u32; +pub type FieldId = u8; pub type Position = u32; type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result>; /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( - displayed_fields: &[u8], + displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, obkv: obkv::KvReader, ) -> anyhow::Result> diff --git a/src/search/facet/facet_condition.rs b/src/search/facet/facet_condition.rs new file mode 100644 index 000000000..77e1de5ad --- /dev/null +++ b/src/search/facet/facet_condition.rs @@ -0,0 +1,643 @@ +use std::collections::HashMap; +use std::fmt::Debug; +use std::ops::Bound::{self, Included, Excluded}; +use std::str::FromStr; + +use heed::types::{ByteSlice, DecodeIgnore}; +use log::debug; +use num_traits::Bounded; +use pest::error::{Error as PestError, ErrorVariant}; +use pest::iterators::{Pair, Pairs}; +use pest::Parser; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::facet::FacetValueStringCodec; +use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; +use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec}; + +use super::FacetRange; +use super::parser::Rule; +use super::parser::{PREC_CLIMBER, FilterParser}; + +use self::FacetCondition::*; +use self::FacetNumberOperator::*; + +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum FacetNumberOperator { + GreaterThan(T), + GreaterThanOrEqual(T), + Equal(T), + NotEqual(T), + LowerThan(T), + LowerThanOrEqual(T), + Between(T, T), +} + +impl FacetNumberOperator { + /// This method can return two operations in case it must express + /// an OR operation for the between case (i.e. `TO`). + fn negate(self) -> (Self, Option) { + match self { + GreaterThan(x) => (LowerThanOrEqual(x), None), + GreaterThanOrEqual(x) => (LowerThan(x), None), + Equal(x) => (NotEqual(x), None), + NotEqual(x) => (Equal(x), None), + LowerThan(x) => (GreaterThanOrEqual(x), None), + LowerThanOrEqual(x) => (GreaterThan(x), None), + Between(x, y) => (LowerThan(x), Some(GreaterThan(y))), + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum FacetStringOperator { + Equal(String), + NotEqual(String), +} + +impl FacetStringOperator { + fn equal(s: &str) -> Self { + FacetStringOperator::Equal(s.to_lowercase()) + } + + fn not_equal(s: &str) -> Self { + FacetStringOperator::equal(s).negate() + } + + fn negate(self) -> Self { + match self { + FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x), + FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x), + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum FacetCondition { + OperatorI64(FieldId, FacetNumberOperator), + OperatorF64(FieldId, FacetNumberOperator), + OperatorString(FieldId, FacetStringOperator), + Or(Box, Box), + And(Box, Box), +} + +fn get_field_id_facet_type<'a>( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + items: &mut Pairs<'a, Rule>, +) -> Result<(FieldId, FacetType), PestError> +{ + // lexing ensures that we at least have a key + let key = items.next().unwrap(); + let field_id = fields_ids_map + .id(key.as_str()) + .ok_or_else(|| { + PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` not found, available attributes are: {}", + key.as_str(), + fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", ") + ), + }, + key.as_span(), + ) + })?; + + let facet_type = faceted_fields + .get(&field_id) + .copied() + .ok_or_else(|| { + PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` is not faceted, available faceted attributes are: {}", + key.as_str(), + faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::>().join(", ") + ), + }, + key.as_span(), + ) + })?; + + Ok((field_id, facet_type)) +} + +fn pest_parse(pair: Pair) -> Result> +where T: FromStr, + T::Err: ToString, +{ + match pair.as_str().parse() { + Ok(value) => Ok(value), + Err(e) => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { message: e.to_string() }, + pair.as_span(), + )) + } + } +} + +impl FacetCondition { + pub fn from_str( + rtxn: &heed::RoTxn, + index: &Index, + expression: &str, + ) -> anyhow::Result + { + let fields_ids_map = index.fields_ids_map(rtxn)?; + let faceted_fields = index.faceted_fields(rtxn)?; + let lexed = FilterParser::parse(Rule::prgm, expression)?; + FacetCondition::from_pairs(&fields_ids_map, &faceted_fields, lexed) + } + + fn from_pairs( + fim: &FieldsIdsMap, + ff: &HashMap, + expression: Pairs, + ) -> anyhow::Result + { + PREC_CLIMBER.climb( + expression, + |pair: Pair| match pair.as_rule() { + Rule::greater => Ok(Self::greater_than(fim, ff, pair)?), + Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?), + Rule::eq => Ok(Self::equal(fim, ff, pair)?), + Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()), + Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), + Rule::less => Ok(Self::lower_than(fim, ff, pair)?), + Rule::between => Ok(Self::between(fim, ff, pair)?), + Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), + Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), + Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), + _ => unreachable!(), + }, + |lhs: anyhow::Result, op: Pair, rhs: anyhow::Result| { + match op.as_rule() { + Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), + Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), + _ => unreachable!(), + } + }, + ) + } + + fn negate(self) -> FacetCondition { + match self { + OperatorI64(fid, op) => match op.negate() { + (op, None) => OperatorI64(fid, op), + (a, Some(b)) => Or(Box::new(OperatorI64(fid, a)), Box::new(OperatorI64(fid, b))), + }, + OperatorF64(fid, op) => match op.negate() { + (op, None) => OperatorF64(fid, op), + (a, Some(b)) => Or(Box::new(OperatorF64(fid, a)), Box::new(OperatorF64(fid, b))), + }, + OperatorString(fid, op) => OperatorString(fid, op.negate()), + Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), + And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), + } + } + + fn between( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let lvalue = items.next().unwrap(); + let rvalue = items.next().unwrap(); + match ftype { + FacetType::Integer => { + let lvalue = pest_parse(lvalue)?; + let rvalue = pest_parse(rvalue)?; + Ok(OperatorI64(fid, Between(lvalue, rvalue))) + }, + FacetType::Float => { + let lvalue = pest_parse(lvalue)?; + let rvalue = pest_parse(rvalue)?; + Ok(OperatorF64(fid, Between(lvalue, rvalue))) + }, + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: "invalid operator on a faceted string".to_string(), + }, + item_span, + ).into()) + }, + } + } + + fn equal( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, Equal(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, Equal(pest_parse(value)?))), + FacetType::String => Ok(OperatorString(fid, FacetStringOperator::equal(value.as_str()))), + } + } + + fn greater_than( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, GreaterThan(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, GreaterThan(pest_parse(value)?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: "invalid operator on a faceted string".to_string(), + }, + item_span, + ).into()) + }, + } + } + + fn greater_than_or_equal( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, GreaterThanOrEqual(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, GreaterThanOrEqual(pest_parse(value)?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: "invalid operator on a faceted string".to_string(), + }, + item_span, + ).into()) + }, + } + } + + fn lower_than( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, LowerThan(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, LowerThan(pest_parse(value)?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: "invalid operator on a faceted string".to_string(), + }, + item_span, + ).into()) + }, + } + } + + fn lower_than_or_equal( + fields_ids_map: &FieldsIdsMap, + faceted_fields: &HashMap, + item: Pair, + ) -> anyhow::Result + { + let item_span = item.as_span(); + let mut items = item.into_inner(); + let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); + match ftype { + FacetType::Integer => Ok(OperatorI64(fid, LowerThanOrEqual(pest_parse(value)?))), + FacetType::Float => Ok(OperatorF64(fid, LowerThanOrEqual(pest_parse(value)?))), + FacetType::String => { + Err(PestError::::new_from_span( + ErrorVariant::CustomError { + message: "invalid operator on a faceted string".to_string(), + }, + item_span, + ).into()) + }, + } + } +} + +impl FacetCondition { + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_levels<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> anyhow::Result<()> + where + T: Copy + PartialEq + PartialOrd + Bounded + Debug, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + { + match (left, right) { + // If the request is an exact value we must go directly to the deepest level. + (Included(l), Included(r)) if l == r && level > 0 => { + return Self::explore_facet_levels::(rtxn, db, field_id, 0, left, right, output); + }, + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + + let mut left_found = None; + let mut right_found = None; + + // We must create a custom iterator to be able to iterate over the + // requested range as the range iterator cannot express some conditions. + let iter = FacetRange::new(rtxn, db.remap_key_type::(), field_id, level, left, right)?; + + debug!("Iterating between {:?} and {:?} (level {})", left, right, level); + + for (i, result) in iter.enumerate() { + let ((_fid, level, l, r), docids) = result?; + debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); + output.union_with(&docids); + // We save the leftest and rightest bounds we actually found at this level. + if i == 0 { left_found = Some(l); } + right_found = Some(r); + } + + // Can we go deeper? + let deeper_level = match level.checked_sub(1) { + Some(level) => level, + None => return Ok(()), + }; + + // We must refine the left and right bounds of this range by retrieving the + // missing part in a deeper level. + match left_found.zip(right_found) { + Some((left_found, right_found)) => { + // If the bound is satisfied we avoid calling this function again. + if !matches!(left, Included(l) if l == left_found) { + let sub_right = Excluded(left_found); + debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, sub_right, output)?; + } + if !matches!(right, Included(r) if r == right_found) { + let sub_left = Excluded(right_found); + debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, sub_left, right, output)?; + } + }, + None => { + // If we found nothing at this level it means that we must find + // the same bounds but at a deeper, more precise level. + Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, right, output)?; + }, + } + + Ok(()) + } + + fn evaluate_number_operator<'t, T: 't, KC>( + rtxn: &'t heed::RoTxn, + index: &Index, + db: heed::Database, + field_id: FieldId, + operator: FacetNumberOperator, + ) -> anyhow::Result + where + T: Copy + PartialEq + PartialOrd + Bounded + Debug, + KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + { + // Make sure we always bound the ranges with the field id and the level, + // as the facets values are all in the same database and prefixed by the + // field id and the level. + let (left, right) = match operator { + GreaterThan(val) => (Excluded(val), Included(T::max_value())), + GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())), + Equal(val) => (Included(val), Included(val)), + NotEqual(val) => { + let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; + let docids = Self::evaluate_number_operator::(rtxn, index, db, field_id, Equal(val))?; + return Ok(all_documents_ids - docids); + }, + LowerThan(val) => (Included(T::min_value()), Excluded(val)), + LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)), + Between(left, right) => (Included(left), Included(right)), + }; + + // Ask for the biggest value that can exist for this specific field, if it exists + // that's fine if it don't, the value just before will be returned instead. + let biggest_level = db + .remap_types::() + .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))? + .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); + + match biggest_level { + Some(level) => { + let mut output = RoaringBitmap::new(); + Self::explore_facet_levels::(rtxn, db, field_id, level, left, right, &mut output)?; + Ok(output) + }, + None => Ok(RoaringBitmap::new()), + } + } + + fn evaluate_string_operator( + rtxn: &heed::RoTxn, + index: &Index, + db: heed::Database, + field_id: FieldId, + operator: &FacetStringOperator, + ) -> anyhow::Result + { + match operator { + FacetStringOperator::Equal(string) => { + match db.get(rtxn, &(field_id, string))? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()) + } + }, + FacetStringOperator::NotEqual(string) => { + let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; + let op = FacetStringOperator::Equal(string.clone()); + let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?; + Ok(all_documents_ids - docids) + }, + } + } + + pub fn evaluate( + &self, + rtxn: &heed::RoTxn, + index: &Index, + ) -> anyhow::Result + { + let db = index.facet_field_id_value_docids; + match self { + OperatorI64(fid, op) => { + Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) + }, + OperatorF64(fid, op) => { + Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) + }, + OperatorString(fid, op) => { + let db = db.remap_key_type::(); + Self::evaluate_string_operator(rtxn, index, db, *fid, op) + }, + Or(lhs, rhs) => { + let lhs = lhs.evaluate(rtxn, index)?; + let rhs = rhs.evaluate(rtxn, index)?; + Ok(lhs | rhs) + }, + And(lhs, rhs) => { + let lhs = lhs.evaluate(rtxn, index)?; + let rhs = rhs.evaluate(rtxn, index)?; + Ok(lhs & rhs) + }, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::update::Settings; + use heed::EnvOpenOptions; + use maplit::hashmap; + + #[test] + fn string() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); + let expected = OperatorString(1, FacetStringOperator::equal("Ponce")); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); + let expected = OperatorString(1, FacetStringOperator::not_equal("ponce")); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); + let expected = OperatorString(1, FacetStringOperator::not_equal("ponce")); + assert_eq!(condition, expected); + } + + #[test] + fn i64() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_faceted_fields(hashmap!{ "timestamp".into() => "integer".into() }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); + let expected = OperatorI64(1, Between(22, 44)); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); + let expected = Or( + Box::new(OperatorI64(1, LowerThan(22))), + Box::new(OperatorI64(1, GreaterThan(44))), + ); + assert_eq!(condition, expected); + } + + #[test] + fn parentheses() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order + builder.set_faceted_fields(hashmap!{ + "channel".into() => "string".into(), + "timestamp".into() => "integer".into(), + }); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FacetCondition::from_str( + &rtxn, &index, + "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", + ).unwrap(); + let expected = Or( + Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), + Box::new(And( + Box::new(OperatorI64(1, Between(22, 44))), + Box::new(OperatorString(0, FacetStringOperator::not_equal("ponce"))), + )) + ); + assert_eq!(condition, expected); + + let condition = FacetCondition::from_str( + &rtxn, &index, + "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", + ).unwrap(); + let expected = Or( + Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), + Box::new(Or( + Box::new(Or( + Box::new(OperatorI64(1, LowerThan(22))), + Box::new(OperatorI64(1, GreaterThan(44))), + )), + Box::new(OperatorString(0, FacetStringOperator::equal("ponce"))), + )), + ); + assert_eq!(condition, expected); + } +} diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs index c47f290e0..41212e83e 100644 --- a/src/search/facet/mod.rs +++ b/src/search/facet/mod.rs @@ -1,661 +1,255 @@ -use std::collections::HashMap; use std::fmt::Debug; -use std::ops::Bound::{self, Unbounded, Included, Excluded}; -use std::str::FromStr; +use std::ops::Bound::{self, Included, Excluded, Unbounded}; -use heed::types::{ByteSlice, DecodeIgnore}; +use either::Either::{self, Left, Right}; +use heed::types::{DecodeIgnore, ByteSlice}; +use heed::{BytesEncode, BytesDecode}; +use heed::{Database, RoRange, RoRevRange, LazyDecode}; use log::debug; use num_traits::Bounded; -use parser::{PREC_CLIMBER, FilterParser}; -use pest::error::{Error as PestError, ErrorVariant}; -use pest::iterators::{Pair, Pairs}; -use pest::Parser; use roaring::RoaringBitmap; -use crate::facet::FacetType; -use crate::heed_codec::facet::FacetValueStringCodec; -use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; -use crate::{Index, FieldsIdsMap, CboRoaringBitmapCodec}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::{Index, FieldId}; -use self::FacetCondition::*; -use self::FacetNumberOperator::*; -use self::parser::Rule; +pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator}; +mod facet_condition; mod parser; -#[derive(Debug, Copy, Clone, PartialEq)] -pub enum FacetNumberOperator { - GreaterThan(T), - GreaterThanOrEqual(T), - Equal(T), - NotEqual(T), - LowerThan(T), - LowerThanOrEqual(T), - Between(T, T), +struct FacetRange<'t, T: 't, KC> { + iter: RoRange<'t, KC, LazyDecode>, + end: Bound, } -impl FacetNumberOperator { - /// This method can return two operations in case it must express - /// an OR operation for the between case (i.e. `TO`). - fn negate(self) -> (Self, Option) { - match self { - GreaterThan(x) => (LowerThanOrEqual(x), None), - GreaterThanOrEqual(x) => (LowerThan(x), None), - Equal(x) => (NotEqual(x), None), - NotEqual(x) => (Equal(x), None), - LowerThan(x) => (GreaterThanOrEqual(x), None), - LowerThanOrEqual(x) => (GreaterThan(x), None), - Between(x, y) => (LowerThan(x), Some(GreaterThan(y))), - } - } -} - -#[derive(Debug, Clone, PartialEq)] -pub enum FacetStringOperator { - Equal(String), - NotEqual(String), -} - -impl FacetStringOperator { - fn equal(s: &str) -> Self { - FacetStringOperator::Equal(s.to_lowercase()) - } - - fn not_equal(s: &str) -> Self { - FacetStringOperator::equal(s).negate() - } - - fn negate(self) -> Self { - match self { - FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x), - FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x), - } - } -} - -#[derive(Debug, Clone, PartialEq)] -pub enum FacetCondition { - OperatorI64(u8, FacetNumberOperator), - OperatorF64(u8, FacetNumberOperator), - OperatorString(u8, FacetStringOperator), - Or(Box, Box), - And(Box, Box), -} - -fn get_field_id_facet_type<'a>( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - items: &mut Pairs<'a, Rule>, -) -> Result<(u8, FacetType), PestError> +impl<'t, T: 't, KC> FacetRange<'t, T, KC> +where + KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, + T: PartialOrd + Copy + Bounded, { - // lexing ensures that we at least have a key - let key = items.next().unwrap(); - let field_id = fields_ids_map - .id(key.as_str()) - .ok_or_else(|| { - PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` not found, available attributes are: {}", - key.as_str(), - fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", ") - ), - }, - key.as_span(), - ) - })?; - - let facet_type = faceted_fields - .get(&field_id) - .copied() - .ok_or_else(|| { - PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` is not faceted, available faceted attributes are: {}", - key.as_str(), - faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::>().join(", ") - ), - }, - key.as_span(), - ) - })?; - - Ok((field_id, facet_type)) -} - -fn pest_parse(pair: Pair) -> Result> -where T: FromStr, - T::Err: ToString, -{ - match pair.as_str().parse() { - Ok(value) => Ok(value), - Err(e) => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { message: e.to_string() }, - pair.as_span(), - )) - } - } -} - -impl FacetCondition { - pub fn from_str( - rtxn: &heed::RoTxn, - index: &Index, - expression: &str, - ) -> anyhow::Result - { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields(rtxn)?; - let lexed = FilterParser::parse(Rule::prgm, expression)?; - FacetCondition::from_pairs(&fields_ids_map, &faceted_fields, lexed) - } - - fn from_pairs( - fim: &FieldsIdsMap, - ff: &HashMap, - expression: Pairs, - ) -> anyhow::Result - { - PREC_CLIMBER.climb( - expression, - |pair: Pair| match pair.as_rule() { - Rule::greater => Ok(Self::greater_than(fim, ff, pair)?), - Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?), - Rule::eq => Ok(Self::equal(fim, ff, pair)?), - Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()), - Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), - Rule::less => Ok(Self::lower_than(fim, ff, pair)?), - Rule::between => Ok(Self::between(fim, ff, pair)?), - Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), - Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), - Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), - _ => unreachable!(), - }, - |lhs: anyhow::Result, op: Pair, rhs: anyhow::Result| { - match op.as_rule() { - Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), - Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), - _ => unreachable!(), - } - }, - ) - } - - fn negate(self) -> FacetCondition { - match self { - OperatorI64(fid, op) => match op.negate() { - (op, None) => OperatorI64(fid, op), - (a, Some(b)) => Or(Box::new(OperatorI64(fid, a)), Box::new(OperatorI64(fid, b))), - }, - OperatorF64(fid, op) => match op.negate() { - (op, None) => OperatorF64(fid, op), - (a, Some(b)) => Or(Box::new(OperatorF64(fid, a)), Box::new(OperatorF64(fid, b))), - }, - OperatorString(fid, op) => OperatorString(fid, op.negate()), - Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), - And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), - } - } - - fn between( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - item: Pair, - ) -> anyhow::Result - { - let item_span = item.as_span(); - let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; - let lvalue = items.next().unwrap(); - let rvalue = items.next().unwrap(); - match ftype { - FacetType::Integer => { - let lvalue = pest_parse(lvalue)?; - let rvalue = pest_parse(rvalue)?; - Ok(OperatorI64(fid, Between(lvalue, rvalue))) - }, - FacetType::Float => { - let lvalue = pest_parse(lvalue)?; - let rvalue = pest_parse(rvalue)?; - Ok(OperatorF64(fid, Between(lvalue, rvalue))) - }, - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - } - } - - fn equal( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - item: Pair, - ) -> anyhow::Result - { - let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; - let value = items.next().unwrap(); - match ftype { - FacetType::Integer => Ok(OperatorI64(fid, Equal(pest_parse(value)?))), - FacetType::Float => Ok(OperatorF64(fid, Equal(pest_parse(value)?))), - FacetType::String => Ok(OperatorString(fid, FacetStringOperator::equal(value.as_str()))), - } - } - - fn greater_than( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - item: Pair, - ) -> anyhow::Result - { - let item_span = item.as_span(); - let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; - let value = items.next().unwrap(); - match ftype { - FacetType::Integer => Ok(OperatorI64(fid, GreaterThan(pest_parse(value)?))), - FacetType::Float => Ok(OperatorF64(fid, GreaterThan(pest_parse(value)?))), - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - } - } - - fn greater_than_or_equal( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - item: Pair, - ) -> anyhow::Result - { - let item_span = item.as_span(); - let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; - let value = items.next().unwrap(); - match ftype { - FacetType::Integer => Ok(OperatorI64(fid, GreaterThanOrEqual(pest_parse(value)?))), - FacetType::Float => Ok(OperatorF64(fid, GreaterThanOrEqual(pest_parse(value)?))), - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - } - } - - fn lower_than( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - item: Pair, - ) -> anyhow::Result - { - let item_span = item.as_span(); - let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; - let value = items.next().unwrap(); - match ftype { - FacetType::Integer => Ok(OperatorI64(fid, LowerThan(pest_parse(value)?))), - FacetType::Float => Ok(OperatorF64(fid, LowerThan(pest_parse(value)?))), - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - } - } - - fn lower_than_or_equal( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - item: Pair, - ) -> anyhow::Result - { - let item_span = item.as_span(); - let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; - let value = items.next().unwrap(); - match ftype { - FacetType::Integer => Ok(OperatorI64(fid, LowerThanOrEqual(pest_parse(value)?))), - FacetType::Float => Ok(OperatorF64(fid, LowerThanOrEqual(pest_parse(value)?))), - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - } - } -} - -impl FacetCondition { - /// Aggregates the documents ids that are part of the specified range automatically - /// going deeper through the levels. - fn explore_facet_levels<'t, T: 't, KC>( + fn new( rtxn: &'t heed::RoTxn, - db: heed::Database, - field_id: u8, + db: Database, + field_id: FieldId, level: u8, left: Bound, right: Bound, - output: &mut RoaringBitmap, - ) -> anyhow::Result<()> - where - T: Copy + PartialEq + PartialOrd + Bounded + Debug, - KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + ) -> heed::Result> { - match (left, right) { - // If the request is an exact value we must go directly to the deepest level. - (Included(l), Included(r)) if l == r && level > 0 => { - return Self::explore_facet_levels::(rtxn, db, field_id, 0, left, right, output); - }, - // lower TO upper when lower > upper must return no result - (Included(l), Included(r)) if l > r => return Ok(()), - (Included(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Included(r)) if l >= r => return Ok(()), - (_, _) => (), - } - - let mut left_found = None; - let mut right_found = None; - - // We must create a custom iterator to be able to iterate over the - // requested range as the range iterator cannot express some conditions. let left_bound = match left { Included(left) => Included((field_id, level, left, T::min_value())), Excluded(left) => Excluded((field_id, level, left, T::min_value())), - Unbounded => Unbounded, + Unbounded => Included((field_id, level, T::min_value(), T::min_value())), }; let right_bound = Included((field_id, level, T::max_value(), T::max_value())); - // We also make sure that we don't decode the data before we are sure we must return it. - let iter = db - .remap_key_type::() - .lazily_decode_data() - .range(rtxn, &(left_bound, right_bound))? - .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { - match right { - Included(right) => *r <= right, - Excluded(right) => *r < right, + let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; + Ok(FacetRange { iter, end: right }) + } +} + +impl<'t, T, KC> Iterator for FacetRange<'t, T, KC> +where + KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, + KC: BytesDecode<'t, DItem = (FieldId, u8, T, T)>, + T: PartialOrd + Copy, +{ + type Item = heed::Result<((FieldId, u8, T, T), RoaringBitmap)>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, Unbounded => true, - } - })) - .map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data)))); - - debug!("Iterating between {:?} and {:?} (level {})", left, right, level); - - for (i, result) in iter.enumerate() { - let ((_fid, level, l, r), docids) = result?; - debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - output.union_with(&docids); - // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { left_found = Some(l); } - right_found = Some(r); - } - - // Can we go deeper? - let deeper_level = match level.checked_sub(1) { - Some(level) => level, - None => return Ok(()), - }; - - // We must refine the left and right bounds of this range by retrieving the - // missing part in a deeper level. - match left_found.zip(right_found) { - Some((left_found, right_found)) => { - // If the bound is satisfied we avoid calling this function again. - if !matches!(left, Included(l) if l == left_found) { - let sub_right = Excluded(left_found); - debug!("calling left with {:?} to {:?} (level {})", left, sub_right, deeper_level); - Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, sub_right, output)?; - } - if !matches!(right, Included(r) if r == right_found) { - let sub_left = Excluded(right_found); - debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); - Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, sub_left, right, output)?; + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => Some(Ok(((fid, level, left, right), docids))), + Err(e) => Some(Err(e)), + } + } else { + None } }, - None => { - // If we found nothing at this level it means that we must find - // the same bounds but at a deeper, more precise level. - Self::explore_facet_levels::(rtxn, db, field_id, deeper_level, left, right, output)?; - }, + Some(Err(e)) => Some(Err(e)), + None => None, } - - Ok(()) } +} - fn evaluate_number_operator<'t, T: 't, KC>( +struct FacetRevRange<'t, T: 't, KC> { + iter: RoRevRange<'t, KC, LazyDecode>, + end: Bound, +} + +impl<'t, T: 't, KC> FacetRevRange<'t, T, KC> +where + KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, + T: PartialOrd + Copy + Bounded, +{ + fn new( rtxn: &'t heed::RoTxn, - index: &Index, - db: heed::Database, - field_id: u8, - operator: FacetNumberOperator, - ) -> anyhow::Result - where - T: Copy + PartialEq + PartialOrd + Bounded + Debug, - KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, - KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, + db: Database, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + ) -> heed::Result> { - // Make sure we always bound the ranges with the field id and the level, - // as the facets values are all in the same database and prefixed by the - // field id and the level. - let (left, right) = match operator { - GreaterThan(val) => (Excluded(val), Included(T::max_value())), - GreaterThanOrEqual(val) => (Included(val), Included(T::max_value())), - Equal(val) => (Included(val), Included(val)), - NotEqual(val) => { - let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; - let docids = Self::evaluate_number_operator::(rtxn, index, db, field_id, Equal(val))?; - return Ok(all_documents_ids - docids); - }, - LowerThan(val) => (Included(T::min_value()), Excluded(val)), - LowerThanOrEqual(val) => (Included(T::min_value()), Included(val)), - Between(left, right) => (Included(left), Included(right)), + let left_bound = match left { + Included(left) => Included((field_id, level, left, T::min_value())), + Excluded(left) => Excluded((field_id, level, left, T::min_value())), + Unbounded => Included((field_id, level, T::min_value(), T::min_value())), }; + let right_bound = Included((field_id, level, T::max_value(), T::max_value())); + let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; + Ok(FacetRevRange { iter, end: right }) + } +} - // Ask for the biggest value that can exist for this specific field, if it exists - // that's fine if it don't, the value just before will be returned instead. - let biggest_level = db - .remap_types::() - .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))? - .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); +impl<'t, T, KC> Iterator for FacetRevRange<'t, T, KC> +where + KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, + KC: BytesDecode<'t, DItem = (FieldId, u8, T, T)>, + T: PartialOrd + Copy, +{ + type Item = heed::Result<((FieldId, u8, T, T), RoaringBitmap)>; - match biggest_level { - Some(level) => { - let mut output = RoaringBitmap::new(); - Self::explore_facet_levels::(rtxn, db, field_id, level, left, right, &mut output)?; - Ok(output) - }, - None => Ok(RoaringBitmap::new()), + fn next(&mut self) -> Option { + loop { + match self.iter.next() { + Some(Ok(((fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), + Err(e) => return Some(Err(e)), + } + } + continue; + }, + Some(Err(e)) => return Some(Err(e)), + None => return None, + } } } +} - fn evaluate_string_operator( - rtxn: &heed::RoTxn, - index: &Index, - db: heed::Database, - field_id: u8, - operator: &FacetStringOperator, - ) -> anyhow::Result +pub struct FacetIter<'t, T: 't, KC> { + rtxn: &'t heed::RoTxn<'t>, + db: Database, + field_id: FieldId, + level_iters: Vec<(RoaringBitmap, Either, FacetRevRange<'t, T, KC>>)>, +} + +impl<'t, T, KC> FacetIter<'t, T, KC> +where + KC: heed::BytesDecode<'t, DItem = (FieldId, u8, T, T)>, + KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, + T: PartialOrd + Copy + Bounded, +{ + pub fn new( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { - match operator { - FacetStringOperator::Equal(string) => { - match db.get(rtxn, &(field_id, string))? { - Some(docids) => Ok(docids), - None => Ok(RoaringBitmap::new()) + let db = index.facet_field_id_value_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Left(highest_iter))] }) + } + + pub fn new_reverse( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> + { + let db = index.facet_field_id_value_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Right(highest_iter))] }) + } + + fn highest_level(rtxn: &'t heed::RoTxn, db: Database, fid: FieldId) -> heed::Result> { + let level = db.remap_types::() + .prefix_iter(rtxn, &[fid][..])? + .remap_key_type::() + .last().transpose()? + .map(|((_, level, _, _), _)| level); + Ok(level) + } +} + +impl<'t, T: 't, KC> Iterator for FacetIter<'t, T, KC> +where + KC: heed::BytesDecode<'t, DItem = (FieldId, u8, T, T)>, + KC: for<'x> heed::BytesEncode<'x, EItem = (FieldId, u8, T, T)>, + T: PartialOrd + Copy + Bounded + Debug, +{ + type Item = heed::Result<(T, RoaringBitmap)>; + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, last) = self.level_iters.last_mut()?; + let is_ascending = last.is_left(); + for result in last { + // If the last iterator must find an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; } - }, - FacetStringOperator::NotEqual(string) => { - let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; - let op = FacetStringOperator::Equal(string.clone()); - let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?; - Ok(all_documents_ids - docids) - }, - } - } - pub fn evaluate( - &self, - rtxn: &heed::RoTxn, - index: &Index, - ) -> anyhow::Result - { - let db = index.facet_field_id_value_docids; - match self { - OperatorI64(fid, op) => { - Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) - }, - OperatorF64(fid, op) => { - Self::evaluate_number_operator::(rtxn, index, db, *fid, *op) - }, - OperatorString(fid, op) => { - let db = db.remap_key_type::(); - Self::evaluate_string_operator(rtxn, index, db, *fid, op) - }, - Or(lhs, rhs) => { - let lhs = lhs.evaluate(rtxn, index)?; - let rhs = rhs.evaluate(rtxn, index)?; - Ok(lhs | rhs) - }, - And(lhs, rhs) => { - let lhs = lhs.evaluate(rtxn, index)?; - let rhs = rhs.evaluate(rtxn, index)?; - Ok(lhs & rhs) - }, + match result { + Ok(((_fid, level, left, right), mut docids)) => { + + docids.intersect_with(&documents_ids); + if !docids.is_empty() { + documents_ids.difference_with(&docids); + + if level == 0 { + debug!("found {:?} at {:?}", docids, left); + return Some(Ok((left, docids))); + } + + let rtxn = self.rtxn; + let db = self.db; + let fid = self.field_id; + let left = Included(left); + let right = Included(right); + + debug!("calling with {:?} to {:?} (level {}) to find {:?}", + left, right, level - 1, docids, + ); + + let result = if is_ascending { + FacetRange::new(rtxn, db, fid, level - 1, left, right).map(Left) + } else { + FacetRevRange::new(rtxn, db, fid, level - 1, left, right).map(Right) + }; + + match result { + Ok(iter) => { + self.level_iters.push((docids, iter)); + continue 'outer; + }, + Err(e) => return Some(Err(e)), + } + } + }, + Err(e) => return Some(Err(e)), + } + } + self.level_iters.pop(); } } } - -#[cfg(test)] -mod tests { - use super::*; - use crate::update::Settings; - use heed::EnvOpenOptions; - use maplit::hashmap; - - #[test] - fn string() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the faceted fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); - builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() }); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); - let expected = OperatorString(1, FacetStringOperator::equal("Ponce")); - assert_eq!(condition, expected); - - let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); - let expected = OperatorString(1, FacetStringOperator::not_equal("ponce")); - assert_eq!(condition, expected); - - let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); - let expected = OperatorString(1, FacetStringOperator::not_equal("ponce")); - assert_eq!(condition, expected); - } - - #[test] - fn i64() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the faceted fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); - builder.set_faceted_fields(hashmap!{ "timestamp".into() => "integer".into() }); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); - let expected = OperatorI64(1, Between(22, 44)); - assert_eq!(condition, expected); - - let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); - let expected = Or( - Box::new(OperatorI64(1, LowerThan(22))), - Box::new(OperatorI64(1, GreaterThan(44))), - ); - assert_eq!(condition, expected); - } - - #[test] - fn parentheses() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the faceted fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index); - builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order - builder.set_faceted_fields(hashmap!{ - "channel".into() => "string".into(), - "timestamp".into() => "integer".into(), - }); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FacetCondition::from_str( - &rtxn, &index, - "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", - ).unwrap(); - let expected = Or( - Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), - Box::new(And( - Box::new(OperatorI64(1, Between(22, 44))), - Box::new(OperatorString(0, FacetStringOperator::not_equal("ponce"))), - )) - ); - assert_eq!(condition, expected); - - let condition = FacetCondition::from_str( - &rtxn, &index, - "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", - ).unwrap(); - let expected = Or( - Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), - Box::new(Or( - Box::new(Or( - Box::new(OperatorI64(1, LowerThan(22))), - Box::new(OperatorI64(1, GreaterThan(44))), - )), - Box::new(OperatorString(0, FacetStringOperator::equal("ponce"))), - )), - ); - assert_eq!(condition, expected); - } -} diff --git a/src/search/mod.rs b/src/search/mod.rs index af6ccaf26..d1850871d 100644 --- a/src/search/mod.rs +++ b/src/search/mod.rs @@ -1,19 +1,26 @@ use std::borrow::Cow; use std::collections::{HashMap, HashSet}; use std::fmt; +use std::time::Instant; +use anyhow::{bail, Context}; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use log::debug; use once_cell::sync::Lazy; +use ordered_float::OrderedFloat; use roaring::bitmap::RoaringBitmap; +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; +use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::mdfs::Mdfs; use crate::query_tokens::{QueryTokens, QueryToken}; -use crate::{Index, DocumentId}; +use crate::{Index, FieldId, DocumentId, Criterion}; -pub use self::facet::FacetCondition; +pub use self::facet::{FacetCondition, FacetNumberOperator, FacetStringOperator}; +pub use self::facet::{FacetIter}; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -144,6 +151,84 @@ impl<'a> Search<'a> { candidates } + fn facet_ordered( + &self, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + documents_ids: RoaringBitmap, + limit: usize, + ) -> anyhow::Result> + { + let mut limit_tmp = limit; + let mut output = Vec::new(); + match facet_type { + FacetType::Float => { + if documents_ids.len() <= 1000 { + let db = self.index.field_id_docid_facet_values.remap_key_type::(); + let mut docids_values = Vec::with_capacity(documents_ids.len() as usize); + for docid in documents_ids { + let left = (field_id, docid, f64::MIN); + let right = (field_id, docid, f64::MAX); + let mut iter = db.range(self.rtxn, &(left..=right))?; + let entry = if ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), ())) = entry.transpose()? { + docids_values.push((docid, OrderedFloat(value))); + } + } + docids_values.sort_unstable_by_key(|(_, value)| *value); + let iter = docids_values.into_iter().map(|(id, _)| id).take(limit); + if ascending { Ok(iter.collect()) } else { Ok(iter.rev().collect()) } + } else { + let facet_fn = if ascending { + FacetIter::::new + } else { + FacetIter::::new_reverse + }; + for result in facet_fn(self.rtxn, self.index, field_id, documents_ids)? { + let (_val, docids) = result?; + limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); + output.push(docids); + if limit_tmp == 0 { break } + } + Ok(output.into_iter().flatten().take(limit).collect()) + } + }, + FacetType::Integer => { + if documents_ids.len() <= 1000 { + let db = self.index.field_id_docid_facet_values.remap_key_type::(); + let mut docids_values = Vec::with_capacity(documents_ids.len() as usize); + for docid in documents_ids { + let left = (field_id, docid, i64::MIN); + let right = (field_id, docid, i64::MAX); + let mut iter = db.range(self.rtxn, &(left..=right))?; + let entry = if ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), ())) = entry.transpose()? { + docids_values.push((docid, value)); + } + } + docids_values.sort_unstable_by_key(|(_, value)| *value); + let iter = docids_values.into_iter().map(|(id, _)| id).take(limit); + if ascending { Ok(iter.collect()) } else { Ok(iter.rev().collect()) } + } else { + let facet_fn = if ascending { + FacetIter::::new + } else { + FacetIter::::new_reverse + }; + for result in facet_fn(self.rtxn, self.index, field_id, documents_ids)? { + let (_val, docids) = result?; + limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); + output.push(docids); + if limit_tmp == 0 { break } + } + Ok(output.into_iter().flatten().take(limit).collect()) + } + }, + FacetType::String => bail!("criteria facet type must be a number"), + } + } + pub fn execute(&self) -> anyhow::Result { let limit = self.limit; let fst = self.index.words_fst(self.rtxn)?; @@ -155,13 +240,34 @@ impl<'a> Search<'a> { }; // We create the original candidates with the facet conditions results. + let before = Instant::now(); let facet_candidates = match &self.facet_condition { Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?), None => None, }; - debug!("facet candidates: {:?}", facet_candidates); + debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); + let order_by_facet = { + let criteria = self.index.criteria(self.rtxn)?; + let result = criteria.into_iter().flat_map(|criterion| { + match criterion { + Criterion::Asc(fid) => Some((fid, true)), + Criterion::Desc(fid) => Some((fid, false)), + _ => None + } + }).next(); + match result { + Some((fid, is_ascending)) => { + let faceted_fields = self.index.faceted_fields(self.rtxn)?; + let ftype = *faceted_fields.get(&fid).context("unknown field id")?; + Some((fid, ftype, is_ascending)) + }, + None => None, + } + }; + + let before = Instant::now(); let (candidates, derived_words) = match (facet_candidates, derived_words) { (Some(mut facet_candidates), Some(derived_words)) => { let words_candidates = Self::compute_candidates(&derived_words); @@ -174,17 +280,28 @@ impl<'a> Search<'a> { (Some(facet_candidates), None) => { // If the query is not set or results in no DFAs but // there is some facet conditions we return a placeholder. - let documents_ids = facet_candidates.iter().take(limit).collect(); + let documents_ids = match order_by_facet { + Some((fid, ftype, is_ascending)) => { + self.facet_ordered(fid, ftype, is_ascending, facet_candidates, limit)? + }, + None => facet_candidates.iter().take(limit).collect(), + }; return Ok(SearchResult { documents_ids, ..Default::default() }) }, (None, None) => { // If the query is not set or results in no DFAs we return a placeholder. - let documents_ids = self.index.documents_ids(self.rtxn)?.iter().take(limit).collect(); + let documents_ids = self.index.documents_ids(self.rtxn)?; + let documents_ids = match order_by_facet { + Some((fid, ftype, is_ascending)) => { + self.facet_ordered(fid, ftype, is_ascending, documents_ids, limit)? + }, + None => documents_ids.iter().take(limit).collect(), + }; return Ok(SearchResult { documents_ids, ..Default::default() }) }, }; - debug!("candidates: {:?}", candidates); + debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed()); // The mana depth first search is a revised DFS that explore // solutions in the order of their proximities. @@ -203,7 +320,19 @@ impl<'a> Search<'a> { } let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); - let documents_ids = documents.into_iter().flatten().take(limit).collect(); + let documents_ids = match order_by_facet { + Some((fid, ftype, order)) => { + let mut ordered_documents = Vec::new(); + for documents_ids in documents { + let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?; + ordered_documents.push(docids); + if ordered_documents.iter().map(Vec::len).sum::() >= limit { break } + } + ordered_documents.into_iter().flatten().take(limit).collect() + }, + None => documents.into_iter().flatten().take(limit).collect(), + }; + Ok(SearchResult { found_words, documents_ids }) } } diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 54ce620cb..ccac80101 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -297,6 +297,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, + field_id_docid_facet_values: _, documents, } = index; diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index 5dc14f97d..a6e54c5e8 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -19,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, + field_id_docid_facet_values, documents, } = self.index; @@ -41,6 +42,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; facet_field_id_value_docids.clear(self.wtxn)?; + field_id_docid_facet_values.clear(self.wtxn)?; documents.clear(self.wtxn)?; Ok(number_of_documents) diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index b1db4f94c..a1d00de43 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -2,7 +2,9 @@ use fst::IntoStreamer; use heed::types::ByteSlice; use roaring::RoaringBitmap; +use crate::facet::FacetType; use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; +use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use super::ClearDocuments; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -75,6 +77,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, + field_id_docid_facet_values, documents, } = self.index; @@ -186,10 +189,42 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // Remove the documents ids from the faceted documents ids. let faceted_fields = self.index.faceted_fields(self.wtxn)?; - for (field_id, _) in faceted_fields { + for (field_id, facet_type) in faceted_fields { let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; docids.difference_with(&self.documents_ids); self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; + + // We delete the entries that are part of the documents ids. + let iter = field_id_docid_facet_values.prefix_iter_mut(self.wtxn, &[field_id])?; + match facet_type { + FacetType::String => { + let mut iter = iter.remap_key_type::(); + while let Some(result) = iter.next() { + let ((_fid, docid, _value), ()) = result?; + if self.documents_ids.contains(docid) { + iter.del_current()?; + } + } + }, + FacetType::Float => { + let mut iter = iter.remap_key_type::(); + while let Some(result) = iter.next() { + let ((_fid, docid, _value), ()) = result?; + if self.documents_ids.contains(docid) { + iter.del_current()?; + } + } + }, + FacetType::Integer => { + let mut iter = iter.remap_key_type::(); + while let Some(result) = iter.next() { + let ((_fid, docid, _value), ()) = result?; + if self.documents_ids.contains(docid) { + iter.del_current()?; + } + } + }, + } } // We delete the documents ids that are under the facet field id values. @@ -205,6 +240,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } + drop(iter); + Ok(self.documents_ids.len() as usize) } } diff --git a/src/update/index_documents/merge_function.rs b/src/update/index_documents/merge_function.rs index fb785fd11..6f24fcad9 100644 --- a/src/update/index_documents/merge_function.rs +++ b/src/update/index_documents/merge_function.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; -use anyhow::{bail, ensure}; +use anyhow::{bail, ensure, Context}; use bstr::ByteSlice as _; use fst::IntoStreamer; use roaring::RoaringBitmap; @@ -42,6 +42,12 @@ pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow:: bail!("merging docid word positions is an error ({:?})", key.as_bstr()) } +pub fn field_id_docid_facet_values_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + let first = values.first().context("no value to merge")?; + ensure!(values.iter().all(|v| v == first), "invalid field id docid facet value merging"); + Ok(first.to_vec()) +} + pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { cbo_roaring_bitmap_merge(values) } diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 796a0910a..8b538b03d 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -21,6 +21,7 @@ use self::store::{Store, Readers}; use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, docid_word_positions_merge, documents_merge, facet_field_value_docids_merge, + field_id_docid_facet_values_merge, }; pub use self::transform::{Transform, TransformOutput}; @@ -395,6 +396,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); + let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len()); let mut documents_readers = Vec::with_capacity(readers.len()); readers.into_iter().for_each(|readers| { let Readers { @@ -403,6 +405,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions, words_pairs_proximities_docids, facet_field_value_docids, + field_id_docid_facet_values, documents } = readers; main_readers.push(main); @@ -410,6 +413,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers.push(docid_word_positions); words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); facet_field_value_docids_readers.push(facet_field_value_docids); + field_id_docid_facet_values_readers.push(field_id_docid_facet_values); documents_readers.push(documents); }); @@ -453,6 +457,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, + field_id_docid_facet_values_readers, )) as anyhow::Result<_> })?; @@ -461,6 +466,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, + field_id_docid_facet_values_readers, ) = readers; let mut documents_ids = self.index.documents_ids(self.wtxn)?; @@ -488,7 +494,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; - let total_databases = 6; + let total_databases = 7; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, @@ -525,6 +531,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases, }); + debug!("Writing the field id docid facet values into LMDB on disk..."); + merge_into_lmdb_database( + self.wtxn, + *self.index.field_id_docid_facet_values.as_polymorph(), + field_id_docid_facet_values_readers, + field_id_docid_facet_values_merge, + write_method, + )?; + + database_count += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: database_count, + total_databases, + }); + debug!("Writing the words pairs proximities docids into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 57f99c908..b107d4be6 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -20,14 +20,15 @@ use tempfile::tempfile; use crate::facet::FacetType; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; +use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::tokenizer::{simple_tokenizer, only_token}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId}; +use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - facet_field_value_docids_merge, + facet_field_value_docids_merge, field_id_docid_facet_values_merge, }; const LMDB_MAX_KEY_LENGTH: usize = 511; @@ -42,13 +43,14 @@ pub struct Readers { pub docid_word_positions: Reader, pub words_pairs_proximities_docids: Reader, pub facet_field_value_docids: Reader, + pub field_id_docid_facet_values: Reader, pub documents: Reader, } pub struct Store { // Indexing parameters - searchable_fields: HashSet, - faceted_fields: HashMap, + searchable_fields: HashSet, + faceted_fields: HashMap, // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, @@ -65,6 +67,7 @@ pub struct Store { word_docids_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, facet_field_value_docids_sorter: Sorter, + field_id_docid_facet_values_sorter: Sorter, // MTBL writers docid_word_positions_writer: Writer, documents_writer: Writer, @@ -72,8 +75,8 @@ pub struct Store { impl Store { pub fn new( - searchable_fields: HashSet, - faceted_fields: HashMap, + searchable_fields: HashSet, + faceted_fields: HashMap, linked_hash_map_size: Option, max_nb_chunks: Option, max_memory: Option, @@ -118,6 +121,14 @@ impl Store { max_nb_chunks, max_memory, ); + let field_id_docid_facet_values_sorter = create_sorter( + field_id_docid_facet_values_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + Some(1024 * 1024 * 1024), // 1MB + ); let documents_writer = tempfile().and_then(|f| { create_writer(chunk_compression_type, chunk_compression_level, f) @@ -146,6 +157,7 @@ impl Store { word_docids_sorter, words_pairs_proximities_docids_sorter, facet_field_value_docids_sorter, + field_id_docid_facet_values_sorter, // MTBL writers docid_word_positions_writer, documents_writer, @@ -176,11 +188,13 @@ impl Store { // Save the documents ids under the facet field id and value we have seen it. fn insert_facet_values_docid( &mut self, - field_id: u8, + field_id: FieldId, field_value: FacetValue, id: DocumentId, ) -> anyhow::Result<()> { + Self::write_field_id_docid_facet_value(&mut self.field_id_docid_facet_values_sorter, field_id, id, &field_value)?; + let key = (field_id, field_value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.facet_field_value_docids.get_refresh(&key) { @@ -192,7 +206,7 @@ impl Store { // one element, this way next time we insert we doesn't grow the capacity. if self.facet_field_value_docids.len() == self.facet_field_value_docids_limit { // Removing the front element is equivalent to removing the LRU element. - Self::write_docid_facet_field_values( + Self::write_facet_field_value_docids( &mut self.facet_field_value_docids_sorter, self.facet_field_value_docids.pop_front(), )?; @@ -243,7 +257,7 @@ impl Store { &mut self, document_id: DocumentId, words_positions: &mut HashMap>, - facet_values: &mut HashMap>, + facet_values: &mut HashMap>, record: &[u8], ) -> anyhow::Result<()> { @@ -326,11 +340,11 @@ impl Store { Ok(()) } - fn write_docid_facet_field_values( + fn write_facet_field_value_docids( sorter: &mut Sorter, iter: I, ) -> anyhow::Result<()> - where I: IntoIterator + where I: IntoIterator { use FacetValue::*; @@ -351,6 +365,29 @@ impl Store { Ok(()) } + fn write_field_id_docid_facet_value( + sorter: &mut Sorter, + field_id: FieldId, + document_id: DocumentId, + value: &FacetValue, + ) -> anyhow::Result<()> + { + use FacetValue::*; + + let result = match value { + String(s) => FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, s)).map(Cow::into_owned), + Float(f) => FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, **f)).map(Cow::into_owned), + Integer(i) => FieldDocIdFacetI64Codec::bytes_encode(&(field_id, document_id, *i)).map(Cow::into_owned), + }; + + let key = result.context("could not serialize facet key")?; + if lmdb_key_valid_size(&key) { + sorter.insert(&key, &[])?; + } + + Ok(()) + } + fn write_word_docids(sorter: &mut Sorter, iter: I) -> anyhow::Result<()> where I: IntoIterator, RoaringBitmap)> { @@ -463,7 +500,7 @@ impl Store { &mut self.words_pairs_proximities_docids_sorter, self.words_pairs_proximities_docids, )?; - Self::write_docid_facet_field_values( + Self::write_facet_field_value_docids( &mut self.facet_field_value_docids_sorter, self.facet_field_value_docids, )?; @@ -491,10 +528,14 @@ impl Store { let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; + let mut field_id_docid_facet_values_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_docid_facet_values_sorter.write_into(&mut field_id_docid_facet_values_wtr)?; + let main = writer_into_reader(main_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; + let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?; let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; let documents = writer_into_reader(self.documents_writer, shrink_size)?; @@ -504,6 +545,7 @@ impl Store { docid_word_positions, words_pairs_proximities_docids, facet_field_value_docids, + field_id_docid_facet_values, documents, }) } diff --git a/src/update/index_documents/transform.rs b/src/update/index_documents/transform.rs index a42da45f1..f44593c05 100644 --- a/src/update/index_documents/transform.rs +++ b/src/update/index_documents/transform.rs @@ -10,13 +10,13 @@ use log::info; use roaring::RoaringBitmap; use serde_json::{Map, Value}; -use crate::{BEU32, MergeFn, Index, FieldsIdsMap, ExternalDocumentsIds}; +use crate::{BEU32, MergeFn, Index, FieldId, FieldsIdsMap, ExternalDocumentsIds}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use super::merge_function::merge_two_obkvs; use super::{create_writer, create_sorter, IndexDocumentsMethod}; pub struct TransformOutput { - pub primary_key: u8, + pub primary_key: FieldId, pub fields_ids_map: FieldsIdsMap, pub external_documents_ids: ExternalDocumentsIds<'static>, pub new_documents_ids: RoaringBitmap, @@ -365,7 +365,7 @@ impl Transform<'_, '_> { fn output_from_sorter( self, sorter: grenad::Sorter, - primary_key: u8, + primary_key: FieldId, fields_ids_map: FieldsIdsMap, approximate_number_of_documents: usize, mut external_documents_ids: ExternalDocumentsIds<'_>, @@ -477,7 +477,7 @@ impl Transform<'_, '_> { // TODO this can be done in parallel by using the rayon `ThreadPool`. pub fn remap_index_documents( self, - primary_key: u8, + primary_key: FieldId, fields_ids_map: FieldsIdsMap, ) -> anyhow::Result { diff --git a/src/update/settings.rs b/src/update/settings.rs index cddd68ca3..ea0f9b5be 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -8,7 +8,7 @@ use rayon::ThreadPool; use crate::update::index_documents::{Transform, IndexDocumentsMethod}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::facet::FacetType; -use crate::{Index, FieldsIdsMap}; +use crate::{Index, FieldsIdsMap, Criterion}; pub struct Settings<'a, 't, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -27,6 +27,7 @@ pub struct Settings<'a, 't, 'u, 'i> { searchable_fields: Option>>, displayed_fields: Option>>, faceted_fields: Option>, + criteria: Option>>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -45,6 +46,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { searchable_fields: None, displayed_fields: None, faceted_fields: None, + criteria: None, } } @@ -68,6 +70,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.faceted_fields = Some(names_facet_types); } + pub fn reset_criteria(&mut self) { + self.criteria = Some(None); + } + + pub fn set_criteria(&mut self, criteria: Vec) { + self.criteria = Some(Some(criteria)); + } + pub fn execute(self, progress_callback: F) -> anyhow::Result<()> where F: Fn(UpdateIndexingStep) + Sync @@ -75,6 +85,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let mut updated_searchable_fields = None; let mut updated_faceted_fields = None; let mut updated_displayed_fields = None; + let mut updated_criteria = None; // Construct the new FieldsIdsMap based on the searchable fields order. let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; @@ -113,9 +124,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { None => fields_ids_map.insert("id").context("field id limit reached")?, }; + let current_faceted_fields = self.index.faceted_fields(self.wtxn)?; if let Some(fields_names_facet_types) = self.faceted_fields { - let current_faceted_fields = self.index.faceted_fields(self.wtxn)?; - let mut faceted_fields = HashMap::new(); for (name, sftype) in fields_names_facet_types { let ftype = FacetType::from_str(&sftype).with_context(|| format!("parsing facet type {:?}", sftype))?; @@ -147,6 +157,25 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + if let Some(criteria) = self.criteria { + match criteria { + Some(criteria_names) => { + let mut new_criteria = Vec::new(); + for name in criteria_names { + let criterion = Criterion::from_str(&mut fields_ids_map, &name)?; + if let Some(fid) = criterion.field_id() { + let name = fields_ids_map.name(fid).unwrap(); + let faceted_fields = updated_faceted_fields.as_ref().unwrap_or(¤t_faceted_fields); + ensure!(faceted_fields.contains_key(&fid), "criterion field {} must be faceted", name); + } + new_criteria.push(criterion); + } + updated_criteria = Some(Some(new_criteria)); + }, + None => updated_criteria = Some(None), + } + } + // If any setting have modified any of the datastructures it means that we need // to retrieve the documents and then reindex then with the new settings. if updated_searchable_fields.is_some() || updated_faceted_fields.is_some() { @@ -202,14 +231,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } if let Some(displayed_fields) = updated_displayed_fields { - // We write the displayed fields into the database here - // to make sure that the right fields are displayed. match displayed_fields { Some(fields) => self.index.put_displayed_fields(self.wtxn, &fields)?, None => self.index.delete_displayed_fields(self.wtxn).map(drop)?, } } + if let Some(criteria) = updated_criteria { + match criteria { + Some(criteria) => self.index.put_criteria(self.wtxn, &criteria)?, + None => self.index.delete_criteria(self.wtxn).map(drop)?, + } + } + Ok(()) } }