From df7a32e3d0589b72c82a323968a5e72edf34f912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 21 Apr 2021 11:48:23 +0200 Subject: [PATCH 01/14] Move the creation date initialization into a function --- milli/src/index.rs | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index f222069f6..07af6286b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -81,16 +81,7 @@ impl Index { let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; - { - let mut txn = env.write_txn()?; - // The db was just created, we update its metadata with the relevant information. - if main.get::<_, Str, SerdeJson>>(&txn, CREATED_AT_KEY)?.is_none() { - let now = Utc::now(); - main.put::<_, Str, SerdeJson>>(&mut txn, UPDATED_AT_KEY, &now)?; - main.put::<_, Str, SerdeJson>>(&mut txn, CREATED_AT_KEY, &now)?; - txn.commit()?; - } - } + Index::initialize_creation_dates(&env, main)?; Ok(Index { env, From a56c46b6f1dbfe22bef9b33535d5e4ae236030ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 21 Apr 2021 11:49:26 +0200 Subject: [PATCH 02/14] Explode the string and f64 facet databases into two --- milli/src/index.rs | 50 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 07af6286b..305d95cc7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -14,6 +14,10 @@ use crate::{ BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, }; +use crate::heed_codec::facet::{ + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + FacetValueStringCodec, FacetLevelValueF64Codec, +}; use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; @@ -40,33 +44,45 @@ const UPDATED_AT_KEY: &str = "updated-at"; pub struct Index { /// The LMDB environment which this index is associated with. pub env: heed::Env, + /// Contains many different types (e.g. the fields ids map). pub main: PolyDatabase, + /// A word and all the documents ids containing the word. pub word_docids: Database, /// A prefix of word and all the documents ids containing this prefix. pub word_prefix_docids: Database, + /// Maps a word and a document id (u32) to all the positions where the given word appears. pub docid_word_positions: Database, + /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, + /// Maps the word, level and position range with the docids that corresponds to it. pub word_level_position_docids: Database, /// Maps the level positions of a word prefix with all the docids where this prefix appears. pub word_prefix_level_position_docids: Database, - /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. - pub facet_field_id_value_docids: Database, - /// Maps the document id, the facet field id and the globally ordered value. - pub field_id_docid_facet_values: Database, + + /// Maps the facet field id, level and the number with the docids that corresponds to it. + pub facet_id_f64_docids: Database, + /// Maps the facet field id and the string with the docids that corresponds to it. + pub facet_id_string_docids: Database, + + /// Maps the document id, the facet field id and the numbers. + pub field_id_docid_facet_f64s: Database, + /// Maps the document id, the facet field id and the strings. + pub field_id_docid_facet_strings: Database, + /// Maps the document id to the document as an obkv store. pub documents: Database, ObkvCodec>, } impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(11); + options.max_dbs(13); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -77,8 +93,10 @@ impl Index { let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; - let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; - let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; + let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?; + let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?; + let field_id_docid_facet_f64s = env.create_database(Some("field-id-docid-facet-f64s"))?; + let field_id_docid_facet_strings = env.create_database(Some("field-id-docid-facet-strings"))?; let documents = env.create_database(Some("documents"))?; Index::initialize_creation_dates(&env, main)?; @@ -93,12 +111,26 @@ impl Index { word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, - facet_field_id_value_docids, - field_id_docid_facet_values, + facet_id_f64_docids, + facet_id_string_docids, + field_id_docid_facet_f64s, + field_id_docid_facet_strings, documents, }) } + fn initialize_creation_dates(env: &heed::Env, main: PolyDatabase) -> heed::Result<()> { + let mut txn = env.write_txn()?; + // The db was just created, we update its metadata with the relevant information. + if main.get::<_, Str, SerdeJson>>(&txn, CREATED_AT_KEY)?.is_none() { + let now = Utc::now(); + main.put::<_, Str, SerdeJson>>(&mut txn, UPDATED_AT_KEY, &now)?; + main.put::<_, Str, SerdeJson>>(&mut txn, CREATED_AT_KEY, &now)?; + txn.commit()?; + } + Ok(()) + } + /// Create a write transaction to be able to write into the index. pub fn write_txn(&self) -> heed::Result { self.env.write_txn() From 837c1041c7c02f47100c78a6ae9b18fc315f68e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 21 Apr 2021 15:43:44 +0200 Subject: [PATCH 03/14] Clear and delete the documents from the facet database --- milli/src/update/clear_documents.rs | 12 ++- milli/src/update/delete_documents.rs | 144 +++++++++++++++++---------- 2 files changed, 100 insertions(+), 56 deletions(-) diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index f89c2d00c..ba0c9e58e 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -30,8 +30,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, - facet_field_id_value_docids, - field_id_docid_facet_values, + facet_id_f64_docids, + facet_id_string_docids, + field_id_docid_facet_f64s, + field_id_docid_facet_strings, documents, } = self.index; @@ -59,8 +61,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_pair_proximity_docids.clear(self.wtxn)?; word_level_position_docids.clear(self.wtxn)?; word_prefix_level_position_docids.clear(self.wtxn)?; - facet_field_id_value_docids.clear(self.wtxn)?; - field_id_docid_facet_values.clear(self.wtxn)?; + facet_id_f64_docids.clear(self.wtxn)?; + facet_id_string_docids.clear(self.wtxn)?; + field_id_docid_facet_f64s.clear(self.wtxn)?; + field_id_docid_facet_strings.clear(self.wtxn)?; documents.clear(self.wtxn)?; Ok(number_of_documents) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 4c5f8d61a..b2b1e8410 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -4,13 +4,12 @@ use std::collections::hash_map::Entry; use anyhow::anyhow; use chrono::Utc; use fst::IntoStreamer; -use heed::types::ByteSlice; +use heed::types::{ByteSlice, Unit}; use roaring::RoaringBitmap; use serde_json::Value; -use crate::facet::FacetType; -use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; -use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds}; use super::ClearDocuments; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -90,8 +89,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, - facet_field_id_value_docids, - field_id_docid_facet_values, + facet_id_f64_docids, + facet_id_string_docids, + field_id_docid_facet_f64s, + field_id_docid_facet_strings, documents, } = self.index; @@ -285,52 +286,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); - // Remove the documents ids from the faceted documents ids. - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; - for (field_id, facet_type) in faceted_fields { - let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; - docids.difference_with(&self.documents_ids); - self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; - - // We delete the entries that are part of the documents ids. - let iter = field_id_docid_facet_values.prefix_iter_mut(self.wtxn, &[field_id])?; - match facet_type { - FacetType::String => { - let mut iter = iter.remap_key_type::(); - while let Some(result) = iter.next() { - let ((_fid, docid, _value), ()) = result?; - if self.documents_ids.contains(docid) { - iter.del_current()?; - } - } - }, - FacetType::Number => { - let mut iter = iter.remap_key_type::(); - while let Some(result) = iter.next() { - let ((_fid, docid, _value), ()) = result?; - if self.documents_ids.contains(docid) { - iter.del_current()?; - } - } - }, - } - } - - // We delete the documents ids that are under the facet field id values. - let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (bytes, mut docids) = result?; - let previous_len = docids.len(); - docids.difference_with(&self.documents_ids); - if docids.is_empty() { - iter.del_current()?; - } else if docids.len() != previous_len { - iter.put_current(bytes, &docids)?; - } - } - - drop(iter); - // We delete the documents ids that are under the word level position docids. let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::(); while let Some(result) = iter.next() { @@ -361,10 +316,95 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); + // We delete the documents ids that are under the facet field id values. + remove_docids_from_facet_field_id_value_docids( + self.wtxn, + facet_id_f64_docids, + &self.documents_ids, + )?; + + remove_docids_from_facet_field_id_value_docids( + self.wtxn, + facet_id_string_docids, + &self.documents_ids, + )?; + + // Remove the documents ids from the faceted documents ids. + let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; + for (field_id, facet_type) in faceted_fields { + let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; + docids.difference_with(&self.documents_ids); + self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; + + remove_docids_from_field_id_docid_facet_value( + self.wtxn, + field_id_docid_facet_f64s, + field_id, + &self.documents_ids, + |(_fid, docid, _value)| docid, + )?; + + remove_docids_from_field_id_docid_facet_value( + self.wtxn, + field_id_docid_facet_strings, + field_id, + &self.documents_ids, + |(_fid, docid, _value)| docid, + )?; + } + Ok(self.documents_ids.len()) } } +fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>( + wtxn: &'a mut heed::RwTxn, + db: &heed::Database, + field_id: FieldId, + to_remove: &RoaringBitmap, + convert: F, +) -> heed::Result<()> +where + C: heed::BytesDecode<'a, DItem=K> + heed::BytesEncode<'a, EItem=K>, + F: Fn(K) -> DocumentId, +{ + let mut iter = db.remap_key_type::() + .prefix_iter_mut(wtxn, &[field_id])? + .remap_key_type::(); + + while let Some(result) = iter.next() { + let (key, ()) = result?; + if to_remove.contains(convert(key)) { + iter.del_current()?; + } + } + + Ok(()) +} + +fn remove_docids_from_facet_field_id_value_docids<'a, C>( + wtxn: &'a mut heed::RwTxn, + db: &heed::Database, + to_remove: &RoaringBitmap, +) -> heed::Result<()> +where + C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, +{ + let mut iter = db.remap_key_type::().iter_mut(wtxn)?; + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + let previous_len = docids.len(); + docids.difference_with(to_remove); + if docids.is_empty() { + iter.del_current()?; + } else if docids.len() != previous_len { + iter.put_current(bytes, &docids)?; + } + } + + Ok(()) +} + #[cfg(test)] mod tests { use heed::EnvOpenOptions; From 597144b0b93ff23ef7523685d5112ecc4ce799d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 28 Apr 2021 10:20:31 +0200 Subject: [PATCH 04/14] Use both number and string facet databases in the distinct system --- milli/src/search/criteria/asc_desc.rs | 57 ++++++----- milli/src/search/distinct/facet_distinct.rs | 101 +++++++++++--------- 2 files changed, 85 insertions(+), 73 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 0511ce319..9e8bebb8f 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -8,7 +8,6 @@ use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; @@ -39,8 +38,7 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ) -> anyhow::Result - { + ) -> anyhow::Result { Self::new(index, rtxn, parent, field_name, true) } @@ -49,8 +47,7 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ) -> anyhow::Result - { + ) -> anyhow::Result { Self::new(index, rtxn, parent, field_name, false) } @@ -60,11 +57,11 @@ impl<'t> AscDesc<'t> { parent: Box, field_name: String, ascending: bool, - ) -> anyhow::Result - { + ) -> anyhow::Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let faceted_fields = index.faceted_fields(rtxn)?; - let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; + let (field_id, facet_type) = + field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; Ok(AscDesc { index, @@ -86,8 +83,10 @@ impl<'t> Criterion for AscDesc<'t> { #[logging_timer::time("AscDesc::{}")] fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { loop { - debug!("Facet {}({}) iteration", - if self.ascending { "Asc" } else { "Desc" }, self.field_name + debug!( + "Facet {}({}) iteration", + if self.ascending { "Asc" } else { "Desc" }, + self.field_name ); match self.candidates.next().transpose()? { @@ -138,7 +137,7 @@ impl<'t> Criterion for AscDesc<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, + } } } } @@ -148,14 +147,13 @@ fn field_id_facet_type( fields_ids_map: &FieldsIdsMap, faceted_fields: &HashMap, field: &str, -) -> anyhow::Result<(FieldId, FacetType)> -{ - let id = fields_ids_map.id(field).with_context(|| { - format!("field {:?} isn't registered", field) - })?; - let facet_type = faceted_fields.get(field).with_context(|| { - format!("field {:?} isn't faceted", field) - })?; +) -> anyhow::Result<(FieldId, FacetType)> { + let id = fields_ids_map + .id(field) + .with_context(|| format!("field {:?} isn't registered", field))?; + let facet_type = faceted_fields + .get(field) + .with_context(|| format!("field {:?} isn't faceted", field))?; Ok((id, *facet_type)) } @@ -170,14 +168,12 @@ fn facet_ordered<'t>( facet_type: FacetType, ascending: bool, candidates: RoaringBitmap, -) -> anyhow::Result> + 't>> -{ +) -> anyhow::Result> + 't>> { match facet_type { FacetType::Number => { if candidates.len() <= CANDIDATES_THRESHOLD { - let iter = iterative_facet_ordered_iter( - index, rtxn, field_id, ascending, candidates, - )?; + let iter = + iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; Ok(Box::new(iter.map(Ok)) as Box>) } else { let facet_fn = if ascending { @@ -188,7 +184,7 @@ fn facet_ordered<'t>( let iter = facet_fn(rtxn, index, field_id, candidates)?; Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } - }, + } FacetType::String => bail!("criteria facet type must be a number"), } } @@ -202,14 +198,14 @@ fn iterative_facet_ordered_iter<'t>( field_id: FieldId, ascending: bool, candidates: RoaringBitmap, -) -> anyhow::Result + 't> -{ - let db = index.field_id_docid_facet_values.remap_key_type::(); +) -> anyhow::Result + 't> { let mut docids_values = Vec::with_capacity(candidates.len() as usize); for docid in candidates.iter() { let left = (field_id, docid, f64::MIN); let right = (field_id, docid, f64::MAX); - let mut iter = db.range(rtxn, &(left..=right))?; + let mut iter = index + .field_id_docid_facet_f64s + .range(rtxn, &(left..=right))?; let entry = if ascending { iter.next() } else { iter.last() }; if let Some(((_, _, value), ())) = entry.transpose()? { docids_values.push((docid, OrderedFloat(value))); @@ -226,7 +222,8 @@ fn iterative_facet_ordered_iter<'t>( // The itertools GroupBy iterator doesn't provide an owned version, we are therefore // required to collect the result into an owned collection (a Vec). // https://github.com/rust-itertools/itertools/issues/499 - let vec: Vec<_> = iter.group_by(|(_, v)| *v) + let vec: Vec<_> = iter + .group_by(|(_, v)| v.clone()) .into_iter() .map(|(_, ids)| ids.map(|(id, _)| id).collect()) .collect(); diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 3c508b25b..f3952e6f1 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -1,10 +1,14 @@ use std::mem::size_of; +use heed::types::ByteSlice; use roaring::RoaringBitmap; +use super::{Distinct, DocIter}; use crate::heed_codec::facet::*; use crate::{facet::FacetType, DocumentId, FieldId, Index}; -use super::{Distinct, DocIter}; + +const FID_SIZE: usize = size_of::(); +const DOCID_SIZE: usize = size_of::(); /// A distinct implementer that is backed by facets. /// @@ -48,31 +52,27 @@ pub struct FacetDistinctIter<'a> { } impl<'a> FacetDistinctIter<'a> { - fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result - where - KC: heed::BytesEncode<'c>, - { - let facet_docids = self - .index - .facet_field_id_value_docids - .remap_key_type::() - .get(self.txn, key)? - .expect("Corrupted data: Facet values must exist"); - Ok(facet_docids) + fn facet_string_docids(&self, key: &str) -> heed::Result> { + self.index + .facet_id_string_docids + .get(self.txn, &(self.distinct, key)) + } + + fn facet_number_docids(&self, key: f64) -> heed::Result> { + // get facet docids on level 0 + self.index + .facet_id_f64_docids + .get(self.txn, &(self.distinct, 0, key, key)) } fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> { - let iter = get_facet_values::( - id, - self.distinct, - self.index, - self.txn, - )?; + let iter = facet_string_values(id, self.distinct, self.index, self.txn)?; for item in iter { let ((_, _, value), _) = item?; - let key = (self.distinct, value); - let facet_docids = self.get_facet_docids::(&key)?; + let facet_docids = self + .facet_string_docids(value)? + .expect("Corrupted data: Facet values must exist"); self.excluded.union_with(&facet_docids); } @@ -82,17 +82,13 @@ impl<'a> FacetDistinctIter<'a> { } fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> { - let iter = get_facet_values::(id, - self.distinct, - self.index, - self.txn, - )?; + let iter = facet_number_values(id, self.distinct, self.index, self.txn)?; for item in iter { let ((_, _, value), _) = item?; - // get facet docids on level 0 - let key = (self.distinct, 0, value, value); - let facet_docids = self.get_facet_docids::(&key)?; + let facet_docids = self + .facet_number_docids(value)? + .expect("Corrupted data: Facet values must exist"); self.excluded.union_with(&facet_docids); } @@ -129,26 +125,44 @@ impl<'a> FacetDistinctIter<'a> { } } -fn get_facet_values<'a, KC>( +fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] { + let mut key = [0; FID_SIZE + DOCID_SIZE]; + key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); + key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); + key +} + +fn facet_number_values<'a>( id: DocumentId, distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> anyhow::Result> -where - KC: heed::BytesDecode<'a>, -{ - const FID_SIZE: usize = size_of::(); - const DOCID_SIZE: usize = size_of::(); - - let mut key = [0; FID_SIZE + DOCID_SIZE]; - key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); - key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); +) -> anyhow::Result> { + let key = facet_values_prefix_key(distinct, id); let iter = index - .field_id_docid_facet_values + .field_id_docid_facet_f64s + .remap_key_type::() .prefix_iter(txn, &key)? - .remap_key_type::(); + .remap_key_type::(); + + Ok(iter) +} + +fn facet_string_values<'a>( + id: DocumentId, + distinct: FieldId, + index: &Index, + txn: &'a heed::RoTxn, +) -> anyhow::Result> { + let key = facet_values_prefix_key(distinct, id); + + let iter = index + .field_id_docid_facet_strings + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_key_type::(); + Ok(iter) } @@ -186,8 +200,8 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { mod test { use std::collections::HashMap; - use super::*; use super::super::test::{generate_index, validate_distinct_candidates}; + use super::*; use crate::facet::FacetType; macro_rules! test_facet_distinct { @@ -196,7 +210,8 @@ mod test { fn $name() { use std::iter::FromIterator; - let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); + let facets = + HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); let (index, fid, candidates) = generate_index($distinct, facets); let txn = index.read_txn().unwrap(); let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type); From 038e03a4e42a4c195cab2cd6730cde541af52c85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 28 Apr 2021 10:22:48 +0200 Subject: [PATCH 05/14] Use both facet databases in the FacetIter type --- milli/src/search/facet/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 0252af963..26bcf1b83 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -140,7 +140,7 @@ impl<'t> FacetIter<'t> { documents_ids: RoaringBitmap, ) -> heed::Result> { - let db = index.facet_field_id_value_docids.remap_key_type::(); + let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Left(highest_iter))]; @@ -157,7 +157,7 @@ impl<'t> FacetIter<'t> { documents_ids: RoaringBitmap, ) -> heed::Result> { - let db = index.facet_field_id_value_docids.remap_key_type::(); + let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Right(highest_iter))]; @@ -175,7 +175,7 @@ impl<'t> FacetIter<'t> { documents_ids: RoaringBitmap, ) -> heed::Result> { - let db = index.facet_field_id_value_docids.remap_key_type::(); + let db = index.facet_id_f64_docids.remap_key_type::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; let level_iters = vec![(documents_ids, Left(highest_iter))]; From bd7b285bae9b66866219eb8b73dd8f178f3810e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 28 Apr 2021 17:58:16 +0200 Subject: [PATCH 06/14] Split the update side to use the number and the strings facet databases --- milli/src/criterion.rs | 10 +- milli/src/index.rs | 96 ++++-- milli/src/search/criteria/asc_desc.rs | 41 +-- milli/src/search/distinct/facet_distinct.rs | 17 +- milli/src/search/mod.rs | 2 +- milli/src/update/clear_documents.rs | 6 +- milli/src/update/delete_documents.rs | 13 +- milli/src/update/facets.rs | 88 +++-- milli/src/update/index_documents/mod.rs | 63 ++-- milli/src/update/index_documents/store.rs | 341 ++++++++++++-------- milli/src/update/settings.rs | 14 +- 11 files changed, 406 insertions(+), 285 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 8bae99a20..1d7326db7 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::HashSet; use std::fmt; use anyhow::{Context, bail}; @@ -6,8 +6,6 @@ use regex::Regex; use serde::{Serialize, Deserialize}; use once_cell::sync::Lazy; -use crate::facet::FacetType; - static ASC_DESC_REGEX: Lazy = Lazy::new(|| { Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap() }); @@ -33,7 +31,7 @@ pub enum Criterion { } impl Criterion { - pub fn from_str(faceted_attributes: &HashMap, txt: &str) -> anyhow::Result { + pub fn from_str(faceted_attributes: &HashSet, txt: &str) -> anyhow::Result { match txt { "words" => Ok(Criterion::Words), "typo" => Ok(Criterion::Typo), @@ -44,7 +42,9 @@ impl Criterion { let caps = ASC_DESC_REGEX.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; let order = caps.get(1).unwrap().as_str(); let field_name = caps.get(2).unwrap().as_str(); - faceted_attributes.get(field_name).with_context(|| format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name))?; + faceted_attributes.get(field_name).with_context(|| { + format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name) + })?; match order { "asc" => Ok(Criterion::Asc(field_name.to_string())), "desc" => Ok(Criterion::Desc(field_name.to_string())), diff --git a/milli/src/index.rs b/milli/src/index.rs index 305d95cc7..14b153a2e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::path::Path; use anyhow::Context; @@ -18,24 +18,24 @@ use crate::heed_codec::facet::{ FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FacetValueStringCodec, FacetLevelValueF64Codec, }; -use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; -pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; -pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; +pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; +pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; +pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; -pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; -pub const WORDS_FST_KEY: &str = "words-fst"; pub const STOP_WORDS_KEY: &str = "stop-words"; +pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; pub const SYNONYMS_KEY: &str = "synonyms"; +pub const WORDS_FST_KEY: &str = "words-fst"; pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; const UPDATED_AT_KEY: &str = "updated-at"; @@ -321,53 +321,97 @@ impl Index { /* faceted fields */ - /// Writes the facet fields associated with their facet type or `None` if - /// the facet type is currently unknown. - pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap) -> heed::Result<()> { - self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types) + /// Writes the facet fields names in the database. + pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields) } - /// Deletes the facet fields ids associated with their facet type. + /// Deletes the facet fields ids in the database. pub fn delete_faceted_fields(&self, wtxn: &mut RwTxn) -> heed::Result { self.main.delete::<_, Str>(wtxn, FACETED_FIELDS_KEY) } - /// Returns the facet fields names associated with their facet type. - pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { + /// Returns the facet fields names. + pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) } /// Same as `faceted_fields`, but returns ids instead. - pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result> { + pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result> { let faceted_fields = self.faceted_fields(rtxn)?; let fields_ids_map = self.fields_ids_map(rtxn)?; let faceted_fields = faceted_fields .iter() - .map(|(k, v)| { - let kid = fields_ids_map + .map(|k| { + fields_ids_map .id(k) .ok_or_else(|| format!("{:?} should be present in the field id map", k)) - .expect("corrupted data: "); - (kid, *v) + .expect("corrupted data: ") }) .collect(); + Ok(faceted_fields) } /* faceted documents ids */ - /// Writes the documents ids that are faceted under this field id. - pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap) -> heed::Result<()> { - let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + /// Writes the documents ids that are faceted with numbers under this field id. + pub fn put_number_faceted_documents_ids( + &self, + wtxn: &mut RwTxn, + field_id: FieldId, + docids: &RoaringBitmap, + ) -> heed::Result<()> + { + let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } - /// Retrieve all the documents ids that faceted under this field id. - pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: FieldId) -> heed::Result { - let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; - buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + /// Retrieve all the documents ids that faceted with numbers under this field id. + pub fn number_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result + { + let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + *buffer.last_mut().unwrap() = field_id; + match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + + /// Writes the documents ids that are faceted with strings under this field id. + pub fn put_string_faceted_documents_ids( + &self, + wtxn: &mut RwTxn, + field_id: FieldId, + docids: &RoaringBitmap, + ) -> heed::Result<()> + { + let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); + *buffer.last_mut().unwrap() = field_id; + self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) + } + + /// Retrieve all the documents ids that faceted with strings under this field id. + pub fn string_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result + { + let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; + buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] + .copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); *buffer.last_mut().unwrap() = field_id; match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 9e8bebb8f..32857b8d7 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::mem::take; -use anyhow::{bail, Context as _}; +use anyhow::Context; use itertools::Itertools; use log::debug; use ordered_float::OrderedFloat; @@ -23,7 +23,6 @@ pub struct AscDesc<'t> { rtxn: &'t heed::RoTxn<'t>, field_name: String, field_id: FieldId, - facet_type: FacetType, ascending: bool, query_tree: Option, candidates: Box> + 't>, @@ -51,6 +50,7 @@ impl<'t> AscDesc<'t> { Self::new(index, rtxn, parent, field_name, false) } + fn new( index: &'t Index, rtxn: &'t heed::RoTxn, @@ -60,19 +60,19 @@ impl<'t> AscDesc<'t> { ) -> anyhow::Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let faceted_fields = index.faceted_fields(rtxn)?; - let (field_id, facet_type) = - field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; + let field_id = fields_ids_map + .id(&field_name) + .with_context(|| format!("field {:?} isn't registered", field_name))?; Ok(AscDesc { index, rtxn, field_name, field_id, - facet_type, ascending, query_tree: None, candidates: Box::new(std::iter::empty()), - faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, + faceted_candidates: index.number_faceted_documents_ids(rtxn, field_id)?, bucket_candidates: RoaringBitmap::new(), parent, }) @@ -165,27 +165,20 @@ fn facet_ordered<'t>( index: &'t Index, rtxn: &'t heed::RoTxn, field_id: FieldId, - facet_type: FacetType, ascending: bool, candidates: RoaringBitmap, ) -> anyhow::Result> + 't>> { - match facet_type { - FacetType::Number => { - if candidates.len() <= CANDIDATES_THRESHOLD { - let iter = - iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; - Ok(Box::new(iter.map(Ok)) as Box>) - } else { - let facet_fn = if ascending { - FacetIter::new_reducing - } else { - FacetIter::new_reverse_reducing - }; - let iter = facet_fn(rtxn, index, field_id, candidates)?; - Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) - } - } - FacetType::String => bail!("criteria facet type must be a number"), + if candidates.len() <= CANDIDATES_THRESHOLD { + let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; + Ok(Box::new(iter.map(Ok)) as Box>) + } else { + let facet_fn = if ascending { + FacetIter::new_reducing + } else { + FacetIter::new_reverse_reducing + }; + let iter = facet_fn(rtxn, index, field_id, candidates)?; + Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } } diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index f3952e6f1..44dd6bc66 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::heed_codec::facet::*; -use crate::{facet::FacetType, DocumentId, FieldId, Index}; +use crate::{DocumentId, FieldId, Index}; const FID_SIZE: usize = size_of::(); const DOCID_SIZE: usize = size_of::(); @@ -22,7 +22,6 @@ pub struct FacetDistinct<'a> { distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>, - facet_type: FacetType, } impl<'a> FacetDistinct<'a> { @@ -30,14 +29,9 @@ impl<'a> FacetDistinct<'a> { distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>, - facet_type: FacetType, - ) -> Self { - Self { - distinct, - index, - txn, - facet_type, - } + ) -> Self + { + Self { distinct, index, txn } } } @@ -45,7 +39,6 @@ pub struct FacetDistinctIter<'a> { candidates: RoaringBitmap, distinct: FieldId, excluded: RoaringBitmap, - facet_type: FacetType, index: &'a Index, iter_offset: usize, txn: &'a heed::RoTxn<'a>, @@ -117,6 +110,7 @@ impl<'a> FacetDistinctIter<'a> { // increasing the offset we make sure to get the first valid value for the next // distinct document to keep. self.iter_offset += 1; + Ok(Some(id)) } // no more candidate at this offset, return. @@ -188,7 +182,6 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { candidates, distinct: self.distinct, excluded, - facet_type: self.facet_type, index: self.index, iter_offset: 0, txn: self.txn, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index be107bf72..640f081ba 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -145,7 +145,7 @@ impl<'a> Search<'a> { let faceted_fields = self.index.faceted_fields(self.rtxn)?; match faceted_fields.get(name) { Some(facet_type) => { - let distinct = FacetDistinct::new(id, self.index, self.rtxn, *facet_type); + let distinct = FacetDistinct::new(id, self.index, self.rtxn); self.perform_sort(distinct, matching_words, criteria) } None => { diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ba0c9e58e..c163046ec 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -49,8 +49,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?; // We clean all the faceted documents ids. - for (field_id, _) in faceted_fields { - self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?; + let empty = RoaringBitmap::default(); + for field_id in faceted_fields { + self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty)?; + self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty)?; } // Clear the other databases. diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index b2b1e8410..e93ff9a0a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -330,11 +330,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { )?; // Remove the documents ids from the faceted documents ids. - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; - for (field_id, facet_type) in faceted_fields { - let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; + for field_id in self.index.faceted_fields_ids(self.wtxn)? { + // Remove docids from the number faceted documents ids + let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?; docids.difference_with(&self.documents_ids); - self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; + self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?; remove_docids_from_field_id_docid_facet_value( self.wtxn, @@ -344,6 +344,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { |(_fid, docid, _value)| docid, )?; + // Remove docids from the string faceted documents ids + let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?; + docids.difference_with(&self.documents_ids); + self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?; + remove_docids_from_field_id_docid_facet_value( self.wtxn, field_id_docid_facet_strings, diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index b9e4d7488..af72133a2 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -9,7 +9,6 @@ use heed::{BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::Index; @@ -62,56 +61,51 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - for (field_id, facet_type) in faceted_fields { - let (content, documents_ids) = match facet_type { - FacetType::String => { - let documents_ids = compute_faceted_documents_ids( - self.wtxn, - self.index.facet_field_id_value_docids, - field_id, - )?; - (None, documents_ids) - }, - FacetType::Number => { - clear_field_number_levels( - self.wtxn, - self.index.facet_field_id_value_docids.remap_key_type::(), - field_id, - )?; + for field_id in faceted_fields { + // Compute and store the faceted strings documents ids. + let string_documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_id_string_docids.remap_key_type::(), + field_id, + )?; - let documents_ids = compute_faceted_documents_ids( - self.wtxn, - self.index.facet_field_id_value_docids, - field_id, - )?; + // Clear the facet number levels. + clear_field_number_levels( + self.wtxn, + self.index.facet_id_f64_docids, + field_id, + )?; - let content = compute_facet_number_levels( - self.wtxn, - self.index.facet_field_id_value_docids.remap_key_type::(), - self.chunk_compression_type, - self.chunk_compression_level, - self.chunk_fusing_shrink_size, - self.level_group_size, - self.min_level_size, - field_id, - )?; + // Compute and store the faceted numbers documents ids. + let number_documents_ids = compute_faceted_documents_ids( + self.wtxn, + self.index.facet_id_f64_docids.remap_key_type::(), + field_id, + )?; - (Some(content), documents_ids) - }, - }; + let content = compute_facet_number_levels( + self.wtxn, + self.index.facet_id_f64_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + field_id, + )?; - if let Some(content) = content { - write_into_lmdb_database( - self.wtxn, - *self.index.facet_field_id_value_docids.as_polymorph(), - content, - |_, _| anyhow::bail!("invalid facet level merging"), - WriteMethod::GetMergePut, - )?; - } + self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?; + self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?; - self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?; + // Store the + write_into_lmdb_database( + self.wtxn, + *self.index.facet_id_f64_docids.as_polymorph(), + content, + |_, _| anyhow::bail!("invalid facet number level merging"), + WriteMethod::GetMergePut, + )?; } Ok(()) @@ -205,10 +199,12 @@ fn compute_faceted_documents_ids( ) -> anyhow::Result { let mut documents_ids = RoaringBitmap::new(); + for result in db.prefix_iter(rtxn, &[field_id])? { let (_key, docids) = result?; - documents_ids.union_with(&docids); + documents_ids |= docids; } + Ok(documents_ids) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 3acae7821..10c2e41e7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -412,7 +412,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { Main, WordDocids, WordLevel0PositionDocids, - FacetLevel0ValuesDocids, + FacetLevel0NumbersDocids, } let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; @@ -478,8 +478,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); - let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); - let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len()); + let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len()); + let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len()); + let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len()); + let mut field_id_docid_facet_strings_readers = Vec::with_capacity(readers.len()); let mut documents_readers = Vec::with_capacity(readers.len()); readers.into_iter().for_each(|readers| { let Readers { @@ -488,17 +490,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions, words_pairs_proximities_docids, word_level_position_docids, - facet_field_value_docids, - field_id_docid_facet_values, - documents + facet_field_numbers_docids, + facet_field_strings_docids, + field_id_docid_facet_numbers, + field_id_docid_facet_strings, + documents, } = readers; main_readers.push(main); word_docids_readers.push(word_docids); docid_word_positions_readers.push(docid_word_positions); words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); word_level_position_docids_readers.push(word_level_position_docids); - facet_field_value_docids_readers.push(facet_field_value_docids); - field_id_docid_facet_values_readers.push(field_id_docid_facet_values); + facet_field_numbers_docids_readers.push(facet_field_numbers_docids); + facet_field_strings_docids_readers.push(facet_field_strings_docids); + field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers); + field_id_docid_facet_strings_readers.push(field_id_docid_facet_strings); documents_readers.push(documents); }); @@ -523,8 +529,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { (DatabaseType::Main, main_readers, main_merge as MergeFn), (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), ( - DatabaseType::FacetLevel0ValuesDocids, - facet_field_value_docids_readers, + DatabaseType::FacetLevel0NumbersDocids, + facet_field_numbers_docids_readers, facet_field_value_docids_merge, ), ( @@ -547,7 +553,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, - field_id_docid_facet_values_readers, + facet_field_numbers_docids_readers, + facet_field_strings_docids_readers, + field_id_docid_facet_numbers_readers, + field_id_docid_facet_strings_readers, )) as anyhow::Result<_> })?; @@ -556,7 +565,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, - field_id_docid_facet_values_readers, + facet_field_numbers_docids_readers, + facet_field_strings_docids_readers, + field_id_docid_facet_numbers_readers, + field_id_docid_facet_strings_readers, ) = readers; let mut documents_ids = self.index.documents_ids(self.wtxn)?; @@ -624,11 +636,26 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases, }); - debug!("Writing the field id docid facet values into LMDB on disk..."); + debug!("Writing the field id docid facet numbers into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, - *self.index.field_id_docid_facet_values.as_polymorph(), - field_id_docid_facet_values_readers, + *self.index.field_id_docid_facet_f64s.as_polymorph(), + field_id_docid_facet_numbers_readers, + field_id_docid_facet_values_merge, + write_method, + )?; + + database_count += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: database_count, + total_databases, + }); + + debug!("Writing the field id docid facet strings into LMDB on disk..."); + merge_into_lmdb_database( + self.wtxn, + *self.index.field_id_docid_facet_strings.as_polymorph(), + field_id_docid_facet_strings_readers, field_id_docid_facet_values_merge, write_method, )?; @@ -678,9 +705,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; }, - DatabaseType::FacetLevel0ValuesDocids => { - debug!("Writing the facet level 0 values docids into LMDB on disk..."); - let db = *self.index.facet_field_id_value_docids.as_polymorph(); + DatabaseType::FacetLevel0NumbersDocids => { + debug!("Writing the facet numbers docids into LMDB on disk..."); + let db = *self.index.facet_id_f64_docids.as_polymorph(); write_into_lmdb_database( self.wtxn, db, diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 0f97476d9..ba8da6d16 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -6,25 +6,24 @@ use std::iter::FromIterator; use std::time::Instant; use std::{cmp, iter}; -use anyhow::{bail, Context}; +use anyhow::Context; use bstr::ByteSlice as _; use fst::Set; use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; -use log::{debug, info, warn}; +use log::{debug, info}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; use tempfile::tempfile; -use crate::facet::{FacetType, FacetValue}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap}; +use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ @@ -45,8 +44,10 @@ pub struct Readers { pub docid_word_positions: Reader, pub words_pairs_proximities_docids: Reader, pub word_level_position_docids: Reader, - pub facet_field_value_docids: Reader, - pub field_id_docid_facet_values: Reader, + pub facet_field_numbers_docids: Reader, + pub facet_field_strings_docids: Reader, + pub field_id_docid_facet_numbers: Reader, + pub field_id_docid_facet_strings: Reader, pub documents: Reader, } @@ -55,13 +56,14 @@ pub struct Store<'s, A> { primary_key: String, fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, - faceted_fields: HashMap, + faceted_fields: HashSet, // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, - facet_field_value_docids: LinkedHashMap<(u8, FacetValue), RoaringBitmap>, + facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, + facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, facet_field_value_docids_limit: usize, // MTBL parameters chunk_compression_type: CompressionType, @@ -72,8 +74,10 @@ pub struct Store<'s, A> { word_docids_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, word_level_position_docids_sorter: Sorter, - facet_field_value_docids_sorter: Sorter, - field_id_docid_facet_values_sorter: Sorter, + facet_field_numbers_docids_sorter: Sorter, + facet_field_strings_docids_sorter: Sorter, + field_id_docid_facet_numbers_sorter: Sorter, + field_id_docid_facet_strings_sorter: Sorter, // MTBL writers docid_word_positions_writer: Writer, documents_writer: Writer, @@ -86,7 +90,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { primary_key: String, fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, - faceted_fields: HashMap, + faceted_fields: HashSet, linked_hash_map_size: Option, max_nb_chunks: Option, max_memory: Option, @@ -132,7 +136,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_nb_chunks, max_memory, ); - let facet_field_value_docids_sorter = create_sorter( + let facet_field_numbers_docids_sorter = create_sorter( facet_field_value_docids_merge, chunk_compression_type, chunk_compression_level, @@ -140,7 +144,23 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_nb_chunks, max_memory, ); - let field_id_docid_facet_values_sorter = create_sorter( + let facet_field_strings_docids_sorter = create_sorter( + facet_field_value_docids_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + max_memory, + ); + let field_id_docid_facet_numbers_sorter = create_sorter( + field_id_docid_facet_values_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + Some(1024 * 1024 * 1024), // 1MB + ); + let field_id_docid_facet_strings_sorter = create_sorter( field_id_docid_facet_values_merge, chunk_compression_type, chunk_compression_level, @@ -173,7 +193,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { word_docids_limit: linked_hash_map_size, words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), words_pairs_proximities_docids_limit: linked_hash_map_size, - facet_field_value_docids: LinkedHashMap::with_capacity(linked_hash_map_size), + facet_field_number_docids: LinkedHashMap::with_capacity(linked_hash_map_size), + facet_field_string_docids: LinkedHashMap::with_capacity(linked_hash_map_size), facet_field_value_docids_limit: linked_hash_map_size, // MTBL parameters chunk_compression_type, @@ -184,8 +205,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { word_docids_sorter, words_pairs_proximities_docids_sorter, word_level_position_docids_sorter, - facet_field_value_docids_sorter, - field_id_docid_facet_values_sorter, + facet_field_numbers_docids_sorter, + facet_field_strings_docids_sorter, + field_id_docid_facet_numbers_sorter, + field_id_docid_facet_strings_sorter, // MTBL writers docid_word_positions_writer, documents_writer, @@ -215,34 +238,68 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - // Save the documents ids under the facet field id and value we have seen it. - fn insert_facet_values_docid( + fn insert_facet_number_values_docid( &mut self, field_id: FieldId, - field_value: FacetValue, + value: OrderedFloat, id: DocumentId, ) -> anyhow::Result<()> { - Self::write_field_id_docid_facet_value(&mut self.field_id_docid_facet_values_sorter, field_id, id, &field_value)?; + let sorter = &mut self.field_id_docid_facet_numbers_sorter; + Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; - let key = (field_id, field_value); + let key = (field_id, value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. - match self.facet_field_value_docids.get_refresh(&key) { + match self.facet_field_number_docids.get_refresh(&key) { Some(old) => { old.insert(id); }, None => { // A newly inserted element is append at the end of the linked hash map. - self.facet_field_value_docids.insert(key, RoaringBitmap::from_iter(Some(id))); + self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id))); // If the word docids just reached it's capacity we must make sure to remove // one element, this way next time we insert we doesn't grow the capacity. - if self.facet_field_value_docids.len() == self.facet_field_value_docids_limit { + if self.facet_field_number_docids.len() == self.facet_field_value_docids_limit { // Removing the front element is equivalent to removing the LRU element. - Self::write_facet_field_value_docids( - &mut self.facet_field_value_docids_sorter, - self.facet_field_value_docids.pop_front(), + Self::write_facet_field_number_docids( + &mut self.facet_field_numbers_docids_sorter, + self.facet_field_number_docids.pop_front(), )?; } } } + + Ok(()) + } + + // Save the documents ids under the facet field id and value we have seen it. + fn insert_facet_string_values_docid( + &mut self, + field_id: FieldId, + value: String, + id: DocumentId, + ) -> anyhow::Result<()> + { + let sorter = &mut self.field_id_docid_facet_strings_sorter; + Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; + + let key = (field_id, value); + // if get_refresh finds the element it is assured to be at the end of the linked hash map. + match self.facet_field_string_docids.get_refresh(&key) { + Some(old) => { old.insert(id); }, + None => { + // A newly inserted element is append at the end of the linked hash map. + self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); + // If the word docids just reached it's capacity we must make sure to remove + // one element, this way next time we insert we doesn't grow the capacity. + if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit { + // Removing the front element is equivalent to removing the LRU element. + Self::write_facet_field_string_docids( + &mut self.facet_field_strings_docids_sorter, + self.facet_field_string_docids.pop_front(), + )?; + } + } + } + Ok(()) } @@ -287,7 +344,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { &mut self, document_id: DocumentId, words_positions: &mut HashMap>, - facet_values: &mut HashMap>, + facet_numbers_values: &mut HashMap>, + facet_strings_values: &mut HashMap>, record: &[u8], ) -> anyhow::Result<()> { @@ -306,10 +364,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { words_positions.clear(); - // We store document_id associated with all the field id and values. - for (field, values) in facet_values.drain() { + // We store document_id associated with all the facet numbers fields ids and values. + for (field, values) in facet_numbers_values.drain() { for value in values { - self.insert_facet_values_docid(field, value, document_id)?; + let value = OrderedFloat::from(value); + self.insert_facet_number_values_docid(field, value, document_id)?; + } + } + + // We store document_id associated with all the facet strings fields ids and values. + for (field, values) in facet_strings_values.drain() { + for value in values { + self.insert_facet_string_values_docid(field, value, document_id)?; } } @@ -409,20 +475,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_facet_field_value_docids( + fn write_facet_field_string_docids( sorter: &mut Sorter, iter: I, ) -> anyhow::Result<()> - where I: IntoIterator + where I: IntoIterator { - use FacetValue::*; - for ((field_id, value), docids) in iter { - let result = match value { - String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), - Number(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned), - }; - let key = result.context("could not serialize facet key")?; + let key = FacetValueStringCodec::bytes_encode(&(field_id, &value)) + .map(Cow::into_owned) + .context("could not serialize facet key")?; let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) .context("could not serialize docids")?; if lmdb_key_valid_size(&key) { @@ -433,21 +495,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(()) } - fn write_field_id_docid_facet_value( + fn write_facet_field_number_docids( + sorter: &mut Sorter, + iter: I, + ) -> anyhow::Result<()> + where I: IntoIterator), RoaringBitmap)> + { + for ((field_id, value), docids) in iter { + let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value)) + .map(Cow::into_owned) + .context("could not serialize facet key")?; + let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) + .context("could not serialize docids")?; + if lmdb_key_valid_size(&key) { + sorter.insert(&key, &bytes)?; + } + } + + Ok(()) + } + + fn write_field_id_docid_facet_number_value( sorter: &mut Sorter, field_id: FieldId, document_id: DocumentId, - value: &FacetValue, + value: OrderedFloat, ) -> anyhow::Result<()> { - use FacetValue::*; + let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value)) + .map(Cow::into_owned) + .context("could not serialize facet key")?; - let result = match value { - String(s) => FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, s)).map(Cow::into_owned), - Number(f) => FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, **f)).map(Cow::into_owned), - }; + if lmdb_key_valid_size(&key) { + sorter.insert(&key, &[])?; + } + + Ok(()) + } + + fn write_field_id_docid_facet_string_value( + sorter: &mut Sorter, + field_id: FieldId, + document_id: DocumentId, + value: &str, + ) -> anyhow::Result<()> + { + let key = FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, value)) + .map(Cow::into_owned) + .context("could not serialize facet key")?; - let key = result.context("could not serialize facet key")?; if lmdb_key_valid_size(&key) { sorter.insert(&key, &[])?; } @@ -493,7 +589,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut before = Instant::now(); let mut words_positions = HashMap::new(); - let mut facet_values = HashMap::new(); + let mut facet_numbers_values = HashMap::new(); + let mut facet_strings_values = HashMap::new(); let mut count: usize = 0; while let Some((key, value)) = documents.next()? { @@ -513,32 +610,12 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { } for (attr, content) in document.iter() { - if self.faceted_fields.contains_key(&attr) || self.searchable_fields.contains(&attr) { + if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) { let value = serde_json::from_slice(content)?; - if let Some(ftype) = self.faceted_fields.get(&attr) { - let mut values = match parse_facet_value(*ftype, &value) { - Ok(values) => values, - Err(e) => { - // We extract the name of the attribute and the document id - // to help users debug a facet type conversion. - let attr_name = self.fields_ids_map.name(attr).unwrap(); - let document_id: Value = self.fields_ids_map.id(&self.primary_key) - .and_then(|fid| document.get(fid)) - .map(serde_json::from_slice) - .unwrap()?; - - let context = format!( - "while extracting facet from the {:?} attribute in the {} document", - attr_name, document_id, - ); - warn!("{}", e.context(context)); - - SmallVec8::default() - }, - }; - facet_values.entry(attr).or_insert_with(SmallVec8::new).extend(values.drain(..)); - } + let (facet_numbers, facet_strings) = extract_facet_values(&value); + facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers); + facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings); if self.searchable_fields.contains(&attr) { let content = match json_to_string(&value) { @@ -558,7 +635,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { } // We write the document in the documents store. - self.write_document(document_id, &mut words_positions, &mut facet_values, value)?; + self.write_document( + document_id, + &mut words_positions, + &mut facet_numbers_values, + &mut facet_strings_values, + value, + )?; } // Compute the document id of the next document. @@ -585,9 +668,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { &mut self.words_pairs_proximities_docids_sorter, self.words_pairs_proximities_docids, )?; - Self::write_facet_field_value_docids( - &mut self.facet_field_value_docids_sorter, - self.facet_field_value_docids, + Self::write_facet_field_number_docids( + &mut self.facet_field_numbers_docids_sorter, + self.facet_field_number_docids, + )?; + + Self::write_facet_field_string_docids( + &mut self.facet_field_strings_docids_sorter, + self.facet_field_string_docids, )?; let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; @@ -613,18 +701,26 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; - let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; + let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; - let mut field_id_docid_facet_values_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; - self.field_id_docid_facet_values_sorter.write_into(&mut field_id_docid_facet_values_wtr)?; + let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?; + + let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?; + + let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?; let main = writer_into_reader(main_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; - let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; - let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?; + let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; + let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; + let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; + let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; let documents = writer_into_reader(self.documents_writer, shrink_size)?; @@ -634,8 +730,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { docid_word_positions, words_pairs_proximities_docids, word_level_position_docids, - facet_field_value_docids, - field_id_docid_facet_values, + facet_field_numbers_docids, + facet_field_strings_docids, + field_id_docid_facet_numbers, + field_id_docid_facet_strings, documents, }) } @@ -710,71 +808,36 @@ fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator< .filter(|(_, t)| t.is_word()) } -fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result> { - use FacetValue::*; - - fn inner_parse_facet_value( - ftype: FacetType, +fn extract_facet_values(value: &Value) -> (Vec, Vec) { + fn inner_extract_facet_values( value: &Value, can_recurse: bool, - output: &mut SmallVec8, - ) -> anyhow::Result<()> - { + output_numbers: &mut Vec, + output_strings: &mut Vec, + ) { match value { - Value::Null => Ok(()), - Value::Bool(b) => match ftype { - FacetType::String => { - output.push(String(b.to_string())); - Ok(()) - }, - FacetType::Number => { - output.push(Number(OrderedFloat(if *b { 1.0 } else { 0.0 }))); - Ok(()) - }, - }, - Value::Number(number) => match ftype { - FacetType::String => { - output.push(String(number.to_string())); - Ok(()) - }, - FacetType::Number => match number.as_f64() { - Some(float) => { - output.push(Number(OrderedFloat(float))); - Ok(()) - }, - None => bail!("invalid facet type, expecting {} found number", ftype), - }, + Value::Null => (), + Value::Bool(b) => output_strings.push(b.to_string()), + Value::Number(number) => if let Some(float) = number.as_f64() { + output_numbers.push(float); }, Value::String(string) => { // TODO must be normalized and not only lowercased. let string = string.trim().to_lowercase(); - match ftype { - FacetType::String => { - output.push(String(string)); - Ok(()) - }, - FacetType::Number => match string.parse() { - Ok(float) => { - output.push(Number(OrderedFloat(float))); - Ok(()) - }, - Err(_err) => bail!("invalid facet type, expecting {} found string", ftype), - }, - } + output_strings.push(string); }, Value::Array(values) => if can_recurse { - values.iter().map(|v| inner_parse_facet_value(ftype, v, false, output)).collect() - } else { - bail!( - "invalid facet type, expecting {} found array (recursive arrays are not supported)", - ftype, - ); + for value in values { + inner_extract_facet_values(value, false, output_numbers, output_strings); + } }, - Value::Object(_) => bail!("invalid facet type, expecting {} found object", ftype), + Value::Object(_) => (), } } - let mut facet_values = SmallVec8::new(); - inner_parse_facet_value(ftype, value, true, &mut facet_values)?; - Ok(facet_values) + let mut facet_number_values = Vec::new(); + let mut facet_string_values = Vec::new(); + inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); + + (facet_number_values, facet_string_values) } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c4d4fcfce..79c447834 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeSet, HashMap}; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::str::FromStr; use anyhow::Context; @@ -11,7 +11,6 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::{FieldsIdsMap, Index}; use crate::criterion::Criterion; -use crate::facet::FacetType; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; use crate::update::index_documents::{IndexDocumentsMethod, Transform}; @@ -68,7 +67,7 @@ pub struct Settings<'a, 't, 'u, 'i> { searchable_fields: Setting>, displayed_fields: Setting>, - faceted_fields: Setting>, + faceted_fields: Setting>, criteria: Setting>, stop_words: Setting>, distinct_attribute: Setting, @@ -123,7 +122,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.faceted_fields = Setting::Reset; } - pub fn set_faceted_fields(&mut self, names_facet_types: HashMap) { + pub fn set_faceted_fields(&mut self, names_facet_types: HashSet) { self.faceted_fields = Setting::Set(names_facet_types); } @@ -387,11 +386,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { match self.faceted_fields { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let mut new_facets = HashMap::new(); - for (name, ty) in fields { + let mut new_facets = HashSet::new(); + for name in fields { fields_ids_map.insert(name).context("field id limit exceeded")?; - let ty = FacetType::from_str(&ty)?; - new_facets.insert(name.clone(), ty); + new_facets.insert(name.clone()); } self.index.put_faceted_fields(self.wtxn, &new_facets)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; From e62b89a2edaf252fb7713074b171b82fe851258f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 3 May 2021 10:05:36 +0200 Subject: [PATCH 07/14] Make the facet distinct work with the new split facets --- milli/src/search/distinct/facet_distinct.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 44dd6bc66..7411e4af9 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -25,13 +25,12 @@ pub struct FacetDistinct<'a> { } impl<'a> FacetDistinct<'a> { - pub fn new( - distinct: FieldId, - index: &'a Index, - txn: &'a heed::RoTxn<'a>, - ) -> Self - { - Self { distinct, index, txn } + pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { + Self { + distinct, + index, + txn, + } } } @@ -100,10 +99,9 @@ impl<'a> FacetDistinctIter<'a> { let mut candidates_iter = self.candidates.iter().skip(self.iter_offset); match candidates_iter.next() { Some(id) => { - match self.facet_type { - FacetType::String => self.distinct_string(id)?, - FacetType::Number => self.distinct_number(id)?, - }; + // We distinct the document id on its facet strings and facet numbers. + self.distinct_string(id)?; + self.distinct_number(id)?; // The first document of each iteration is kept, since the next call to // `difference_with` will filter out all the documents for that facet value. By From f7efde11d9bf60a012141efeae63ecb309fd6f17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 3 May 2021 11:45:45 +0200 Subject: [PATCH 08/14] Refine the facet condition to use both facet databases --- milli/src/search/facet/facet_condition.rs | 304 ++++++++-------------- milli/src/search/facet/mod.rs | 2 +- milli/src/search/mod.rs | 4 +- 3 files changed, 111 insertions(+), 199 deletions(-) diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index 525450ee1..a02a08571 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::ops::Bound::{self, Included, Excluded}; use std::str::FromStr; @@ -21,76 +21,52 @@ use super::parser::Rule; use super::parser::{PREC_CLIMBER, FilterParser}; use self::FacetCondition::*; -use self::FacetNumberOperator::*; +use self::Operator::*; -#[derive(Debug, Copy, Clone, PartialEq)] -pub enum FacetNumberOperator { +#[derive(Debug, Clone, PartialEq)] +pub enum Operator { GreaterThan(f64), GreaterThanOrEqual(f64), - Equal(f64), - NotEqual(f64), + Equal(Option, String), + NotEqual(Option, String), LowerThan(f64), LowerThanOrEqual(f64), Between(f64, f64), } -impl FacetNumberOperator { +impl Operator { /// This method can return two operations in case it must express /// an OR operation for the between case (i.e. `TO`). fn negate(self) -> (Self, Option) { match self { - GreaterThan(x) => (LowerThanOrEqual(x), None), - GreaterThanOrEqual(x) => (LowerThan(x), None), - Equal(x) => (NotEqual(x), None), - NotEqual(x) => (Equal(x), None), - LowerThan(x) => (GreaterThanOrEqual(x), None), - LowerThanOrEqual(x) => (GreaterThan(x), None), - Between(x, y) => (LowerThan(x), Some(GreaterThan(y))), - } - } -} - -#[derive(Debug, Clone, PartialEq)] -pub enum FacetStringOperator { - Equal(String), - NotEqual(String), -} - -impl FacetStringOperator { - fn equal(s: &str) -> Self { - FacetStringOperator::Equal(s.to_lowercase()) - } - - #[allow(dead_code)] - fn not_equal(s: &str) -> Self { - FacetStringOperator::equal(s).negate() - } - - fn negate(self) -> Self { - match self { - FacetStringOperator::Equal(x) => FacetStringOperator::NotEqual(x), - FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x), + GreaterThan(n) => (LowerThanOrEqual(n), None), + GreaterThanOrEqual(n) => (LowerThan(n), None), + Equal(n, s) => (NotEqual(n, s), None), + NotEqual(n, s) => (Equal(n, s), None), + LowerThan(n) => (GreaterThanOrEqual(n), None), + LowerThanOrEqual(n) => (GreaterThan(n), None), + Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), } } } #[derive(Debug, Clone, PartialEq)] pub enum FacetCondition { - OperatorString(FieldId, FacetStringOperator), - OperatorNumber(FieldId, FacetNumberOperator), + Operator(FieldId, Operator), Or(Box, Box), And(Box, Box), } -fn get_field_id_facet_type<'a>( +fn field_id<'a>( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, items: &mut Pairs<'a, Rule>, -) -> Result<(FieldId, FacetType), PestError> +) -> Result> { // lexing ensures that we at least have a key let key = items.next().unwrap(); - let field_id = fields_ids_map + + fields_ids_map .id(key.as_str()) .ok_or_else(|| { PestError::new_from_span( @@ -103,32 +79,14 @@ fn get_field_id_facet_type<'a>( }, key.as_span(), ) - })?; - - let facet_type = faceted_fields - .get(&field_id) - .copied() - .ok_or_else(|| { - PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` is not faceted, available faceted attributes are: {}", - key.as_str(), - faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::>().join(", ") - ), - }, - key.as_span(), - ) - })?; - - Ok((field_id, facet_type)) + }) } -fn pest_parse(pair: Pair) -> Result> +fn pest_parse(pair: Pair) -> (Result>, String) where T: FromStr, T::Err: ToString, { - match pair.as_str().parse() { + let result = match pair.as_str().parse::() { Ok(value) => Ok(value), Err(e) => { Err(PestError::::new_from_span( @@ -136,7 +94,9 @@ where T: FromStr, pair.as_span(), )) } - } + }; + + (result, pair.as_str().to_string()) } impl FacetCondition { @@ -232,7 +192,7 @@ impl FacetCondition { fn from_pairs( fim: &FieldsIdsMap, - ff: &HashMap, + ff: &HashSet, expression: Pairs, ) -> anyhow::Result { @@ -263,10 +223,9 @@ impl FacetCondition { fn negate(self) -> FacetCondition { match self { - OperatorString(fid, op) => OperatorString(fid, op.negate()), - OperatorNumber(fid, op) => match op.negate() { - (op, None) => OperatorNumber(fid, op), - (a, Some(b)) => Or(Box::new(OperatorNumber(fid, a)), Box::new(OperatorNumber(fid, b))), + Operator(fid, op) => match op.negate() { + (op, None) => Operator(fid, op), + (a, Some(b)) => Or(Box::new(Operator(fid, a)), Box::new(Operator(fid, b))), }, Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), @@ -275,137 +234,100 @@ impl FacetCondition { fn between( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let item_span = item.as_span(); let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; - let lvalue = items.next().unwrap(); - let rvalue = items.next().unwrap(); - match ftype { - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - FacetType::Number => { - let lvalue = pest_parse(lvalue)?; - let rvalue = pest_parse(rvalue)?; - Ok(OperatorNumber(fid, Between(lvalue, rvalue))) - }, - } + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + + let (lresult, _) = pest_parse(items.next().unwrap()); + let (rresult, _) = pest_parse(items.next().unwrap()); + + let lvalue = lresult?; + let rvalue = rresult?; + + Ok(Operator(fid, Between(lvalue, rvalue))) } fn equal( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); - match ftype { - FacetType::String => Ok(OperatorString(fid, FacetStringOperator::equal(value.as_str()))), - FacetType::Number => Ok(OperatorNumber(fid, Equal(pest_parse(value)?))), - } + let (result, svalue) = pest_parse(value); + + Ok(Operator(fid, Equal(Some(result?), svalue))) } fn greater_than( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let item_span = item.as_span(); let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); - match ftype { - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - FacetType::Number => Ok(OperatorNumber(fid, GreaterThan(pest_parse(value)?))), - } + let (result, _svalue) = pest_parse(value); + + Ok(Operator(fid, GreaterThan(result?))) } fn greater_than_or_equal( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let item_span = item.as_span(); let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); - match ftype { - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - FacetType::Number => Ok(OperatorNumber(fid, GreaterThanOrEqual(pest_parse(value)?))), - } + let (result, _svalue) = pest_parse(value); + + Ok(Operator(fid, GreaterThanOrEqual(result?))) } fn lower_than( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let item_span = item.as_span(); let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); - match ftype { - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - FacetType::Number => Ok(OperatorNumber(fid, LowerThan(pest_parse(value)?))), - } + let (result, _svalue) = pest_parse(value); + + Ok(Operator(fid, LowerThan(result?))) } fn lower_than_or_equal( fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, + faceted_fields: &HashSet, item: Pair, ) -> anyhow::Result { let item_span = item.as_span(); let mut items = item.into_inner(); - let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; + let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; + let value = items.next().unwrap(); - match ftype { - FacetType::String => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { - message: "invalid operator on a faceted string".to_string(), - }, - item_span, - ).into()) - }, - FacetType::Number => Ok(OperatorNumber(fid, LowerThanOrEqual(pest_parse(value)?))), - } + let (result, _svalue) = pest_parse(value); + + Ok(Operator(fid, LowerThanOrEqual(result?))) } } @@ -485,34 +407,53 @@ impl FacetCondition { Ok(()) } - fn evaluate_number_operator<>( + fn evaluate_operator( rtxn: &heed::RoTxn, index: &Index, - db: heed::Database, + numbers_db: heed::Database, + strings_db: heed::Database, field_id: FieldId, - operator: FacetNumberOperator, + operator: &Operator, ) -> anyhow::Result { // Make sure we always bound the ranges with the field id and the level, // as the facets values are all in the same database and prefixed by the // field id and the level. let (left, right) = match operator { - GreaterThan(val) => (Excluded(val), Included(f64::MAX)), - GreaterThanOrEqual(val) => (Included(val), Included(f64::MAX)), - Equal(val) => (Included(val), Included(val)), - NotEqual(val) => { - let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; - let docids = Self::evaluate_number_operator(rtxn, index, db, field_id, Equal(val))?; - return Ok(all_documents_ids - docids); + GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), + GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), + Equal(number, string) => { + let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); + let number_docids = match number { + Some(n) => { + let n = Included(*n); + let mut output = RoaringBitmap::new(); + Self::explore_facet_number_levels(rtxn, numbers_db, field_id, 0, n, n, &mut output)?; + output + }, + None => RoaringBitmap::new(), + }; + return Ok(string_docids | number_docids); }, - LowerThan(val) => (Included(f64::MIN), Excluded(val)), - LowerThanOrEqual(val) => (Included(f64::MIN), Included(val)), - Between(left, right) => (Included(left), Included(right)), + NotEqual(number, string) => { + let all_numbers_ids = if number.is_some() { + index.number_faceted_documents_ids(rtxn, field_id)? + } else { + RoaringBitmap::new() + }; + let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; + let operator = Equal(*number, string.clone()); + let docids = Self::evaluate_operator(rtxn, index, numbers_db, strings_db, field_id, &operator)?; + return Ok((all_numbers_ids | all_strings_ids) - docids); + }, + LowerThan(val) => (Included(f64::MIN), Excluded(*val)), + LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), + Between(left, right) => (Included(*left), Included(*right)), }; // Ask for the biggest value that can exist for this specific field, if it exists // that's fine if it don't, the value just before will be returned instead. - let biggest_level = db + let biggest_level = numbers_db .remap_data_type::() .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, f64::MAX, f64::MAX))? .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); @@ -520,52 +461,25 @@ impl FacetCondition { match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels(rtxn, db, field_id, level, left, right, &mut output)?; + Self::explore_facet_number_levels(rtxn, numbers_db, field_id, level, left, right, &mut output)?; Ok(output) }, None => Ok(RoaringBitmap::new()), } } - fn evaluate_string_operator( - rtxn: &heed::RoTxn, - index: &Index, - db: heed::Database, - field_id: FieldId, - operator: &FacetStringOperator, - ) -> anyhow::Result - { - match operator { - FacetStringOperator::Equal(string) => { - match db.get(rtxn, &(field_id, string))? { - Some(docids) => Ok(docids), - None => Ok(RoaringBitmap::new()) - } - }, - FacetStringOperator::NotEqual(string) => { - let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; - let op = FacetStringOperator::Equal(string.clone()); - let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?; - Ok(all_documents_ids - docids) - }, - } - } - pub fn evaluate( &self, rtxn: &heed::RoTxn, index: &Index, ) -> anyhow::Result { - let db = index.facet_field_id_value_docids; + let numbers_db = index.facet_id_f64_docids; + let strings_db = index.facet_id_string_docids; + match self { - OperatorString(fid, op) => { - let db = db.remap_key_type::(); - Self::evaluate_string_operator(rtxn, index, db, *fid, op) - }, - OperatorNumber(fid, op) => { - let db = db.remap_key_type::(); - Self::evaluate_number_operator(rtxn, index, db, *fid, *op) + Operator(fid, op) => { + Self::evaluate_operator(rtxn, index, numbers_db, strings_db, *fid, op) }, Or(lhs, rhs) => { let lhs = lhs.evaluate(rtxn, index)?; diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 26bcf1b83..fff1d14a8 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -9,7 +9,7 @@ use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{Index, FieldId}; -pub use self::facet_condition::{FacetCondition, FacetNumberOperator, FacetStringOperator}; +pub use self::facet_condition::{FacetCondition, Operator}; pub use self::facet_distribution::FacetDistribution; mod facet_condition; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 640f081ba..f2211ad78 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -16,9 +16,7 @@ use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct}; use crate::search::criteria::r#final::{Final, FinalResult}; use crate::{Index, DocumentId}; -pub use self::facet::{ - FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator, -}; +pub use self::facet::{FacetCondition, FacetDistribution, FacetIter, Operator}; pub use self::query_tree::MatchingWords; use self::query_tree::QueryTreeBuilder; From 79efded841368f23393b877543fc4b1d59efeafc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 3 May 2021 12:51:33 +0200 Subject: [PATCH 09/14] Refine the FacetCondition from_array constructor --- milli/src/search/facet/facet_condition.rs | 68 ++++++----------------- 1 file changed, 16 insertions(+), 52 deletions(-) diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index a02a08571..899b99b71 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -1,9 +1,8 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::HashSet; use std::fmt::Debug; use std::ops::Bound::{self, Included, Excluded}; use std::str::FromStr; -use anyhow::Context; use either::Either; use heed::types::DecodeIgnore; use log::debug; @@ -12,7 +11,6 @@ use pest::iterators::{Pair, Pairs}; use pest::Parser; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::{Index, FieldId, FieldsIdsMap, CboRoaringBitmapCodec}; @@ -65,21 +63,19 @@ fn field_id<'a>( { // lexing ensures that we at least have a key let key = items.next().unwrap(); - - fields_ids_map - .id(key.as_str()) - .ok_or_else(|| { - PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` not found, available attributes are: {}", - key.as_str(), - fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", ") - ), - }, - key.as_span(), - ) - }) + match fields_ids_map.id(key.as_str()) { + Some(field_id) => Ok(field_id), + None => Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` not found, available attributes are: {}", + key.as_str(), + fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", ") + ), + }, + key.as_span(), + )), + } } fn pest_parse(pair: Pair) -> (Result>, String) @@ -110,32 +106,6 @@ impl FacetCondition { A: AsRef, B: AsRef, { - fn facet_condition( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - key: &str, - value: &str, - ) -> anyhow::Result - { - let fid = fields_ids_map.id(key).with_context(|| { - format!("{:?} isn't present in the fields ids map", key) - })?; - let ftype = faceted_fields.get(key).copied().with_context(|| { - format!("{:?} isn't a faceted field", key) - })?; - let (neg, value) = match value.trim().strip_prefix('-') { - Some(value) => (true, value.trim()), - None => (false, value.trim()), - }; - - let operator = match ftype { - FacetType::String => OperatorString(fid, FacetStringOperator::equal(value)), - FacetType::Number => OperatorNumber(fid, FacetNumberOperator::Equal(value.parse()?)), - }; - - if neg { Ok(operator.negate()) } else { Ok(operator) } - } - let fields_ids_map = index.fields_ids_map(rtxn)?; let faceted_fields = index.faceted_fields(rtxn)?; let mut ands = None; @@ -145,10 +115,7 @@ impl FacetCondition { Either::Left(array) => { let mut ors = None; for rule in array { - let mut iter = rule.as_ref().splitn(2, ':'); - let key = iter.next().context("missing facet condition key")?; - let value = iter.next().context("missing facet condition value")?; - let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?; + let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?; ors = match ors.take() { Some(ors) => Some(Or(Box::new(ors), Box::new(condition))), None => Some(condition), @@ -163,10 +130,7 @@ impl FacetCondition { } }, Either::Right(rule) => { - let mut iter = rule.as_ref().splitn(2, ':'); - let key = iter.next().context("missing facet condition key")?; - let value = iter.next().context("missing facet condition value")?; - let condition = facet_condition(&fields_ids_map, &faceted_fields, key, value)?; + let condition = FacetCondition::from_str(rtxn, index, rule.as_ref())?; ands = match ands.take() { Some(ands) => Some(And(Box::new(ands), Box::new(condition))), None => Some(condition), From 02c655ff1a153eb548509284eec73e5e024a9d63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 3 May 2021 15:17:24 +0200 Subject: [PATCH 10/14] Refine the facet distribution to use both databases --- milli/src/search/criteria/asc_desc.rs | 2 - milli/src/search/facet/facet_condition.rs | 47 +++--- milli/src/search/facet/facet_distribution.rs | 156 +++++++++---------- milli/src/search/mod.rs | 15 +- 4 files changed, 106 insertions(+), 114 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 32857b8d7..f57d6d54f 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -50,7 +50,6 @@ impl<'t> AscDesc<'t> { Self::new(index, rtxn, parent, field_name, false) } - fn new( index: &'t Index, rtxn: &'t heed::RoTxn, @@ -59,7 +58,6 @@ impl<'t> AscDesc<'t> { ascending: bool, ) -> anyhow::Result { let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields(rtxn)?; let field_id = fields_ids_map .id(&field_name) .with_context(|| format!("field {:?} isn't registered", field_name))?; diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index 899b99b71..b189f5f0f 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -55,27 +55,45 @@ pub enum FacetCondition { And(Box, Box), } -fn field_id<'a>( +fn field_id( fields_ids_map: &FieldsIdsMap, faceted_fields: &HashSet, - items: &mut Pairs<'a, Rule>, + items: &mut Pairs, ) -> Result> { // lexing ensures that we at least have a key let key = items.next().unwrap(); - match fields_ids_map.id(key.as_str()) { - Some(field_id) => Ok(field_id), - None => Err(PestError::new_from_span( + + let field_id = match fields_ids_map.id(key.as_str()) { + Some(field_id) => field_id, + None => return Err(PestError::new_from_span( ErrorVariant::CustomError { message: format!( "attribute `{}` not found, available attributes are: {}", key.as_str(), - fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", ") + fields_ids_map.iter().map(|(_, n)| n).collect::>().join(", "), ), }, key.as_span(), )), + }; + + if !faceted_fields.contains(&field_id) { + return Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` is not faceted, available faceted attributes are: {}", + key.as_str(), + faceted_fields.iter().flat_map(|id| { + fields_ids_map.name(*id) + }).collect::>().join(", "), + ), + }, + key.as_span(), + )); } + + Ok(field_id) } fn pest_parse(pair: Pair) -> (Result>, String) @@ -84,12 +102,10 @@ where T: FromStr, { let result = match pair.as_str().parse::() { Ok(value) => Ok(value), - Err(e) => { - Err(PestError::::new_from_span( - ErrorVariant::CustomError { message: e.to_string() }, - pair.as_span(), - )) - } + Err(e) => Err(PestError::::new_from_span( + ErrorVariant::CustomError { message: e.to_string() }, + pair.as_span(), + )), }; (result, pair.as_str().to_string()) @@ -106,8 +122,6 @@ impl FacetCondition { A: AsRef, B: AsRef, { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let faceted_fields = index.faceted_fields(rtxn)?; let mut ands = None; for either in array { @@ -202,7 +216,6 @@ impl FacetCondition { item: Pair, ) -> anyhow::Result { - let item_span = item.as_span(); let mut items = item.into_inner(); let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; @@ -236,7 +249,6 @@ impl FacetCondition { item: Pair, ) -> anyhow::Result { - let item_span = item.as_span(); let mut items = item.into_inner(); let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; @@ -252,7 +264,6 @@ impl FacetCondition { item: Pair, ) -> anyhow::Result { - let item_span = item.as_span(); let mut items = item.into_inner(); let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; @@ -268,7 +279,6 @@ impl FacetCondition { item: Pair, ) -> anyhow::Result { - let item_span = item.as_span(); let mut items = item.into_inner(); let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; @@ -284,7 +294,6 @@ impl FacetCondition { item: Pair, ) -> anyhow::Result { - let item_span = item.as_span(); let mut items = item.into_inner(); let fid = field_id(fields_ids_map, faceted_fields, &mut items)?; diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 7fd2d385b..c6122cc77 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -3,12 +3,12 @@ use std::ops::Bound::Unbounded; use std::{cmp, fmt}; use anyhow::Context; -use heed::BytesDecode; +use heed::{Database, BytesDecode}; +use heed::types::{ByteSlice, Unit}; use roaring::RoaringBitmap; -use crate::facet::{FacetType, FacetValue}; -use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; -use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; +use crate::facet::FacetType; +use crate::heed_codec::facet::FacetValueStringCodec; use crate::search::facet::{FacetIter, FacetRange}; use crate::{Index, FieldId, DocumentId}; @@ -60,86 +60,81 @@ impl<'a> FacetDistribution<'a> { /// There is a small amount of candidates OR we ask for facet string values so we /// decide to iterate over the facet values of each one of them, one by one. - fn facet_values_from_documents( + fn facet_distribution_from_documents( &self, field_id: FieldId, facet_type: FacetType, candidates: &RoaringBitmap, - ) -> heed::Result> + distribution: &mut BTreeMap, + ) -> heed::Result<()> { fn fetch_facet_values<'t, KC, K: 't>( - index: &Index, rtxn: &'t heed::RoTxn, + db: Database, field_id: FieldId, candidates: &RoaringBitmap, - ) -> heed::Result> + distribution: &mut BTreeMap, + ) -> heed::Result<()> where + K: fmt::Display, KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>, - K: Into, { - let mut facet_values = BTreeMap::new(); let mut key_buffer = vec![field_id]; for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { key_buffer.truncate(1); key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = index.field_id_docid_facet_values + let iter = db + .remap_key_type::() .prefix_iter(rtxn, &key_buffer)? .remap_key_type::(); for result in iter { let ((_, _, value), ()) = result?; - *facet_values.entry(value.into()).or_insert(0) += 1; + *distribution.entry(value.to_string()).or_insert(0) += 1; } } - Ok(facet_values) + Ok(()) } - let index = self.index; - let rtxn = self.rtxn; match facet_type { - FacetType::String => { - fetch_facet_values::(index, rtxn, field_id, candidates) - }, FacetType::Number => { - fetch_facet_values::(index, rtxn, field_id, candidates) + let db = self.index.field_id_docid_facet_f64s; + fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) }, + FacetType::String => { + let db = self.index.field_id_docid_facet_strings; + fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) + } } } /// There is too much documents, we use the facet levels to move throught /// the facet values, to find the candidates and values associated. - fn facet_values_from_facet_levels( + fn facet_numbers_distribution_from_facet_levels( &self, field_id: FieldId, - facet_type: FacetType, candidates: &RoaringBitmap, - ) -> heed::Result> + distribution: &mut BTreeMap, + ) -> heed::Result<()> { - let iter = match facet_type { - FacetType::String => unreachable!(), - FacetType::Number => { - let iter = FacetIter::new_non_reducing( - self.rtxn, self.index, field_id, candidates.clone(), - )?; - iter.map(|r| r.map(|(v, docids)| (FacetValue::from(v), docids))) - }, - }; + let iter = FacetIter::new_non_reducing( + self.rtxn, self.index, field_id, candidates.clone(), + )?; - let mut facet_values = BTreeMap::new(); for result in iter { let (value, mut docids) = result?; docids.intersect_with(candidates); if !docids.is_empty() { - facet_values.insert(value, docids.len()); + distribution.insert(value.to_string(), docids.len()); } - if facet_values.len() == self.max_values_by_facet { + if distribution.len() == self.max_values_by_facet { break; } } - Ok(facet_values) + Ok(()) } /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the @@ -147,80 +142,73 @@ impl<'a> FacetDistribution<'a> { fn facet_values_from_raw_facet_database( &self, field_id: FieldId, - facet_type: FacetType, - ) -> heed::Result> + ) -> heed::Result> { - let db = self.index.facet_field_id_value_docids; - let level = 0; - let iter = match facet_type { - FacetType::String => { - let iter = db - .prefix_iter(self.rtxn, &[field_id])? - .remap_key_type::() - .map(|r| r.map(|((_, v), docids)| (FacetValue::from(v), docids))); - Box::new(iter) as Box::> - }, - FacetType::Number => { - let db = db.remap_key_type::(); - let range = FacetRange::new( - self.rtxn, db, field_id, level, Unbounded, Unbounded, - )?; - Box::new(range.map(|r| r.map(|((_, _, v, _), docids)| (FacetValue::from(v), docids)))) - }, - }; + let mut distribution = BTreeMap::new(); - let mut facet_values = BTreeMap::new(); - for result in iter { - let (value, docids) = result?; - facet_values.insert(value, docids.len()); - if facet_values.len() == self.max_values_by_facet { + let db = self.index.facet_id_f64_docids; + let range = FacetRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + + for result in range { + let ((_, _, value, _), docids) = result?; + distribution.insert(value.to_string(), docids.len()); + if distribution.len() == self.max_values_by_facet { break; } } - Ok(facet_values) + let iter = self.index + .facet_id_string_docids + .remap_key_type::() + .prefix_iter(self.rtxn, &[field_id])? + .remap_key_type::(); + + for result in iter { + let ((_, value), docids) = result?; + distribution.insert(value.to_string(), docids.len()); + if distribution.len() == self.max_values_by_facet { + break; + } + } + + Ok(distribution) } - fn facet_values( - &self, - field_id: FieldId, - facet_type: FacetType, - ) -> heed::Result> - { + fn facet_values(&self, field_id: FieldId) -> heed::Result> { + use FacetType::{Number, String}; + if let Some(candidates) = self.candidates.as_ref() { // Classic search, candidates were specified, we must return facet values only related // to those candidates. We also enter here for facet strings for performance reasons. - if candidates.len() <= CANDIDATES_THRESHOLD || facet_type == FacetType::String { - self.facet_values_from_documents(field_id, facet_type, candidates) + let mut distribution = BTreeMap::new(); + if candidates.len() <= CANDIDATES_THRESHOLD { + self.facet_distribution_from_documents(field_id, Number, candidates, &mut distribution)?; + self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; } else { - self.facet_values_from_facet_levels(field_id, facet_type, candidates) + self.facet_numbers_distribution_from_facet_levels(field_id, candidates, &mut distribution)?; + self.facet_distribution_from_documents(field_id, String, candidates, &mut distribution)?; } + + Ok(distribution) } else { - self.facet_values_from_raw_facet_database(field_id, facet_type) + self.facet_values_from_raw_facet_database(field_id) } } - pub fn execute(&self) -> anyhow::Result>> { + pub fn execute(&self) -> anyhow::Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; let faceted_fields = self.index.faceted_fields(self.rtxn)?; - let fields_ids: Vec<_> = match &self.facets { - Some(names) => names - .iter() - .filter_map(|n| faceted_fields.get(n).map(|t| (n.to_string(), *t))) - .collect(), - None => faceted_fields.into_iter().collect(), - }; - let mut facets_values = BTreeMap::new(); - for (name, ftype) in fields_ids { + let mut distribution = BTreeMap::new(); + for name in faceted_fields { let fid = fields_ids_map.id(&name).with_context(|| { format!("missing field name {:?} from the fields id map", name) })?; - let values = self.facet_values(fid, ftype)?; - facets_values.insert(name, values); + let values = self.facet_values(fid)?; + distribution.insert(name, values); } - Ok(facets_values) + Ok(distribution) } } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f2211ad78..623581706 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -141,15 +141,12 @@ impl<'a> Search<'a> { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; let id = field_ids_map.id(name).expect("distinct not present in field map"); let faceted_fields = self.index.faceted_fields(self.rtxn)?; - match faceted_fields.get(name) { - Some(facet_type) => { - let distinct = FacetDistinct::new(id, self.index, self.rtxn); - self.perform_sort(distinct, matching_words, criteria) - } - None => { - let distinct = MapDistinct::new(id, self.index, self.rtxn); - self.perform_sort(distinct, matching_words, criteria) - } + if faceted_fields.contains(name) { + let distinct = FacetDistinct::new(id, self.index, self.rtxn); + self.perform_sort(distinct, matching_words, criteria) + } else { + let distinct = MapDistinct::new(id, self.index, self.rtxn); + self.perform_sort(distinct, matching_words, criteria) } } } From 3a4a150ef04b0b866b6061036c2cd0cb13b0fcdf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 3 May 2021 15:58:47 +0200 Subject: [PATCH 11/14] Fix the tests and remaining warnings --- milli/src/search/criteria/asc_desc.rs | 19 +------ milli/src/search/distinct/facet_distinct.rs | 16 +++--- milli/src/search/distinct/map_distinct.rs | 4 +- milli/src/search/distinct/mod.rs | 4 +- milli/src/search/facet/facet_condition.rs | 56 ++++++++++----------- milli/src/update/clear_documents.rs | 6 ++- milli/src/update/index_documents/mod.rs | 21 ++++++-- milli/src/update/index_documents/store.rs | 8 +-- milli/src/update/settings.rs | 43 ++++++++++------ 9 files changed, 88 insertions(+), 89 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index f57d6d54f..c80bb38f1 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -1,4 +1,3 @@ -use std::collections::HashMap; use std::mem::take; use anyhow::Context; @@ -7,11 +6,10 @@ use log::debug; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; -use crate::facet::FacetType; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; -use crate::{FieldsIdsMap, FieldId, Index}; +use crate::{FieldId, Index}; use super::{Criterion, CriterionParameters, CriterionResult}; /// Threshold on the number of candidates that will make @@ -119,7 +117,6 @@ impl<'t> Criterion for AscDesc<'t> { self.index, self.rtxn, self.field_id, - self.facet_type, self.ascending, candidates, )?; @@ -141,20 +138,6 @@ impl<'t> Criterion for AscDesc<'t> { } } -fn field_id_facet_type( - fields_ids_map: &FieldsIdsMap, - faceted_fields: &HashMap, - field: &str, -) -> anyhow::Result<(FieldId, FacetType)> { - let id = fields_ids_map - .id(field) - .with_context(|| format!("field {:?} isn't registered", field))?; - let facet_type = faceted_fields - .get(field) - .with_context(|| format!("field {:?} isn't faceted", field))?; - Ok((id, *facet_type)) -} - /// Returns an iterator over groups of the given candidates in ascending or descending order. /// /// It will either use an iterative or a recursive method on the whole facet database depending diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 7411e4af9..9485087d3 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -189,23 +189,21 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { #[cfg(test)] mod test { - use std::collections::HashMap; + use std::collections::HashSet; use super::super::test::{generate_index, validate_distinct_candidates}; use super::*; - use crate::facet::FacetType; macro_rules! test_facet_distinct { - ($name:ident, $distinct:literal, $facet_type:expr) => { + ($name:ident, $distinct:literal) => { #[test] fn $name() { use std::iter::FromIterator; - let facets = - HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); + let facets = HashSet::from_iter(Some(($distinct.to_string()))); let (index, fid, candidates) = generate_index($distinct, facets); let txn = index.read_txn().unwrap(); - let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type); + let mut map_distinct = FacetDistinct::new(fid, &index, &txn); let excluded = RoaringBitmap::new(); let mut iter = map_distinct.distinct(candidates.clone(), excluded); let count = validate_distinct_candidates(iter.by_ref(), fid, &index); @@ -215,7 +213,7 @@ mod test { }; } - test_facet_distinct!(test_string, "txt", FacetType::String); - test_facet_distinct!(test_strings, "txts", FacetType::String); - test_facet_distinct!(test_number, "cat-int", FacetType::Number); + test_facet_distinct!(test_string, "txt"); + test_facet_distinct!(test_strings, "txts"); + test_facet_distinct!(test_number, "cat-int"); } diff --git a/milli/src/search/distinct/map_distinct.rs b/milli/src/search/distinct/map_distinct.rs index 4c01d1ded..465af2c3b 100644 --- a/milli/src/search/distinct/map_distinct.rs +++ b/milli/src/search/distinct/map_distinct.rs @@ -110,7 +110,7 @@ impl<'a, 'b> Distinct<'b> for MapDistinct<'a> { #[cfg(test)] mod test { - use std::collections::HashMap; + use std::collections::HashSet; use super::*; use super::super::test::{generate_index, validate_distinct_candidates}; @@ -119,7 +119,7 @@ mod test { ($name:ident, $distinct:literal) => { #[test] fn $name() { - let (index, fid, candidates) = generate_index($distinct, HashMap::new()); + let (index, fid, candidates) = generate_index($distinct, HashSet::new()); let txn = index.read_txn().unwrap(); let mut map_distinct = MapDistinct::new(fid, &index, &txn); let excluded = RoaringBitmap::new(); diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 776f0d2b3..0dd628d5b 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -28,7 +28,7 @@ pub trait Distinct<'a> { #[cfg(test)] mod test { - use std::collections::{HashMap, HashSet}; + use std::collections::HashSet; use once_cell::sync::Lazy; use rand::{seq::SliceRandom, Rng}; @@ -74,7 +74,7 @@ mod test { /// Returns a temporary index populated with random test documents, the FieldId for the /// distinct attribute, and the RoaringBitmap with the document ids. - pub(crate) fn generate_index(distinct: &str, facets: HashMap) -> (TempIndex, FieldId, RoaringBitmap) { + pub(crate) fn generate_index(distinct: &str, facets: HashSet) -> (TempIndex, FieldId, RoaringBitmap) { let index = TempIndex::new(); let mut txn = index.write_txn().unwrap(); diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index b189f5f0f..e7917df97 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -240,7 +240,10 @@ impl FacetCondition { let value = items.next().unwrap(); let (result, svalue) = pest_parse(value); - Ok(Operator(fid, Equal(Some(result?), svalue))) + // TODO we must normalize instead of lowercase. + let svalue = svalue.to_lowercase(); + + Ok(Operator(fid, Equal(result.ok(), svalue))) } fn greater_than( @@ -473,7 +476,8 @@ mod tests { use super::*; use crate::update::Settings; use heed::EnvOpenOptions; - use maplit::hashmap; + use maplit::hashset; + use big_s::S; #[test] fn string() { @@ -485,22 +489,22 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() }); + builder.set_faceted_fields(hashset!{ S("channel") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); - let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); - let expected = OperatorString(0, FacetStringOperator::equal("Ponce")); + let condition = FacetCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); + let expected = Operator(0, Operator::Equal(None, S("ponce"))); assert_eq!(condition, expected); let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); - let expected = OperatorString(0, FacetStringOperator::not_equal("ponce")); + let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); assert_eq!(condition, expected); let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); - let expected = OperatorString(0, FacetStringOperator::not_equal("ponce")); + let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); assert_eq!(condition, expected); } @@ -514,20 +518,20 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap!{ "timestamp".into() => "number".into() }); + builder.set_faceted_fields(hashset!{ "timestamp".into() }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); // Test that the facet condition is correctly generated. let rtxn = index.read_txn().unwrap(); let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); - let expected = OperatorNumber(0, Between(22.0, 44.0)); + let expected = Operator(0, Between(22.0, 44.0)); assert_eq!(condition, expected); let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); let expected = Or( - Box::new(OperatorNumber(0, LowerThan(22.0))), - Box::new(OperatorNumber(0, GreaterThan(44.0))), + Box::new(Operator(0, LowerThan(22.0))), + Box::new(Operator(0, GreaterThan(44.0))), ); assert_eq!(condition, expected); } @@ -542,11 +546,8 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order - builder.set_faceted_fields(hashmap!{ - "channel".into() => "string".into(), - "timestamp".into() => "number".into(), - }); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -557,10 +558,10 @@ mod tests { "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", ).unwrap(); let expected = Or( - Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), + Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(And( - Box::new(OperatorNumber(1, Between(22.0, 44.0))), - Box::new(OperatorString(0, FacetStringOperator::not_equal("ponce"))), + Box::new(Operator(1, Between(22.0, 44.0))), + Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))), )) ); assert_eq!(condition, expected); @@ -570,13 +571,13 @@ mod tests { "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", ).unwrap(); let expected = Or( - Box::new(OperatorString(0, FacetStringOperator::equal("gotaga"))), + Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), Box::new(Or( Box::new(Or( - Box::new(OperatorNumber(1, LowerThan(22.0))), - Box::new(OperatorNumber(1, GreaterThan(44.0))), + Box::new(Operator(1, LowerThan(22.0))), + Box::new(Operator(1, GreaterThan(44.0))), )), - Box::new(OperatorString(0, FacetStringOperator::equal("ponce"))), + Box::new(Operator(0, Operator::Equal(None, S("ponce")))), )), ); assert_eq!(condition, expected); @@ -592,11 +593,8 @@ mod tests { // Set the faceted fields to be the channel. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order - builder.set_faceted_fields(hashmap!{ - "channel".into() => "string".into(), - "timestamp".into() => "number".into(), - }); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_faceted_fields(hashset!{ S("channel"), S("timestamp") }); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); @@ -604,7 +602,7 @@ mod tests { let rtxn = index.read_txn().unwrap(); let condition = FacetCondition::from_array( &rtxn, &index, - vec![Either::Right("channel:gotaga"), Either::Left(vec!["timestamp:44", "channel:-ponce"])], + vec![Either::Right("channel = gotaga"), Either::Left(vec!["timestamp = 44", "channel != ponce"])], ).unwrap().unwrap(); let expected = FacetCondition::from_str( &rtxn, &index, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index c163046ec..e4c1d35f8 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -118,8 +118,10 @@ mod tests { assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); - assert!(index.facet_field_id_value_docids.is_empty(&rtxn).unwrap()); - assert!(index.field_id_docid_facet_values.is_empty(&rtxn).unwrap()); + assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); + assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); + assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); + assert!(index.field_id_docid_facet_strings.is_empty(&rtxn).unwrap()); assert!(index.documents.is_empty(&rtxn).unwrap()); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 10c2e41e7..064f4e6fd 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -450,8 +450,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { .enumerate() .map(|(i, documents)| { let store = Store::new( - primary_key.clone(), - fields_ids_map.clone(), searchable_fields.clone(), faceted_fields.clone(), linked_hash_map_size, @@ -553,7 +551,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, - facet_field_numbers_docids_readers, facet_field_strings_docids_readers, field_id_docid_facet_numbers_readers, field_id_docid_facet_strings_readers, @@ -565,7 +562,6 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, - facet_field_numbers_docids_readers, facet_field_strings_docids_readers, field_id_docid_facet_numbers_readers, field_id_docid_facet_strings_readers, @@ -599,7 +595,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; - let total_databases = 8; + let total_databases = 10; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, @@ -636,6 +632,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases, }); + debug!("Writing the facet id string docids into LMDB on disk..."); + merge_into_lmdb_database( + self.wtxn, + *self.index.facet_id_string_docids.as_polymorph(), + facet_field_strings_docids_readers, + facet_field_value_docids_merge, + write_method, + )?; + + database_count += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: database_count, + total_databases, + }); + debug!("Writing the field id docid facet numbers into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index ba8da6d16..afc199293 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -23,7 +23,7 @@ use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap}; +use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ @@ -53,8 +53,6 @@ pub struct Readers { pub struct Store<'s, A> { // Indexing parameters - primary_key: String, - fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, faceted_fields: HashSet, // Caches @@ -87,8 +85,6 @@ pub struct Store<'s, A> { impl<'s, A: AsRef<[u8]>> Store<'s, A> { pub fn new( - primary_key: String, - fields_ids_map: FieldsIdsMap, searchable_fields: HashSet, faceted_fields: HashSet, linked_hash_map_size: Option, @@ -184,8 +180,6 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Ok(Store { // Indexing parameters. - primary_key, - fields_ids_map, searchable_fields, faceted_fields, // Caches diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 79c447834..1571f627d 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,5 +1,4 @@ use std::collections::{BTreeSet, HashMap, HashSet}; -use std::str::FromStr; use anyhow::Context; use chrono::Utc; @@ -443,9 +442,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { #[cfg(test)] mod tests { use heed::EnvOpenOptions; - use maplit::{btreeset, hashmap}; + use heed::types::ByteSlice; + use maplit::{btreeset, hashmap, hashset}; + use big_s::S; - use crate::facet::FacetType; use crate::update::{IndexDocuments, UpdateFormat}; use super::*; @@ -620,37 +620,53 @@ mod tests { // Set the faceted fields to be the age. let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_faceted_fields(hashmap!{ "age".into() => "number".into() }); + builder.set_faceted_fields(hashset!{ S("age") }); builder.execute(|_, _| ()).unwrap(); // Then index some documents. - let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let content = &br#"[ + { "name": "kevin", "age": 23 }, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]"#[..]; let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); builder.enable_autogenerate_docids(); - builder.update_format(UpdateFormat::Csv); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); let fields_ids = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashmap!{ "age".to_string() => FacetType::Number }); + assert_eq!(fields_ids, hashset!{ S("age") }); // Only count the field_id 0 and level 0 facet values. - let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); + // TODO we must support typed CSVs for numbers to be understood. + let count = index.facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); assert_eq!(count, 3); drop(rtxn); // Index a little more documents with new and current facets values. let mut wtxn = index.write_txn().unwrap(); - let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..]; + let content = &br#"[ + { "name": "kevin2", "age": 23 }, + { "name": "kevina2", "age": 21 }, + { "name": "benoit", "age": 35 } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 2); - builder.update_format(UpdateFormat::Csv); + builder.enable_autogenerate_docids(); + builder.update_format(UpdateFormat::Json); builder.execute(content, |_, _| ()).unwrap(); wtxn.commit().unwrap(); let rtxn = index.read_txn().unwrap(); // Only count the field_id 0 and level 0 facet values. - let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[0, 0]).unwrap().count(); + // TODO we must support typed CSVs for numbers to be understood. + let count = index.facet_id_f64_docids + .remap_key_type::() + .prefix_iter(&rtxn, &[0, 0]).unwrap().count(); assert_eq!(count, 4); } @@ -817,10 +833,7 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_displayed_fields(vec!["hello".to_string()]); - builder.set_faceted_fields(hashmap!{ - "age".into() => "number".into(), - "toto".into() => "number".into(), - }); + builder.set_faceted_fields(hashset!{ S("age"), S("toto") }); builder.set_criteria(vec!["asc(toto)".to_string()]); builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); From 28bd9e183e789f03388ce48d44d0e9df0699f330 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 4 May 2021 12:05:43 +0200 Subject: [PATCH 12/14] Fix the infos crate to support split facet databases --- infos/src/main.rs | 219 +++++++++++++++++++++++++--------------------- 1 file changed, 121 insertions(+), 98 deletions(-) diff --git a/infos/src/main.rs b/infos/src/main.rs index 902394af8..ee2060d38 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -5,6 +5,7 @@ use std::{str, io, fmt}; use anyhow::Context; use byte_unit::Byte; use heed::EnvOpenOptions; +use milli::facet::FacetType; use milli::{Index, TreeLevel}; use structopt::StructOpt; @@ -22,8 +23,11 @@ const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; -const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids"; -const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values"; +const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids"; +const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids"; +const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s"; +const FIELD_ID_DOCID_FACET_STRINGS_DB_NAME: &str = "field-id-docid-facet-strings"; + const DOCUMENTS_DB_NAME: &str = "documents"; const ALL_DATABASE_NAMES: &[&str] = &[ @@ -35,8 +39,10 @@ const ALL_DATABASE_NAMES: &[&str] = &[ WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, WORD_LEVEL_POSITION_DOCIDS_DB_NAME, WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, - FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME, - FIELD_ID_DOCID_FACET_VALUES_DB_NAME, + FACET_ID_F64_DOCIDS_DB_NAME, + FACET_ID_STRING_DOCIDS_DB_NAME, + FIELD_ID_DOCID_FACET_F64S_DB_NAME, + FIELD_ID_DOCID_FACET_STRINGS_DB_NAME, DOCUMENTS_DB_NAME, ]; @@ -108,8 +114,18 @@ enum Command { prefixes: Vec, }, - /// Outputs a CSV with the documents ids along with the facet values where it appears. - FacetValuesDocids { + /// Outputs a CSV with the documents ids along with the facet numbers where it appears. + FacetNumbersDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// The field name in the document. + field_name: String, + }, + + /// Outputs a CSV with the documents ids along with the facet strings where it appears. + FacetStringsDocids { /// Display the whole documents ids in details. #[structopt(long)] full_display: bool, @@ -149,8 +165,8 @@ enum Command { internal_documents_ids: Vec, }, - /// Outputs some facets statistics for the given facet name. - FacetStats { + /// Outputs some facets numbers statistics for the given facet name. + FacetNumberStats { /// The field name in the document. field_name: String, }, @@ -243,8 +259,11 @@ fn main() -> anyhow::Result<()> { WordsPrefixesDocids { full_display, prefixes } => { words_prefixes_docids(&index, &rtxn, !full_display, prefixes) }, - FacetValuesDocids { full_display, field_name } => { - facet_values_docids(&index, &rtxn, !full_display, field_name) + FacetNumbersDocids { full_display, field_name } => { + facet_values_docids(&index, &rtxn, !full_display, FacetType::Number, field_name) + }, + FacetStringsDocids { full_display, field_name } => { + facet_values_docids(&index, &rtxn, !full_display, FacetType::String, field_name) }, WordsLevelPositionsDocids { full_display, words } => { words_level_positions_docids(&index, &rtxn, !full_display, words) @@ -255,7 +274,7 @@ fn main() -> anyhow::Result<()> { DocidsWordsPositions { full_display, internal_documents_ids } => { docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) }, - FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), + FacetNumberStats { field_name } => facet_number_stats(&index, &rtxn, field_name), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositionsByWord => { average_number_of_positions_by_word(&index, &rtxn) @@ -297,36 +316,22 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow: } /// Helper function that converts the facet value key to a unique type -/// that can be used to log or display purposes. -fn facet_values_iter<'txn, DC: 'txn, T>( +/// that can be used for log or display purposes. +fn facet_values_iter<'txn, KC: 'txn, DC: 'txn>( rtxn: &'txn heed::RoTxn, - db: heed::Database, + db: heed::Database, field_id: u8, - facet_type: milli::facet::FacetType, - string_fn: impl Fn(&str) -> T + 'txn, - float_fn: impl Fn(u8, f64, f64) -> T + 'txn, -) -> heed::Result> + 'txn>> +) -> heed::Result> + 'txn>> where + KC: heed::BytesDecode<'txn>, DC: heed::BytesDecode<'txn>, { - use milli::facet::FacetType; - use milli::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; + let iter = db + .remap_key_type::() + .prefix_iter(&rtxn, &[field_id])? + .remap_key_type::(); - let iter = db.prefix_iter(&rtxn, &[field_id])?; - match facet_type { - FacetType::String => { - let iter = iter.remap_key_type::() - .map(move |r| r.map(|((_, key), value)| (string_fn(key), value))); - Ok(Box::new(iter) as Box>) - }, - FacetType::Number => { - let iter = iter.remap_key_type::() - .map(move |r| r.map(|((_, level, left, right), value)| { - (float_fn(level, left, right), value) - })); - Ok(Box::new(iter)) - }, - } + Ok(Box::new(iter)) } fn facet_number_value_to_string(level: u8, left: T, right: T) -> (u8, String) { @@ -352,9 +357,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, - facet_field_id_value_docids, - field_id_docid_facet_values: _, - documents + facet_id_f64_docids, + facet_id_string_docids, + field_id_docid_facet_f64s: _, + field_id_docid_facet_strings: _, + documents, } = index; let main_name = "main"; @@ -365,7 +372,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let word_pair_proximity_docids_name = "word_pair_proximity_docids"; let word_level_position_docids_name = "word_level_position_docids"; let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; - let facet_field_id_value_docids_name = "facet_field_id_value_docids"; + let facet_id_f64_docids_name = "facet_id_f64_docids"; + let facet_id_string_docids_name = "facet_id_string_docids"; let documents_name = "documents"; let mut heap = BinaryHeap::with_capacity(limit + 1); @@ -437,27 +445,27 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let faceted_fields = index.faceted_fields_ids(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?; - for (field_id, field_type) in faceted_fields { - let facet_name = fields_ids_map.name(field_id).unwrap(); - let db = facet_field_id_value_docids.remap_data_type::(); - let iter = facet_values_iter( - rtxn, - db, - field_id, - field_type, - |key| key.to_owned(), - |level, left, right| { - let mut output = facet_number_value_to_string(level, left, right).1; - let _ = write!(&mut output, " (level {})", level); - output - }, - )?; + for facet_id in faceted_fields { + let facet_name = fields_ids_map.name(facet_id).unwrap(); - for result in iter { - let (fvalue, value) = result?; + // List the facet numbers of this facet id. + let db = facet_id_f64_docids.remap_data_type::(); + for result in facet_values_iter(rtxn, db, facet_id)? { + let ((_fid, level, left, right), value) = result?; + let mut output = facet_number_value_to_string(level, left, right).1; + write!(&mut output, " (level {})", level)?; + let key = format!("{} {}", facet_name, output); + heap.push(Reverse((value.len(), key, facet_id_f64_docids_name))); + if heap.len() > limit { heap.pop(); } + } + + // List the facet strings of this facet id. + let db = facet_id_string_docids.remap_data_type::(); + for result in facet_values_iter(rtxn, db, facet_id)? { + let ((_fid, fvalue), value) = result?; let key = format!("{} {}", facet_name, fvalue); - heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name))); + heap.push(Reverse((value.len(), key, facet_id_string_docids_name))); if heap.len() > limit { heap.pop(); } } } @@ -536,38 +544,55 @@ fn words_prefixes_docids( Ok(wtr.flush()?) } -fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> { +fn facet_values_docids( + index: &Index, + rtxn: &heed::RoTxn, + debug: bool, + facet_type: FacetType, + field_name: String, +) -> anyhow::Result<()> +{ let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?; let field_id = fields_ids_map.id(&field_name) .with_context(|| format!("field {} not found", field_name))?; - let field_type = faceted_fields.get(&field_id) - .with_context(|| format!("field {} is not faceted", field_name))?; + + if !faceted_fields.contains(&field_id) { + anyhow::bail!("field {} is not faceted", field_name); + } let stdout = io::stdout(); let mut wtr = csv::Writer::from_writer(stdout.lock()); - wtr.write_record(&["facet_value", "facet_level", "documents_count", "documents_ids"])?; - let db = index.facet_field_id_value_docids; - let iter = facet_values_iter( - rtxn, - db, - field_id, - *field_type, - |key| (0, key.to_owned()), - facet_number_value_to_string, - )?; - - for result in iter { - let ((level, value), docids) = result?; - let count = docids.len(); - let docids = if debug { - format!("{:?}", docids) - } else { - format!("{:?}", docids.iter().collect::>()) - }; - wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?; + match facet_type { + FacetType::Number => { + wtr.write_record(&["facet_number", "facet_level", "documents_count", "documents_ids"])?; + for result in facet_values_iter(rtxn, index.facet_id_f64_docids, field_id)? { + let ((_fid, level, left, right), docids) = result?; + let value = facet_number_value_to_string(level, left, right).1; + let count = docids.len(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[value, level.to_string(), count.to_string(), docids])?; + } + }, + FacetType::String => { + wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; + for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { + let ((_fid, value), docids) = result?; + let count = docids.len(); + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[value.to_string(), count.to_string(), docids])?; + } + } } Ok(wtr.flush()?) @@ -684,31 +709,24 @@ fn docids_words_positions( Ok(wtr.flush()?) } -fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { +fn facet_number_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { let fields_ids_map = index.fields_ids_map(&rtxn)?; let faceted_fields = index.faceted_fields_ids(&rtxn)?; let field_id = fields_ids_map.id(&field_name) .with_context(|| format!("field {} not found", field_name))?; - let field_type = faceted_fields.get(&field_id) - .with_context(|| format!("field {} is not faceted", field_name))?; - let db = index.facet_field_id_value_docids; - let iter = facet_values_iter( - rtxn, - db, - field_id, - *field_type, - |_key| 0u8, - |level, _left, _right| level, - )?; + if !faceted_fields.contains(&field_id) { + anyhow::bail!("field {} is not faceted", field_name); + } + let iter = facet_values_iter(rtxn, index.facet_id_f64_docids, field_id)?; println!("The database {:?} facet stats", field_name); let mut level_size = 0; let mut current_level = None; for result in iter { - let (level, _) = result?; + let ((_fid, level, _left, _right), _) = result?; if let Some(current) = current_level { if current != level { println!("\tnumber of groups at level {}: {}", current, level_size); @@ -843,7 +861,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a use heed::types::ByteSlice; let Index { - env: _, + env: _env, main, word_docids, word_prefix_docids, @@ -852,8 +870,10 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a word_prefix_pair_proximity_docids, word_level_position_docids, word_prefix_level_position_docids, - facet_field_id_value_docids, - field_id_docid_facet_values, + facet_id_f64_docids, + facet_id_string_docids, + field_id_docid_facet_f64s, + field_id_docid_facet_strings, documents, } = index; @@ -873,8 +893,11 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), - FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(), - FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(), + FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(), + FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(), + FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(), + FIELD_ID_DOCID_FACET_STRINGS_DB_NAME => field_id_docid_facet_strings.as_polymorph(), + DOCUMENTS_DB_NAME => documents.as_polymorph(), unknown => anyhow::bail!("unknown database {:?}", unknown), }; From 5012cc3a32ffd7345ec929b74484b9216cc20db2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 4 May 2021 12:09:43 +0200 Subject: [PATCH 13/14] Fix the http-ui crate to support split facet databases --- http-ui/src/main.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 00618f58a..7d51098c3 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -30,7 +30,6 @@ use warp::{Filter, http::Response}; use warp::filters::ws::Message; use milli::{FacetCondition, Index, MatchingWords, obkv_to_json, SearchResult, UpdateStore}; -use milli::facet::FacetValue; use milli::update::{IndexDocumentsMethod, Setting, UpdateBuilder, UpdateFormat}; use milli::update::UpdateIndexingStep::*; @@ -252,7 +251,7 @@ struct Settings { searchable_attributes: Setting>, #[serde(default, skip_serializing_if = "Setting::is_not_set")] - faceted_attributes: Setting>, + faceted_attributes: Setting>, #[serde(default, skip_serializing_if = "Setting::is_not_set")] criteria: Setting>, @@ -671,7 +670,7 @@ async fn main() -> anyhow::Result<()> { struct Answer { documents: Vec>, number_of_candidates: u64, - facets: BTreeMap>, + facets: BTreeMap>, } let disable_highlighting = opt.disable_highlighting; @@ -985,7 +984,7 @@ async fn main() -> anyhow::Result<()> { #[cfg(test)] mod tests { - use maplit::{btreeset,hashmap}; + use maplit::{btreeset,hashmap, hashset}; use serde_test::{assert_tokens, Token}; use milli::update::Setting; @@ -997,10 +996,10 @@ mod tests { let settings = Settings { displayed_attributes: Setting::Set(vec!["name".to_string()]), searchable_attributes: Setting::Set(vec!["age".to_string()]), - faceted_attributes: Setting::Set(hashmap! { "age".into() => "integer".into() }), + faceted_attributes: Setting::Set(hashset!{ "age".to_string() }), criteria: Setting::Set(vec!["asc(age)".to_string()]), stop_words: Setting::Set(btreeset! { "and".to_string() }), - synonyms: Setting::Set(hashmap! { "alex".to_string() => vec!["alexey".to_string()] }) + synonyms: Setting::Set(hashmap!{ "alex".to_string() => vec!["alexey".to_string()] }) }; assert_tokens(&settings, &[ From 1c0a5cd136146e0b290445ed16130e48014402d8 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 31 May 2021 15:22:50 +0200 Subject: [PATCH 14/14] Resolve code modification suggestions --- milli/src/search/facet/facet_condition.rs | 2 -- milli/src/update/facets.rs | 1 - milli/src/update/index_documents/store.rs | 1 - 3 files changed, 4 deletions(-) diff --git a/milli/src/search/facet/facet_condition.rs b/milli/src/search/facet/facet_condition.rs index e7917df97..fd7053269 100644 --- a/milli/src/search/facet/facet_condition.rs +++ b/milli/src/search/facet/facet_condition.rs @@ -240,9 +240,7 @@ impl FacetCondition { let value = items.next().unwrap(); let (result, svalue) = pest_parse(value); - // TODO we must normalize instead of lowercase. let svalue = svalue.to_lowercase(); - Ok(Operator(fid, Equal(result.ok(), svalue))) } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index af72133a2..f0eab6023 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -98,7 +98,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?; self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?; - // Store the write_into_lmdb_database( self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index afc199293..4f65d77e1 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -816,7 +816,6 @@ fn extract_facet_values(value: &Value) -> (Vec, Vec) { output_numbers.push(float); }, Value::String(string) => { - // TODO must be normalized and not only lowercased. let string = string.trim().to_lowercase(); output_strings.push(string); },