From 27c7ab6e001826bdd233c8adfea2e2b3f3384394 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Wed, 31 Mar 2021 18:14:23 +0300 Subject: [PATCH] feat(index): store fields distribution in index --- Cargo.lock | 6 +-- milli/src/index.rs | 42 +++++++++---------- milli/src/update/index_documents/mod.rs | 4 ++ milli/src/update/index_documents/transform.rs | 17 ++++++++ milli/src/update/settings.rs | 2 +- 5 files changed, 45 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8f296f2fa..91e72450a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1520,7 +1520,8 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pest" version = "2.1.3" -source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" dependencies = [ "ucd-trie", ] @@ -1528,8 +1529,7 @@ dependencies = [ [[package]] name = "pest" version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" +source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" dependencies = [ "ucd-trie", ] diff --git a/milli/src/index.rs b/milli/src/index.rs index 642ad4ab7..2b364b068 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,6 +23,7 @@ pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; +pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; @@ -33,6 +34,8 @@ pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; const CREATED_AT_KEY: &str = "created-at"; const UPDATED_AT_KEY: &str = "updated-at"; +pub type FieldsDistribution = HashMap; + #[derive(Clone)] pub struct Index { /// The LMDB environment which this index is associated with. @@ -204,23 +207,18 @@ impl Index { Ok(self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_IDS_MAP_KEY)?.unwrap_or_default()) } - /* fields ids distribution */ + /* fields distribution */ - /// Returns the fields ids distribution which associate the internal field ids - /// with the number of times it occurs in the obkv documents. - // TODO store in the index itself and change only within updates that modify the documents - pub fn fields_ids_distribution(&self, rtxn: &RoTxn) -> anyhow::Result> { - let mut distribution = HashMap::new(); + /// Writes the fields distribution which associate the field with the number of times + /// it occurs in the obkv documents. + pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson>(wtxn, FIELDS_DISTRIBUTION_KEY, &distribution) + } - for document in self.documents.iter(rtxn)? { - let (_, obkv) = document?; - - for (field_id, _) in obkv.iter() { - *distribution.entry(field_id).or_default() += 1; - } - } - - Ok(distribution) + /// Returns the fields distribution which associate the field with the number of times + /// it occurs in the obkv documents. + pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result { + Ok(self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_DISTRIBUTION_KEY)?.unwrap_or_default()) } /* displayed fields */ @@ -469,6 +467,7 @@ impl Index { #[cfg(test)] mod tests { use heed::EnvOpenOptions; + use maplit::hashmap; use crate::Index; use crate::update::{IndexDocuments, UpdateFormat}; @@ -493,16 +492,15 @@ mod tests { } #[test] - fn fields_ids_distribution() { + fn initial_fields_distribution() { let index = prepare_index(); let rtxn = index.read_txn().unwrap(); - let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); - - let fields_ids_distribution = index.fields_ids_distribution(&rtxn).unwrap(); - assert_eq!(fields_ids_distribution.len(), 2); - assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("age").unwrap()), Some(&1)); - assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("name").unwrap()), Some(&2)); + let fields_distribution = index.fields_distribution(&rtxn).unwrap(); + assert_eq!(fields_distribution, hashmap!{ + "age".to_string() => 1, + "name".to_string() => 2 + }); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index f4a7c7f25..fb1a2d6c0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -358,6 +358,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let TransformOutput { primary_key, fields_ids_map, + fields_distribution, external_documents_ids, new_documents_ids, replaced_documents_ids, @@ -551,6 +552,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the fields ids map into the main database self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + // We write the fields distribution into the main database + self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; + // We write the primary key field id into the main database self.index.put_primary_key(self.wtxn, &primary_key)?; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index b22cd14c6..f8aac60d7 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -14,12 +14,14 @@ use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use super::merge_function::merge_two_obkvs; use super::{create_writer, create_sorter, IndexDocumentsMethod}; +use crate::index::FieldsDistribution; const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, + pub fields_distribution: FieldsDistribution, pub external_documents_ids: ExternalDocumentsIds<'static>, pub new_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap, @@ -74,6 +76,7 @@ impl Transform<'_, '_> { F: Fn(UpdateIndexingStep) + Sync, { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; + let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); // Deserialize the whole batch of documents in memory. @@ -103,6 +106,7 @@ impl Transform<'_, '_> { return Ok(TransformOutput { primary_key, fields_ids_map, + fields_distribution, external_documents_ids: ExternalDocumentsIds::default(), new_documents_ids: RoaringBitmap::new(), replaced_documents_ids: RoaringBitmap::new(), @@ -148,6 +152,8 @@ impl Transform<'_, '_> { // We prepare the fields ids map with the documents keys. for (key, _value) in &document { fields_ids_map.insert(&key).context("field id limit reached")?; + + *fields_distribution.entry(key.to_owned()).or_default() += 1; } // We retrieve the user id from the document based on the primary key name, @@ -200,6 +206,7 @@ impl Transform<'_, '_> { sorter, primary_key, fields_ids_map, + fields_distribution, documents_count, external_documents_ids, progress_callback, @@ -212,6 +219,7 @@ impl Transform<'_, '_> { F: Fn(UpdateIndexingStep) + Sync, { let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?; + let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap(); let mut csv = csv::Reader::from_reader(reader); @@ -307,6 +315,10 @@ impl Transform<'_, '_> { json_buffer.clear(); serde_json::to_writer(&mut json_buffer, &field)?; writer.insert(*field_id, &json_buffer)?; + + let field_name = fields_ids_map.name(*field_id).unwrap(); + + *fields_distribution.entry(field_name.to_string()).or_default() += 1; } // We use the extracted/generated user id as the key for this document. @@ -328,6 +340,7 @@ impl Transform<'_, '_> { sorter, primary_key_name, fields_ids_map, + fields_distribution, documents_count, external_documents_ids, progress_callback, @@ -342,6 +355,7 @@ impl Transform<'_, '_> { sorter: grenad::Sorter, primary_key: String, fields_ids_map: FieldsIdsMap, + fields_distribution: FieldsDistribution, approximate_number_of_documents: usize, mut external_documents_ids: ExternalDocumentsIds<'_>, progress_callback: F, @@ -439,6 +453,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map, + fields_distribution, external_documents_ids: external_documents_ids.into_static(), new_documents_ids, replaced_documents_ids, @@ -457,6 +472,7 @@ impl Transform<'_, '_> { new_fields_ids_map: FieldsIdsMap, ) -> anyhow::Result { + let fields_distribution = self.index.fields_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_count = documents_ids.len() as usize; @@ -492,6 +508,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map: new_fields_ids_map, + fields_distribution, external_documents_ids: external_documents_ids.into_static(), new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 451447102..45a4c204c 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -183,7 +183,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } - /// Udpates the index's searchable attributes. This causes the field map to be recomputed to + /// Updates the index's searchable attributes. This causes the field map to be recomputed to /// reflect the order of the searchable attributes. fn update_searchable(&mut self) -> anyhow::Result { match self.searchable_fields {