From 3c304c89d4885fd3fd0cbef58826bf0fa3aace80 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 1 Jun 2021 16:29:14 +0200 Subject: [PATCH] Make sure that we generate the faceted database when required --- milli/src/index.rs | 18 ++-- milli/src/search/distinct/mod.rs | 2 +- milli/src/search/mod.rs | 2 +- milli/src/update/facets.rs | 16 ++-- milli/src/update/settings.rs | 139 ++++++++++++++++++++++++++----- 5 files changed, 135 insertions(+), 42 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 9cfcd841c..4e32f673a 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,7 +23,7 @@ use crate::fields_ids_map::FieldsIdsMap; pub const CRITERIA_KEY: &str = "criteria"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; -pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key"; +pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; @@ -365,7 +365,7 @@ impl Index { /// Faceted fields are the union of all the filterable, distinct, and Asc/Desc fields. pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result> { let filterable_fields = self.filterable_fields(rtxn)?; - let distinct_field = self.distinct_attribute(rtxn)?; + let distinct_field = self.distinct_field(rtxn)?; let asc_desc_fields = self.criteria(rtxn)? .into_iter() .filter_map(|criterion| match criterion { @@ -465,18 +465,18 @@ impl Index { } } - /* Distinct attribute */ + /* distinct field */ - pub(crate) fn put_distinct_attribute(&self, wtxn: &mut RwTxn, distinct_attribute: &str) -> heed::Result<()> { - self.main.put::<_, Str, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY, distinct_attribute) + pub(crate) fn put_distinct_field(&self, wtxn: &mut RwTxn, distinct_field: &str) -> heed::Result<()> { + self.main.put::<_, Str, Str>(wtxn, DISTINCT_FIELD_KEY, distinct_field) } - pub fn distinct_attribute<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { - self.main.get::<_, Str, Str>(rtxn, DISTINCT_ATTRIBUTE_KEY) + pub fn distinct_field<'a>(&self, rtxn: &'a RoTxn) -> heed::Result> { + self.main.get::<_, Str, Str>(rtxn, DISTINCT_FIELD_KEY) } - pub(crate) fn delete_distinct_attribute(&self, wtxn: &mut RwTxn) -> heed::Result { - self.main.delete::<_, Str>(wtxn, DISTINCT_ATTRIBUTE_KEY) + pub(crate) fn delete_distinct_field(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, DISTINCT_FIELD_KEY) } /* criteria */ diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 945beb7e6..1b7c69c7a 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -79,7 +79,7 @@ mod test { // set distinct and faceted attributes for the index. let builder = UpdateBuilder::new(0); let mut update = builder.settings(&mut txn, &index); - update.set_distinct_attribute(distinct.to_string()); + update.set_distinct_field(distinct.to_string()); update.execute(|_, _| ()).unwrap(); // add documents to the index diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index c152d47a4..872ebfca6 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -136,7 +136,7 @@ impl<'a> Search<'a> { let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; let criteria = criteria_builder.build(query_tree, primitive_query, filtered_candidates)?; - match self.index.distinct_attribute(self.rtxn)? { + match self.index.distinct_field(self.rtxn)? { None => self.perform_sort(NoopDistinct, matching_words, criteria), Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 1c235a509..f0eab6023 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -57,14 +57,14 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { pub fn execute(self) -> anyhow::Result<()> { self.index.set_updated_at(self.wtxn, &Utc::now())?; - // We get the filterable fields to be able to create the facet levels. - let filterable_fields = self.index.filterable_fields_ids(self.wtxn)?; + // We get the faceted fields to be able to create the facet levels. + let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - for field_id in filterable_fields { - // Compute and store the filterable strings documents ids. - let string_documents_ids = compute_filterable_documents_ids( + for field_id in faceted_fields { + // Compute and store the faceted strings documents ids. + let string_documents_ids = compute_faceted_documents_ids( self.wtxn, self.index.facet_id_string_docids.remap_key_type::(), field_id, @@ -77,8 +77,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - // Compute and store the filterable numbers documents ids. - let number_documents_ids = compute_filterable_documents_ids( + // Compute and store the faceted numbers documents ids. + let number_documents_ids = compute_faceted_documents_ids( self.wtxn, self.index.facet_id_f64_docids.remap_key_type::(), field_id, @@ -191,7 +191,7 @@ fn compute_facet_number_levels<'t>( writer_into_reader(writer, shrink_size) } -fn compute_filterable_documents_ids( +fn compute_faceted_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, field_id: u8, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index eba14a17c..ef32c5c44 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -68,7 +68,7 @@ pub struct Settings<'a, 't, 'u, 'i> { filterable_fields: Setting>, criteria: Setting>, stop_words: Setting>, - distinct_attribute: Setting, + distinct_field: Setting, synonyms: Setting>>, } @@ -94,7 +94,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { filterable_fields: Setting::NotSet, criteria: Setting::NotSet, stop_words: Setting::NotSet, - distinct_attribute: Setting::NotSet, + distinct_field: Setting::NotSet, synonyms: Setting::NotSet, update_id, } @@ -144,12 +144,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } - pub fn reset_distinct_attribute(&mut self) { - self.distinct_attribute = Setting::Reset; + pub fn reset_distinct_field(&mut self) { + self.distinct_field = Setting::Reset; } - pub fn set_distinct_attribute(&mut self, distinct_attribute: String) { - self.distinct_attribute = Setting::Set(distinct_attribute); + pub fn set_distinct_field(&mut self, distinct_field: String) { + self.distinct_field = Setting::Set(distinct_field); } pub fn reset_synonyms(&mut self) { @@ -165,8 +165,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> anyhow::Result<()> - where - F: Fn(UpdateIndexingStep, u64) + Sync + where + F: Fn(UpdateIndexingStep, u64) + Sync { let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let update_id = self.update_id; @@ -197,7 +197,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { let output = transform.remap_index_documents( primary_key.to_string(), old_fields_ids_map, - fields_ids_map.clone())?; + fields_ids_map.clone(), + )?; // We clear the full database (words-fst, documents ids and documents content). ClearDocuments::new(self.wtxn, self.index, self.update_id).execute()?; @@ -214,6 +215,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; indexing_builder.thread_pool = self.thread_pool; indexing_builder.execute_raw(output, &cb)?; + Ok(()) } @@ -242,18 +244,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(true) } - fn update_distinct_attribute(&mut self) -> anyhow::Result { - match self.distinct_attribute { + fn update_distinct_field(&mut self) -> anyhow::Result { + match self.distinct_field { Setting::Set(ref attr) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; fields_ids_map .insert(attr) .context("field id limit exceeded")?; - self.index.put_distinct_attribute(self.wtxn, &attr)?; + self.index.put_distinct_field(self.wtxn, &attr)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } - Setting::Reset => { self.index.delete_distinct_attribute(self.wtxn)?; }, + Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; }, Setting::NotSet => return Ok(false), } Ok(true) @@ -380,7 +382,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } - fn update_facets(&mut self) -> anyhow::Result { + fn update_filterable(&mut self) -> anyhow::Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; @@ -393,9 +395,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; } Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; } - Setting::NotSet => return Ok(false) + Setting::NotSet => (), } - Ok(true) + Ok(()) } fn update_criteria(&mut self) -> anyhow::Result<()> { @@ -419,20 +421,29 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { F: Fn(UpdateIndexingStep, u64) + Sync { self.index.set_updated_at(self.wtxn, &Utc::now())?; + + let old_faceted_fields = self.index.faceted_fields(&self.wtxn)?; let old_fields_ids_map = self.index.fields_ids_map(&self.wtxn)?; + self.update_displayed()?; - let stop_words_updated = self.update_stop_words()?; - let facets_updated = self.update_facets()?; - self.update_distinct_attribute()?; - // update_criteria MUST be called after update_facets, since criterion fields must be set - // as facets. + self.update_filterable()?; + self.update_distinct_field()?; self.update_criteria()?; + + // If there is new faceted fields we indicate that we must reindex as we must + // index new fields as facets. It means that the distinct attribute, + // an Asc/Desc criterion or a filtered attribute as be added or removed. + let new_faceted_fields = self.index.faceted_fields(&self.wtxn)?; + let faceted_updated = old_faceted_fields != new_faceted_fields; + + let stop_words_updated = self.update_stop_words()?; let synonyms_updated = self.update_synonyms()?; let searchable_updated = self.update_searchable()?; - if stop_words_updated || facets_updated || synonyms_updated || searchable_updated { + if stop_words_updated || faceted_updated || synonyms_updated || searchable_updated { self.reindex(&progress_callback, old_fields_ids_map)?; } + Ok(()) } } @@ -444,7 +455,7 @@ mod tests { use maplit::{btreeset, hashmap, hashset}; use big_s::S; - use crate::{Criterion, FilterCondition}; + use crate::{Criterion, FilterCondition, SearchResult}; use crate::update::{IndexDocuments, UpdateFormat}; use super::*; @@ -669,6 +680,88 @@ mod tests { assert_eq!(count, 4); } + #[test] + fn set_asc_desc_field() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the age. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + // Don't display the generated `id` field. + builder.set_displayed_fields(vec![S("name"), S("age")]); + builder.set_criteria(vec![S("asc(age)")]); + builder.execute(|_, _| ()).unwrap(); + + // Then index some documents. + let content = &br#"[ + { "name": "kevin", "age": 23 }, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); + builder.enable_autogenerate_docids(); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Run an empty query just to ensure that the search results are ordered. + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); + let documents = index.documents(&rtxn, documents_ids).unwrap(); + + // Fetch the documents "age" field in the ordre in which the documents appear. + let age_field_id = index.fields_ids_map(&rtxn).unwrap().id("age").unwrap(); + let iter = documents.into_iter().map(|(_, doc)| { + let bytes = doc.get(age_field_id).unwrap(); + let string = std::str::from_utf8(bytes).unwrap(); + string.parse::().unwrap() + }); + + assert_eq!(iter.collect::>(), vec![21, 23, 34]); + } + + #[test] + fn set_distinct_field() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the age. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + // Don't display the generated `id` field. + builder.set_displayed_fields(vec![S("name"), S("age")]); + builder.set_distinct_field(S("age")); + builder.execute(|_, _| ()).unwrap(); + + // Then index some documents. + let content = &br#"[ + { "name": "kevin", "age": 23 }, + { "name": "kevina", "age": 21 }, + { "name": "benoit", "age": 34 }, + { "name": "bernard", "age": 34 }, + { "name": "bertrand", "age": 34 }, + { "name": "bernie", "age": 34 }, + { "name": "ben", "age": 34 } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); + builder.enable_autogenerate_docids(); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Run an empty query just to ensure that the search results are ordered. + let rtxn = index.read_txn().unwrap(); + let SearchResult { documents_ids, .. } = index.search(&rtxn).execute().unwrap(); + + // There must be at least one document with a 34 as the age. + assert_eq!(documents_ids.len(), 3); + } + #[test] fn default_stop_words() { let path = tempfile::tempdir().unwrap();