From 713acc408be998c4d6aa33e5c2a2114bfbc90514 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 15 Jun 2021 13:45:20 +0200 Subject: [PATCH] Introduce the primary key to the Settings builder structure --- benchmarks/benches/utils.rs | 8 +- milli/src/error.rs | 8 ++ milli/src/index.rs | 30 +++---- milli/src/update/index_documents/transform.rs | 2 +- milli/src/update/settings.rs | 89 ++++++++++++++++++- milli/tests/search/query_criteria.rs | 7 +- 6 files changed, 121 insertions(+), 23 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 5138de4d2..d5181849f 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -55,15 +55,15 @@ pub fn base_setup(conf: &Conf) -> Index { options.map_size(100 * 1024 * 1024 * 1024); // 100 GB options.max_readers(10); let index = Index::new(options, conf.database_name).unwrap(); - if let Some(primary_key) = conf.primary_key { - let mut wtxn = index.write_txn().unwrap(); - index.put_primary_key(&mut wtxn, primary_key).unwrap(); - } let update_builder = UpdateBuilder::new(0); let mut wtxn = index.write_txn().unwrap(); let mut builder = update_builder.settings(&mut wtxn, &index); + if let Some(primary_key) = conf.primary_key { + builder.set_primary_key(primary_key.to_string()); + } + if let Some(criterion) = conf.criterion { builder.reset_filterable_fields(); builder.reset_criteria(); diff --git a/milli/src/error.rs b/milli/src/error.rs index 5a8dfc90b..19f9c364c 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -60,6 +60,8 @@ pub enum UserError { MissingDocumentId { document: Object }, MissingPrimaryKey, NoSpaceLeftOnDevice, + PrimaryKeyCannotBeChanged, + PrimaryKeyCannotBeReset, SerdeJson(serde_json::Error), UnknownInternalDocumentId { document_id: DocumentId }, } @@ -211,6 +213,12 @@ impl fmt::Display for UserError { // TODO where can we find it instead of writing the text ourselves? Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"), Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), + Self::PrimaryKeyCannotBeChanged => { + f.write_str("primary key cannot be changed if the database contains documents") + }, + Self::PrimaryKeyCannotBeReset => { + f.write_str("primary key cannot be reset if the database contains documents") + }, Self::SerdeJson(error) => error.fmt(f), Self::UnknownInternalDocumentId { document_id } => { write!(f, "an unknown internal document id have been used ({})", document_id) diff --git a/milli/src/index.rs b/milli/src/index.rs index 02a1f9d58..bf4b3e023 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -184,7 +184,7 @@ impl Index { /* documents ids */ /// Writes the documents ids that corresponds to the user-ids-documents-ids FST. - pub fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { + pub(crate) fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) } @@ -202,7 +202,7 @@ impl Index { /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. - pub fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { + pub(crate) fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { self.set_updated_at(wtxn, &Utc::now())?; self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, &primary_key) } @@ -220,7 +220,7 @@ impl Index { /* external documents ids */ /// Writes the external documents ids and internal ids (i.e. `u32`). - pub fn put_external_documents_ids<'a>( + pub(crate) fn put_external_documents_ids<'a>( &self, wtxn: &mut RwTxn, external_documents_ids: &ExternalDocumentsIds<'a>, @@ -254,7 +254,7 @@ impl Index { /// Writes the fields ids map which associate the documents keys with an internal field id /// (i.e. `u8`), this field id is used to identify fields in the obkv documents. - pub fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { + pub(crate) fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) } @@ -271,7 +271,7 @@ impl Index { /// Writes the fields distribution which associates every field name with /// the number of times it occurs in the documents. - pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { + pub(crate) fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution) } @@ -288,7 +288,7 @@ impl Index { /// Writes the fields that must be displayed in the defined order. /// There must be not be any duplicate field id. - pub fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { + pub(crate) fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields) } @@ -328,7 +328,7 @@ impl Index { /* searchable fields */ /// Writes the searchable fields, when this list is specified, only these are indexed. - pub fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { + pub(crate) fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields) } @@ -367,7 +367,7 @@ impl Index { /* filterable fields */ /// Writes the filterable fields names in the database. - pub fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { + pub(crate) fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) } @@ -453,7 +453,7 @@ impl Index { /* faceted documents ids */ /// Writes the documents ids that are faceted with numbers under this field id. - pub fn put_number_faceted_documents_ids( + pub(crate) fn put_number_faceted_documents_ids( &self, wtxn: &mut RwTxn, field_id: FieldId, @@ -485,7 +485,7 @@ impl Index { } /// Writes the documents ids that are faceted with strings under this field id. - pub fn put_string_faceted_documents_ids( + pub(crate) fn put_string_faceted_documents_ids( &self, wtxn: &mut RwTxn, field_id: FieldId, @@ -532,7 +532,7 @@ impl Index { /* criteria */ - pub fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { + pub(crate) fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) } @@ -550,7 +550,7 @@ impl Index { /* words fst */ /// Writes the FST which is the words dictionary of the engine. - pub fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + pub(crate) fn put_words_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) } @@ -564,7 +564,7 @@ impl Index { /* stop words */ - pub fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + pub(crate) fn put_stop_words>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) } @@ -581,7 +581,7 @@ impl Index { /* synonyms */ - pub fn put_synonyms( + pub(crate) fn put_synonyms( &self, wtxn: &mut RwTxn, synonyms: &HashMap, Vec>>, @@ -611,7 +611,7 @@ impl Index { /* words prefixes fst */ /// Writes the FST which is the words prefixes dictionnary of the engine. - pub fn put_words_prefixes_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { + pub(crate) fn put_words_prefixes_fst>(&self, wtxn: &mut RwTxn, fst: &fst::Set) -> heed::Result<()> { self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index c44130d7e..9e88559d0 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -375,7 +375,7 @@ impl Transform<'_, '_> { // Once we have sort and deduplicated the documents we write them into a final file. let mut final_sorter = create_sorter( - |_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "merging documents" }), + |_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "documents" }), self.chunk_compression_type, self.chunk_compression_level, self.chunk_fusing_shrink_size, diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1756a21c9..39cb27c00 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -72,6 +72,7 @@ pub struct Settings<'a, 't, 'u, 'i> { stop_words: Setting>, distinct_field: Setting, synonyms: Setting>>, + primary_key: Setting, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -98,6 +99,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { stop_words: Setting::NotSet, distinct_field: Setting::NotSet, synonyms: Setting::NotSet, + primary_key: Setting::NotSet, update_id, } } @@ -166,6 +168,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + pub fn reset_primary_key(&mut self) { + self.primary_key = Setting::Reset; + } + + pub fn set_primary_key(&mut self, primary_key: String) { + self.primary_key = Setting::Set(primary_key); + } + fn reindex(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync @@ -423,6 +433,31 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { Ok(()) } + fn update_primary_key(&mut self) -> Result<()> { + match self.primary_key { + Setting::Set(ref primary_key) => { + if self.index.number_of_documents(&self.wtxn)? == 0 { + let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; + self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + self.index.put_primary_key(self.wtxn, primary_key)?; + Ok(()) + } else { + Err(UserError::PrimaryKeyCannotBeChanged.into()) + } + }, + Setting::Reset => { + if self.index.number_of_documents(&self.wtxn)? == 0 { + self.index.delete_primary_key(self.wtxn)?; + Ok(()) + } else { + Err(UserError::PrimaryKeyCannotBeReset.into()) + } + }, + Setting::NotSet => Ok(()), + } + } + pub fn execute(mut self, progress_callback: F) -> Result<()> where F: Fn(UpdateIndexingStep, u64) + Sync @@ -436,6 +471,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.update_filterable()?; self.update_distinct_field()?; self.update_criteria()?; + self.update_primary_key()?; // If there is new faceted fields we indicate that we must reindex as we must // index new fields as facets. It means that the distinct attribute, @@ -462,8 +498,9 @@ mod tests { use maplit::{btreeset, hashmap, hashset}; use big_s::S; - use crate::{Criterion, FilterCondition, SearchResult}; + use crate::error::Error; use crate::update::{IndexDocuments, UpdateFormat}; + use crate::{Criterion, FilterCondition, SearchResult}; use super::*; @@ -977,4 +1014,54 @@ mod tests { let rtxn = index.read_txn().unwrap(); FilterCondition::from_str(&rtxn, &index, "toto = 32").unwrap_err(); } + + #[test] + fn setting_primary_key() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the primary key settings + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_primary_key(S("mykey")); + + builder.execute(|_, _| ()).unwrap(); + assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey")); + + // Then index some documents with the "mykey" primary key. + let content = &br#"[ + { "mykey": 1, "name": "kevin", "age": 23 }, + { "mykey": 2, "name": "kevina", "age": 21 }, + { "mykey": 3, "name": "benoit", "age": 34 }, + { "mykey": 4, "name": "bernard", "age": 34 }, + { "mykey": 5, "name": "bertrand", "age": 34 }, + { "mykey": 6, "name": "bernie", "age": 34 }, + { "mykey": 7, "name": "ben", "age": 34 } + ]"#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 1); + builder.update_format(UpdateFormat::Json); + builder.disable_autogenerate_docids(); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // We now try to reset the primary key + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.reset_primary_key(); + + let err = builder.execute(|_, _| ()).unwrap_err(); + assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset))); + + // But if we clear the database... + let mut wtxn = index.write_txn().unwrap(); + let builder = ClearDocuments::new(&mut wtxn, &index, 0); + builder.execute().unwrap(); + + // ...we can change the primary key + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_primary_key(S("myid")); + builder.execute(|_, _| ()).unwrap(); + } } diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index f0eecfaba..2b9c5ae5e 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -1,5 +1,6 @@ -use milli::{Search, SearchResult, Criterion}; use big_s::S; +use milli::update::Settings; +use milli::{Search, SearchResult, Criterion}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; use Criterion::*; @@ -189,7 +190,9 @@ fn criteria_mixup() { eprintln!("Testing with criteria order: {:?}", &criteria); //update criteria let mut wtxn = index.write_txn().unwrap(); - index.put_criteria(&mut wtxn, &criteria).unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_criteria(criteria.iter().map(ToString::to_string).collect()); + builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); let mut rtxn = index.read_txn().unwrap();