Introduce the primary key to the Settings builder structure

This commit is contained in:
Kerollmops 2021-06-15 13:45:20 +02:00
parent a7d6930905
commit 713acc408b
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
6 changed files with 121 additions and 23 deletions

View File

@ -55,15 +55,15 @@ pub fn base_setup(conf: &Conf) -> Index {
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
options.max_readers(10); options.max_readers(10);
let index = Index::new(options, conf.database_name).unwrap(); let index = Index::new(options, conf.database_name).unwrap();
if let Some(primary_key) = conf.primary_key {
let mut wtxn = index.write_txn().unwrap();
index.put_primary_key(&mut wtxn, primary_key).unwrap();
}
let update_builder = UpdateBuilder::new(0); let update_builder = UpdateBuilder::new(0);
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
let mut builder = update_builder.settings(&mut wtxn, &index); let mut builder = update_builder.settings(&mut wtxn, &index);
if let Some(primary_key) = conf.primary_key {
builder.set_primary_key(primary_key.to_string());
}
if let Some(criterion) = conf.criterion { if let Some(criterion) = conf.criterion {
builder.reset_filterable_fields(); builder.reset_filterable_fields();
builder.reset_criteria(); builder.reset_criteria();

View File

@ -60,6 +60,8 @@ pub enum UserError {
MissingDocumentId { document: Object }, MissingDocumentId { document: Object },
MissingPrimaryKey, MissingPrimaryKey,
NoSpaceLeftOnDevice, NoSpaceLeftOnDevice,
PrimaryKeyCannotBeChanged,
PrimaryKeyCannotBeReset,
SerdeJson(serde_json::Error), SerdeJson(serde_json::Error),
UnknownInternalDocumentId { document_id: DocumentId }, UnknownInternalDocumentId { document_id: DocumentId },
} }
@ -211,6 +213,12 @@ impl fmt::Display for UserError {
// TODO where can we find it instead of writing the text ourselves? // TODO where can we find it instead of writing the text ourselves?
Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"), Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"),
Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), Self::InvalidStoreFile => f.write_str("store file is not a valid database file"),
Self::PrimaryKeyCannotBeChanged => {
f.write_str("primary key cannot be changed if the database contains documents")
},
Self::PrimaryKeyCannotBeReset => {
f.write_str("primary key cannot be reset if the database contains documents")
},
Self::SerdeJson(error) => error.fmt(f), Self::SerdeJson(error) => error.fmt(f),
Self::UnknownInternalDocumentId { document_id } => { Self::UnknownInternalDocumentId { document_id } => {
write!(f, "an unknown internal document id have been used ({})", document_id) write!(f, "an unknown internal document id have been used ({})", document_id)

View File

@ -184,7 +184,7 @@ impl Index {
/* documents ids */ /* documents ids */
/// Writes the documents ids that corresponds to the user-ids-documents-ids FST. /// Writes the documents ids that corresponds to the user-ids-documents-ids FST.
pub fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> { pub(crate) fn put_documents_ids(&self, wtxn: &mut RwTxn, docids: &RoaringBitmap) -> heed::Result<()> {
self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids) self.main.put::<_, Str, RoaringBitmapCodec>(wtxn, main_key::DOCUMENTS_IDS_KEY, docids)
} }
@ -202,7 +202,7 @@ impl Index {
/* primary key */ /* primary key */
/// Writes the documents primary key, this is the field name that is used to store the id. /// Writes the documents primary key, this is the field name that is used to store the id.
pub fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> { pub(crate) fn put_primary_key(&self, wtxn: &mut RwTxn, primary_key: &str) -> heed::Result<()> {
self.set_updated_at(wtxn, &Utc::now())?; self.set_updated_at(wtxn, &Utc::now())?;
self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, &primary_key) self.main.put::<_, Str, Str>(wtxn, main_key::PRIMARY_KEY_KEY, &primary_key)
} }
@ -220,7 +220,7 @@ impl Index {
/* external documents ids */ /* external documents ids */
/// Writes the external documents ids and internal ids (i.e. `u32`). /// Writes the external documents ids and internal ids (i.e. `u32`).
pub fn put_external_documents_ids<'a>( pub(crate) fn put_external_documents_ids<'a>(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
external_documents_ids: &ExternalDocumentsIds<'a>, external_documents_ids: &ExternalDocumentsIds<'a>,
@ -254,7 +254,7 @@ impl Index {
/// Writes the fields ids map which associate the documents keys with an internal field id /// Writes the fields ids map which associate the documents keys with an internal field id
/// (i.e. `u8`), this field id is used to identify fields in the obkv documents. /// (i.e. `u8`), this field id is used to identify fields in the obkv documents.
pub fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> { pub(crate) fn put_fields_ids_map(&self, wtxn: &mut RwTxn, map: &FieldsIdsMap) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map) self.main.put::<_, Str, SerdeJson<FieldsIdsMap>>(wtxn, main_key::FIELDS_IDS_MAP_KEY, map)
} }
@ -271,7 +271,7 @@ impl Index {
/// Writes the fields distribution which associates every field name with /// Writes the fields distribution which associates every field name with
/// the number of times it occurs in the documents. /// the number of times it occurs in the documents.
pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { pub(crate) fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution) self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, main_key::FIELDS_DISTRIBUTION_KEY, distribution)
} }
@ -288,7 +288,7 @@ impl Index {
/// Writes the fields that must be displayed in the defined order. /// Writes the fields that must be displayed in the defined order.
/// There must be not be any duplicate field id. /// There must be not be any duplicate field id.
pub fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { pub(crate) fn put_displayed_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields) self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::DISPLAYED_FIELDS_KEY, &fields)
} }
@ -328,7 +328,7 @@ impl Index {
/* searchable fields */ /* searchable fields */
/// Writes the searchable fields, when this list is specified, only these are indexed. /// Writes the searchable fields, when this list is specified, only these are indexed.
pub fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> { pub(crate) fn put_searchable_fields(&self, wtxn: &mut RwTxn, fields: &[&str]) -> heed::Result<()> {
self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields) self.main.put::<_, Str, SerdeBincode<&[&str]>>(wtxn, main_key::SEARCHABLE_FIELDS_KEY, &fields)
} }
@ -367,7 +367,7 @@ impl Index {
/* filterable fields */ /* filterable fields */
/// Writes the filterable fields names in the database. /// Writes the filterable fields names in the database.
pub fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet<String>) -> heed::Result<()> { pub(crate) fn put_filterable_fields(&self, wtxn: &mut RwTxn, fields: &HashSet<String>) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields) self.main.put::<_, Str, SerdeJson<_>>(wtxn, main_key::FILTERABLE_FIELDS_KEY, fields)
} }
@ -453,7 +453,7 @@ impl Index {
/* faceted documents ids */ /* faceted documents ids */
/// Writes the documents ids that are faceted with numbers under this field id. /// Writes the documents ids that are faceted with numbers under this field id.
pub fn put_number_faceted_documents_ids( pub(crate) fn put_number_faceted_documents_ids(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
field_id: FieldId, field_id: FieldId,
@ -485,7 +485,7 @@ impl Index {
} }
/// Writes the documents ids that are faceted with strings under this field id. /// Writes the documents ids that are faceted with strings under this field id.
pub fn put_string_faceted_documents_ids( pub(crate) fn put_string_faceted_documents_ids(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
field_id: FieldId, field_id: FieldId,
@ -532,7 +532,7 @@ impl Index {
/* criteria */ /* criteria */
pub fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> { pub(crate) fn put_criteria(&self, wtxn: &mut RwTxn, criteria: &[Criterion]) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria) self.main.put::<_, Str, SerdeJson<&[Criterion]>>(wtxn, main_key::CRITERIA_KEY, &criteria)
} }
@ -550,7 +550,7 @@ impl Index {
/* words fst */ /* words fst */
/// Writes the FST which is the words dictionary of the engine. /// Writes the FST which is the words dictionary of the engine.
pub fn put_words_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> { pub(crate) fn put_words_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes()) self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_FST_KEY, fst.as_fst().as_bytes())
} }
@ -564,7 +564,7 @@ impl Index {
/* stop words */ /* stop words */
pub fn put_stop_words<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> { pub(crate) fn put_stop_words<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes()) self.main.put::<_, Str, ByteSlice>(wtxn, main_key::STOP_WORDS_KEY, fst.as_fst().as_bytes())
} }
@ -581,7 +581,7 @@ impl Index {
/* synonyms */ /* synonyms */
pub fn put_synonyms( pub(crate) fn put_synonyms(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>, synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>,
@ -611,7 +611,7 @@ impl Index {
/* words prefixes fst */ /* words prefixes fst */
/// Writes the FST which is the words prefixes dictionnary of the engine. /// Writes the FST which is the words prefixes dictionnary of the engine.
pub fn put_words_prefixes_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> { pub(crate) fn put_words_prefixes_fst<A: AsRef<[u8]>>(&self, wtxn: &mut RwTxn, fst: &fst::Set<A>) -> heed::Result<()> {
self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes()) self.main.put::<_, Str, ByteSlice>(wtxn, main_key::WORDS_PREFIXES_FST_KEY, fst.as_fst().as_bytes())
} }

View File

@ -375,7 +375,7 @@ impl Transform<'_, '_> {
// Once we have sort and deduplicated the documents we write them into a final file. // Once we have sort and deduplicated the documents we write them into a final file.
let mut final_sorter = create_sorter( let mut final_sorter = create_sorter(
|_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "merging documents" }), |_id, _obkvs| Err(InternalError::IndexingMergingKeys { process: "documents" }),
self.chunk_compression_type, self.chunk_compression_type,
self.chunk_compression_level, self.chunk_compression_level,
self.chunk_fusing_shrink_size, self.chunk_fusing_shrink_size,

View File

@ -72,6 +72,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
stop_words: Setting<BTreeSet<String>>, stop_words: Setting<BTreeSet<String>>,
distinct_field: Setting<String>, distinct_field: Setting<String>,
synonyms: Setting<HashMap<String, Vec<String>>>, synonyms: Setting<HashMap<String, Vec<String>>>,
primary_key: Setting<String>,
} }
impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
@ -98,6 +99,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
stop_words: Setting::NotSet, stop_words: Setting::NotSet,
distinct_field: Setting::NotSet, distinct_field: Setting::NotSet,
synonyms: Setting::NotSet, synonyms: Setting::NotSet,
primary_key: Setting::NotSet,
update_id, update_id,
} }
} }
@ -166,6 +168,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} }
} }
pub fn reset_primary_key(&mut self) {
self.primary_key = Setting::Reset;
}
pub fn set_primary_key(&mut self, primary_key: String) {
self.primary_key = Setting::Set(primary_key);
}
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()> fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
where where
F: Fn(UpdateIndexingStep, u64) + Sync F: Fn(UpdateIndexingStep, u64) + Sync
@ -423,6 +433,31 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
Ok(()) Ok(())
} }
fn update_primary_key(&mut self) -> Result<()> {
match self.primary_key {
Setting::Set(ref primary_key) => {
if self.index.number_of_documents(&self.wtxn)? == 0 {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
self.index.put_primary_key(self.wtxn, primary_key)?;
Ok(())
} else {
Err(UserError::PrimaryKeyCannotBeChanged.into())
}
},
Setting::Reset => {
if self.index.number_of_documents(&self.wtxn)? == 0 {
self.index.delete_primary_key(self.wtxn)?;
Ok(())
} else {
Err(UserError::PrimaryKeyCannotBeReset.into())
}
},
Setting::NotSet => Ok(()),
}
}
pub fn execute<F>(mut self, progress_callback: F) -> Result<()> pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
where where
F: Fn(UpdateIndexingStep, u64) + Sync F: Fn(UpdateIndexingStep, u64) + Sync
@ -436,6 +471,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.update_filterable()?; self.update_filterable()?;
self.update_distinct_field()?; self.update_distinct_field()?;
self.update_criteria()?; self.update_criteria()?;
self.update_primary_key()?;
// If there is new faceted fields we indicate that we must reindex as we must // If there is new faceted fields we indicate that we must reindex as we must
// index new fields as facets. It means that the distinct attribute, // index new fields as facets. It means that the distinct attribute,
@ -462,8 +498,9 @@ mod tests {
use maplit::{btreeset, hashmap, hashset}; use maplit::{btreeset, hashmap, hashset};
use big_s::S; use big_s::S;
use crate::{Criterion, FilterCondition, SearchResult}; use crate::error::Error;
use crate::update::{IndexDocuments, UpdateFormat}; use crate::update::{IndexDocuments, UpdateFormat};
use crate::{Criterion, FilterCondition, SearchResult};
use super::*; use super::*;
@ -977,4 +1014,54 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
FilterCondition::from_str(&rtxn, &index, "toto = 32").unwrap_err(); FilterCondition::from_str(&rtxn, &index, "toto = 32").unwrap_err();
} }
#[test]
fn setting_primary_key() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// Set the primary key settings
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_primary_key(S("mykey"));
builder.execute(|_, _| ()).unwrap();
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey"));
// Then index some documents with the "mykey" primary key.
let content = &br#"[
{ "mykey": 1, "name": "kevin", "age": 23 },
{ "mykey": 2, "name": "kevina", "age": 21 },
{ "mykey": 3, "name": "benoit", "age": 34 },
{ "mykey": 4, "name": "bernard", "age": 34 },
{ "mykey": 5, "name": "bertrand", "age": 34 },
{ "mykey": 6, "name": "bernie", "age": 34 },
{ "mykey": 7, "name": "ben", "age": 34 }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.disable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// We now try to reset the primary key
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.reset_primary_key();
let err = builder.execute(|_, _| ()).unwrap_err();
assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset)));
// But if we clear the database...
let mut wtxn = index.write_txn().unwrap();
let builder = ClearDocuments::new(&mut wtxn, &index, 0);
builder.execute().unwrap();
// ...we can change the primary key
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_primary_key(S("myid"));
builder.execute(|_, _| ()).unwrap();
}
} }

View File

@ -1,5 +1,6 @@
use milli::{Search, SearchResult, Criterion};
use big_s::S; use big_s::S;
use milli::update::Settings;
use milli::{Search, SearchResult, Criterion};
use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};
use Criterion::*; use Criterion::*;
@ -189,7 +190,9 @@ fn criteria_mixup() {
eprintln!("Testing with criteria order: {:?}", &criteria); eprintln!("Testing with criteria order: {:?}", &criteria);
//update criteria //update criteria
let mut wtxn = index.write_txn().unwrap(); let mut wtxn = index.write_txn().unwrap();
index.put_criteria(&mut wtxn, &criteria).unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_criteria(criteria.iter().map(ToString::to_string).collect());
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap(); wtxn.commit().unwrap();
let mut rtxn = index.read_txn().unwrap(); let mut rtxn = index.read_txn().unwrap();