From 3cdf14d4c5bb865473b5f6dd184ddd9db73ba99b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 2 Dec 2020 18:31:41 +0100 Subject: [PATCH] Introduce the field-id-docid-facet-values database --- .../facet/field_doc_id_facet_f64_codec.rs | 36 +++++++++++++ .../facet/field_doc_id_facet_i64_codec.rs | 34 ++++++++++++ .../facet/field_doc_id_facet_string_codec.rs | 31 +++++++++++ src/heed_codec/facet/mod.rs | 6 +++ src/index.rs | 6 ++- src/subcommand/infos.rs | 1 + src/update/clear_documents.rs | 2 + src/update/delete_documents.rs | 39 +++++++++++++- src/update/index_documents/merge_function.rs | 8 ++- src/update/index_documents/mod.rs | 23 +++++++- src/update/index_documents/store.rs | 52 +++++++++++++++++-- 11 files changed, 229 insertions(+), 9 deletions(-) create mode 100644 src/heed_codec/facet/field_doc_id_facet_f64_codec.rs create mode 100644 src/heed_codec/facet/field_doc_id_facet_i64_codec.rs create mode 100644 src/heed_codec/facet/field_doc_id_facet_string_codec.rs diff --git a/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs b/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs new file mode 100644 index 000000000..e9b5abeb8 --- /dev/null +++ b/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs @@ -0,0 +1,36 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use crate::{FieldId, DocumentId}; +use crate::facet::value_encoding::f64_into_bytes; + +pub struct FieldDocIdFacetF64Codec; + +impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { + type DItem = (FieldId, DocumentId, f64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + + let (document_id_bytes, bytes) = bytes.split_at(4); + let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; + + let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?; + + Some((*field_id, document_id, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec { + type EItem = (FieldId, DocumentId, f64); + + fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(1 + 4 + 8 + 8); + bytes.push(*field_id); + bytes.extend_from_slice(&document_id.to_be_bytes()); + let value_bytes = f64_into_bytes(*value)?; + bytes.extend_from_slice(&value_bytes); + bytes.extend_from_slice(&value.to_be_bytes()); + Some(Cow::Owned(bytes)) + } +} diff --git a/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs b/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs new file mode 100644 index 000000000..a9eaf188c --- /dev/null +++ b/src/heed_codec/facet/field_doc_id_facet_i64_codec.rs @@ -0,0 +1,34 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use crate::facet::value_encoding::{i64_into_bytes, i64_from_bytes}; +use crate::{FieldId, DocumentId}; + +pub struct FieldDocIdFacetI64Codec; + +impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetI64Codec { + type DItem = (FieldId, DocumentId, i64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + + let (document_id_bytes, bytes) = bytes.split_at(4); + let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; + + let value = bytes[..8].try_into().map(i64_from_bytes).ok()?; + + Some((*field_id, document_id, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetI64Codec { + type EItem = (FieldId, DocumentId, i64); + + fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(1 + 4 + 8); + bytes.push(*field_id); + bytes.extend_from_slice(&document_id.to_be_bytes()); + bytes.extend_from_slice(&i64_into_bytes(*value)); + Some(Cow::Owned(bytes)) + } +} diff --git a/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/src/heed_codec/facet/field_doc_id_facet_string_codec.rs new file mode 100644 index 000000000..2e282b2a0 --- /dev/null +++ b/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -0,0 +1,31 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::str; + +use crate::{FieldId, DocumentId}; + +pub struct FieldDocIdFacetStringCodec; + +impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { + type DItem = (FieldId, DocumentId, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (document_id_bytes, bytes) = bytes.split_at(4); + let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?; + let value = str::from_utf8(bytes).ok()?; + Some((*field_id, document_id, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { + type EItem = (FieldId, DocumentId, &'a str); + + fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(1 + 4 + value.len()); + bytes.push(*field_id); + bytes.extend_from_slice(&document_id.to_be_bytes()); + bytes.extend_from_slice(value.as_bytes()); + Some(Cow::Owned(bytes)) + } +} diff --git a/src/heed_codec/facet/mod.rs b/src/heed_codec/facet/mod.rs index ef97e6add..d8ce936e0 100644 --- a/src/heed_codec/facet/mod.rs +++ b/src/heed_codec/facet/mod.rs @@ -1,7 +1,13 @@ mod facet_level_value_f64_codec; mod facet_level_value_i64_codec; mod facet_value_string_codec; +mod field_doc_id_facet_f64_codec; +mod field_doc_id_facet_i64_codec; +mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec; pub use self::facet_value_string_codec::FacetValueStringCodec; +pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; +pub use self::field_doc_id_facet_i64_codec::FieldDocIdFacetI64Codec; +pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/src/index.rs b/src/index.rs index 81b306dce..26ca4ae29 100644 --- a/src/index.rs +++ b/src/index.rs @@ -42,13 +42,15 @@ pub struct Index { pub word_pair_proximity_docids: Database, /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. pub facet_field_id_value_docids: Database, + /// Maps the document id, the facet field id and the globally ordered value. + pub field_id_docid_facet_values: Database, /// Maps the document id to the document as an obkv store. pub documents: Database, ObkvCodec>, } impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(6); + options.max_dbs(7); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; @@ -56,6 +58,7 @@ impl Index { let docid_word_positions = env.create_database(Some("docid-word-positions"))?; let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; + let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; let documents = env.create_database(Some("documents"))?; Ok(Index { @@ -65,6 +68,7 @@ impl Index { docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, + field_id_docid_facet_values, documents, }) } diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 54ce620cb..ccac80101 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -297,6 +297,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, + field_id_docid_facet_values: _, documents, } = index; diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index 5dc14f97d..a6e54c5e8 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -19,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, + field_id_docid_facet_values, documents, } = self.index; @@ -41,6 +42,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; facet_field_id_value_docids.clear(self.wtxn)?; + field_id_docid_facet_values.clear(self.wtxn)?; documents.clear(self.wtxn)?; Ok(number_of_documents) diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index b1db4f94c..a1d00de43 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -2,7 +2,9 @@ use fst::IntoStreamer; use heed::types::ByteSlice; use roaring::RoaringBitmap; +use crate::facet::FacetType; use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; +use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use super::ClearDocuments; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -75,6 +77,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, facet_field_id_value_docids, + field_id_docid_facet_values, documents, } = self.index; @@ -186,10 +189,42 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // Remove the documents ids from the faceted documents ids. let faceted_fields = self.index.faceted_fields(self.wtxn)?; - for (field_id, _) in faceted_fields { + for (field_id, facet_type) in faceted_fields { let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; docids.difference_with(&self.documents_ids); self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; + + // We delete the entries that are part of the documents ids. + let iter = field_id_docid_facet_values.prefix_iter_mut(self.wtxn, &[field_id])?; + match facet_type { + FacetType::String => { + let mut iter = iter.remap_key_type::(); + while let Some(result) = iter.next() { + let ((_fid, docid, _value), ()) = result?; + if self.documents_ids.contains(docid) { + iter.del_current()?; + } + } + }, + FacetType::Float => { + let mut iter = iter.remap_key_type::(); + while let Some(result) = iter.next() { + let ((_fid, docid, _value), ()) = result?; + if self.documents_ids.contains(docid) { + iter.del_current()?; + } + } + }, + FacetType::Integer => { + let mut iter = iter.remap_key_type::(); + while let Some(result) = iter.next() { + let ((_fid, docid, _value), ()) = result?; + if self.documents_ids.contains(docid) { + iter.del_current()?; + } + } + }, + } } // We delete the documents ids that are under the facet field id values. @@ -205,6 +240,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } + drop(iter); + Ok(self.documents_ids.len() as usize) } } diff --git a/src/update/index_documents/merge_function.rs b/src/update/index_documents/merge_function.rs index fb785fd11..6f24fcad9 100644 --- a/src/update/index_documents/merge_function.rs +++ b/src/update/index_documents/merge_function.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; -use anyhow::{bail, ensure}; +use anyhow::{bail, ensure, Context}; use bstr::ByteSlice as _; use fst::IntoStreamer; use roaring::RoaringBitmap; @@ -42,6 +42,12 @@ pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow:: bail!("merging docid word positions is an error ({:?})", key.as_bstr()) } +pub fn field_id_docid_facet_values_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + let first = values.first().context("no value to merge")?; + ensure!(values.iter().all(|v| v == first), "invalid field id docid facet value merging"); + Ok(first.to_vec()) +} + pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { cbo_roaring_bitmap_merge(values) } diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 796a0910a..8b538b03d 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -21,6 +21,7 @@ use self::store::{Store, Readers}; use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, docid_word_positions_merge, documents_merge, facet_field_value_docids_merge, + field_id_docid_facet_values_merge, }; pub use self::transform::{Transform, TransformOutput}; @@ -395,6 +396,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); + let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len()); let mut documents_readers = Vec::with_capacity(readers.len()); readers.into_iter().for_each(|readers| { let Readers { @@ -403,6 +405,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions, words_pairs_proximities_docids, facet_field_value_docids, + field_id_docid_facet_values, documents } = readers; main_readers.push(main); @@ -410,6 +413,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers.push(docid_word_positions); words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); facet_field_value_docids_readers.push(facet_field_value_docids); + field_id_docid_facet_values_readers.push(field_id_docid_facet_values); documents_readers.push(documents); }); @@ -453,6 +457,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, + field_id_docid_facet_values_readers, )) as anyhow::Result<_> })?; @@ -461,6 +466,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { docid_word_positions_readers, documents_readers, words_pairs_proximities_docids_readers, + field_id_docid_facet_values_readers, ) = readers; let mut documents_ids = self.index.documents_ids(self.wtxn)?; @@ -488,7 +494,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; - let total_databases = 6; + let total_databases = 7; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, @@ -525,6 +531,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { total_databases, }); + debug!("Writing the field id docid facet values into LMDB on disk..."); + merge_into_lmdb_database( + self.wtxn, + *self.index.field_id_docid_facet_values.as_polymorph(), + field_id_docid_facet_values_readers, + field_id_docid_facet_values_merge, + write_method, + )?; + + database_count += 1; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { + databases_seen: database_count, + total_databases, + }); + debug!("Writing the words pairs proximities docids into LMDB on disk..."); merge_into_lmdb_database( self.wtxn, diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index c56b8b09e..b107d4be6 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -20,6 +20,7 @@ use tempfile::tempfile; use crate::facet::FacetType; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; +use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::tokenizer::{simple_tokenizer, only_token}; use crate::update::UpdateIndexingStep; use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId}; @@ -27,7 +28,7 @@ use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, Docu use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; use super::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - facet_field_value_docids_merge, + facet_field_value_docids_merge, field_id_docid_facet_values_merge, }; const LMDB_MAX_KEY_LENGTH: usize = 511; @@ -42,6 +43,7 @@ pub struct Readers { pub docid_word_positions: Reader, pub words_pairs_proximities_docids: Reader, pub facet_field_value_docids: Reader, + pub field_id_docid_facet_values: Reader, pub documents: Reader, } @@ -65,6 +67,7 @@ pub struct Store { word_docids_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, facet_field_value_docids_sorter: Sorter, + field_id_docid_facet_values_sorter: Sorter, // MTBL writers docid_word_positions_writer: Writer, documents_writer: Writer, @@ -118,6 +121,14 @@ impl Store { max_nb_chunks, max_memory, ); + let field_id_docid_facet_values_sorter = create_sorter( + field_id_docid_facet_values_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + Some(1024 * 1024 * 1024), // 1MB + ); let documents_writer = tempfile().and_then(|f| { create_writer(chunk_compression_type, chunk_compression_level, f) @@ -146,6 +157,7 @@ impl Store { word_docids_sorter, words_pairs_proximities_docids_sorter, facet_field_value_docids_sorter, + field_id_docid_facet_values_sorter, // MTBL writers docid_word_positions_writer, documents_writer, @@ -181,6 +193,8 @@ impl Store { id: DocumentId, ) -> anyhow::Result<()> { + Self::write_field_id_docid_facet_value(&mut self.field_id_docid_facet_values_sorter, field_id, id, &field_value)?; + let key = (field_id, field_value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.facet_field_value_docids.get_refresh(&key) { @@ -192,7 +206,7 @@ impl Store { // one element, this way next time we insert we doesn't grow the capacity. if self.facet_field_value_docids.len() == self.facet_field_value_docids_limit { // Removing the front element is equivalent to removing the LRU element. - Self::write_docid_facet_field_values( + Self::write_facet_field_value_docids( &mut self.facet_field_value_docids_sorter, self.facet_field_value_docids.pop_front(), )?; @@ -326,11 +340,11 @@ impl Store { Ok(()) } - fn write_docid_facet_field_values( + fn write_facet_field_value_docids( sorter: &mut Sorter, iter: I, ) -> anyhow::Result<()> - where I: IntoIterator + where I: IntoIterator { use FacetValue::*; @@ -351,6 +365,29 @@ impl Store { Ok(()) } + fn write_field_id_docid_facet_value( + sorter: &mut Sorter, + field_id: FieldId, + document_id: DocumentId, + value: &FacetValue, + ) -> anyhow::Result<()> + { + use FacetValue::*; + + let result = match value { + String(s) => FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, s)).map(Cow::into_owned), + Float(f) => FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, **f)).map(Cow::into_owned), + Integer(i) => FieldDocIdFacetI64Codec::bytes_encode(&(field_id, document_id, *i)).map(Cow::into_owned), + }; + + let key = result.context("could not serialize facet key")?; + if lmdb_key_valid_size(&key) { + sorter.insert(&key, &[])?; + } + + Ok(()) + } + fn write_word_docids(sorter: &mut Sorter, iter: I) -> anyhow::Result<()> where I: IntoIterator, RoaringBitmap)> { @@ -463,7 +500,7 @@ impl Store { &mut self.words_pairs_proximities_docids_sorter, self.words_pairs_proximities_docids, )?; - Self::write_docid_facet_field_values( + Self::write_facet_field_value_docids( &mut self.facet_field_value_docids_sorter, self.facet_field_value_docids, )?; @@ -491,10 +528,14 @@ impl Store { let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; + let mut field_id_docid_facet_values_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.field_id_docid_facet_values_sorter.write_into(&mut field_id_docid_facet_values_wtr)?; + let main = writer_into_reader(main_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; + let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?; let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; let documents = writer_into_reader(self.documents_writer, shrink_size)?; @@ -504,6 +545,7 @@ impl Store { docid_word_positions, words_pairs_proximities_docids, facet_field_value_docids, + field_id_docid_facet_values, documents, }) }