From e0058c1125e1a383a485358be2cf87d04c860669 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Nov 2020 15:48:24 +0100 Subject: [PATCH 01/13] Introduce codecs for facet types (string, f64, u64, i64) --- src/facet/facet_type.rs | 15 ++++ src/facet/mod.rs | 4 + src/facet/value_encoding.rs | 89 +++++++++++++++++++ src/heed_codec/facet/facet_value_f64_codec.rs | 50 +++++++++++ src/heed_codec/facet/facet_value_i64_codec.rs | 28 ++++++ .../facet/facet_value_string_codec.rs | 25 ++++++ src/heed_codec/facet/facet_value_u64_codec.rs | 28 ++++++ src/heed_codec/facet/mod.rs | 9 ++ src/heed_codec/mod.rs | 3 + src/heed_codec/str_bytes_codec.rs | 28 ++++++ src/lib.rs | 1 + 11 files changed, 280 insertions(+) create mode 100644 src/facet/facet_type.rs create mode 100644 src/facet/mod.rs create mode 100644 src/facet/value_encoding.rs create mode 100644 src/heed_codec/facet/facet_value_f64_codec.rs create mode 100644 src/heed_codec/facet/facet_value_i64_codec.rs create mode 100644 src/heed_codec/facet/facet_value_string_codec.rs create mode 100644 src/heed_codec/facet/facet_value_u64_codec.rs create mode 100644 src/heed_codec/facet/mod.rs create mode 100644 src/heed_codec/str_bytes_codec.rs diff --git a/src/facet/facet_type.rs b/src/facet/facet_type.rs new file mode 100644 index 000000000..13482e8b3 --- /dev/null +++ b/src/facet/facet_type.rs @@ -0,0 +1,15 @@ +use std::cmp; + +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub enum FacetType { + String, + F64, + I64, + U64, +} + +impl FacetType { + pub fn merge(a: FacetType, b: FacetType) -> FacetType { + cmp::min(a, b) + } +} diff --git a/src/facet/mod.rs b/src/facet/mod.rs new file mode 100644 index 000000000..9ec99f2d3 --- /dev/null +++ b/src/facet/mod.rs @@ -0,0 +1,4 @@ +mod facet_type; +pub mod value_encoding; + +pub use self::facet_type::FacetType; diff --git a/src/facet/value_encoding.rs b/src/facet/value_encoding.rs new file mode 100644 index 000000000..01fdba723 --- /dev/null +++ b/src/facet/value_encoding.rs @@ -0,0 +1,89 @@ +// https://stackoverflow.com/a/43305015/1941280 +#[inline] +pub fn f64_into_bytes(float: f64) -> Option<[u8; 8]> { + if float.is_finite() { + if float == 0.0 || float == -0.0 { + return Some(xor_first_bit(0.0_f64.to_be_bytes())); + } else if float.is_sign_negative() { + return Some(xor_all_bits(float.to_be_bytes())); + } else if float.is_sign_positive() { + return Some(xor_first_bit(float.to_be_bytes())); + } + } + None +} + +#[inline] +pub fn u64_into_bytes(int: u64) -> [u8; 8] { + int.to_be_bytes() +} + +#[inline] +pub fn u64_from_bytes(bytes: [u8; 8]) -> u64 { + u64::from_be_bytes(bytes) +} + +#[inline] +pub fn i64_into_bytes(int: i64) -> [u8; 8] { + xor_first_bit(int.to_be_bytes()) +} + +#[inline] +pub fn i64_from_bytes(bytes: [u8; 8]) -> i64 { + i64::from_be_bytes(xor_first_bit(bytes)) +} + +#[inline] +fn xor_first_bit(mut x: [u8; 8]) -> [u8; 8] { + x[0] ^= 0x80; + x +} + +#[inline] +fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] { + x.iter_mut().for_each(|b| *b ^= 0xff); + x +} + +#[cfg(test)] +mod tests { + use std::cmp::Ordering::Less; + use super::*; + + fn is_sorted(x: &[T]) -> bool { + x.windows(2).map(|x| x[0].cmp(&x[1])).all(|o| o == Less) + } + + #[test] + fn ordered_f64_bytes() { + let a = -13_f64; + let b = -10.0; + let c = -0.0; + let d = 1.0; + let e = 43.0; + + let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect(); + assert!(is_sorted(&vec), "{:?}", vec); + } + + #[test] + fn ordered_u64_bytes() { + let a = 0_u64; + let b = 1_u64; + let c = 43_u64; + + let vec: Vec<_> = [a, b, c].iter().cloned().map(u64_into_bytes).collect(); + assert!(is_sorted(&vec), "{:?}", vec); + } + + #[test] + fn ordered_i64_bytes() { + let a = -10_i64; + let b = -0_i64; + let c = 1_i64; + let d = 43_i64; + + let vec: Vec<_> = [a, b, c, d].iter().cloned().map(i64_into_bytes).collect(); + assert!(is_sorted(&vec), "{:?}", vec); + } +} diff --git a/src/heed_codec/facet/facet_value_f64_codec.rs b/src/heed_codec/facet/facet_value_f64_codec.rs new file mode 100644 index 000000000..fdacb1f08 --- /dev/null +++ b/src/heed_codec/facet/facet_value_f64_codec.rs @@ -0,0 +1,50 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::str; + +use crate::heed_codec::StrBytesCodec; +use crate::facet::value_encoding::f64_into_bytes; + +pub struct FacetValueF64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetValueF64Codec { + type DItem = (&'a str, f64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (name, buffer) = StrBytesCodec::bytes_decode(bytes)?; + let value = buffer[8..].try_into().ok().map(f64::from_be_bytes)?; + Some((name, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetValueF64Codec { + type EItem = (&'a str, f64); + + fn bytes_encode((name, value): &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // Write the globally ordered float. + let bytes = f64_into_bytes(*value)?; + buffer[..8].copy_from_slice(&bytes[..]); + + // Then the f64 value just to be able to read it back. + let bytes = value.to_be_bytes(); + buffer[8..].copy_from_slice(&bytes[..]); + + let tuple = (*name, &buffer[..]); + StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) + } +} + +#[cfg(test)] +mod tests { + use heed::{BytesEncode, BytesDecode}; + use super::*; + + #[test] + fn globally_ordered_f64() { + let bytes = FacetValueF64Codec::bytes_encode(&("hello", -32.0)).unwrap(); + let (name, value) = FacetValueF64Codec::bytes_decode(&bytes).unwrap(); + assert_eq!((name, value), ("hello", -32.0)); + } +} diff --git a/src/heed_codec/facet/facet_value_i64_codec.rs b/src/heed_codec/facet/facet_value_i64_codec.rs new file mode 100644 index 000000000..e3c333883 --- /dev/null +++ b/src/heed_codec/facet/facet_value_i64_codec.rs @@ -0,0 +1,28 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::str; + +use crate::heed_codec::StrBytesCodec; +use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; + +pub struct FacetValueI64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetValueI64Codec { + type DItem = (&'a str, i64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?; + let value = bytes.try_into().map(i64_from_bytes).ok()?; + Some((name, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetValueI64Codec { + type EItem = (&'a str, i64); + + fn bytes_encode((name, value): &Self::EItem) -> Option> { + let value = i64_into_bytes(*value); + let tuple = (*name, &value[..]); + StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) + } +} diff --git a/src/heed_codec/facet/facet_value_string_codec.rs b/src/heed_codec/facet/facet_value_string_codec.rs new file mode 100644 index 000000000..8b046192a --- /dev/null +++ b/src/heed_codec/facet/facet_value_string_codec.rs @@ -0,0 +1,25 @@ +use std::borrow::Cow; +use std::str; + +use crate::heed_codec::StrBytesCodec; + +pub struct FacetValueStringCodec; + +impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { + type DItem = (&'a str, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?; + let value = str::from_utf8(bytes).ok()?; + Some((name, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec { + type EItem = (&'a str, &'a str); + + fn bytes_encode((name, value): &Self::EItem) -> Option> { + let tuple = (*name, value.as_bytes()); + StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) + } +} diff --git a/src/heed_codec/facet/facet_value_u64_codec.rs b/src/heed_codec/facet/facet_value_u64_codec.rs new file mode 100644 index 000000000..2fdc7416e --- /dev/null +++ b/src/heed_codec/facet/facet_value_u64_codec.rs @@ -0,0 +1,28 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::str; + +use crate::heed_codec::StrBytesCodec; +use crate::facet::value_encoding::{u64_from_bytes, u64_into_bytes}; + +pub struct FacetValueU64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetValueU64Codec { + type DItem = (&'a str, u64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?; + let value = bytes.try_into().map(u64_from_bytes).ok()?; + Some((name, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetValueU64Codec { + type EItem = (&'a str, u64); + + fn bytes_encode((name, value): &Self::EItem) -> Option> { + let value = u64_into_bytes(*value); + let tuple = (*name, &value[..]); + StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) + } +} diff --git a/src/heed_codec/facet/mod.rs b/src/heed_codec/facet/mod.rs new file mode 100644 index 000000000..ba34f6220 --- /dev/null +++ b/src/heed_codec/facet/mod.rs @@ -0,0 +1,9 @@ +mod facet_value_f64_codec; +mod facet_value_i64_codec; +mod facet_value_string_codec; +mod facet_value_u64_codec; + +pub use self::facet_value_f64_codec::FacetValueF64Codec; +pub use self::facet_value_i64_codec::FacetValueI64Codec; +pub use self::facet_value_string_codec::FacetValueStringCodec; +pub use self::facet_value_u64_codec::FacetValueU64Codec; diff --git a/src/heed_codec/mod.rs b/src/heed_codec/mod.rs index 68739fbf1..260e79c4b 100644 --- a/src/heed_codec/mod.rs +++ b/src/heed_codec/mod.rs @@ -1,8 +1,10 @@ mod beu32_str_codec; mod bo_roaring_bitmap_codec; mod cbo_roaring_bitmap_codec; +mod facet; mod obkv_codec; mod roaring_bitmap_codec; +mod str_bytes_codec; mod str_str_u8_codec; pub use self::beu32_str_codec::BEU32StrCodec; @@ -10,4 +12,5 @@ pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap_codec::RoaringBitmapCodec; +pub use self::str_bytes_codec::StrBytesCodec; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/src/heed_codec/str_bytes_codec.rs b/src/heed_codec/str_bytes_codec.rs new file mode 100644 index 000000000..1a864c1fb --- /dev/null +++ b/src/heed_codec/str_bytes_codec.rs @@ -0,0 +1,28 @@ +use std::borrow::Cow; +use std::str; + +pub struct StrBytesCodec; + +impl<'a> heed::BytesDecode<'a> for StrBytesCodec { + type DItem = (&'a str, &'a [u8]); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let s1_end = bytes.iter().position(|b| *b == 0)?; + let (s1_bytes, s2_bytes) = bytes.split_at(s1_end); + let s1 = str::from_utf8(s1_bytes).ok()?; + let s2 = &s2_bytes[1..]; + Some((s1, s2)) + } +} + +impl<'a> heed::BytesEncode<'a> for StrBytesCodec { + type EItem = (&'a str, &'a [u8]); + + fn bytes_encode((s1, s2): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); + bytes.extend_from_slice(s1.as_bytes()); + bytes.push(0); + bytes.extend_from_slice(s2); + Some(Cow::Owned(bytes)) + } +} diff --git a/src/lib.rs b/src/lib.rs index bea05e68a..2e879fd03 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ mod index; mod mdfs; mod query_tokens; mod search; +pub mod facet; pub mod heed_codec; pub mod proximity; pub mod subcommand; From 92ec908303a5dc6f0aa7347f76a7e5fd141a7a9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Nov 2020 16:04:04 +0100 Subject: [PATCH 02/13] Introduce the facet field id values engine database --- src/index.rs | 15 +++++++++++++-- src/update/clear_documents.rs | 24 ++++++++---------------- src/update/delete_documents.rs | 19 ++++++++++++++++++- 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/src/index.rs b/src/index.rs index 1a0c5d4d6..49f877d81 100644 --- a/src/index.rs +++ b/src/index.rs @@ -34,22 +34,33 @@ pub struct Index { pub docid_word_positions: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, + /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. + pub facet_field_id_value_docids: Database, /// Maps the document id to the document as an obkv store. pub documents: Database, ObkvCodec>, } impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result { - options.max_dbs(5); + options.max_dbs(6); let env = options.open(path)?; let main = env.create_poly_database(Some("main"))?; let word_docids = env.create_database(Some("word-docids"))?; let docid_word_positions = env.create_database(Some("docid-word-positions"))?; let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; + let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; let documents = env.create_database(Some("documents"))?; - Ok(Index { env, main, word_docids, docid_word_positions, word_pair_proximity_docids, documents }) + Ok(Index { + env, + main, + word_docids, + docid_word_positions, + word_pair_proximity_docids, + facet_field_id_value_docids, + documents, + }) } /// Create a write transaction to be able to write into the index. diff --git a/src/update/clear_documents.rs b/src/update/clear_documents.rs index ae739bd0b..c49ae9104 100644 --- a/src/update/clear_documents.rs +++ b/src/update/clear_documents.rs @@ -18,33 +18,25 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_docids, docid_word_positions, word_pair_proximity_docids, + facet_field_id_value_docids, documents, } = self.index; - // We clear the word fst. + // We retrieve the number of documents ids that we are deleting. + let number_of_documents = self.index.number_of_documents(self.wtxn)?; + + // We clean some of the main engine datastructures. self.index.put_words_fst(self.wtxn, &fst::Set::default())?; - - // We clear the users ids documents ids. self.index.put_users_ids_documents_ids(self.wtxn, &fst::Map::default())?; - - // We retrieve the documents ids. - let documents_ids = self.index.documents_ids(self.wtxn)?; - - // We clear the internal documents ids. self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; - // We clear the word docids. + // Clear the other databases. word_docids.clear(self.wtxn)?; - - // We clear the docid word positions. docid_word_positions.clear(self.wtxn)?; - - // We clear the word pair proximity docids. word_pair_proximity_docids.clear(self.wtxn)?; - - // We clear the documents themselves. + facet_field_id_value_docids.clear(self.wtxn)?; documents.clear(self.wtxn)?; - Ok(documents_ids.len() as usize) + Ok(number_of_documents) } } diff --git a/src/update/delete_documents.rs b/src/update/delete_documents.rs index e3a3be15a..d68bca81c 100644 --- a/src/update/delete_documents.rs +++ b/src/update/delete_documents.rs @@ -76,6 +76,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_docids, docid_word_positions, word_pair_proximity_docids, + facet_field_id_value_docids, documents, } = self.index; @@ -158,7 +159,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // We construct an FST set that contains the words to delete from the words FST. - let words_to_delete = words.iter().filter_map(|(w, d)| if *d { Some(w.as_ref()) } else { None }); + let words_to_delete = words.iter().filter_map(|(word, must_remove)| { + if *must_remove { Some(word.as_ref()) } else { None } + }); let words_to_delete = fst::Set::from_iter(words_to_delete)?; let new_words_fst = { @@ -191,6 +194,20 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } + drop(iter); + + // We delete the documents ids that are under the facet field id values. + let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?; + while let Some(result) = iter.next() { + let (bytes, mut docids) = result?; + docids.difference_with(&self.documents_ids); + if docids.is_empty() { + iter.del_current()?; + } else { + iter.put_current(bytes, &docids)?; + } + } + Ok(self.documents_ids.len() as usize) } } From 72f18759babf511c1ef5a5a1003a56da22bc1f2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Nov 2020 16:17:37 +0100 Subject: [PATCH 03/13] Introduce getters and setters for the facet fields ids facet types --- src/facet/facet_type.rs | 2 ++ src/index.rs | 23 ++++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/facet/facet_type.rs b/src/facet/facet_type.rs index 13482e8b3..93c98316d 100644 --- a/src/facet/facet_type.rs +++ b/src/facet/facet_type.rs @@ -1,6 +1,8 @@ use std::cmp; +use serde::{Serialize, Deserialize}; #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +#[derive(Serialize, Deserialize)] pub enum FacetType { String, F64, diff --git a/src/index.rs b/src/index.rs index 49f877d81..4946545e4 100644 --- a/src/index.rs +++ b/src/index.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::HashMap; use std::path::Path; use anyhow::Context; @@ -6,9 +7,10 @@ use heed::types::*; use heed::{PolyDatabase, Database, RwTxn, RoTxn}; use roaring::RoaringBitmap; +use crate::facet::FacetType; +use crate::fields_ids_map::FieldsIdsMap; use crate::Search; use crate::{BEU32, DocumentId}; -use crate::fields_ids_map::FieldsIdsMap; use crate::{ RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, @@ -16,6 +18,7 @@ use crate::{ pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; +pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; @@ -186,6 +189,24 @@ impl Index { self.main.get::<_, Str, ByteSlice>(rtxn, SEARCHABLE_FIELDS_KEY) } + /* faceted fields */ + + /// Writes the facet fields ids associated with their facet type or `None` if + /// the facet type is currently unknown. + pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap>) -> heed::Result<()> { + self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types) + } + + /// Deletes the facet fields ids associated with their facet type. + pub fn delete_faceted_fields(&self, wtxn: &mut RwTxn) -> heed::Result { + self.main.delete::<_, Str>(wtxn, FACETED_FIELDS_KEY) + } + + /// Returns the facet fields ids associated with their facet type. + pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result>> { + Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) + } + /* words fst */ /// Writes the FST which is the words dictionnary of the engine. From ebe7087bff1af538980c2faf5730e9a4f291b5b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Nov 2020 17:08:18 +0100 Subject: [PATCH 04/13] Introduce the faceted fields setting --- src/update/settings.rs | 89 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 4 deletions(-) diff --git a/src/update/settings.rs b/src/update/settings.rs index ad191ceec..f6528ed94 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + use anyhow::Context; use grenad::CompressionType; use rayon::ThreadPool; @@ -22,6 +24,7 @@ pub struct Settings<'a, 't, 'u, 'i> { // however if it is `Some(None)` it means that the user forced a reset of the setting. searchable_fields: Option>>, displayed_fields: Option>>, + faceted_fields: Option>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -39,6 +42,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { thread_pool: None, searchable_fields: None, displayed_fields: None, + faceted_fields: None, } } @@ -58,13 +62,77 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.displayed_fields = Some(Some(names)); } + pub fn set_faceted_fields(&mut self, names: Vec) { + self.faceted_fields = Some(names); + } + pub fn execute(self, progress_callback: F) -> anyhow::Result<()> where F: Fn(UpdateIndexingStep) + Sync { + if let Some(fields_names) = self.faceted_fields { + let current_faceted_fields = self.index.faceted_fields(self.wtxn)?; + let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + + let mut fields_ids_map = current_fields_ids_map.clone(); + let mut faceted_fields = HashMap::new(); + for name in fields_names { + let id = fields_ids_map.insert(&name).context("field id limit reached")?; + match current_faceted_fields.get(&id) { + Some(ftype) => faceted_fields.insert(id, ftype.clone()), + None => faceted_fields.insert(id, None), + }; + } + + let transform = Transform { + rtxn: &self.wtxn, + index: self.index, + log_every_n: self.log_every_n, + chunk_compression_type: self.chunk_compression_type, + chunk_compression_level: self.chunk_compression_level, + chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, + max_nb_chunks: self.max_nb_chunks, + max_memory: self.max_memory, + index_documents_method: IndexDocumentsMethod::ReplaceDocuments, + autogenerate_docids: false, + }; + + // We compute or generate the new primary key field id. + let primary_key = match self.index.primary_key(&self.wtxn)? { + Some(id) => { + let name = current_fields_ids_map.name(id).unwrap(); + fields_ids_map.insert(name).context("field id limit reached")? + }, + None => fields_ids_map.insert("id").context("field id limit reached")?, + }; + + // We remap the documents fields based on the new `FieldsIdsMap`. + let output = transform.remap_index_documents(primary_key, fields_ids_map.clone())?; + + // We write the faceted_fields fields into the database here. + self.index.put_faceted_fields(self.wtxn, &faceted_fields)?; + + // We clear the full database (words-fst, documents ids and documents content). + ClearDocuments::new(self.wtxn, self.index).execute()?; + + // We index the generated `TransformOutput` which must contain + // all the documents with fields in the newly defined searchable order. + let mut indexing_builder = IndexDocuments::new(self.wtxn, self.index); + indexing_builder.log_every_n = self.log_every_n; + indexing_builder.max_nb_chunks = self.max_nb_chunks; + indexing_builder.max_memory = self.max_memory; + indexing_builder.linked_hash_map_size = self.linked_hash_map_size; + indexing_builder.chunk_compression_type = self.chunk_compression_type; + indexing_builder.chunk_compression_level = self.chunk_compression_level; + indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; + indexing_builder.thread_pool = self.thread_pool; + indexing_builder.execute_raw(output, &progress_callback)?; + } + // Check that the searchable attributes have been specified. if let Some(value) = self.searchable_fields { let current_displayed_fields = self.index.displayed_fields(self.wtxn)?; + let current_faceted_fields = self.index.faceted_fields(self.wtxn)?; let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let result = match value { @@ -82,6 +150,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fields_ids_map, Some(searchable_fields), current_displayed_fields.map(ToOwned::to_owned), + current_faceted_fields, ) } else { // We create or generate the fields ids corresponding to those names. @@ -97,7 +166,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fields_ids_map.insert(name).context("field id limit reached")?; } - // We must also update the displayed fields according to the new `FieldsIdsMap`. + // We must update the displayed fields according to the new `FieldsIdsMap`. let displayed_fields = match current_displayed_fields { Some(fields) => { let mut displayed_fields = Vec::new(); @@ -111,17 +180,26 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { None => None, }; - (fields_ids_map, Some(searchable_fields), displayed_fields) + // We must update the faceted fields according to the new `FieldsIdsMap`. + let mut faceted_fields = HashMap::new(); + for (id, ftype) in current_faceted_fields { + let name = current_fields_ids_map.name(id).unwrap(); + let id = fields_ids_map.id(name).context("field id limit reached")?; + faceted_fields.insert(id, ftype); + } + + (fields_ids_map, Some(searchable_fields), displayed_fields, faceted_fields) } }, None => ( current_fields_ids_map.clone(), None, current_displayed_fields.map(ToOwned::to_owned), + current_faceted_fields, ), }; - let (mut fields_ids_map, searchable_fields, displayed_fields) = result; + let (mut fields_ids_map, searchable_fields, displayed_fields, faceted_fields) = result; let transform = Transform { rtxn: &self.wtxn, @@ -166,6 +244,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { None => self.index.delete_displayed_fields(self.wtxn).map(drop)?, } + // We write the faceted_fields fields into the database here. + self.index.put_faceted_fields(self.wtxn, &faceted_fields)?; + // We clear the full database (words-fst, documents ids and documents content). ClearDocuments::new(self.wtxn, self.index).execute()?; @@ -180,7 +261,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.chunk_compression_level = self.chunk_compression_level; indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; indexing_builder.thread_pool = self.thread_pool; - indexing_builder.execute_raw(output, progress_callback)?; + indexing_builder.execute_raw(output, &progress_callback)?; } // Check that the displayed attributes have been specified. From 466fb601d679cae10cf61c703beb3da7a957f0c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Nov 2020 17:33:05 +0100 Subject: [PATCH 05/13] Faceted fields settings must specify the facet type --- src/index.rs | 4 ++-- src/lib.rs | 8 +++---- src/update/index_documents/mod.rs | 2 ++ src/update/index_documents/store.rs | 35 ++++++++++++++++++----------- src/update/settings.rs | 22 +++++++++++------- 5 files changed, 44 insertions(+), 27 deletions(-) diff --git a/src/index.rs b/src/index.rs index 4946545e4..68d7dfe5f 100644 --- a/src/index.rs +++ b/src/index.rs @@ -193,7 +193,7 @@ impl Index { /// Writes the facet fields ids associated with their facet type or `None` if /// the facet type is currently unknown. - pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap>) -> heed::Result<()> { + pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types) } @@ -203,7 +203,7 @@ impl Index { } /// Returns the facet fields ids associated with their facet type. - pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result>> { + pub fn faceted_fields(&self, wtxn: &RoTxn) -> heed::Result> { Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) } diff --git a/src/lib.rs b/src/lib.rs index 2e879fd03..0e9ee0ec0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -61,9 +61,9 @@ pub fn obkv_to_json( } /// Transform a JSON value into a string that can be indexed. -pub fn json_to_string(value: Value) -> Option { +pub fn json_to_string(value: &Value) -> Option { - fn inner(value: Value, output: &mut String) -> bool { + fn inner(value: &Value, output: &mut String) -> bool { use std::fmt::Write; match value { Value::Null => false, @@ -122,7 +122,7 @@ mod tests { "not_there": null, }); - let string = json_to_string(value).unwrap(); + let string = json_to_string(&value).unwrap(); assert_eq!(string, "name: John Doe. age: 43. "); } @@ -136,7 +136,7 @@ mod tests { null, ]); - let string = json_to_string(value).unwrap(); + let string = json_to_string(&value).unwrap(); // We don't care about having two point (.) after the other as // the distance of hard separators is clamped to 8 anyway. assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index f078ec9ed..c060b04a1 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -329,6 +329,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { WordDocids, } + let faceted_fields = self.index.faceted_fields(self.wtxn)?; let searchable_fields: HashSet<_> = match self.index.searchable_fields(self.wtxn)? { Some(fields) => fields.iter().copied().collect(), None => fields_ids_map.iter().map(|(id, _name)| id).collect(), @@ -362,6 +363,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { .map(|(i, documents)| { let store = Store::new( searchable_fields.clone(), + faceted_fields.clone(), linked_hash_map_size, max_nb_chunks, max_memory_by_job, diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 6d42f0119..e9548eec3 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -14,6 +14,7 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use roaring::RoaringBitmap; use tempfile::tempfile; +use crate::facet::FacetType; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::tokenizer::{simple_tokenizer, only_token}; use crate::update::UpdateIndexingStep; @@ -39,6 +40,7 @@ pub struct Readers { pub struct Store { // Indexing parameters searchable_fields: HashSet, + faceted_fields: HashMap, // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, @@ -60,6 +62,7 @@ pub struct Store { impl Store { pub fn new( searchable_fields: HashSet, + faceted_fields: HashMap, linked_hash_map_size: Option, max_nb_chunks: Option, max_memory: Option, @@ -107,6 +110,7 @@ impl Store { Ok(Store { // Indexing parameters. searchable_fields, + faceted_fields, // Caches word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), word_docids_limit: linked_hash_map_size, @@ -320,21 +324,26 @@ impl Store { } for (attr, content) in document.iter() { - if !self.searchable_fields.contains(&attr) { - continue; - } + if self.faceted_fields.contains_key(&attr) || self.searchable_fields.contains(&attr) { + let value = serde_json::from_slice(content)?; - let value = serde_json::from_slice(content)?; - let content = match json_to_string(value) { - Some(content) => content, - None => continue, - }; + if let Some(ftype) = self.faceted_fields.get(&attr) { + todo!("parse facet field value") + } - let tokens = simple_tokenizer(&content).filter_map(only_token); - for (pos, token) in tokens.enumerate().take(MAX_POSITION) { - let word = token.to_lowercase(); - let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + if self.searchable_fields.contains(&attr) { + let content = match json_to_string(&value) { + Some(content) => content, + None => continue, + }; + + let tokens = simple_tokenizer(&content).filter_map(only_token); + for (pos, token) in tokens.enumerate().take(MAX_POSITION) { + let word = token.to_lowercase(); + let position = (attr as usize * MAX_POSITION + pos) as u32; + words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + } + } } } diff --git a/src/update/settings.rs b/src/update/settings.rs index f6528ed94..d5ffba23b 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -1,11 +1,13 @@ use std::collections::HashMap; +use std::str::FromStr; -use anyhow::Context; +use anyhow::{ensure, Context}; use grenad::CompressionType; use rayon::ThreadPool; use crate::update::index_documents::{Transform, IndexDocumentsMethod}; use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; +use crate::facet::FacetType; use crate::{Index, FieldsIdsMap}; pub struct Settings<'a, 't, 'u, 'i> { @@ -24,7 +26,7 @@ pub struct Settings<'a, 't, 'u, 'i> { // however if it is `Some(None)` it means that the user forced a reset of the setting. searchable_fields: Option>>, displayed_fields: Option>>, - faceted_fields: Option>, + faceted_fields: Option>, } impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { @@ -62,25 +64,29 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.displayed_fields = Some(Some(names)); } - pub fn set_faceted_fields(&mut self, names: Vec) { - self.faceted_fields = Some(names); + pub fn set_faceted_fields(&mut self, names_facet_types: HashMap) { + self.faceted_fields = Some(names_facet_types); } pub fn execute(self, progress_callback: F) -> anyhow::Result<()> where F: Fn(UpdateIndexingStep) + Sync { - if let Some(fields_names) = self.faceted_fields { + if let Some(fields_names_facet_types) = self.faceted_fields { let current_faceted_fields = self.index.faceted_fields(self.wtxn)?; let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut fields_ids_map = current_fields_ids_map.clone(); let mut faceted_fields = HashMap::new(); - for name in fields_names { + for (name, sftype) in fields_names_facet_types { + let ftype = FacetType::from_str(&sftype).with_context(|| format!("parsing facet type {:?}", sftype))?; let id = fields_ids_map.insert(&name).context("field id limit reached")?; match current_faceted_fields.get(&id) { - Some(ftype) => faceted_fields.insert(id, ftype.clone()), - None => faceted_fields.insert(id, None), + Some(pftype) => { + ensure!(ftype == *pftype, "{} facet type changed from {} to {}", name, ftype, pftype); + faceted_fields.insert(id, ftype) + }, + None => faceted_fields.insert(id, ftype), }; } From cf9ddd293dd61be00ef95ce3bf681277d49155c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Nov 2020 11:45:37 +0100 Subject: [PATCH 06/13] Simplify the the facet types --- src/facet/facet_type.rs | 47 ++++++++++++++++--- src/facet/value_encoding.rs | 20 -------- src/heed_codec/facet/facet_value_u64_codec.rs | 28 ----------- src/heed_codec/facet/mod.rs | 2 - 4 files changed, 40 insertions(+), 57 deletions(-) delete mode 100644 src/heed_codec/facet/facet_value_u64_codec.rs diff --git a/src/facet/facet_type.rs b/src/facet/facet_type.rs index 93c98316d..4fdc80798 100644 --- a/src/facet/facet_type.rs +++ b/src/facet/facet_type.rs @@ -1,17 +1,50 @@ -use std::cmp; +use std::error::Error; +use std::fmt; +use std::str::FromStr; + use serde::{Serialize, Deserialize}; #[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] #[derive(Serialize, Deserialize)] pub enum FacetType { String, - F64, - I64, - U64, + Float, + Integer, } -impl FacetType { - pub fn merge(a: FacetType, b: FacetType) -> FacetType { - cmp::min(a, b) +impl fmt::Display for FacetType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + FacetType::String => f.write_str("string"), + FacetType::Float => f.write_str("float"), + FacetType::Integer => f.write_str("integer"), + } } } + +impl FromStr for FacetType { + type Err = InvalidFacetType; + + fn from_str(s: &str) -> Result { + if s.eq_ignore_ascii_case("string") { + Ok(FacetType::String) + } else if s.eq_ignore_ascii_case("float") { + Ok(FacetType::Float) + } else if s.eq_ignore_ascii_case("integer") { + Ok(FacetType::Integer) + } else { + Err(InvalidFacetType) + } + } +} + +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct InvalidFacetType; + +impl fmt::Display for InvalidFacetType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.write_str(r#"Invalid facet type, must be "string", "float" or "integer""#) + } +} + +impl Error for InvalidFacetType { } diff --git a/src/facet/value_encoding.rs b/src/facet/value_encoding.rs index 01fdba723..3cb012a0e 100644 --- a/src/facet/value_encoding.rs +++ b/src/facet/value_encoding.rs @@ -13,16 +13,6 @@ pub fn f64_into_bytes(float: f64) -> Option<[u8; 8]> { None } -#[inline] -pub fn u64_into_bytes(int: u64) -> [u8; 8] { - int.to_be_bytes() -} - -#[inline] -pub fn u64_from_bytes(bytes: [u8; 8]) -> u64 { - u64::from_be_bytes(bytes) -} - #[inline] pub fn i64_into_bytes(int: i64) -> [u8; 8] { xor_first_bit(int.to_be_bytes()) @@ -66,16 +56,6 @@ mod tests { assert!(is_sorted(&vec), "{:?}", vec); } - #[test] - fn ordered_u64_bytes() { - let a = 0_u64; - let b = 1_u64; - let c = 43_u64; - - let vec: Vec<_> = [a, b, c].iter().cloned().map(u64_into_bytes).collect(); - assert!(is_sorted(&vec), "{:?}", vec); - } - #[test] fn ordered_i64_bytes() { let a = -10_i64; diff --git a/src/heed_codec/facet/facet_value_u64_codec.rs b/src/heed_codec/facet/facet_value_u64_codec.rs deleted file mode 100644 index 2fdc7416e..000000000 --- a/src/heed_codec/facet/facet_value_u64_codec.rs +++ /dev/null @@ -1,28 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::str; - -use crate::heed_codec::StrBytesCodec; -use crate::facet::value_encoding::{u64_from_bytes, u64_into_bytes}; - -pub struct FacetValueU64Codec; - -impl<'a> heed::BytesDecode<'a> for FacetValueU64Codec { - type DItem = (&'a str, u64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?; - let value = bytes.try_into().map(u64_from_bytes).ok()?; - Some((name, value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FacetValueU64Codec { - type EItem = (&'a str, u64); - - fn bytes_encode((name, value): &Self::EItem) -> Option> { - let value = u64_into_bytes(*value); - let tuple = (*name, &value[..]); - StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) - } -} diff --git a/src/heed_codec/facet/mod.rs b/src/heed_codec/facet/mod.rs index ba34f6220..abe2c1d8a 100644 --- a/src/heed_codec/facet/mod.rs +++ b/src/heed_codec/facet/mod.rs @@ -1,9 +1,7 @@ mod facet_value_f64_codec; mod facet_value_i64_codec; mod facet_value_string_codec; -mod facet_value_u64_codec; pub use self::facet_value_f64_codec::FacetValueF64Codec; pub use self::facet_value_i64_codec::FacetValueI64Codec; pub use self::facet_value_string_codec::FacetValueStringCodec; -pub use self::facet_value_u64_codec::FacetValueU64Codec; From 8ae988895943a52b1352811204dbbc562cdec31c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Nov 2020 14:04:24 +0100 Subject: [PATCH 07/13] Store the field id instead of the field name in the facets database --- src/heed_codec/facet/facet_value_f64_codec.rs | 24 ++++++++-------- src/heed_codec/facet/facet_value_i64_codec.rs | 22 +++++++-------- .../facet/facet_value_string_codec.rs | 18 ++++++------ src/heed_codec/mod.rs | 4 +-- src/heed_codec/str_bytes_codec.rs | 28 ------------------- 5 files changed, 33 insertions(+), 63 deletions(-) delete mode 100644 src/heed_codec/str_bytes_codec.rs diff --git a/src/heed_codec/facet/facet_value_f64_codec.rs b/src/heed_codec/facet/facet_value_f64_codec.rs index fdacb1f08..228514de5 100644 --- a/src/heed_codec/facet/facet_value_f64_codec.rs +++ b/src/heed_codec/facet/facet_value_f64_codec.rs @@ -1,26 +1,24 @@ use std::borrow::Cow; use std::convert::TryInto; -use std::str; -use crate::heed_codec::StrBytesCodec; use crate::facet::value_encoding::f64_into_bytes; pub struct FacetValueF64Codec; impl<'a> heed::BytesDecode<'a> for FacetValueF64Codec { - type DItem = (&'a str, f64); + type DItem = (u8, f64); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (name, buffer) = StrBytesCodec::bytes_decode(bytes)?; + let (field_id, buffer) = bytes.split_first()?; let value = buffer[8..].try_into().ok().map(f64::from_be_bytes)?; - Some((name, value)) + Some((*field_id, value)) } } -impl<'a> heed::BytesEncode<'a> for FacetValueF64Codec { - type EItem = (&'a str, f64); +impl heed::BytesEncode<'_> for FacetValueF64Codec { + type EItem = (u8, f64); - fn bytes_encode((name, value): &Self::EItem) -> Option> { + fn bytes_encode((field_id, value): &Self::EItem) -> Option> { let mut buffer = [0u8; 16]; // Write the globally ordered float. @@ -31,8 +29,10 @@ impl<'a> heed::BytesEncode<'a> for FacetValueF64Codec { let bytes = value.to_be_bytes(); buffer[8..].copy_from_slice(&bytes[..]); - let tuple = (*name, &buffer[..]); - StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) + let mut bytes = Vec::with_capacity(buffer.len() + 1); + bytes.push(*field_id); + bytes.extend_from_slice(&buffer[..]); + Some(Cow::Owned(bytes)) } } @@ -43,8 +43,8 @@ mod tests { #[test] fn globally_ordered_f64() { - let bytes = FacetValueF64Codec::bytes_encode(&("hello", -32.0)).unwrap(); + let bytes = FacetValueF64Codec::bytes_encode(&(3, -32.0)).unwrap(); let (name, value) = FacetValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, value), ("hello", -32.0)); + assert_eq!((name, value), (3, -32.0)); } } diff --git a/src/heed_codec/facet/facet_value_i64_codec.rs b/src/heed_codec/facet/facet_value_i64_codec.rs index e3c333883..f99b8a3ea 100644 --- a/src/heed_codec/facet/facet_value_i64_codec.rs +++ b/src/heed_codec/facet/facet_value_i64_codec.rs @@ -1,28 +1,28 @@ use std::borrow::Cow; use std::convert::TryInto; -use std::str; -use crate::heed_codec::StrBytesCodec; use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; pub struct FacetValueI64Codec; impl<'a> heed::BytesDecode<'a> for FacetValueI64Codec { - type DItem = (&'a str, i64); + type DItem = (u8, i64); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?; - let value = bytes.try_into().map(i64_from_bytes).ok()?; - Some((name, value)) + let (field_id, buffer) = bytes.split_first()?; + let value = buffer.try_into().map(i64_from_bytes).ok()?; + Some((*field_id, value)) } } -impl<'a> heed::BytesEncode<'a> for FacetValueI64Codec { - type EItem = (&'a str, i64); +impl heed::BytesEncode<'_> for FacetValueI64Codec { + type EItem = (u8, i64); - fn bytes_encode((name, value): &Self::EItem) -> Option> { + fn bytes_encode((field_id, value): &Self::EItem) -> Option> { let value = i64_into_bytes(*value); - let tuple = (*name, &value[..]); - StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) + let mut bytes = Vec::with_capacity(value.len() + 1); + bytes.push(*field_id); + bytes.extend_from_slice(&value[..]); + Some(Cow::Owned(bytes)) } } diff --git a/src/heed_codec/facet/facet_value_string_codec.rs b/src/heed_codec/facet/facet_value_string_codec.rs index 8b046192a..faa8b407b 100644 --- a/src/heed_codec/facet/facet_value_string_codec.rs +++ b/src/heed_codec/facet/facet_value_string_codec.rs @@ -1,25 +1,25 @@ use std::borrow::Cow; use std::str; -use crate::heed_codec::StrBytesCodec; - pub struct FacetValueStringCodec; impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { - type DItem = (&'a str, &'a str); + type DItem = (u8, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?; + let (field_id, bytes) = bytes.split_first()?; let value = str::from_utf8(bytes).ok()?; - Some((name, value)) + Some((*field_id, value)) } } impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec { - type EItem = (&'a str, &'a str); + type EItem = (u8, &'a str); - fn bytes_encode((name, value): &Self::EItem) -> Option> { - let tuple = (*name, value.as_bytes()); - StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) + fn bytes_encode((field_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(value.len() + 1); + bytes.push(*field_id); + bytes.extend_from_slice(value.as_bytes()); + Some(Cow::Owned(bytes)) } } diff --git a/src/heed_codec/mod.rs b/src/heed_codec/mod.rs index 260e79c4b..e7b8cf256 100644 --- a/src/heed_codec/mod.rs +++ b/src/heed_codec/mod.rs @@ -1,16 +1,14 @@ mod beu32_str_codec; mod bo_roaring_bitmap_codec; mod cbo_roaring_bitmap_codec; -mod facet; mod obkv_codec; mod roaring_bitmap_codec; -mod str_bytes_codec; mod str_str_u8_codec; +pub mod facet; pub use self::beu32_str_codec::BEU32StrCodec; pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap_codec::RoaringBitmapCodec; -pub use self::str_bytes_codec::StrBytesCodec; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/src/heed_codec/str_bytes_codec.rs b/src/heed_codec/str_bytes_codec.rs deleted file mode 100644 index 1a864c1fb..000000000 --- a/src/heed_codec/str_bytes_codec.rs +++ /dev/null @@ -1,28 +0,0 @@ -use std::borrow::Cow; -use std::str; - -pub struct StrBytesCodec; - -impl<'a> heed::BytesDecode<'a> for StrBytesCodec { - type DItem = (&'a str, &'a [u8]); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let s1_end = bytes.iter().position(|b| *b == 0)?; - let (s1_bytes, s2_bytes) = bytes.split_at(s1_end); - let s1 = str::from_utf8(s1_bytes).ok()?; - let s2 = &s2_bytes[1..]; - Some((s1, s2)) - } -} - -impl<'a> heed::BytesEncode<'a> for StrBytesCodec { - type EItem = (&'a str, &'a [u8]); - - fn bytes_encode((s1, s2): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); - bytes.extend_from_slice(s1.as_bytes()); - bytes.push(0); - bytes.extend_from_slice(s2); - Some(Cow::Owned(bytes)) - } -} From 4e5e55c21a1574825882d41e81f3c413ba7351d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Nov 2020 14:33:53 +0100 Subject: [PATCH 08/13] Simplify the merge functions --- src/update/index_documents/merge_function.rs | 56 ++++++++++++-------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/src/update/index_documents/merge_function.rs b/src/update/index_documents/merge_function.rs index 7f91b6716..fb785fd11 100644 --- a/src/update/index_documents/merge_function.rs +++ b/src/update/index_documents/merge_function.rs @@ -29,23 +29,13 @@ pub fn main_merge(key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { ensure!(values.windows(2).all(|vs| vs[0] == vs[1]), "fields ids map doesn't match"); Ok(values[0].to_vec()) }, - DOCUMENTS_IDS_KEY => word_docids_merge(&[], values), + DOCUMENTS_IDS_KEY => roaring_bitmap_merge(values), otherwise => bail!("wut {:?}", otherwise), } } pub fn word_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - let (head, tail) = values.split_first().unwrap(); - let mut head = RoaringBitmap::deserialize_from(&head[..])?; - - for value in tail { - let bitmap = RoaringBitmap::deserialize_from(&value[..])?; - head.union_with(&bitmap); - } - - let mut vec = Vec::with_capacity(head.serialized_size()); - head.serialize_into(&mut vec)?; - Ok(vec) + roaring_bitmap_merge(values) } pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result> { @@ -53,17 +43,11 @@ pub fn docid_word_positions_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow:: } pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { - let (head, tail) = values.split_first().unwrap(); - let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; + cbo_roaring_bitmap_merge(values) +} - for value in tail { - let bitmap = CboRoaringBitmapCodec::deserialize_from(&value[..])?; - head.union_with(&bitmap); - } - - let mut vec = Vec::new(); - CboRoaringBitmapCodec::serialize_into(&head, &mut vec)?; - Ok(vec) +pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result> { + cbo_roaring_bitmap_merge(values) } pub fn documents_merge(key: &[u8], _values: &[Cow<[u8]>]) -> anyhow::Result> { @@ -85,3 +69,31 @@ pub fn merge_two_obkvs(base: obkv::KvReader, update: obkv::KvReader, buffer: &mu writer.finish().unwrap(); } + +fn roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { + let (head, tail) = values.split_first().unwrap(); + let mut head = RoaringBitmap::deserialize_from(&head[..])?; + + for value in tail { + let bitmap = RoaringBitmap::deserialize_from(&value[..])?; + head.union_with(&bitmap); + } + + let mut vec = Vec::with_capacity(head.serialized_size()); + head.serialize_into(&mut vec)?; + Ok(vec) +} + +fn cbo_roaring_bitmap_merge(values: &[Cow<[u8]>]) -> anyhow::Result> { + let (head, tail) = values.split_first().unwrap(); + let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; + + for value in tail { + let bitmap = CboRoaringBitmapCodec::deserialize_from(&value[..])?; + head.union_with(&bitmap); + } + + let mut vec = Vec::new(); + CboRoaringBitmapCodec::serialize_into(&head, &mut vec)?; + Ok(vec) +} From a18d9a1f871c5c039b4bb8fa0f4820ebbb0d90a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Nov 2020 14:49:48 +0100 Subject: [PATCH 09/13] Parse and store the faceted fields --- Cargo.lock | 17 +++ Cargo.toml | 2 + src/lib.rs | 1 + src/update/index_documents/mod.rs | 55 +++++++-- src/update/index_documents/store.rs | 183 ++++++++++++++++++++++++++-- src/update/settings.rs | 28 +++++ 6 files changed, 262 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 63e30b65c..ddb2e9ec5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -550,6 +550,12 @@ dependencies = [ "cfg-if 0.1.10", ] +[[package]] +name = "maplit" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" + [[package]] name = "matches" version = "0.1.8" @@ -608,10 +614,12 @@ dependencies = [ "levenshtein_automata", "linked-hash-map", "log", + "maplit", "memmap", "near-proximity", "obkv", "once_cell", + "ordered-float", "rayon", "ringtail", "roaring", @@ -708,6 +716,15 @@ version = "11.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c" +[[package]] +name = "ordered-float" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fe9037165d7023b1228bc4ae9a2fa1a2b0095eca6c2998c624723dfd01314a5" +dependencies = [ + "num-traits", +] + [[package]] name = "page_size" version = "0.4.2" diff --git a/Cargo.toml b/Cargo.toml index 441397275..fd453a5f2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ memmap = "0.7.0" near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } obkv = "0.1.0" once_cell = "1.4.0" +ordered-float = "2.0.0" rayon = "1.3.1" ringtail = "0.3.0" roaring = "0.6.1" @@ -44,6 +45,7 @@ stderrlog = "0.5.0" [dev-dependencies] criterion = "0.3.3" +maplit = "1.0.2" [build-dependencies] fst = "0.4.4" diff --git a/src/lib.rs b/src/lib.rs index 0e9ee0ec0..808c54a4a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -34,6 +34,7 @@ pub type FastMap8 = HashMap>; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; +pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; pub type BEU32 = heed::zerocopy::U32; pub type BEU64 = heed::zerocopy::U64; pub type DocumentId = u32; diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index c060b04a1..13b725e19 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -16,10 +16,10 @@ use rayon::ThreadPool; use crate::index::Index; use crate::update::UpdateIndexingStep; -use self::store::Store; +use self::store::{Store, Readers}; use self::merge_function::{ main_merge, word_docids_merge, words_pairs_proximities_docids_merge, - docid_word_positions_merge, documents_merge, + docid_word_positions_merge, documents_merge, facet_field_value_docids_merge, }; pub use self::transform::{Transform, TransformOutput}; @@ -327,6 +327,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { enum DatabaseType { Main, WordDocids, + FacetValuesDocids, } let faceted_fields = self.index.faceted_fields(self.wtxn)?; @@ -386,13 +387,23 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let mut word_docids_readers = Vec::with_capacity(readers.len()); let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); + let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); let mut documents_readers = Vec::with_capacity(readers.len()); readers.into_iter().for_each(|readers| { - main_readers.push(readers.main); - word_docids_readers.push(readers.word_docids); - docid_word_positions_readers.push(readers.docid_word_positions); - words_pairs_proximities_docids_readers.push(readers.words_pairs_proximities_docids); - documents_readers.push(readers.documents); + let Readers { + main, + word_docids, + docid_word_positions, + words_pairs_proximities_docids, + facet_field_value_docids, + documents + } = readers; + main_readers.push(main); + word_docids_readers.push(word_docids); + docid_word_positions_readers.push(docid_word_positions); + words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); + facet_field_value_docids_readers.push(facet_field_value_docids); + documents_readers.push(documents); }); // This is the function that merge the readers @@ -415,6 +426,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { vec![ (DatabaseType::Main, main_readers, main_merge as MergeFn), (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), + ( + DatabaseType::FacetValuesDocids, + facet_field_value_docids_readers, + facet_field_value_docids_merge, + ), ] .into_par_iter() .for_each(|(dbtype, readers, merge)| { @@ -465,9 +481,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.index.put_documents_ids(self.wtxn, &documents_ids)?; let mut database_count = 0; + let total_databases = 6; + progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: 0, - total_databases: 5, + total_databases, }); debug!("Writing the docid word positions into LMDB on disk..."); @@ -482,7 +500,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { database_count += 1; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: database_count, - total_databases: 5, + total_databases, }); debug!("Writing the documents into LMDB on disk..."); @@ -497,7 +515,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { database_count += 1; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: database_count, - total_databases: 5, + total_databases, }); debug!("Writing the words pairs proximities docids into LMDB on disk..."); @@ -512,7 +530,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { database_count += 1; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: database_count, - total_databases: 5, + total_databases, }); for (db_type, result) in receiver { @@ -539,16 +557,27 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { write_method, )?; }, + DatabaseType::FacetValuesDocids => { + debug!("Writing the facet values docids into LMDB on disk..."); + let db = *self.index.facet_field_id_value_docids.as_polymorph(); + write_into_lmdb_database( + self.wtxn, + db, + content, + facet_field_value_docids_merge, + write_method, + )?; + }, } database_count += 1; progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen: database_count, - total_databases: 5, + total_databases, }); } - debug_assert_eq!(database_count, 5); + debug_assert_eq!(database_count, total_databases); info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index e9548eec3..3182e2ccd 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::{BTreeMap, HashMap, HashSet}; use std::convert::{TryFrom, TryInto}; use std::fs::File; @@ -5,23 +6,29 @@ use std::iter::FromIterator; use std::time::Instant; use std::{cmp, iter}; -use anyhow::Context; +use anyhow::{bail, Context}; use bstr::ByteSlice as _; +use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; use log::{debug, info}; -use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; +use ordered_float::OrderedFloat; use roaring::RoaringBitmap; +use serde_json::Value; use tempfile::tempfile; use crate::facet::FacetType; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; +use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; use crate::tokenizer::{simple_tokenizer, only_token}; use crate::update::UpdateIndexingStep; -use crate::{json_to_string, SmallVec32, Position, DocumentId}; +use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId}; use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; -use super::merge_function::{main_merge, word_docids_merge, words_pairs_proximities_docids_merge}; +use super::merge_function::{ + main_merge, word_docids_merge, words_pairs_proximities_docids_merge, + facet_field_value_docids_merge, +}; const LMDB_MAX_KEY_LENGTH: usize = 511; const ONE_KILOBYTE: usize = 1024 * 1024; @@ -34,6 +41,7 @@ pub struct Readers { pub word_docids: Reader, pub docid_word_positions: Reader, pub words_pairs_proximities_docids: Reader, + pub facet_field_value_docids: Reader, pub documents: Reader, } @@ -46,6 +54,8 @@ pub struct Store { word_docids_limit: usize, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, + facet_field_value_docids: LinkedHashMap<(u8, FacetValue), RoaringBitmap>, + facet_field_value_docids_limit: usize, // MTBL parameters chunk_compression_type: CompressionType, chunk_compression_level: Option, @@ -54,6 +64,7 @@ pub struct Store { main_sorter: Sorter, word_docids_sorter: Sorter, words_pairs_proximities_docids_sorter: Sorter, + facet_field_value_docids_sorter: Sorter, // MTBL writers docid_word_positions_writer: Writer, documents_writer: Writer, @@ -72,7 +83,7 @@ impl Store { ) -> anyhow::Result { // We divide the max memory by the number of sorter the Store have. - let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 3)); + let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4)); let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); let main_sorter = create_sorter( @@ -99,6 +110,14 @@ impl Store { max_nb_chunks, max_memory, ); + let facet_field_value_docids_sorter = create_sorter( + facet_field_value_docids_merge, + chunk_compression_type, + chunk_compression_level, + chunk_fusing_shrink_size, + max_nb_chunks, + max_memory, + ); let documents_writer = tempfile().and_then(|f| { create_writer(chunk_compression_type, chunk_compression_level, f) @@ -116,6 +135,8 @@ impl Store { word_docids_limit: linked_hash_map_size, words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), words_pairs_proximities_docids_limit: linked_hash_map_size, + facet_field_value_docids: LinkedHashMap::with_capacity(linked_hash_map_size), + facet_field_value_docids_limit: linked_hash_map_size, // MTBL parameters chunk_compression_type, chunk_compression_level, @@ -124,6 +145,7 @@ impl Store { main_sorter, word_docids_sorter, words_pairs_proximities_docids_sorter, + facet_field_value_docids_sorter, // MTBL writers docid_word_positions_writer, documents_writer, @@ -151,6 +173,35 @@ impl Store { Ok(()) } + // Save the documents ids under the facet field id and value we have seen it. + fn insert_facet_values_docid( + &mut self, + field_id: u8, + field_value: FacetValue, + id: DocumentId, + ) -> anyhow::Result<()> + { + let key = (field_id, field_value); + // if get_refresh finds the element it is assured to be at the end of the linked hash map. + match self.facet_field_value_docids.get_refresh(&key) { + Some(old) => { old.insert(id); }, + None => { + // A newly inserted element is append at the end of the linked hash map. + self.facet_field_value_docids.insert(key, RoaringBitmap::from_iter(Some(id))); + // If the word docids just reached it's capacity we must make sure to remove + // one element, this way next time we insert we doesn't grow the capacity. + if self.facet_field_value_docids.len() == self.facet_field_value_docids_limit { + // Removing the front element is equivalent to removing the LRU element. + Self::write_docid_facet_field_values( + &mut self.facet_field_value_docids_sorter, + self.facet_field_value_docids.pop_front(), + )?; + } + } + } + Ok(()) + } + // Save the documents ids under the words pairs proximities that it contains. fn insert_words_pairs_proximities_docids<'a>( &mut self, @@ -191,7 +242,8 @@ impl Store { fn write_document( &mut self, document_id: DocumentId, - words_positions: &HashMap>, + words_positions: &mut HashMap>, + facet_values: &mut HashMap>, record: &[u8], ) -> anyhow::Result<()> { @@ -200,13 +252,20 @@ impl Store { self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; // We store document_id associated with all the words the record contains. - for (word, _) in words_positions { - self.insert_word_docid(word, document_id)?; + for (word, _) in words_positions.drain() { + self.insert_word_docid(&word, document_id)?; } self.documents_writer.insert(document_id.to_be_bytes(), record)?; Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; + // We store document_id associated with all the field id and values. + for (field, values) in facet_values.drain() { + for value in values { + self.insert_facet_values_docid(field, value, document_id)?; + } + } + Ok(()) } @@ -267,6 +326,31 @@ impl Store { Ok(()) } + fn write_docid_facet_field_values( + sorter: &mut Sorter, + iter: I, + ) -> anyhow::Result<()> + where I: IntoIterator + { + use FacetValue::*; + + for ((field_id, value), docids) in iter { + let result = match value { + String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), + Float(f) => FacetValueF64Codec::bytes_encode(&(field_id, *f)).map(Cow::into_owned), + Integer(i) => FacetValueI64Codec::bytes_encode(&(field_id, i)).map(Cow::into_owned), + }; + let key = result.context("could not serialize facet key")?; + let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) + .context("could not serialize docids")?; + if lmdb_key_valid_size(&key) { + sorter.insert(&key, &bytes)?; + } + } + + Ok(()) + } + fn write_word_docids(sorter: &mut Sorter, iter: I) -> anyhow::Result<()> where I: IntoIterator, RoaringBitmap)> { @@ -305,6 +389,7 @@ impl Store { let mut before = Instant::now(); let mut words_positions = HashMap::new(); + let mut facet_values = HashMap::new(); let mut count: usize = 0; while let Some((key, value)) = documents.next()? { @@ -328,7 +413,10 @@ impl Store { let value = serde_json::from_slice(content)?; if let Some(ftype) = self.faceted_fields.get(&attr) { - todo!("parse facet field value") + let mut values = parse_facet_value(*ftype, &value).with_context(|| { + format!("extracting facets from the value {}", value) + })?; + facet_values.entry(attr).or_insert_with(SmallVec8::new).extend(values.drain(..)); } if self.searchable_fields.contains(&attr) { @@ -348,8 +436,7 @@ impl Store { } // We write the document in the documents store. - self.write_document(document_id, &words_positions, value)?; - words_positions.clear(); + self.write_document(document_id, &mut words_positions, &mut facet_values, value)?; } // Compute the document id of the next document. @@ -376,6 +463,10 @@ impl Store { &mut self.words_pairs_proximities_docids_sorter, self.words_pairs_proximities_docids, )?; + Self::write_docid_facet_field_values( + &mut self.facet_field_value_docids_sorter, + self.facet_field_value_docids, + )?; let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; let mut builder = fst::SetBuilder::memory(); @@ -397,9 +488,13 @@ impl Store { let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; + let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; + self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; + let main = writer_into_reader(main_wtr, shrink_size)?; let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; + let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; let documents = writer_into_reader(self.documents_writer, shrink_size)?; @@ -408,6 +503,7 @@ impl Store { word_docids, docid_word_positions, words_pairs_proximities_docids, + facet_field_value_docids, documents, }) } @@ -453,3 +549,68 @@ fn format_count(n: usize) -> String { fn lmdb_key_valid_size(key: &[u8]) -> bool { !key.is_empty() && key.len() <= LMDB_MAX_KEY_LENGTH } + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +enum FacetValue { + String(SmallString32), + Float(OrderedFloat), + Integer(i64), +} + +fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result> { + use FacetValue::*; + + fn inner_parse_facet_value( + ftype: FacetType, + value: &Value, + can_recurse: bool, + output: &mut SmallVec8, + ) -> anyhow::Result<()> + { + match value { + Value::Null => Ok(()), + Value::Bool(b) => Ok(output.push(Integer(*b as i64))), + Value::Number(number) => match ftype { + FacetType::String => bail!("invalid facet type, expecting {} found number", ftype), + FacetType::Float => match number.as_f64() { + Some(float) => Ok(output.push(Float(OrderedFloat(float)))), + None => bail!("invalid facet type, expecting {} found integer", ftype), + }, + FacetType::Integer => match number.as_i64() { + Some(integer) => Ok(output.push(Integer(integer))), + None => if number.is_f64() { + bail!("invalid facet type, expecting {} found float", ftype) + } else { + bail!("invalid facet type, expecting {} found out-of-bound integer (64bit)", ftype) + }, + }, + }, + Value::String(string) => { + match ftype { + FacetType::String => { + let string = SmallString32::from(string.as_str()); + Ok(output.push(String(string))) + }, + FacetType::Float => match string.parse() { + Ok(float) => Ok(output.push(Float(OrderedFloat(float)))), + Err(_err) => bail!("invalid facet type, expecting {} found string", ftype), + }, + FacetType::Integer => match string.parse() { + Ok(integer) => Ok(output.push(Integer(integer))), + Err(_err) => bail!("invalid facet type, expecting {} found string", ftype), + }, + } + }, + Value::Array(values) => if can_recurse { + values.iter().map(|v| inner_parse_facet_value(ftype, v, false, output)).collect() + } else { + bail!("invalid facet type, expecting {} found sub-array ()", ftype) + }, + Value::Object(_) => bail!("invalid facet type, expecting {} found object", ftype), + } + } + + let mut facet_values = SmallVec8::new(); + inner_parse_facet_value(ftype, value, true, &mut facet_values)?; + Ok(facet_values) +} diff --git a/src/update/settings.rs b/src/update/settings.rs index d5ffba23b..de165926f 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -306,6 +306,7 @@ mod tests { use super::*; use crate::update::{IndexDocuments, UpdateFormat}; use heed::EnvOpenOptions; + use maplit::hashmap; #[test] fn set_and_reset_searchable_fields() { @@ -473,4 +474,31 @@ mod tests { assert_eq!(fields_ids, None); drop(rtxn); } + + #[test] + fn set_faceted_fields() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the faceted fields to be the age. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index); + builder.set_faceted_fields(hashmap!{ "age".into() => "integer".into() }); + builder.execute(|_| ()).unwrap(); + + // Then index some documents. + let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index); + builder.update_format(UpdateFormat::Csv); + builder.execute(content, |_| ()).unwrap(); + wtxn.commit().unwrap(); + + // Check that the displayed fields are correctly set. + let rtxn = index.read_txn().unwrap(); + let fields_ids = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(fields_ids, hashmap!{ 0 => FacetType::Integer }); + drop(rtxn); + } } From 8e6efe4d8797d244a4f46b5202b508355553d17d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Nov 2020 16:15:05 +0100 Subject: [PATCH 10/13] Introduce an infos subcommand to display the facet values --- src/subcommand/infos.rs | 71 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/src/subcommand/infos.rs b/src/subcommand/infos.rs index 0a4dabeba..aa5cd3d7b 100644 --- a/src/subcommand/infos.rs +++ b/src/subcommand/infos.rs @@ -78,6 +78,16 @@ enum Command { words: Vec, }, + /// Outputs a CSV with the documents ids along with the facet values where it appears. + FacetValuesDocids { + /// Display the whole documents ids in details. + #[structopt(long)] + full_display: bool, + + /// The field name in the document. + field_name: String, + }, + /// Outputs the total size of all the docid-word-positions keys and values. TotalDocidWordPositionsSize, @@ -147,6 +157,9 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { MostCommonWords { limit } => most_common_words(&index, &rtxn, limit), BiggestValues { limit } => biggest_value_sizes(&index, &rtxn, limit), WordsDocids { full_display, words } => words_docids(&index, &rtxn, !full_display, words), + FacetValuesDocids { full_display, field_name } => { + facet_values_docids(&index, &rtxn, !full_display, field_name) + }, TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), AverageNumberOfPositionsByWord => { @@ -256,6 +269,64 @@ fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec anyhow::Result<()> { + use crate::facet::FacetType; + use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; + use heed::{BytesDecode, Error::Decoding}; + + let fields_ids_map = index.fields_ids_map(&rtxn)?; + let faceted_fields = index.faceted_fields(&rtxn)?; + + let field_id = fields_ids_map.id(&field_name) + .with_context(|| format!("field {} not found", field_name))?; + let field_type = faceted_fields.get(&field_id) + .with_context(|| format!("field {} is not faceted", field_name))?; + + let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?; + let iter = match field_type { + FacetType::String => { + let iter = iter + .map(|result| result.and_then(|(key, value)| { + let (_, key) = FacetValueStringCodec::bytes_decode(key).ok_or(Decoding)?; + Ok((key.to_string(), value)) + })); + Box::new(iter) as Box> + }, + FacetType::Float => { + let iter = iter + .map(|result| result.and_then(|(key, value)| { + let (_, key) = FacetValueF64Codec::bytes_decode(key).ok_or(Decoding)?; + Ok((key.to_string(), value)) + })); + Box::new(iter) + }, + FacetType::Integer => { + let iter = iter + .map(|result| result.and_then(|(key, value)| { + let (_, key) = FacetValueI64Codec::bytes_decode(key).ok_or(Decoding)?; + Ok((key.to_string(), value)) + })); + Box::new(iter) + }, + }; + + let stdout = io::stdout(); + let mut wtr = csv::Writer::from_writer(stdout.lock()); + wtr.write_record(&["facet_value", "documents_ids"])?; + + for result in iter { + let (value, docids) = result?; + let docids = if debug { + format!("{:?}", docids) + } else { + format!("{:?}", docids.iter().collect::>()) + }; + wtr.write_record(&[value, docids])?; + } + + Ok(wtr.flush()?) +} + fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> { use std::fs::File; use std::io::Write as _; From 23f9a22edc24499bcfd48a42197a980052f68ac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Nov 2020 16:16:07 +0100 Subject: [PATCH 11/13] Update the HTTP settings route to accept the faceted fields --- http-ui/Cargo.lock | 10 ++++++++++ http-ui/src/main.rs | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index 7c7759e2e..23fed5bbe 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -989,6 +989,7 @@ dependencies = [ "near-proximity", "obkv", "once_cell", + "ordered-float", "rayon", "ringtail", "roaring", @@ -1205,6 +1206,15 @@ version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5" +[[package]] +name = "ordered-float" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fe9037165d7023b1228bc4ae9a2fa1a2b0095eca6c2998c624723dfd01314a5" +dependencies = [ + "num-traits", +] + [[package]] name = "page_size" version = "0.4.2" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 2af2c9d90..094b2fb79 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fs::{File, create_dir_all}; use std::net::SocketAddr; use std::path::PathBuf; @@ -210,6 +210,8 @@ enum UpdateMetaProgress { } #[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +#[serde(rename_all = "camelCase")] struct Settings { #[serde( default, @@ -224,6 +226,9 @@ struct Settings { skip_serializing_if = "Option::is_none", )] searchable_attributes: Option>>, + + #[serde(default)] + faceted_attributes: Option>, } // Any value that is present is considered Some value, including null. @@ -367,6 +372,11 @@ async fn main() -> anyhow::Result<()> { } } + // We transpose the settings JSON struct into a real setting update. + if let Some(facet_types) = settings.faceted_attributes { + builder.set_faceted_fields(facet_types); + } + let result = builder.execute(|indexing_step| { let (current, total) = match indexing_step { TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None), From f9cc12ae0f99794982e359ed7674a539185c3391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Nov 2020 18:33:51 +0100 Subject: [PATCH 12/13] Do not try to parse empty faceted strings --- src/update/index_documents/store.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 3182e2ccd..9c75f10fe 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -586,9 +586,11 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result { + let string = string.trim(); + if string.is_empty() { return Ok(()) } match ftype { FacetType::String => { - let string = SmallString32::from(string.as_str()); + let string = SmallString32::from(string); Ok(output.push(String(string))) }, FacetType::Float => match string.parse() { From e76558b0cc74879bfd644e2d455dca2d636167d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 13 Nov 2020 22:35:02 +0100 Subject: [PATCH 13/13] Change the settings update system to reindex only one time --- src/update/settings.rs | 237 +++++++++++++---------------------------- 1 file changed, 75 insertions(+), 162 deletions(-) diff --git a/src/update/settings.rs b/src/update/settings.rs index de165926f..03f184ef6 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -72,11 +72,50 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { where F: Fn(UpdateIndexingStep) + Sync { + let mut updated_searchable_fields = None; + let mut updated_faceted_fields = None; + let mut updated_displayed_fields = None; + + // Construct the new FieldsIdsMap based on the searchable fields order. + let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + let mut fields_ids_map = match self.searchable_fields { + Some(Some(searchable_fields)) => { + let mut new_fields_ids_map = FieldsIdsMap::new(); + let mut new_searchable_fields = Vec::new(); + + for name in searchable_fields { + let id = new_fields_ids_map.insert(&name).context("field id limit reached")?; + new_searchable_fields.push(id); + } + + for (_, name) in fields_ids_map.iter() { + new_fields_ids_map.insert(name).context("field id limit reached")?; + } + + updated_searchable_fields = Some(Some(new_searchable_fields)); + new_fields_ids_map + }, + Some(None) => { + updated_searchable_fields = Some(None); + fields_ids_map + }, + None => fields_ids_map, + }; + + // We compute or generate the new primary key field id. + // TODO make the primary key settable. + let primary_key = match self.index.primary_key(&self.wtxn)? { + Some(id) => { + let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; + let name = current_fields_ids_map.name(id).unwrap(); + fields_ids_map.insert(name).context("field id limit reached")? + }, + None => fields_ids_map.insert("id").context("field id limit reached")?, + }; + if let Some(fields_names_facet_types) = self.faceted_fields { let current_faceted_fields = self.index.faceted_fields(self.wtxn)?; - let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - let mut fields_ids_map = current_fields_ids_map.clone(); let mut faceted_fields = HashMap::new(); for (name, sftype) in fields_names_facet_types { let ftype = FacetType::from_str(&sftype).with_context(|| format!("parsing facet type {:?}", sftype))?; @@ -90,123 +129,27 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { }; } - let transform = Transform { - rtxn: &self.wtxn, - index: self.index, - log_every_n: self.log_every_n, - chunk_compression_type: self.chunk_compression_type, - chunk_compression_level: self.chunk_compression_level, - chunk_fusing_shrink_size: self.chunk_fusing_shrink_size, - max_nb_chunks: self.max_nb_chunks, - max_memory: self.max_memory, - index_documents_method: IndexDocumentsMethod::ReplaceDocuments, - autogenerate_docids: false, - }; - - // We compute or generate the new primary key field id. - let primary_key = match self.index.primary_key(&self.wtxn)? { - Some(id) => { - let name = current_fields_ids_map.name(id).unwrap(); - fields_ids_map.insert(name).context("field id limit reached")? - }, - None => fields_ids_map.insert("id").context("field id limit reached")?, - }; - - // We remap the documents fields based on the new `FieldsIdsMap`. - let output = transform.remap_index_documents(primary_key, fields_ids_map.clone())?; - - // We write the faceted_fields fields into the database here. - self.index.put_faceted_fields(self.wtxn, &faceted_fields)?; - - // We clear the full database (words-fst, documents ids and documents content). - ClearDocuments::new(self.wtxn, self.index).execute()?; - - // We index the generated `TransformOutput` which must contain - // all the documents with fields in the newly defined searchable order. - let mut indexing_builder = IndexDocuments::new(self.wtxn, self.index); - indexing_builder.log_every_n = self.log_every_n; - indexing_builder.max_nb_chunks = self.max_nb_chunks; - indexing_builder.max_memory = self.max_memory; - indexing_builder.linked_hash_map_size = self.linked_hash_map_size; - indexing_builder.chunk_compression_type = self.chunk_compression_type; - indexing_builder.chunk_compression_level = self.chunk_compression_level; - indexing_builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; - indexing_builder.thread_pool = self.thread_pool; - indexing_builder.execute_raw(output, &progress_callback)?; + updated_faceted_fields = Some(faceted_fields); } - // Check that the searchable attributes have been specified. - if let Some(value) = self.searchable_fields { - let current_displayed_fields = self.index.displayed_fields(self.wtxn)?; - let current_faceted_fields = self.index.faceted_fields(self.wtxn)?; - let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - - let result = match value { - Some(fields_names) => { - let mut fields_ids_map = current_fields_ids_map.clone(); - let searchable_fields: Vec<_> = - fields_names.iter() - .map(|name| fields_ids_map.insert(name)) - .collect::>>() - .context("field id limit reached")?; - - // If the searchable fields are ordered we don't have to generate a new `FieldsIdsMap`. - if searchable_fields.windows(2).all(|win| win[0] < win[1]) { - ( - fields_ids_map, - Some(searchable_fields), - current_displayed_fields.map(ToOwned::to_owned), - current_faceted_fields, - ) - } else { - // We create or generate the fields ids corresponding to those names. - let mut fields_ids_map = FieldsIdsMap::new(); - let mut searchable_fields = Vec::new(); - for name in fields_names { - let id = fields_ids_map.insert(&name).context("field id limit reached")?; - searchable_fields.push(id); - } - - // We complete the new FieldsIdsMap with the previous names. - for (_id, name) in current_fields_ids_map.iter() { - fields_ids_map.insert(name).context("field id limit reached")?; - } - - // We must update the displayed fields according to the new `FieldsIdsMap`. - let displayed_fields = match current_displayed_fields { - Some(fields) => { - let mut displayed_fields = Vec::new(); - for id in fields { - let name = current_fields_ids_map.name(*id).unwrap(); - let id = fields_ids_map.id(name).context("field id limit reached")?; - displayed_fields.push(id); - } - Some(displayed_fields) - }, - None => None, - }; - - // We must update the faceted fields according to the new `FieldsIdsMap`. - let mut faceted_fields = HashMap::new(); - for (id, ftype) in current_faceted_fields { - let name = current_fields_ids_map.name(id).unwrap(); - let id = fields_ids_map.id(name).context("field id limit reached")?; - faceted_fields.insert(id, ftype); - } - - (fields_ids_map, Some(searchable_fields), displayed_fields, faceted_fields) + // Check that the displayed attributes have been specified. + if let Some(value) = self.displayed_fields { + match value { + Some(names) => { + let mut new_displayed_fields = Vec::new(); + for name in names { + let id = fields_ids_map.insert(&name).context("field id limit reached")?; + new_displayed_fields.push(id); } - }, - None => ( - current_fields_ids_map.clone(), - None, - current_displayed_fields.map(ToOwned::to_owned), - current_faceted_fields, - ), - }; - - let (mut fields_ids_map, searchable_fields, displayed_fields, faceted_fields) = result; + updated_displayed_fields = Some(Some(new_displayed_fields)); + } + None => updated_displayed_fields = Some(None), + } + } + // If any setting have modified any of the datastructures it means that we need + // to retrieve the documents and then reindex then with the new settings. + if updated_searchable_fields.is_some() || updated_faceted_fields.is_some() { let transform = Transform { rtxn: &self.wtxn, index: self.index, @@ -220,15 +163,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { autogenerate_docids: false, }; - // We compute or generate the new primary key field id. - let primary_key = match self.index.primary_key(&self.wtxn)? { - Some(id) => { - let name = current_fields_ids_map.name(id).unwrap(); - fields_ids_map.insert(name).context("field id limit reached")? - }, - None => fields_ids_map.insert("id").context("field id limit reached")?, - }; - // We remap the documents fields based on the new `FieldsIdsMap`. let output = transform.remap_index_documents(primary_key, fields_ids_map.clone())?; @@ -236,23 +170,20 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { // this way next indexing methods will be based on that. self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; - // The new searchable fields are also written down to make sure - // that the IndexDocuments system takes only these ones into account. - match searchable_fields { - Some(fields) => self.index.put_searchable_fields(self.wtxn, &fields)?, - None => self.index.delete_searchable_fields(self.wtxn).map(drop)?, + if let Some(faceted_fields) = updated_faceted_fields { + // We write the faceted_fields fields into the database here. + self.index.put_faceted_fields(self.wtxn, &faceted_fields)?; } - // We write the displayed fields into the database here - // to make sure that the right fields are displayed. - match displayed_fields { - Some(fields) => self.index.put_displayed_fields(self.wtxn, &fields)?, - None => self.index.delete_displayed_fields(self.wtxn).map(drop)?, + if let Some(searchable_fields) = updated_searchable_fields { + // The new searchable fields are also written down to make sure + // that the IndexDocuments system takes only these ones into account. + match searchable_fields { + Some(fields) => self.index.put_searchable_fields(self.wtxn, &fields)?, + None => self.index.delete_searchable_fields(self.wtxn).map(drop)?, + } } - // We write the faceted_fields fields into the database here. - self.index.put_faceted_fields(self.wtxn, &faceted_fields)?; - // We clear the full database (words-fst, documents ids and documents content). ClearDocuments::new(self.wtxn, self.index).execute()?; @@ -270,30 +201,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.execute_raw(output, &progress_callback)?; } - // Check that the displayed attributes have been specified. - if let Some(value) = self.displayed_fields { - match value { - // If it has been set, and it was a list of fields names, we create - // or generate the fields ids corresponds to those names and store them - // in the database in the order they were specified. - Some(fields_names) => { - let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; - - // We create or generate the fields ids corresponding to those names. - let mut fields_ids = Vec::new(); - for name in fields_names { - let id = fields_ids_map.insert(&name).context("field id limit reached")?; - fields_ids.push(id); - } - - self.index.put_displayed_fields(self.wtxn, &fields_ids)?; - }, - // If it was set to `null` it means that the user wants to get the default behavior - // which is displaying all the attributes in no specific order (FieldsIdsMap order), - // we just have to delete the displayed fields. - None => { - self.index.delete_displayed_fields(self.wtxn)?; - }, + if let Some(displayed_fields) = updated_displayed_fields { + // We write the displayed fields into the database here + // to make sure that the right fields are displayed. + match displayed_fields { + Some(fields) => self.index.put_displayed_fields(self.wtxn, &fields)?, + None => self.index.delete_displayed_fields(self.wtxn).map(drop)?, } } @@ -498,7 +411,7 @@ mod tests { // Check that the displayed fields are correctly set. let rtxn = index.read_txn().unwrap(); let fields_ids = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(fields_ids, hashmap!{ 0 => FacetType::Integer }); + assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer }); drop(rtxn); } }