From 453d593ce804eac35af4f7abd1b3a88c218941d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 09:30:19 +0200 Subject: [PATCH 01/18] Add a database containing the docids where each field exists --- infos/src/main.rs | 15 ++ milli/src/heed_codec/facet/mod.rs | 40 +++++ milli/src/index.rs | 24 ++- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 11 +- .../extract/extract_facet_exists_docids.rs | 42 +++++ .../extract/extract_fid_docid_facet_values.rs | 29 +++- .../src/update/index_documents/extract/mod.rs | 47 ++++-- milli/src/update/index_documents/mod.rs | 149 ++++++++++++++++++ .../src/update/index_documents/typed_chunk.rs | 13 ++ 10 files changed, 350 insertions(+), 22 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_facet_exists_docids.rs diff --git a/infos/src/main.rs b/infos/src/main.rs index 29a87cdcf..89aec6182 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -384,6 +384,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_exists_docids, exact_word_docids, exact_word_prefix_docids, field_id_docid_facet_f64s: _, @@ -402,6 +403,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let field_id_word_count_docids_name = "field_id_word_count_docids"; let facet_id_f64_docids_name = "facet_id_f64_docids"; let facet_id_string_docids_name = "facet_id_string_docids"; + let facet_id_exists_docids_name = "facet_id_exists_docids"; let documents_name = "documents"; let mut heap = BinaryHeap::with_capacity(limit + 1); @@ -544,6 +546,17 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho heap.pop(); } } + + // List the docids where the facet exists + let db = facet_id_exists_docids.remap_data_type::(); + for result in facet_values_iter(rtxn, db, facet_id)? { + let (_fid, value) = result?; + let key = format!("{}", facet_name); + heap.push(Reverse((value.len(), key, facet_id_exists_docids_name))); + if heap.len() > limit { + heap.pop(); + } + } } for result in index.all_documents(rtxn)? { @@ -984,6 +997,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a facet_id_string_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, + facet_id_exists_docids, exact_word_prefix_docids, exact_word_docids, .. @@ -1007,6 +1021,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec) -> a FIELD_ID_WORD_COUNT_DOCIDS => field_id_word_count_docids.as_polymorph(), FACET_ID_F64_DOCIDS => facet_id_f64_docids.as_polymorph(), FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(), + FACET_ID_EXISTS_DOCIDS => facet_id_exists_docids.as_polymorph(), FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(), FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(), EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(), diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index e93fb57b9..79dbffa1d 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -25,3 +25,43 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { None } } + +use crate::{try_split_array_at, DocumentId, FieldId}; +use std::borrow::Cow; +use std::convert::TryInto; + +pub struct FieldIdCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdCodec { + type DItem = FieldId; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id_bytes, _) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + Some(field_id) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldIdCodec { + type EItem = FieldId; + + fn bytes_encode(field_id: &Self::EItem) -> Option> { + Some(Cow::Owned(field_id.to_be_bytes().to_vec())) + } +} + +pub struct FieldIdDocIdCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdDocIdCodec { + type DItem = (FieldId, DocumentId); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + + let document_id_bytes = bytes[..4].try_into().ok()?; + let document_id = u32::from_be_bytes(document_id_bytes); + + Some((field_id, document_id)) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 9637b4103..816112178 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -15,7 +15,7 @@ use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, }; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, @@ -75,6 +75,7 @@ pub mod db_name { pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; + pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; @@ -116,6 +117,9 @@ pub struct Index { /// Maps the position of a word prefix with all the docids where this prefix appears. pub word_prefix_position_docids: Database, + /// Maps the facet field id and the docids for which this field exists + pub facet_id_exists_docids: Database, + /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, /// Maps the facet field id and the string with the original string and docids that corresponds to it. @@ -134,7 +138,7 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(16); + options.max_dbs(17); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -152,6 +156,9 @@ impl Index { let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; + let facet_id_exists_docids: Database = + env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; + let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_strings = env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?; @@ -174,6 +181,7 @@ impl Index { field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_exists_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -806,6 +814,18 @@ impl Index { } } + /// Retrieve all the documents which contain this field id + pub fn exists_faceted_documents_ids( + &self, + rtxn: &RoTxn, + field_id: FieldId, + ) -> heed::Result { + match self.facet_id_exists_docids.get(rtxn, &field_id)? { + Some(docids) => Ok(docids), + None => Ok(RoaringBitmap::new()), + } + } + /* distinct field */ pub(crate) fn put_distinct_field( diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index d1939df7b..db438d019 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -30,6 +30,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_position_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_exists_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, documents, @@ -69,6 +70,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { field_id_word_count_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?; facet_id_f64_docids.clear(self.wtxn)?; + facet_id_exists_docids.clear(self.wtxn)?; facet_id_string_docids.clear(self.wtxn)?; field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 3b519c101..6dfdb9a7c 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -170,6 +170,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_position_docids, word_prefix_position_docids, facet_id_f64_docids, + facet_id_exists_docids, facet_id_string_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, @@ -424,11 +425,17 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_number_docids( + remove_docids_from_facet_field_id_docids( self.wtxn, facet_id_f64_docids, &self.to_delete_docids, )?; + // We delete the documents ids that are under the facet field id values. + remove_docids_from_facet_field_id_docids( + self.wtxn, + facet_id_exists_docids, + &self.to_delete_docids, + )?; remove_docids_from_facet_field_id_string_docids( self.wtxn, @@ -618,7 +625,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( Ok(()) } -fn remove_docids_from_facet_field_id_number_docids<'a, C>( +fn remove_docids_from_facet_field_id_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs new file mode 100644 index 000000000..e7a001c08 --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs @@ -0,0 +1,42 @@ +use std::fs::File; +use std::io; + +use heed::{BytesDecode, BytesEncode}; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, +}; +use crate::heed_codec::facet::{FieldIdCodec, FieldIdDocIdCodec}; +use crate::Result; + +/// Extracts the documents ids where this field appears. +/// +/// Returns a grenad reader whose key is the field id encoded +/// with `FieldIdCodec` and the value is a document_id (u32) +/// encoded as native-endian bytes. +#[logging_timer::time] +pub fn extract_facet_exists_docids( + docid_fid_facet_number: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut facet_exists_docids_sorter = create_sorter( + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut cursor = docid_fid_facet_number.into_cursor()?; + while let Some((key_bytes, _)) = cursor.move_on_next()? { + let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap(); + + let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap(); + + facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + } + + sorter_into_reader(facet_exists_docids_sorter, indexer) +} diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 628636f78..d93bde500 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -20,7 +20,7 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader)> { +) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -28,7 +28,7 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 2), + max_memory.map(|m| m / 3), ); let mut fid_docid_facet_strings_sorter = create_sorter( @@ -36,7 +36,15 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 2), + max_memory.map(|m| m / 3), + ); + + let mut fid_docid_facet_exists_sorter = create_sorter( + keep_first, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory.map(|m| m / 3), ); let mut key_buffer = Vec::new(); @@ -46,15 +54,19 @@ pub fn extract_fid_docid_facet_values( for (field_id, field_bytes) in obkv.iter() { if faceted_fields.contains(&field_id) { - let value = - serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; - let (numbers, strings) = extract_facet_values(&value); - key_buffer.clear(); + // here, we know already that the document must be added to the “field id exists” database // prefix key with the field_id and the document_id + key_buffer.extend_from_slice(&field_id.to_be_bytes()); key_buffer.extend_from_slice(&docid_bytes); + fid_docid_facet_exists_sorter.insert(&key_buffer, ().as_bytes())?; + + let value = + serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; + + let (numbers, strings) = extract_facet_values(&value); // insert facet numbers in sorter for number in numbers { @@ -79,7 +91,8 @@ pub fn extract_fid_docid_facet_values( Ok(( sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, - sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, + sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?, + sorter_into_reader(fid_docid_facet_exists_sorter, indexer)?, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index c3c2033a6..7d26e0984 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -1,4 +1,5 @@ mod extract_docid_word_positions; +mod extract_facet_exists_docids; mod extract_facet_number_docids; mod extract_facet_string_docids; mod extract_fid_docid_facet_values; @@ -16,6 +17,7 @@ use log::debug; use rayon::prelude::*; use self::extract_docid_word_positions::extract_docid_word_positions; +use self::extract_facet_exists_docids::extract_facet_exists_docids; use self::extract_facet_number_docids::extract_facet_number_docids; use self::extract_facet_string_docids::extract_facet_string_docids; use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; @@ -53,7 +55,7 @@ pub(crate) fn data_from_obkv_documents( }) .collect::>()?; - let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = flattened_obkv_chunks + let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks .par_bridge() .map(|flattened_obkv_chunks| { send_and_extract_flattened_documents_data( @@ -72,7 +74,10 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, - (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), + ( + docid_fid_facet_numbers_chunks, + (docid_fid_facet_strings_chunks, docid_fid_facet_exists_chunks), + ), ) = result?; spawn_extraction_task::<_, _, Vec>>( @@ -137,6 +142,15 @@ pub(crate) fn data_from_obkv_documents( TypedChunk::FieldIdFacetNumberDocids, "field-id-facet-number-docids", ); + spawn_extraction_task::<_, _, Vec>>( + docid_fid_facet_exists_chunks.clone(), + indexer.clone(), + lmdb_writer_sx.clone(), + extract_facet_exists_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::FieldIdFacetExistsDocids, + "field-id-facet-exists-docids", + ); Ok(()) } @@ -197,6 +211,7 @@ fn send_original_documents_data( /// - docid_word_positions /// - docid_fid_facet_numbers /// - docid_fid_facet_strings +/// - docid_fid_facet_exists fn send_and_extract_flattened_documents_data( flattened_documents_chunk: Result>, indexer: GrenadParameters, @@ -209,7 +224,10 @@ fn send_and_extract_flattened_documents_data( max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, - (grenad::Reader, grenad::Reader), + ( + grenad::Reader, + (grenad::Reader, grenad::Reader), + ), )> { let flattened_documents_chunk = flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; @@ -250,12 +268,15 @@ fn send_and_extract_flattened_documents_data( Ok(docid_word_positions_chunk) }, || { - let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) = - extract_fid_docid_facet_values( - flattened_documents_chunk.clone(), - indexer.clone(), - faceted_fields, - )?; + let ( + docid_fid_facet_numbers_chunk, + docid_fid_facet_strings_chunk, + docid_fid_facet_exists_chunk, + ) = extract_fid_docid_facet_values( + flattened_documents_chunk.clone(), + indexer.clone(), + faceted_fields, + )?; // send docid_fid_facet_numbers_chunk to DB writer let docid_fid_facet_numbers_chunk = @@ -273,7 +294,13 @@ fn send_and_extract_flattened_documents_data( docid_fid_facet_strings_chunk.clone(), ))); - Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk)) + let docid_fid_facet_exists_chunk = + unsafe { as_cloneable_grenad(&docid_fid_facet_exists_chunk)? }; + + Ok(( + docid_fid_facet_numbers_chunk, + (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk), + )) }, ); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba428f078..82457762e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1931,4 +1931,153 @@ mod tests { assert_eq!(ids.len(), map.len()); } + + #[test] + fn index_documents_check_exists_database_reindex() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = documents!([ + { + "id": 0, + "colour": 0, + }, + { + "id": 1, + "colour": [] + }, + { + "id": 2, + "colour": {} + }, + { + "id": 3, + "colour": null + }, + { + "id": 4, + "colour": [1] + }, + { + "id": 5 + }, + { + "id": 6, + "colour": { + "green": 1 + } + } + ]); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + let faceted_fields = hashset!(S("colour")); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); + + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + + let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); + + let bitmap_colour_green = + index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); + } + + #[test] + fn index_documents_check_exists_database() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let config = IndexerConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + + let faceted_fields = hashset!(S("colour")); + builder.set_filterable_fields(faceted_fields); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + + let content = documents!([ + { + "id": 0, + "colour": 0, + }, + { + "id": 1, + "colour": [] + }, + { + "id": 2, + "colour": {} + }, + { + "id": 3, + "colour": null + }, + { + "id": 4, + "colour": [1] + }, + { + "id": 5 + }, + { + "id": 6, + "colour": { + "green": 1 + } + } + ]); + + let indexing_config = IndexDocumentsConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + + let mut builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content).unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); + + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + + let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); + + let bitmap_colour_green = + index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); + } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 26b97c3a0..e501e5efd 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -35,6 +35,7 @@ pub(crate) enum TypedChunk { WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), + FieldIdFacetExistsDocids(grenad::Reader), GeoPoints(grenad::Reader), } @@ -146,6 +147,18 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids_iter) => { + append_entries_into_database( + facet_id_exists_docids_iter, + &index.facet_id_exists_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + ) + .unwrap(); + is_merged_database = true; + } TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { append_entries_into_database( word_pair_proximity_docids_iter, From a8641b42a7963666eedb9846b6aa69481a3f92ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 14 Jun 2022 15:05:49 +0200 Subject: [PATCH 02/18] Modify flatten_serde_json to keep dummy value for all object keys Example: ```json { "id": 0, "colour" : { "green": 1 } } ``` becomes: ```json { "id": 0, "colour" : [], "colour.green": 1 } ``` to retain the information the key "colour" exists in the original json value. --- flatten-serde-json/src/lib.rs | 53 ++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/flatten-serde-json/src/lib.rs b/flatten-serde-json/src/lib.rs index 8312f5bd6..e1b2b20c7 100644 --- a/flatten-serde-json/src/lib.rs +++ b/flatten-serde-json/src/lib.rs @@ -4,7 +4,11 @@ use serde_json::{Map, Value}; pub fn flatten(json: &Map) -> Map { let mut obj = Map::new(); - insert_object(&mut obj, None, json); + let mut all_keys = vec![]; + insert_object(&mut obj, None, json, &mut all_keys); + for key in all_keys { + obj.entry(key).or_insert(Value::Array(vec![])); + } obj } @@ -12,26 +16,32 @@ fn insert_object( base_json: &mut Map, base_key: Option<&str>, object: &Map, + all_keys: &mut Vec, ) { for (key, value) in object { let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); - + all_keys.push(new_key.clone()); if let Some(array) = value.as_array() { - insert_array(base_json, &new_key, array); + insert_array(base_json, &new_key, array, all_keys); } else if let Some(object) = value.as_object() { - insert_object(base_json, Some(&new_key), object); + insert_object(base_json, Some(&new_key), object, all_keys); } else { insert_value(base_json, &new_key, value.clone()); } } } -fn insert_array(base_json: &mut Map, base_key: &str, array: &Vec) { +fn insert_array( + base_json: &mut Map, + base_key: &str, + array: &Vec, + all_keys: &mut Vec, +) { for value in array { if let Some(object) = value.as_object() { - insert_object(base_json, Some(base_key), object); + insert_object(base_json, Some(base_key), object, all_keys); } else if let Some(sub_array) = value.as_array() { - insert_array(base_json, base_key, sub_array); + insert_array(base_json, base_key, sub_array, all_keys); } else { insert_value(base_json, base_key, value.clone()); } @@ -103,6 +113,7 @@ mod tests { assert_eq!( &flat, json!({ + "a": [], "a.b": "c", "a.d": "e", "a.f": "g" @@ -116,6 +127,10 @@ mod tests { fn flatten_array() { let mut base: Value = json!({ "a": [ + 1, + "b", + [], + [{}], { "b": "c" }, { "b": "d" }, { "b": "e" }, @@ -127,6 +142,7 @@ mod tests { assert_eq!( &flat, json!({ + "a": [1, "b"], "a.b": ["c", "d", "e"], }) .as_object() @@ -154,6 +170,28 @@ mod tests { .as_object() .unwrap() ); + + // here we must keep 42 in "a" + let mut base: Value = json!({ + "a": [ + { "b": "c" }, + { "b": "d" }, + { "b": "e" }, + null, + ] + }); + let json = std::mem::take(base.as_object_mut().unwrap()); + let flat = flatten(&json); + + assert_eq!( + &flat, + json!({ + "a": null, + "a.b": ["c", "d", "e"], + }) + .as_object() + .unwrap() + ); } #[test] @@ -170,6 +208,7 @@ mod tests { assert_eq!( &flat, json!({ + "a": [], "a.b": ["c", "d"], }) .as_object() From 72452f0cb270d5225e83b31ec4d707845afd1eea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 25 May 2022 11:55:16 +0200 Subject: [PATCH 03/18] Implements the EXIST filter operator --- filter-parser/src/condition.rs | 13 ++++++++++++- filter-parser/src/error.rs | 4 ++-- filter-parser/src/lib.rs | 27 ++++++++++++++++++++++----- milli/src/search/facet/filter.rs | 19 +++++++++++++++++++ 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 264787055..0ece99a0d 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -8,7 +8,7 @@ use nom::branch::alt; use nom::bytes::complete::tag; use nom::combinator::cut; -use nom::sequence::tuple; +use nom::sequence::{terminated, tuple}; use Condition::*; use crate::{parse_value, FilterCondition, IResult, Span, Token}; @@ -19,6 +19,8 @@ pub enum Condition<'a> { GreaterThanOrEqual(Token<'a>), Equal(Token<'a>), NotEqual(Token<'a>), + Exist, + NotExist, LowerThan(Token<'a>), LowerThanOrEqual(Token<'a>), Between { from: Token<'a>, to: Token<'a> }, @@ -33,6 +35,8 @@ impl<'a> Condition<'a> { GreaterThanOrEqual(n) => (LowerThan(n), None), Equal(s) => (NotEqual(s), None), NotEqual(s) => (Equal(s), None), + Exist => (NotExist, None), + NotExist => (Exist, None), LowerThan(n) => (GreaterThanOrEqual(n), None), LowerThanOrEqual(n) => (GreaterThan(n), None), Between { from, to } => (LowerThan(from), Some(GreaterThan(to))), @@ -58,6 +62,13 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } +/// exist = value EXIST +pub fn parse_exist(input: Span) -> IResult { + let (input, key) = terminated(parse_value, tag("EXIST"))(input)?; + + Ok((input, FilterCondition::Condition { fid: key.into(), op: Exist })) +} + /// to = value value TO value pub fn parse_to(input: Span) -> IResult { let (input, (key, from, _, to)) = diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index ddf7bea47..8136732c8 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -128,10 +128,10 @@ impl<'a> Display for Error<'a> { writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)? } ErrorKind::InvalidPrimary if input.trim().is_empty() => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing.")? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing.")? } ErrorKind::InvalidPrimary => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `{}`.", escaped_input)? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `{}`.", escaped_input)? } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 243d1a3f4..ee4edc122 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -6,8 +6,9 @@ //! or = and (~ "OR" ~ and) //! and = not (~ "AND" not)* //! not = ("NOT" ~ not) | primary -//! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to +//! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | exist | to //! condition = value ("==" | ">" ...) value +//! exist = value EXIST //! to = value value TO value //! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* //! singleQuoted = "'" .* all but quotes "'" @@ -42,6 +43,7 @@ mod value; use std::fmt::Debug; use std::str::FromStr; +use condition::parse_exist; pub use condition::{parse_condition, parse_to, Condition}; use error::{cut_with_err, NomErrorExt}; pub use error::{Error, ErrorKind}; @@ -248,6 +250,7 @@ fn parse_primary(input: Span) -> IResult { ), parse_geo_radius, parse_condition, + parse_exist, parse_to, // the next lines are only for error handling and are written at the end to have the less possible performance impact parse_geo_point, @@ -420,6 +423,20 @@ pub mod tests { op: Condition::LowerThan(rtok("NOT subscribers >= ", "1000")), }, ), + ( + "subscribers EXIST", + Fc::Condition { + fid: rtok("", "subscribers"), + op: Condition::Exist, + }, + ), + ( + "NOT subscribers EXIST", + Fc::Condition { + fid: rtok("NOT ", "subscribers"), + op: Condition::NotExist, + }, + ), ( "subscribers 100 TO 1000", Fc::Condition { @@ -577,10 +594,10 @@ pub mod tests { ("channel = ", "Was expecting a value but instead got nothing."), ("channel = 🐻", "Was expecting a value but instead got `🐻`."), ("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."), - ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `OR`."), - ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `AND`."), - ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `channel Ponce`."), - ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing."), + ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `OR`."), + ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `AND`."), + ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `channel Ponce`."), + ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing."), ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index d89413f62..a5c13ec2a 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -280,6 +280,25 @@ impl<'a> Filter<'a> { Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse()?)), Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)), Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)), + Condition::Exist => { + let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; + return Ok(exist); + } + Condition::NotExist => { + let all_ids = index.documents_ids(rtxn)?; + + let exist = Self::evaluate_operator( + rtxn, + index, + numbers_db, + strings_db, + field_id, + &Condition::Exist, + )?; + + let notexist = all_ids - exist; + return Ok(notexist); + } Condition::Equal(val) => { let (_original_value, string_docids) = strings_db .get(rtxn, &(field_id, &val.value().to_lowercase()))? From dc64170a69e0be95b2c9e5bb96e1a5be0b58ec06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 14 Jun 2022 16:42:09 +0200 Subject: [PATCH 04/18] =?UTF-8?q?Improve=20syntax=20of=20EXISTS=20filter,?= =?UTF-8?q?=20allow=20=E2=80=9Cvalue=20NOT=20EXISTS=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- filter-parser/src/condition.rs | 22 ++++++++++------ filter-parser/src/error.rs | 4 +-- filter-parser/src/lib.rs | 45 +++++++++++++++++++------------- milli/src/search/facet/filter.rs | 6 ++--- 4 files changed, 46 insertions(+), 31 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 0ece99a0d..b57d68b75 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -19,8 +19,8 @@ pub enum Condition<'a> { GreaterThanOrEqual(Token<'a>), Equal(Token<'a>), NotEqual(Token<'a>), - Exist, - NotExist, + Exists, + NotExists, LowerThan(Token<'a>), LowerThanOrEqual(Token<'a>), Between { from: Token<'a>, to: Token<'a> }, @@ -35,8 +35,8 @@ impl<'a> Condition<'a> { GreaterThanOrEqual(n) => (LowerThan(n), None), Equal(s) => (NotEqual(s), None), NotEqual(s) => (Equal(s), None), - Exist => (NotExist, None), - NotExist => (Exist, None), + Exists => (NotExists, None), + NotExists => (Exists, None), LowerThan(n) => (GreaterThanOrEqual(n), None), LowerThanOrEqual(n) => (GreaterThan(n), None), Between { from, to } => (LowerThan(from), Some(GreaterThan(to))), @@ -62,11 +62,17 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } -/// exist = value EXIST -pub fn parse_exist(input: Span) -> IResult { - let (input, key) = terminated(parse_value, tag("EXIST"))(input)?; +/// exist = value NOT EXISTS +pub fn parse_exists(input: Span) -> IResult { + let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; - Ok((input, FilterCondition::Condition { fid: key.into(), op: Exist })) + Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists })) +} +/// exist = value NOT EXISTS +pub fn parse_not_exists(input: Span) -> IResult { + let (input, key) = terminated(parse_value, tag("NOT EXISTS"))(input)?; + + Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists })) } /// to = value value TO value diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs index 8136732c8..a3720f7bf 100644 --- a/filter-parser/src/error.rs +++ b/filter-parser/src/error.rs @@ -128,10 +128,10 @@ impl<'a> Display for Error<'a> { writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)? } ErrorKind::InvalidPrimary if input.trim().is_empty() => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing.")? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing.")? } ErrorKind::InvalidPrimary => { - writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `{}`.", escaped_input)? + writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `{}`.", escaped_input)? } ErrorKind::ExpectedEof => { writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)? diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index ee4edc122..69215798c 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -3,23 +3,24 @@ //! ```text //! filter = expression ~ EOF //! expression = or -//! or = and (~ "OR" ~ and) -//! and = not (~ "AND" not)* -//! not = ("NOT" ~ not) | primary -//! primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | exist | to +//! or = and ("OR" and) +//! and = not ("AND" not)* +//! not = ("NOT" not) | primary +//! primary = (WS* "(" expression ")" WS*) | geoRadius | condition | exists | not_exists | to //! condition = value ("==" | ">" ...) value -//! exist = value EXIST +//! exists = value EXISTS +//! not_exists = value NOT EXISTS //! to = value value TO value -//! value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* +//! value = WS* ( word | singleQuoted | doubleQuoted) ~ WS* //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* ~ "_geoRadius(" ~ WS* ~ float ~ WS* ~ "," ~ WS* ~ float ~ WS* ~ "," float ~ WS* ~ ")" +//! geoRadius = WS* ~ "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" //! ``` //! //! Other BNF grammar used to handle some specific errors: //! ```text -//! geoPoint = WS* ~ "_geoPoint(" ~ (float ~ ",")* ~ ")" +//! geoPoint = WS* "_geoPoint(" (float ",")* ")" //! ``` //! //! Specific errors: @@ -43,8 +44,8 @@ mod value; use std::fmt::Debug; use std::str::FromStr; -use condition::parse_exist; pub use condition::{parse_condition, parse_to, Condition}; +use condition::{parse_exists, parse_not_exists}; use error::{cut_with_err, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; @@ -250,7 +251,8 @@ fn parse_primary(input: Span) -> IResult { ), parse_geo_radius, parse_condition, - parse_exist, + parse_exists, + parse_not_exists, parse_to, // the next lines are only for error handling and are written at the end to have the less possible performance impact parse_geo_point, @@ -424,17 +426,24 @@ pub mod tests { }, ), ( - "subscribers EXIST", + "subscribers EXISTS", Fc::Condition { fid: rtok("", "subscribers"), - op: Condition::Exist, + op: Condition::Exists, }, ), ( - "NOT subscribers EXIST", + "NOT subscribers EXISTS", Fc::Condition { fid: rtok("NOT ", "subscribers"), - op: Condition::NotExist, + op: Condition::NotExists, + }, + ), + ( + "subscribers NOT EXISTS", + Fc::Condition { + fid: rtok("", "subscribers"), + op: Condition::NotExists, }, ), ( @@ -594,10 +603,10 @@ pub mod tests { ("channel = ", "Was expecting a value but instead got nothing."), ("channel = 🐻", "Was expecting a value but instead got `🐻`."), ("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."), - ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `OR`."), - ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `AND`."), - ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `channel Ponce`."), - ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing."), + ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `OR`."), + ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `AND`."), + ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."), + ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing."), ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index a5c13ec2a..7f3b928dd 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -280,11 +280,11 @@ impl<'a> Filter<'a> { Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse()?)), Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)), Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)), - Condition::Exist => { + Condition::Exists => { let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; return Ok(exist); } - Condition::NotExist => { + Condition::NotExists => { let all_ids = index.documents_ids(rtxn)?; let exist = Self::evaluate_operator( @@ -293,7 +293,7 @@ impl<'a> Filter<'a> { numbers_db, strings_db, field_id, - &Condition::Exist, + &Condition::Exists, )?; let notexist = all_ids - exist; From 0388b2d46382e5578824dd5b2b522cead764f35b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 08:58:41 +0200 Subject: [PATCH 05/18] Run cargo fmt --- milli/src/heed_codec/facet/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 79dbffa1d..8c5a4c118 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -26,10 +26,11 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { } } -use crate::{try_split_array_at, DocumentId, FieldId}; use std::borrow::Cow; use std::convert::TryInto; +use crate::{try_split_array_at, DocumentId, FieldId}; + pub struct FieldIdCodec; impl<'a> heed::BytesDecode<'a> for FieldIdCodec { From a5c916225077b0dfe8ba35cea702af730f692b1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 09:14:19 +0200 Subject: [PATCH 06/18] Improve parser for NOT EXISTS filter Allow multiple spaces between NOT and EXISTS --- filter-parser/src/condition.rs | 10 ++++---- filter-parser/src/lib.rs | 43 +++++++++++++++++++++++----------- filter-parser/src/value.rs | 2 +- 3 files changed, 36 insertions(+), 19 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index b57d68b75..6a5ecbe0a 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -7,11 +7,12 @@ use nom::branch::alt; use nom::bytes::complete::tag; +use nom::character::complete::multispace1; use nom::combinator::cut; use nom::sequence::{terminated, tuple}; use Condition::*; -use crate::{parse_value, FilterCondition, IResult, Span, Token}; +use crate::{parse_value, ws, FilterCondition, IResult, Span, Token}; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Condition<'a> { @@ -62,16 +63,17 @@ pub fn parse_condition(input: Span) -> IResult { Ok((input, condition)) } -/// exist = value NOT EXISTS +/// exist = value "EXISTS" pub fn parse_exists(input: Span) -> IResult { let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?; Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists })) } -/// exist = value NOT EXISTS +/// exist = value "NOT" WS* "EXISTS" pub fn parse_not_exists(input: Span) -> IResult { - let (input, key) = terminated(parse_value, tag("NOT EXISTS"))(input)?; + let (input, key) = parse_value(input)?; + let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?; Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists })) } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 69215798c..e40519e87 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -1,21 +1,21 @@ //! BNF grammar: //! //! ```text -//! filter = expression ~ EOF +//! filter = expression EOF //! expression = or //! or = and ("OR" and) //! and = not ("AND" not)* //! not = ("NOT" not) | primary //! primary = (WS* "(" expression ")" WS*) | geoRadius | condition | exists | not_exists | to //! condition = value ("==" | ">" ...) value -//! exists = value EXISTS -//! not_exists = value NOT EXISTS -//! to = value value TO value -//! value = WS* ( word | singleQuoted | doubleQuoted) ~ WS* +//! exists = value "EXISTS" +//! not_exists = value "NOT" WS* "EXISTS" +//! to = value value "TO" value +//! value = WS* ( word | singleQuoted | doubleQuoted) WS* //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* ~ "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" +//! geoRadius = WS* "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" //! ``` //! //! Other BNF grammar used to handle some specific errors: @@ -31,7 +31,7 @@ //! field < 12 AND _geoPoint(1, 2) //! ``` //! -//! - If a user try to use a geoRadius as a value we must throw an error. +//! - If a user try to use a geoRadius as a value we must throw an error. //! ```text //! field = _geoRadius(12, 13, 14) //! ``` @@ -170,7 +170,7 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) delimited(multispace0, inner, multispace0) } -/// or = and (~ "OR" ~ and) +/// or = and ("OR" and) fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; // if we found a `OR` then we MUST find something next @@ -182,7 +182,7 @@ fn parse_or(input: Span) -> IResult { Ok((input, expr)) } -/// and = not (~ "AND" not)* +/// and = not ("AND" not)* fn parse_and(input: Span) -> IResult { let (input, lhs) = parse_not(input)?; // if we found a `AND` then we MUST find something next @@ -193,14 +193,14 @@ fn parse_and(input: Span) -> IResult { Ok((input, expr)) } -/// not = ("NOT" ~ not) | primary +/// not = ("NOT" not) | primary /// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. /// If we parse a `NOT` we MUST parse something behind. fn parse_not(input: Span) -> IResult { alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input) } -/// geoRadius = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float) +/// geoRadius = WS* "_geoRadius(float "," float "," float) /// If we parse `_geoRadius` we MUST parse the rest of the expression. fn parse_geo_radius(input: Span) -> IResult { // we want to forbid space BEFORE the _geoRadius but not after @@ -224,7 +224,7 @@ fn parse_geo_radius(input: Span) -> IResult { Ok((input, res)) } -/// geoPoint = WS* ~ "_geoPoint(float ~ "," ~ float ~ "," float) +/// geoPoint = WS* "_geoPoint(float "," float "," float) fn parse_geo_point(input: Span) -> IResult { // we want to forbid space BEFORE the _geoPoint but not after tuple(( @@ -238,7 +238,7 @@ fn parse_geo_point(input: Span) -> IResult { Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) } -/// primary = (WS* ~ "(" expression ")" ~ WS*) | geoRadius | condition | to +/// primary = (WS* "(" expression ")" WS*) | geoRadius | condition | to fn parse_primary(input: Span) -> IResult { alt(( // if we find a first parenthesis, then we must parse an expression and find the closing parenthesis @@ -266,7 +266,7 @@ pub fn parse_expression(input: Span) -> IResult { parse_or(input) } -/// filter = expression ~ EOF +/// filter = expression EOF pub fn parse_filter(input: Span) -> IResult { terminated(parse_expression, eof)(input) } @@ -446,6 +446,20 @@ pub mod tests { op: Condition::NotExists, }, ), + ( + "NOT subscribers NOT EXISTS", + Fc::Condition { + fid: rtok("NOT ", "subscribers"), + op: Condition::Exists, + }, + ), + ( + "subscribers NOT EXISTS", + Fc::Condition { + fid: rtok("", "subscribers"), + op: Condition::NotExists, + }, + ), ( "subscribers 100 TO 1000", Fc::Condition { @@ -616,6 +630,7 @@ pub mod tests { ("channel = \"ponce", "Expression `\\\"ponce` is missing the following closing delimiter: `\"`."), ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."), ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."), + ("colour NOT EXIST", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`."), ]; for (input, expected) in test_case { diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 84dd21902..18ae58ae5 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -48,7 +48,7 @@ fn quoted_by(quote: char, input: Span) -> IResult { )) } -/// value = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS* +/// value = WS* ( word | singleQuoted | doubleQuoted) WS* pub fn parse_value<'a>(input: Span<'a>) -> IResult> { // to get better diagnostic message we are going to strip the left whitespaces from the input right now let (input, _) = take_while(char::is_whitespace)(input)?; From 722db7b088a86f2f4cabef93927e6815a791136e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 09:28:21 +0200 Subject: [PATCH 07/18] Ignore target directory of filter-parser/fuzz crate --- filter-parser/fuzz/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/filter-parser/fuzz/.gitignore b/filter-parser/fuzz/.gitignore index cb73742e4..9a2e1d58c 100644 --- a/filter-parser/fuzz/.gitignore +++ b/filter-parser/fuzz/.gitignore @@ -1,2 +1,3 @@ /corpus/ /artifacts/ +/target/ \ No newline at end of file From bd15f5625af5d3032f386d8c67cfb9a4d5f8330d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 15 Jun 2022 13:31:51 +0200 Subject: [PATCH 08/18] Fix compiler warning --- filter-parser/src/condition.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index 6a5ecbe0a..c63f1d926 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -12,7 +12,7 @@ use nom::combinator::cut; use nom::sequence::{terminated, tuple}; use Condition::*; -use crate::{parse_value, ws, FilterCondition, IResult, Span, Token}; +use crate::{parse_value, FilterCondition, IResult, Span, Token}; #[derive(Debug, Clone, PartialEq, Eq)] pub enum Condition<'a> { From 392472f4bb9f1f16aa27e6b9feef249f2d1f31f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 06:19:33 +0200 Subject: [PATCH 09/18] Apply suggestions from code review Co-authored-by: Tamo --- filter-parser/src/lib.rs | 6 +++--- infos/src/main.rs | 2 +- milli/src/index.rs | 3 +-- .../index_documents/extract/extract_facet_exists_docids.rs | 2 -- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index e40519e87..5cce5f4c3 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -170,7 +170,7 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) delimited(multispace0, inner, multispace0) } -/// or = and ("OR" and) +/// or = and ("OR" and)* fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; // if we found a `OR` then we MUST find something next @@ -200,7 +200,7 @@ fn parse_not(input: Span) -> IResult { alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input) } -/// geoRadius = WS* "_geoRadius(float "," float "," float) +/// geoRadius = WS* "_geoRadius(float WS* "," WS* float WS* "," WS* float) /// If we parse `_geoRadius` we MUST parse the rest of the expression. fn parse_geo_radius(input: Span) -> IResult { // we want to forbid space BEFORE the _geoRadius but not after @@ -224,7 +224,7 @@ fn parse_geo_radius(input: Span) -> IResult { Ok((input, res)) } -/// geoPoint = WS* "_geoPoint(float "," float "," float) +/// geoPoint = WS* "_geoPoint(float WS* "," WS* float WS* "," WS* float) fn parse_geo_point(input: Span) -> IResult { // we want to forbid space BEFORE the _geoPoint but not after tuple(( diff --git a/infos/src/main.rs b/infos/src/main.rs index 89aec6182..feec17557 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -551,7 +551,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho let db = facet_id_exists_docids.remap_data_type::(); for result in facet_values_iter(rtxn, db, facet_id)? { let (_fid, value) = result?; - let key = format!("{}", facet_name); + let key = facet_name.to_string(); heap.push(Reverse((value.len(), key, facet_id_exists_docids_name))); if heap.len() > limit { heap.pop(); diff --git a/milli/src/index.rs b/milli/src/index.rs index 816112178..b0897271e 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -156,8 +156,7 @@ impl Index { let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; - let facet_id_exists_docids: Database = - env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; + let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; let field_id_docid_facet_strings = diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs index e7a001c08..d25c57aea 100644 --- a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs @@ -32,9 +32,7 @@ pub fn extract_facet_exists_docids( let mut cursor = docid_fid_facet_number.into_cursor()?; while let Some((key_bytes, _)) = cursor.move_on_next()? { let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap(); - let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap(); - facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } From 30bd4db0fcbcf31d80baec5892d28076c16b8577 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 08:24:16 +0200 Subject: [PATCH 10/18] Simplify indexing task for facet_exists_docids database --- milli/src/heed_codec/facet/field_id_codec.rs | 25 +++++++++++ milli/src/heed_codec/facet/mod.rs | 43 +------------------ milli/src/lib.rs | 1 + .../extract/extract_facet_exists_docids.rs | 40 ----------------- .../extract/extract_fid_docid_facet_values.rs | 24 +++++++---- .../src/update/index_documents/extract/mod.rs | 22 +++------- 6 files changed, 50 insertions(+), 105 deletions(-) create mode 100644 milli/src/heed_codec/facet/field_id_codec.rs delete mode 100644 milli/src/update/index_documents/extract/extract_facet_exists_docids.rs diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs new file mode 100644 index 000000000..d147423f2 --- /dev/null +++ b/milli/src/heed_codec/facet/field_id_codec.rs @@ -0,0 +1,25 @@ +use crate::{FieldId, BEU16}; +use heed::zerocopy::AsBytes; +use std::{borrow::Cow, convert::TryInto}; + +pub struct FieldIdCodec; + +impl<'a> heed::BytesDecode<'a> for FieldIdCodec { + type DItem = FieldId; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let bytes: [u8; 2] = bytes[..2].try_into().ok()?; + let field_id = BEU16::from(bytes).get(); + Some(field_id) + } +} + +impl<'a> heed::BytesEncode<'a> for FieldIdCodec { + type EItem = FieldId; + + fn bytes_encode(field_id: &Self::EItem) -> Option> { + let field_id = BEU16::new(*field_id); + let bytes = field_id.as_bytes(); + Some(Cow::Owned(bytes.to_vec())) + } +} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 8c5a4c118..384991fd7 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -5,6 +5,7 @@ mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; +mod field_id_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; @@ -15,6 +16,7 @@ pub use self::facet_string_level_zero_value_codec::{ pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; +pub use self::field_id_codec::FieldIdCodec; /// Tries to split a slice in half at the given middle point, /// `None` if the slice is too short. @@ -25,44 +27,3 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { None } } - -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::{try_split_array_at, DocumentId, FieldId}; - -pub struct FieldIdCodec; - -impl<'a> heed::BytesDecode<'a> for FieldIdCodec { - type DItem = FieldId; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, _) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - Some(field_id) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldIdCodec { - type EItem = FieldId; - - fn bytes_encode(field_id: &Self::EItem) -> Option> { - Some(Cow::Owned(field_id.to_be_bytes().to_vec())) - } -} - -pub struct FieldIdDocIdCodec; - -impl<'a> heed::BytesDecode<'a> for FieldIdDocIdCodec { - type DItem = (FieldId, DocumentId); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - - let document_id_bytes = bytes[..4].try_into().ok()?; - let document_id = u32::from_be_bytes(document_id_bytes); - - Some((field_id, document_id)) - } -} diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 81cd057d5..20fdceaec 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -49,6 +49,7 @@ pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; +pub type BEU16 = heed::zerocopy::U16; pub type BEU32 = heed::zerocopy::U32; pub type BEU64 = heed::zerocopy::U64; pub type Attribute = u32; diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs deleted file mode 100644 index d25c57aea..000000000 --- a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs +++ /dev/null @@ -1,40 +0,0 @@ -use std::fs::File; -use std::io; - -use heed::{BytesDecode, BytesEncode}; - -use super::helpers::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, -}; -use crate::heed_codec::facet::{FieldIdCodec, FieldIdDocIdCodec}; -use crate::Result; - -/// Extracts the documents ids where this field appears. -/// -/// Returns a grenad reader whose key is the field id encoded -/// with `FieldIdCodec` and the value is a document_id (u32) -/// encoded as native-endian bytes. -#[logging_timer::time] -pub fn extract_facet_exists_docids( - docid_fid_facet_number: grenad::Reader, - indexer: GrenadParameters, -) -> Result> { - let max_memory = indexer.max_memory_by_thread(); - - let mut facet_exists_docids_sorter = create_sorter( - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory, - ); - - let mut cursor = docid_fid_facet_number.into_cursor()?; - while let Some((key_bytes, _)) = cursor.move_on_next()? { - let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap(); - let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap(); - facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; - } - - sorter_into_reader(facet_exists_docids_sorter, indexer) -} diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index d93bde500..c83ac49e0 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,15 +1,16 @@ +use heed::zerocopy::AsBytes; +use serde_json::Value; use std::collections::HashSet; +use std::convert::TryInto; use std::fs::File; use std::io; use std::mem::size_of; -use heed::zerocopy::AsBytes; -use serde_json::Value; - use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::{DocumentId, FieldId, Result}; +use crate::update::index_documents::merge_cbo_roaring_bitmaps; +use crate::{DocumentId, FieldId, Result, BEU32}; /// Extracts the facet values of each faceted field of each document. /// @@ -40,7 +41,7 @@ pub fn extract_fid_docid_facet_values( ); let mut fid_docid_facet_exists_sorter = create_sorter( - keep_first, + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -56,12 +57,17 @@ pub fn extract_fid_docid_facet_values( if faceted_fields.contains(&field_id) { key_buffer.clear(); - // here, we know already that the document must be added to the “field id exists” database - // prefix key with the field_id and the document_id - + // Set key to the field_id + // Note: this encoding is consistent with FieldIdCodec key_buffer.extend_from_slice(&field_id.to_be_bytes()); + + // Here, we know already that the document must be added to the “field id exists” database + let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); + let document = BEU32::from(document).get(); + fid_docid_facet_exists_sorter.insert(&key_buffer, document.to_ne_bytes())?; + + // For the other extraction tasks, prefix the key with the field_id and the document_id key_buffer.extend_from_slice(&docid_bytes); - fid_docid_facet_exists_sorter.insert(&key_buffer, ().as_bytes())?; let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 7d26e0984..bb695a99f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -1,5 +1,4 @@ mod extract_docid_word_positions; -mod extract_facet_exists_docids; mod extract_facet_number_docids; mod extract_facet_string_docids; mod extract_fid_docid_facet_values; @@ -17,7 +16,6 @@ use log::debug; use rayon::prelude::*; use self::extract_docid_word_positions::extract_docid_word_positions; -use self::extract_facet_exists_docids::extract_facet_exists_docids; use self::extract_facet_number_docids::extract_facet_number_docids; use self::extract_facet_string_docids::extract_facet_string_docids; use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; @@ -142,15 +140,12 @@ pub(crate) fn data_from_obkv_documents( TypedChunk::FieldIdFacetNumberDocids, "field-id-facet-number-docids", ); - spawn_extraction_task::<_, _, Vec>>( - docid_fid_facet_exists_chunks.clone(), - indexer.clone(), - lmdb_writer_sx.clone(), - extract_facet_exists_docids, - merge_cbo_roaring_bitmaps, - TypedChunk::FieldIdFacetExistsDocids, - "field-id-facet-exists-docids", - ); + + // spawn extraction task for field-id-facet-exists-docids + rayon::spawn(move || { + let reader = docid_fid_facet_exists_chunks.merge(merge_cbo_roaring_bitmaps, &indexer); + let _ = lmdb_writer_sx.send(reader.map(TypedChunk::FieldIdFacetExistsDocids)); + }); Ok(()) } @@ -226,7 +221,7 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, grenad::Reader), + (grenad::Reader, grenad::Reader), ), )> { let flattened_documents_chunk = @@ -294,9 +289,6 @@ fn send_and_extract_flattened_documents_data( docid_fid_facet_strings_chunk.clone(), ))); - let docid_fid_facet_exists_chunk = - unsafe { as_cloneable_grenad(&docid_fid_facet_exists_chunk)? }; - Ok(( docid_fid_facet_numbers_chunk, (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk), From c17d616250cee38d34dbad51173188ae8cc54ead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 08:41:33 +0200 Subject: [PATCH 11/18] Refactor index_documents_check_exists_database tests --- milli/src/update/index_documents/mod.rs | 222 +++++++++--------------- 1 file changed, 86 insertions(+), 136 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 82457762e..99f474eb6 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1932,152 +1932,102 @@ mod tests { assert_eq!(ids.len(), map.len()); } - #[test] - fn index_documents_check_exists_database_reindex() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let content = documents!([ - { - "id": 0, - "colour": 0, - }, - { - "id": 1, - "colour": [] - }, - { - "id": 2, - "colour": {} - }, - { - "id": 3, - "colour": null - }, - { - "id": 4, - "colour": [1] - }, - { - "id": 5 - }, - { - "id": 6, - "colour": { - "green": 1 - } - } - ]); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - builder.add_documents(content).unwrap(); - builder.execute().unwrap(); - - wtxn.commit().unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - - let faceted_fields = hashset!(S("colour")); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); - - let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); - let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); - - let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); - assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); - - let bitmap_colour_green = - index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); - assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); - } - #[test] fn index_documents_check_exists_database() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - let config = IndexerConfig::default(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = update::Settings::new(&mut wtxn, &index, &config); - - let faceted_fields = hashset!(S("colour")); - builder.set_filterable_fields(faceted_fields); - builder.execute(|_| ()).unwrap(); - wtxn.commit().unwrap(); - - let content = documents!([ - { - "id": 0, - "colour": 0, - }, - { - "id": 1, - "colour": [] - }, - { - "id": 2, - "colour": {} - }, - { - "id": 3, - "colour": null - }, - { - "id": 4, - "colour": [1] - }, - { - "id": 5 - }, - { - "id": 6, - "colour": { - "green": 1 - } - } - ]); - let indexing_config = IndexDocumentsConfig::default(); - let mut wtxn = index.write_txn().unwrap(); + let faceted_fields = hashset!(S("colour")); + let content = || { + documents!([ + { + "id": 0, + "colour": 0, + }, + { + "id": 1, + "colour": [] + }, + { + "id": 2, + "colour": {} + }, + { + "id": 3, + "colour": null + }, + { + "id": 4, + "colour": [1] + }, + { + "id": 5 + }, + { + "id": 6, + "colour": { + "green": 1 + } + }, + { + "id": 7, + "colour": { + "green": { + "blue": [] + } + } + } + ]) + }; + let make_index = || { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + Index::new(options, &path).unwrap() + }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) - .unwrap(); - builder.add_documents(content).unwrap(); - builder.execute().unwrap(); + let set_filterable_fields = |index: &Index| { + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_filterable_fields(faceted_fields.clone()); + builder.execute(|_| ()).unwrap(); + wtxn.commit().unwrap(); + }; + let add_documents = |index: &Index| { + let mut wtxn = index.write_txn().unwrap(); + let mut builder = + IndexDocuments::new(&mut wtxn, index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + builder.add_documents(content()).unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + }; - wtxn.commit().unwrap(); + let check_ok = |index: &Index| { + let rtxn = index.read_txn().unwrap(); + let facets = index.faceted_fields(&rtxn).unwrap(); + assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); - let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"))); + let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); + let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); - let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); - let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); + let bitmap_colour = + index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6, 7]); - let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); - assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6]); + let bitmap_colour_green = + index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6, 7]); + }; - let bitmap_colour_green = - index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); - assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6]); + let index = make_index(); + add_documents(&index); + set_filterable_fields(&index); + check_ok(&index); + + let index = make_index(); + set_filterable_fields(&index); + add_documents(&index); + check_ok(&index); } } From ea0642c32d799b201c61cc81d8f979e33cc74380 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 09:12:37 +0200 Subject: [PATCH 12/18] Make filter parser more strict regarding spacing around operators OR, AND, NOT, TO must now be followed by spaces --- filter-parser/src/condition.rs | 11 ++++----- filter-parser/src/lib.rs | 45 ++++++++++++++++++++-------------- filter-parser/src/value.rs | 2 +- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs index c63f1d926..cbf73b96a 100644 --- a/filter-parser/src/condition.rs +++ b/filter-parser/src/condition.rs @@ -44,8 +44,7 @@ impl<'a> Condition<'a> { } } } - -/// condition = value ("==" | ">" ...) value +/// condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value pub fn parse_condition(input: Span) -> IResult { let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("="))); let (input, (fid, op, value)) = tuple((parse_value, operator, cut(parse_value)))(input)?; @@ -69,7 +68,7 @@ pub fn parse_exists(input: Span) -> IResult { Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists })) } -/// exist = value "NOT" WS* "EXISTS" +/// exist = value "NOT" WS+ "EXISTS" pub fn parse_not_exists(input: Span) -> IResult { let (input, key) = parse_value(input)?; @@ -77,10 +76,10 @@ pub fn parse_not_exists(input: Span) -> IResult { Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists })) } -/// to = value value TO value +/// to = value value "TO" WS+ value pub fn parse_to(input: Span) -> IResult { - let (input, (key, from, _, to)) = - tuple((parse_value, parse_value, tag("TO"), cut(parse_value)))(input)?; + let (input, (key, from, _, _, to)) = + tuple((parse_value, parse_value, tag("TO"), multispace1, cut(parse_value)))(input)?; Ok((input, FilterCondition::Condition { fid: key, op: Between { from, to } })) } diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs index 5cce5f4c3..01be432d7 100644 --- a/filter-parser/src/lib.rs +++ b/filter-parser/src/lib.rs @@ -3,19 +3,19 @@ //! ```text //! filter = expression EOF //! expression = or -//! or = and ("OR" and) -//! and = not ("AND" not)* -//! not = ("NOT" not) | primary -//! primary = (WS* "(" expression ")" WS*) | geoRadius | condition | exists | not_exists | to -//! condition = value ("==" | ">" ...) value +//! or = and ("OR" WS+ and)* +//! and = not ("AND" WS+ not)* +//! not = ("NOT" WS+ not) | primary +//! primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to +//! condition = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value //! exists = value "EXISTS" -//! not_exists = value "NOT" WS* "EXISTS" -//! to = value value "TO" value -//! value = WS* ( word | singleQuoted | doubleQuoted) WS* +//! not_exists = value "NOT" WS+ "EXISTS" +//! to = value value "TO" WS+ value +//! value = WS* ( word | singleQuoted | doubleQuoted) WS+ //! singleQuoted = "'" .* all but quotes "'" //! doubleQuoted = "\"" .* all but double quotes "\"" //! word = (alphanumeric | _ | - | .)+ -//! geoRadius = WS* "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" +//! geoRadius = "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")" //! ``` //! //! Other BNF grammar used to handle some specific errors: @@ -50,7 +50,7 @@ use error::{cut_with_err, NomErrorExt}; pub use error::{Error, ErrorKind}; use nom::branch::alt; use nom::bytes::complete::tag; -use nom::character::complete::{char, multispace0}; +use nom::character::complete::{char, multispace0, multispace1}; use nom::combinator::{cut, eof, map}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; @@ -170,11 +170,11 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult) -> impl FnMut(Span<'a>) delimited(multispace0, inner, multispace0) } -/// or = and ("OR" and)* +/// or = and ("OR" WS+ and)* fn parse_or(input: Span) -> IResult { let (input, lhs) = parse_and(input)?; // if we found a `OR` then we MUST find something next - let (input, ors) = many0(preceded(ws(tag("OR")), cut(parse_and)))(input)?; + let (input, ors) = many0(preceded(ws(tuple((tag("OR"), multispace1))), cut(parse_and)))(input)?; let expr = ors .into_iter() @@ -186,24 +186,28 @@ fn parse_or(input: Span) -> IResult { fn parse_and(input: Span) -> IResult { let (input, lhs) = parse_not(input)?; // if we found a `AND` then we MUST find something next - let (input, ors) = many0(preceded(ws(tag("AND")), cut(parse_not)))(input)?; + let (input, ors) = + many0(preceded(ws(tuple((tag("AND"), multispace1))), cut(parse_not)))(input)?; let expr = ors .into_iter() .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); Ok((input, expr)) } -/// not = ("NOT" not) | primary -/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. +/// not = ("NOT" WS+ not) | primary +/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`. /// If we parse a `NOT` we MUST parse something behind. fn parse_not(input: Span) -> IResult { - alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input) + alt(( + map(preceded(ws(tuple((tag("NOT"), multispace1))), cut(parse_not)), |e| e.negate()), + parse_primary, + ))(input) } /// geoRadius = WS* "_geoRadius(float WS* "," WS* float WS* "," WS* float) /// If we parse `_geoRadius` we MUST parse the rest of the expression. fn parse_geo_radius(input: Span) -> IResult { - // we want to forbid space BEFORE the _geoRadius but not after + // we want to allow space BEFORE the _geoRadius but not after let parsed = preceded( tuple((multispace0, tag("_geoRadius"))), // if we were able to parse `_geoRadius` and can't parse the rest of the input we return a failure @@ -238,7 +242,7 @@ fn parse_geo_point(input: Span) -> IResult { Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) } -/// primary = (WS* "(" expression ")" WS*) | geoRadius | condition | to +/// primary = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to fn parse_primary(input: Span) -> IResult { alt(( // if we find a first parenthesis, then we must parse an expression and find the closing parenthesis @@ -620,7 +624,7 @@ pub mod tests { ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `OR`."), ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `AND`."), ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."), - ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing."), + ("channel = Ponce OR", "Found unexpected characters at the end of the filter: `OR`. You probably forgot an `OR` or an `AND` rule."), ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."), ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."), @@ -631,6 +635,9 @@ pub mod tests { ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."), ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."), ("colour NOT EXIST", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`."), + ("subscribers 100 TO1000", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`."), + ("channel = ponce ORdog != 'bernese mountain'", "Found unexpected characters at the end of the filter: `ORdog != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."), + ("channel = ponce AND'dog' != 'bernese mountain'", "Found unexpected characters at the end of the filter: `AND\\'dog\\' != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."), ]; for (input, expected) in test_case { diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs index 18ae58ae5..22da6a0df 100644 --- a/filter-parser/src/value.rs +++ b/filter-parser/src/value.rs @@ -48,7 +48,7 @@ fn quoted_by(quote: char, input: Span) -> IResult { )) } -/// value = WS* ( word | singleQuoted | doubleQuoted) WS* +/// value = WS* ( word | singleQuoted | doubleQuoted) WS+ pub fn parse_value<'a>(input: Span<'a>) -> IResult> { // to get better diagnostic message we are going to strip the left whitespaces from the input right now let (input, _) = take_while(char::is_whitespace)(input)?; From 80b962b4f4aeacaec68e22279e3349adf3138746 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 16 Jun 2022 09:16:53 +0200 Subject: [PATCH 13/18] Run cargo fmt --- milli/src/heed_codec/facet/field_id_codec.rs | 7 +++++-- .../extract/extract_fid_docid_facet_values.rs | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs index d147423f2..871b05a09 100644 --- a/milli/src/heed_codec/facet/field_id_codec.rs +++ b/milli/src/heed_codec/facet/field_id_codec.rs @@ -1,6 +1,9 @@ -use crate::{FieldId, BEU16}; +use std::borrow::Cow; +use std::convert::TryInto; + use heed::zerocopy::AsBytes; -use std::{borrow::Cow, convert::TryInto}; + +use crate::{FieldId, BEU16}; pub struct FieldIdCodec; diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index c83ac49e0..6d66a7a64 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,11 +1,12 @@ -use heed::zerocopy::AsBytes; -use serde_json::Value; use std::collections::HashSet; use std::convert::TryInto; use std::fs::File; use std::io; use std::mem::size_of; +use heed::zerocopy::AsBytes; +use serde_json::Value; + use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; From 4f0bd317dff811a74a86649d4c6e064f31949c15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 4 Jul 2022 08:41:54 +0200 Subject: [PATCH 14/18] Remove custom implementation of BytesEncode/Decode for the FieldId --- milli/src/heed_codec/facet/field_id_codec.rs | 28 -------------------- milli/src/heed_codec/facet/mod.rs | 7 +++-- milli/src/index.rs | 4 +-- 3 files changed, 7 insertions(+), 32 deletions(-) delete mode 100644 milli/src/heed_codec/facet/field_id_codec.rs diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs deleted file mode 100644 index 871b05a09..000000000 --- a/milli/src/heed_codec/facet/field_id_codec.rs +++ /dev/null @@ -1,28 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use heed::zerocopy::AsBytes; - -use crate::{FieldId, BEU16}; - -pub struct FieldIdCodec; - -impl<'a> heed::BytesDecode<'a> for FieldIdCodec { - type DItem = FieldId; - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let bytes: [u8; 2] = bytes[..2].try_into().ok()?; - let field_id = BEU16::from(bytes).get(); - Some(field_id) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldIdCodec { - type EItem = FieldId; - - fn bytes_encode(field_id: &Self::EItem) -> Option> { - let field_id = BEU16::new(*field_id); - let bytes = field_id.as_bytes(); - Some(Cow::Owned(bytes.to_vec())) - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 384991fd7..51812d97a 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -5,7 +5,9 @@ mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; -mod field_id_codec; + +use crate::BEU16; +use heed::types::OwnedType; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; @@ -16,7 +18,8 @@ pub use self::facet_string_level_zero_value_codec::{ pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; -pub use self::field_id_codec::FieldIdCodec; + +pub type FieldIdCodec = OwnedType; /// Tries to split a slice in half at the given middle point, /// `None` if the slice is too short. diff --git a/milli/src/index.rs b/milli/src/index.rs index b0897271e..aec7aa396 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -21,7 +21,7 @@ use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, StrBEU32Codec, StrStrU8Codec, BEU32, + Search, StrBEU32Codec, StrStrU8Codec, BEU16, BEU32, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -819,7 +819,7 @@ impl Index { rtxn: &RoTxn, field_id: FieldId, ) -> heed::Result { - match self.facet_id_exists_docids.get(rtxn, &field_id)? { + match self.facet_id_exists_docids.get(rtxn, &BEU16::new(field_id))? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), } From 1eb1e73bb3d909d2be54ff835126f008b812aa9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 4 Jul 2022 09:28:23 +0200 Subject: [PATCH 15/18] Add integration tests for the EXISTS filter --- milli/src/heed_codec/facet/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 11 ++++-- milli/tests/assets/test_set.ndjson | 30 +++++++------- milli/tests/search/filters.rs | 6 +++ milli/tests/search/mod.rs | 52 +++++++++++++++++++++++-- 5 files changed, 78 insertions(+), 23 deletions(-) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 51812d97a..0b2d9186f 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -6,7 +6,6 @@ mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; -use crate::BEU16; use heed::types::OwnedType; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; @@ -18,6 +17,7 @@ pub use self::facet_string_level_zero_value_codec::{ pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; +use crate::BEU16; pub type FieldIdCodec = OwnedType; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 99f474eb6..950b3a417 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -592,7 +592,7 @@ mod tests { use super::*; use crate::documents::DocumentBatchBuilder; use crate::update::DeleteDocuments; - use crate::HashMap; + use crate::{HashMap, BEU16}; #[test] fn simple_document_replacement() { @@ -2012,11 +2012,14 @@ mod tests { let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); let bitmap_colour = - index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap(); + index.facet_id_exists_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap(); assert_eq!(bitmap_colour.into_iter().collect::>(), vec![0, 1, 2, 3, 4, 6, 7]); - let bitmap_colour_green = - index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap(); + let bitmap_colour_green = index + .facet_id_exists_docids + .get(&rtxn, &BEU16::new(colour_green_id)) + .unwrap() + .unwrap(); assert_eq!(bitmap_colour_green.into_iter().collect::>(), vec![6, 7]); }; diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson index 6383d274e..427daca8c 100644 --- a/milli/tests/assets/test_set.ndjson +++ b/milli/tests/assets/test_set.ndjson @@ -1,17 +1,17 @@ -{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":""} -{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":""} -{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":""} -{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":""} -{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":""} -{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":""} -{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":""} -{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":""} +{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null]} +{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": []} +{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null} +{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4} +{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E"} +{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"]} +{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":"", "opt1": [7]} +{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8]} {"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":""} -{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":""} -{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""} -{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":""} -{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":""} -{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":""} -{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":""} -{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":""} +{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}} +{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] } +{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}} +{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":"", "opt1": [13, [{"opt2": null}]]} +{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}} } +{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]] } +{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":"", "opt1.opt2": 16} {"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""} diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs index fe926d17a..1700a1478 100644 --- a/milli/tests/search/filters.rs +++ b/milli/tests/search/filters.rs @@ -80,3 +80,9 @@ test_filter!( lower_complex_filter_2, vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])] ); +test_filter!(exists_filter_1, vec![Right("opt1 EXISTS")]); +test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]); +test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]); +test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]); + +test_filter!(exists_filter_2, vec![Right("opt1.opt2 EXISTS")]); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 472fbafe0..ec784bfc0 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -9,7 +9,7 @@ use maplit::{hashmap, hashset}; use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, DocumentId, Index, Member}; -use serde::Deserialize; +use serde::{Deserialize, Deserializer}; use slice_group_by::GroupBy; mod distinct; @@ -43,6 +43,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { S("tag"), S("asc_desc_rank"), S("_geo"), + S("opt1"), + S("opt1.opt2") }); builder.set_sortable_fields(hashset! { S("tag"), @@ -196,12 +198,44 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { id = (document.geo_rank < 100000).then(|| document.id.clone()); } else if filter.starts_with("NOT _geoRadius") { id = (document.geo_rank > 1000000).then(|| document.id.clone()); + } else if matches!(filter, "opt1 EXISTS" | "NOT opt1 NOT EXISTS") { + id = document.opt1.is_some().then(|| document.id.clone()); + } else if matches!(filter, "NOT opt1 EXISTS" | "opt1 NOT EXISTS") { + id = document.opt1.is_none().then(|| document.id.clone()); + } else if matches!(filter, "opt1.opt2 EXISTS") { + if document.opt1opt2.is_some() { + id = Some(document.id.clone()); + } else if let Some(opt1) = &document.opt1 { + id = contains_key_rec(opt1, "opt2").then(|| document.id.clone()); + } } id } +pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool { + match v { + serde_json::Value::Array(v) => { + for v in v.iter() { + if contains_key_rec(v, key) { + return true; + } + } + false + } + serde_json::Value::Object(v) => { + for (k, v) in v.iter() { + if k == key || contains_key_rec(v, key) { + return true; + } + } + false + } + _ => false, + } +} + pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet { - let dataset: HashSet = + let dataset: Vec = serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect(); let mut filtered_ids: HashSet<_> = dataset.iter().map(|d| d.id.clone()).collect(); @@ -229,7 +263,7 @@ pub fn expected_filtered_ids(filters: Vec, &str>>) -> HashSet, + #[serde(default, deserialize_with = "some_option", rename = "opt1.opt2")] + pub opt1opt2: Option, +} + +fn some_option<'de, D>(deserializer: D) -> Result, D::Error> +where + D: Deserializer<'de>, +{ + let result = serde_json::Value::deserialize(deserializer)?; + Ok(Some(result)) } From aed8c69bcb94b83ccb2957b741584f908c22d094 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 09:57:28 +0200 Subject: [PATCH 16/18] Refactor indexation of the "facet-id-exists-docids" database The idea is to directly create a sorted and merged list of bitmaps in the form of a BTreeMap instead of creating a grenad::Reader where the keys are field_id and the values are docids. Then we send that BTreeMap to the thing that handles TypedChunks, which inserts its content into the database. --- .../extract/extract_fid_docid_facet_values.rs | 23 +++--- .../src/update/index_documents/extract/mod.rs | 35 +++++---- .../src/update/index_documents/typed_chunk.rs | 73 ++++++++++++++++--- 3 files changed, 92 insertions(+), 39 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 6d66a7a64..368378792 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -1,16 +1,16 @@ -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::convert::TryInto; use std::fs::File; use std::io; use std::mem::size_of; use heed::zerocopy::AsBytes; +use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{DocumentId, FieldId, Result, BEU32}; /// Extracts the facet values of each faceted field of each document. @@ -22,7 +22,7 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { +) -> Result<(grenad::Reader, grenad::Reader, BTreeMap)> { let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -30,7 +30,7 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 3), + max_memory.map(|m| m / 2), ); let mut fid_docid_facet_strings_sorter = create_sorter( @@ -38,16 +38,10 @@ pub fn extract_fid_docid_facet_values( indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, - max_memory.map(|m| m / 3), + max_memory.map(|m| m / 2), ); - let mut fid_docid_facet_exists_sorter = create_sorter( - merge_cbo_roaring_bitmaps, - indexer.chunk_compression_type, - indexer.chunk_compression_level, - indexer.max_nb_chunks, - max_memory.map(|m| m / 3), - ); + let mut facet_exists_docids = BTreeMap::::new(); let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; @@ -65,7 +59,8 @@ pub fn extract_fid_docid_facet_values( // Here, we know already that the document must be added to the “field id exists” database let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); let document = BEU32::from(document).get(); - fid_docid_facet_exists_sorter.insert(&key_buffer, document.to_ne_bytes())?; + + facet_exists_docids.entry(field_id).or_default().insert(document); // For the other extraction tasks, prefix the key with the field_id and the document_id key_buffer.extend_from_slice(&docid_bytes); @@ -99,7 +94,7 @@ pub fn extract_fid_docid_facet_values( Ok(( sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?, - sorter_into_reader(fid_docid_facet_exists_sorter, indexer)?, + facet_exists_docids, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index bb695a99f..76d968919 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -8,12 +8,13 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::fs::File; use crossbeam_channel::Sender; use log::debug; use rayon::prelude::*; +use roaring::RoaringBitmap; use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_facet_number_docids::extract_facet_number_docids; @@ -72,12 +73,24 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, - ( - docid_fid_facet_numbers_chunks, - (docid_fid_facet_strings_chunks, docid_fid_facet_exists_chunks), - ), + (docid_fid_facet_numbers_chunks, (docid_fid_facet_strings_chunks, facet_exists_docids)), ) = result?; + // merge facet_exists_docids hashmaps and send them as a typed chunk + { + let lmdb_writer_sx = lmdb_writer_sx.clone(); + rayon::spawn(move || { + let mut all = BTreeMap::default(); + for facet_exists_docids in facet_exists_docids { + for (field_id, docids) in facet_exists_docids { + let docids0 = all.entry(field_id).or_default(); + *docids0 |= docids; + } + } + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(all))); + }); + } + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), @@ -141,12 +154,6 @@ pub(crate) fn data_from_obkv_documents( "field-id-facet-number-docids", ); - // spawn extraction task for field-id-facet-exists-docids - rayon::spawn(move || { - let reader = docid_fid_facet_exists_chunks.merge(merge_cbo_roaring_bitmaps, &indexer); - let _ = lmdb_writer_sx.send(reader.map(TypedChunk::FieldIdFacetExistsDocids)); - }); - Ok(()) } @@ -221,7 +228,7 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, grenad::Reader), + (grenad::Reader, BTreeMap), ), )> { let flattened_documents_chunk = @@ -266,7 +273,7 @@ fn send_and_extract_flattened_documents_data( let ( docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk, - docid_fid_facet_exists_chunk, + facet_exists_docids, ) = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), indexer.clone(), @@ -291,7 +298,7 @@ fn send_and_extract_flattened_documents_data( Ok(( docid_fid_facet_numbers_chunk, - (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk), + (docid_fid_facet_strings_chunk, facet_exists_docids), )) }, ); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e501e5efd..e1fd8f98d 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,11 +1,12 @@ use std::borrow::Cow; +use std::collections::BTreeMap; use std::convert::TryInto; use std::fs::File; use std::io; use grenad::MergerBuilder; use heed::types::ByteSlice; -use heed::{BytesDecode, RwTxn}; +use heed::{BytesDecode, BytesEncode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ @@ -16,8 +17,8 @@ use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ - lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, - Result, + error, lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, + GeoPoint, Index, Result, BEU16, }; pub(crate) enum TypedChunk { @@ -35,7 +36,7 @@ pub(crate) enum TypedChunk { WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), - FieldIdFacetExistsDocids(grenad::Reader), + FieldIdFacetExistsDocids(BTreeMap), GeoPoints(grenad::Reader), } @@ -147,16 +148,14 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } - TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids_iter) => { - append_entries_into_database( - facet_id_exists_docids_iter, + TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { + write_sorted_iterator_into_database( + facet_id_exists_docids.into_iter().map(|(k, v)| (BEU16::new(k), v)), &index.facet_id_exists_docids, + "facet-id-exists-docids", wtxn, - index_is_empty, - |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, - ) - .unwrap(); + )?; is_merged_database = true; } TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { @@ -270,6 +269,58 @@ fn merge_cbo_roaring_bitmaps( )?) } +fn write_sorted_iterator_into_database( + mut iterator: Iter, + database: &heed::Database, + database_name: &'static str, + wtxn: &mut RwTxn, + merge_values: Merge, +) -> Result<()> +where + for<'a> KeyCodec: BytesEncode<'a, EItem = Key>, + for<'a> ValueCodec: BytesEncode<'a, EItem = Value> + BytesDecode<'a, DItem = Value>, + Iter: Iterator, + Merge: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, +{ + if database.is_empty(wtxn)? { + let mut database = database.iter_mut(wtxn)?.remap_types::(); + + while let Some((key, value)) = iterator.next() { + let key = KeyCodec::bytes_encode(&key) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + if valid_lmdb_key(&key) { + let value = ValueCodec::bytes_encode(&value) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + unsafe { database.append(&key, &value)? }; + } + } + + Ok(()) + } else { + let database = database.remap_types::(); + let mut buffer = Vec::new(); + while let Some((key, value)) = iterator.next() { + let key = KeyCodec::bytes_encode(&key) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + if valid_lmdb_key(&key) { + let value = ValueCodec::bytes_encode(&value) + .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; + let value = match database.get(wtxn, &key)? { + Some(prev_value) => { + merge_values(&value, &prev_value, &mut buffer)?; + &buffer[..] + } + None => &value, + }; + + database.put(wtxn, &key, value)?; + } + } + + Ok(()) + } +} + /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( From d0eee5ff7a970b42b1a3723c1e627dc2f2ea8608 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 13:44:36 +0200 Subject: [PATCH 17/18] Fix compiler error --- milli/src/search/facet/filter.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 7f3b928dd..903f9644f 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -287,14 +287,7 @@ impl<'a> Filter<'a> { Condition::NotExists => { let all_ids = index.documents_ids(rtxn)?; - let exist = Self::evaluate_operator( - rtxn, - index, - numbers_db, - strings_db, - field_id, - &Condition::Exists, - )?; + let exist = Self::evaluate_operator(rtxn, index, field_id, &Condition::Exists)?; let notexist = all_ids - exist; return Ok(notexist); From 1506683705b0ee01afdef14d1f07a527d6dadc17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 19 Jul 2022 14:42:35 +0200 Subject: [PATCH 18/18] Avoid using too much memory when indexing facet-exists-docids --- .../extract/extract_fid_docid_facet_values.rs | 19 +++++- .../src/update/index_documents/extract/mod.rs | 29 ++++---- .../src/update/index_documents/typed_chunk.rs | 68 +++---------------- 3 files changed, 40 insertions(+), 76 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 368378792..cf116e6f5 100644 --- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -5,13 +5,15 @@ use std::io; use std::mem::size_of; use heed::zerocopy::AsBytes; +use heed::BytesEncode; use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; use crate::error::InternalError; use crate::facet::value_encoding::f64_into_bytes; -use crate::{DocumentId, FieldId, Result, BEU32}; +use crate::update::index_documents::{create_writer, writer_into_reader}; +use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32}; /// Extracts the facet values of each faceted field of each document. /// @@ -22,7 +24,7 @@ pub fn extract_fid_docid_facet_values( obkv_documents: grenad::Reader, indexer: GrenadParameters, faceted_fields: &HashSet, -) -> Result<(grenad::Reader, grenad::Reader, BTreeMap)> { +) -> Result<(grenad::Reader, grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); let mut fid_docid_facet_numbers_sorter = create_sorter( @@ -91,10 +93,21 @@ pub fn extract_fid_docid_facet_values( } } + let mut facet_exists_docids_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + for (fid, bitmap) in facet_exists_docids.into_iter() { + let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); + facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; + } + let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; + Ok(( sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?, sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?, - facet_exists_docids, + facet_exists_docids_reader, )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 76d968919..157886e63 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -8,13 +8,12 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; -use std::collections::{BTreeMap, HashSet}; +use std::collections::HashSet; use std::fs::File; use crossbeam_channel::Sender; use log::debug; use rayon::prelude::*; -use roaring::RoaringBitmap; use self::extract_docid_word_positions::extract_docid_word_positions; use self::extract_facet_number_docids::extract_facet_number_docids; @@ -73,21 +72,25 @@ pub(crate) fn data_from_obkv_documents( let ( docid_word_positions_chunks, - (docid_fid_facet_numbers_chunks, (docid_fid_facet_strings_chunks, facet_exists_docids)), + ( + docid_fid_facet_numbers_chunks, + (docid_fid_facet_strings_chunks, facet_exists_docids_chunks), + ), ) = result?; - // merge facet_exists_docids hashmaps and send them as a typed chunk + // merge facet_exists_docids and send them as a typed chunk { let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { - let mut all = BTreeMap::default(); - for facet_exists_docids in facet_exists_docids { - for (field_id, docids) in facet_exists_docids { - let docids0 = all.entry(field_id).or_default(); - *docids0 |= docids; + debug!("merge {} database", "facet-id-exists-docids"); + match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { + Ok(reader) => { + let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); + } + Err(e) => { + let _ = lmdb_writer_sx.send(Err(e)); } } - let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(all))); }); } @@ -228,7 +231,7 @@ fn send_and_extract_flattened_documents_data( grenad::Reader, ( grenad::Reader, - (grenad::Reader, BTreeMap), + (grenad::Reader, grenad::Reader), ), )> { let flattened_documents_chunk = @@ -273,7 +276,7 @@ fn send_and_extract_flattened_documents_data( let ( docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk, - facet_exists_docids, + fid_facet_exists_docids_chunk, ) = extract_fid_docid_facet_values( flattened_documents_chunk.clone(), indexer.clone(), @@ -298,7 +301,7 @@ fn send_and_extract_flattened_documents_data( Ok(( docid_fid_facet_numbers_chunk, - (docid_fid_facet_strings_chunk, facet_exists_docids), + (docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk), )) }, ); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e1fd8f98d..5b7b00c21 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -1,12 +1,11 @@ use std::borrow::Cow; -use std::collections::BTreeMap; use std::convert::TryInto; use std::fs::File; use std::io; use grenad::MergerBuilder; use heed::types::ByteSlice; -use heed::{BytesDecode, BytesEncode, RwTxn}; +use heed::{BytesDecode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ @@ -17,8 +16,8 @@ use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ - error, lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, - GeoPoint, Index, Result, BEU16, + lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, + Result, }; pub(crate) enum TypedChunk { @@ -36,7 +35,7 @@ pub(crate) enum TypedChunk { WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), - FieldIdFacetExistsDocids(BTreeMap), + FieldIdFacetExistsDocids(grenad::Reader), GeoPoints(grenad::Reader), } @@ -149,11 +148,12 @@ pub(crate) fn write_typed_chunk_into_index( is_merged_database = true; } TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { - write_sorted_iterator_into_database( - facet_id_exists_docids.into_iter().map(|(k, v)| (BEU16::new(k), v)), + append_entries_into_database( + facet_id_exists_docids, &index.facet_id_exists_docids, - "facet-id-exists-docids", wtxn, + index_is_empty, + |value, _buffer| Ok(value), merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -269,58 +269,6 @@ fn merge_cbo_roaring_bitmaps( )?) } -fn write_sorted_iterator_into_database( - mut iterator: Iter, - database: &heed::Database, - database_name: &'static str, - wtxn: &mut RwTxn, - merge_values: Merge, -) -> Result<()> -where - for<'a> KeyCodec: BytesEncode<'a, EItem = Key>, - for<'a> ValueCodec: BytesEncode<'a, EItem = Value> + BytesDecode<'a, DItem = Value>, - Iter: Iterator, - Merge: Fn(&[u8], &[u8], &mut Vec) -> Result<()>, -{ - if database.is_empty(wtxn)? { - let mut database = database.iter_mut(wtxn)?.remap_types::(); - - while let Some((key, value)) = iterator.next() { - let key = KeyCodec::bytes_encode(&key) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - if valid_lmdb_key(&key) { - let value = ValueCodec::bytes_encode(&value) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - unsafe { database.append(&key, &value)? }; - } - } - - Ok(()) - } else { - let database = database.remap_types::(); - let mut buffer = Vec::new(); - while let Some((key, value)) = iterator.next() { - let key = KeyCodec::bytes_encode(&key) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - if valid_lmdb_key(&key) { - let value = ValueCodec::bytes_encode(&value) - .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?; - let value = match database.get(wtxn, &key)? { - Some(prev_value) => { - merge_values(&value, &prev_value, &mut buffer)?; - &buffer[..] - } - None => &value, - }; - - database.put(wtxn, &key, value)?; - } - } - - Ok(()) - } -} - /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database(