From 453d593ce804eac35af4f7abd1b3a88c218941d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Tue, 19 Jul 2022 09:30:19 +0200
Subject: [PATCH 01/18] Add a database containing the docids where each field
 exists

---
 infos/src/main.rs                             |  15 ++
 milli/src/heed_codec/facet/mod.rs             |  40 +++++
 milli/src/index.rs                            |  24 ++-
 milli/src/update/clear_documents.rs           |   2 +
 milli/src/update/delete_documents.rs          |  11 +-
 .../extract/extract_facet_exists_docids.rs    |  42 +++++
 .../extract/extract_fid_docid_facet_values.rs |  29 +++-
 .../src/update/index_documents/extract/mod.rs |  47 ++++--
 milli/src/update/index_documents/mod.rs       | 149 ++++++++++++++++++
 .../src/update/index_documents/typed_chunk.rs |  13 ++
 10 files changed, 350 insertions(+), 22 deletions(-)
 create mode 100644 milli/src/update/index_documents/extract/extract_facet_exists_docids.rs
diff --git a/infos/src/main.rs b/infos/src/main.rs
index 29a87cdcf..89aec6182 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -384,6 +384,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
         field_id_word_count_docids,
         facet_id_f64_docids,
         facet_id_string_docids,
+        facet_id_exists_docids,
         exact_word_docids,
         exact_word_prefix_docids,
         field_id_docid_facet_f64s: _,
@@ -402,6 +403,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
     let field_id_word_count_docids_name = "field_id_word_count_docids";
     let facet_id_f64_docids_name = "facet_id_f64_docids";
     let facet_id_string_docids_name = "facet_id_string_docids";
+    let facet_id_exists_docids_name = "facet_id_exists_docids";
     let documents_name = "documents";
 
     let mut heap = BinaryHeap::with_capacity(limit + 1);
@@ -544,6 +546,17 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
                     heap.pop();
                 }
             }
+
+            // List the docids where the facet exists
+            let db = facet_id_exists_docids.remap_data_type::<ByteSlice>();
+            for result in facet_values_iter(rtxn, db, facet_id)? {
+                let (_fid, value) = result?;
+                let key = format!("{}", facet_name);
+                heap.push(Reverse((value.len(), key, facet_id_exists_docids_name)));
+                if heap.len() > limit {
+                    heap.pop();
+                }
+            }
         }
 
         for result in index.all_documents(rtxn)? {
@@ -984,6 +997,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
         facet_id_string_docids,
         field_id_docid_facet_f64s,
         field_id_docid_facet_strings,
+        facet_id_exists_docids,
         exact_word_prefix_docids,
         exact_word_docids,
         ..
@@ -1007,6 +1021,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a
             FIELD_ID_WORD_COUNT_DOCIDS => field_id_word_count_docids.as_polymorph(),
             FACET_ID_F64_DOCIDS => facet_id_f64_docids.as_polymorph(),
             FACET_ID_STRING_DOCIDS => facet_id_string_docids.as_polymorph(),
+            FACET_ID_EXISTS_DOCIDS => facet_id_exists_docids.as_polymorph(),
             FIELD_ID_DOCID_FACET_F64S => field_id_docid_facet_f64s.as_polymorph(),
             FIELD_ID_DOCID_FACET_STRINGS => field_id_docid_facet_strings.as_polymorph(),
             EXACT_WORD_DOCIDS => exact_word_docids.as_polymorph(),
diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs
index e93fb57b9..79dbffa1d 100644
--- a/milli/src/heed_codec/facet/mod.rs
+++ b/milli/src/heed_codec/facet/mod.rs
@@ -25,3 +25,43 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
         None
     }
 }
+
+use crate::{try_split_array_at, DocumentId, FieldId};
+use std::borrow::Cow;
+use std::convert::TryInto;
+
+pub struct FieldIdCodec;
+
+impl<'a> heed::BytesDecode<'a> for FieldIdCodec {
+    type DItem = FieldId;
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let (field_id_bytes, _) = try_split_array_at(bytes)?;
+        let field_id = u16::from_be_bytes(field_id_bytes);
+        Some(field_id)
+    }
+}
+
+impl<'a> heed::BytesEncode<'a> for FieldIdCodec {
+    type EItem = FieldId;
+
+    fn bytes_encode(field_id: &Self::EItem) -> Option<Cow<[u8]>> {
+        Some(Cow::Owned(field_id.to_be_bytes().to_vec()))
+    }
+}
+
+pub struct FieldIdDocIdCodec;
+
+impl<'a> heed::BytesDecode<'a> for FieldIdDocIdCodec {
+    type DItem = (FieldId, DocumentId);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
+        let field_id = u16::from_be_bytes(field_id_bytes);
+
+        let document_id_bytes = bytes[..4].try_into().ok()?;
+        let document_id = u32::from_be_bytes(document_id_bytes);
+
+        Some((field_id, document_id))
+    }
+}
diff --git a/milli/src/index.rs b/milli/src/index.rs
index 9637b4103..816112178 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -15,7 +15,7 @@ use crate::error::{InternalError, UserError};
 use crate::fields_ids_map::FieldsIdsMap;
 use crate::heed_codec::facet::{
     FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
-    FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
+    FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec,
 };
 use crate::{
     default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
@@ -75,6 +75,7 @@ pub mod db_name {
     pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
     pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
     pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
+    pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
     pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
     pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
     pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
@@ -116,6 +117,9 @@ pub struct Index {
     /// Maps the position of a word prefix with all the docids where this prefix appears.
     pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
 
+    /// Maps the facet field id and the docids for which this field exists
+    pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
+
     /// Maps the facet field id, level and the number with the docids that corresponds to it.
     pub facet_id_f64_docids: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
     /// Maps the facet field id and the string with the original string and docids that corresponds to it.
@@ -134,7 +138,7 @@ impl Index {
     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
         use db_name::*;
 
-        options.max_dbs(16);
+        options.max_dbs(17);
         unsafe { options.flag(Flags::MdbAlwaysFreePages) };
 
         let env = options.open(path)?;
@@ -152,6 +156,9 @@ impl Index {
         let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
         let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
         let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
+        let facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec> =
+            env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
+
         let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
         let field_id_docid_facet_strings =
             env.create_database(Some(FIELD_ID_DOCID_FACET_STRINGS))?;
@@ -174,6 +181,7 @@ impl Index {
             field_id_word_count_docids,
             facet_id_f64_docids,
             facet_id_string_docids,
+            facet_id_exists_docids,
             field_id_docid_facet_f64s,
             field_id_docid_facet_strings,
             documents,
@@ -806,6 +814,18 @@ impl Index {
         }
     }
 
+    /// Retrieve all the documents which contain this field id
+    pub fn exists_faceted_documents_ids(
+        &self,
+        rtxn: &RoTxn,
+        field_id: FieldId,
+    ) -> heed::Result<RoaringBitmap> {
+        match self.facet_id_exists_docids.get(rtxn, &field_id)? {
+            Some(docids) => Ok(docids),
+            None => Ok(RoaringBitmap::new()),
+        }
+    }
+
     /* distinct field */
 
     pub(crate) fn put_distinct_field(
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index d1939df7b..db438d019 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -30,6 +30,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             word_prefix_position_docids,
             facet_id_f64_docids,
             facet_id_string_docids,
+            facet_id_exists_docids,
             field_id_docid_facet_f64s,
             field_id_docid_facet_strings,
             documents,
@@ -69,6 +70,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         field_id_word_count_docids.clear(self.wtxn)?;
         word_prefix_position_docids.clear(self.wtxn)?;
         facet_id_f64_docids.clear(self.wtxn)?;
+        facet_id_exists_docids.clear(self.wtxn)?;
         facet_id_string_docids.clear(self.wtxn)?;
         field_id_docid_facet_f64s.clear(self.wtxn)?;
         field_id_docid_facet_strings.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 3b519c101..6dfdb9a7c 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -170,6 +170,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             word_position_docids,
             word_prefix_position_docids,
             facet_id_f64_docids,
+            facet_id_exists_docids,
             facet_id_string_docids,
             field_id_docid_facet_f64s,
             field_id_docid_facet_strings,
@@ -424,11 +425,17 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
         }
 
         // We delete the documents ids that are under the facet field id values.
-        remove_docids_from_facet_field_id_number_docids(
+        remove_docids_from_facet_field_id_docids(
             self.wtxn,
             facet_id_f64_docids,
             &self.to_delete_docids,
         )?;
+        // We delete the documents ids that are under the facet field id values.
+        remove_docids_from_facet_field_id_docids(
+            self.wtxn,
+            facet_id_exists_docids,
+            &self.to_delete_docids,
+        )?;
 
         remove_docids_from_facet_field_id_string_docids(
             self.wtxn,
@@ -618,7 +625,7 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>(
     Ok(())
 }
 
-fn remove_docids_from_facet_field_id_number_docids<'a, C>(
+fn remove_docids_from_facet_field_id_docids<'a, C>(
     wtxn: &'a mut heed::RwTxn,
     db: &heed::Database<C, CboRoaringBitmapCodec>,
     to_remove: &RoaringBitmap,
diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs
new file mode 100644
index 000000000..e7a001c08
--- /dev/null
+++ b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs
@@ -0,0 +1,42 @@
+use std::fs::File;
+use std::io;
+
+use heed::{BytesDecode, BytesEncode};
+
+use super::helpers::{
+    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
+};
+use crate::heed_codec::facet::{FieldIdCodec, FieldIdDocIdCodec};
+use crate::Result;
+
+/// Extracts the documents ids where this field appears.
+///
+/// Returns a grenad reader whose key is the field id encoded
+/// with `FieldIdCodec` and the value is a document_id (u32)
+/// encoded as native-endian bytes.
+#[logging_timer::time]
+pub fn extract_facet_exists_docids<R: io::Read + io::Seek>(
+    docid_fid_facet_number: grenad::Reader<R>,
+    indexer: GrenadParameters,
+) -> Result<grenad::Reader<File>> {
+    let max_memory = indexer.max_memory_by_thread();
+
+    let mut facet_exists_docids_sorter = create_sorter(
+        merge_cbo_roaring_bitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory,
+    );
+
+    let mut cursor = docid_fid_facet_number.into_cursor()?;
+    while let Some((key_bytes, _)) = cursor.move_on_next()? {
+        let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap();
+
+        let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap();
+
+        facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
+    }
+
+    sorter_into_reader(facet_exists_docids_sorter, indexer)
+}
diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index 628636f78..d93bde500 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -20,7 +20,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
     obkv_documents: grenad::Reader<R>,
     indexer: GrenadParameters,
     faceted_fields: &HashSet<FieldId>,
-) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
+) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
     let max_memory = indexer.max_memory_by_thread();
 
     let mut fid_docid_facet_numbers_sorter = create_sorter(
@@ -28,7 +28,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
-        max_memory.map(|m| m / 2),
+        max_memory.map(|m| m / 3),
     );
 
     let mut fid_docid_facet_strings_sorter = create_sorter(
@@ -36,7 +36,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
-        max_memory.map(|m| m / 2),
+        max_memory.map(|m| m / 3),
+    );
+
+    let mut fid_docid_facet_exists_sorter = create_sorter(
+        keep_first,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 3),
     );
 
     let mut key_buffer = Vec::new();
@@ -46,15 +54,19 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
 
         for (field_id, field_bytes) in obkv.iter() {
             if faceted_fields.contains(&field_id) {
-                let value =
-                    serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
-                let (numbers, strings) = extract_facet_values(&value);
-
                 key_buffer.clear();
 
+                // here, we know already that the document must be added to the “field id exists” database
                 // prefix key with the field_id and the document_id
+
                 key_buffer.extend_from_slice(&field_id.to_be_bytes());
                 key_buffer.extend_from_slice(&docid_bytes);
+                fid_docid_facet_exists_sorter.insert(&key_buffer, ().as_bytes())?;
+
+                let value =
+                    serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
+
+                let (numbers, strings) = extract_facet_values(&value);
 
                 // insert facet numbers in sorter
                 for number in numbers {
@@ -79,7 +91,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
 
     Ok((
         sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?,
-        sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
+        sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?,
+        sorter_into_reader(fid_docid_facet_exists_sorter, indexer)?,
     ))
 }
 
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index c3c2033a6..7d26e0984 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -1,4 +1,5 @@
 mod extract_docid_word_positions;
+mod extract_facet_exists_docids;
 mod extract_facet_number_docids;
 mod extract_facet_string_docids;
 mod extract_fid_docid_facet_values;
@@ -16,6 +17,7 @@ use log::debug;
 use rayon::prelude::*;
 
 use self::extract_docid_word_positions::extract_docid_word_positions;
+use self::extract_facet_exists_docids::extract_facet_exists_docids;
 use self::extract_facet_number_docids::extract_facet_number_docids;
 use self::extract_facet_string_docids::extract_facet_string_docids;
 use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
@@ -53,7 +55,7 @@ pub(crate) fn data_from_obkv_documents(
         })
         .collect::<Result<()>>()?;
 
-    let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = flattened_obkv_chunks
+    let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks
         .par_bridge()
         .map(|flattened_obkv_chunks| {
             send_and_extract_flattened_documents_data(
@@ -72,7 +74,10 @@ pub(crate) fn data_from_obkv_documents(
 
     let (
         docid_word_positions_chunks,
-        (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks),
+        (
+            docid_fid_facet_numbers_chunks,
+            (docid_fid_facet_strings_chunks, docid_fid_facet_exists_chunks),
+        ),
     ) = result?;
 
     spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
@@ -137,6 +142,15 @@ pub(crate) fn data_from_obkv_documents(
         TypedChunk::FieldIdFacetNumberDocids,
         "field-id-facet-number-docids",
     );
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+        docid_fid_facet_exists_chunks.clone(),
+        indexer.clone(),
+        lmdb_writer_sx.clone(),
+        extract_facet_exists_docids,
+        merge_cbo_roaring_bitmaps,
+        TypedChunk::FieldIdFacetExistsDocids,
+        "field-id-facet-exists-docids",
+    );
 
     Ok(())
 }
@@ -197,6 +211,7 @@ fn send_original_documents_data(
 /// - docid_word_positions
 /// - docid_fid_facet_numbers
 /// - docid_fid_facet_strings
+/// - docid_fid_facet_exists
 fn send_and_extract_flattened_documents_data(
     flattened_documents_chunk: Result<grenad::Reader<File>>,
     indexer: GrenadParameters,
@@ -209,7 +224,10 @@ fn send_and_extract_flattened_documents_data(
     max_positions_per_attributes: Option<u32>,
 ) -> Result<(
     grenad::Reader<CursorClonableMmap>,
-    (grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
+    (
+        grenad::Reader<CursorClonableMmap>,
+        (grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
+    ),
 )> {
     let flattened_documents_chunk =
         flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
@@ -250,12 +268,15 @@ fn send_and_extract_flattened_documents_data(
                 Ok(docid_word_positions_chunk)
             },
             || {
-                let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) =
-                    extract_fid_docid_facet_values(
-                        flattened_documents_chunk.clone(),
-                        indexer.clone(),
-                        faceted_fields,
-                    )?;
+                let (
+                    docid_fid_facet_numbers_chunk,
+                    docid_fid_facet_strings_chunk,
+                    docid_fid_facet_exists_chunk,
+                ) = extract_fid_docid_facet_values(
+                    flattened_documents_chunk.clone(),
+                    indexer.clone(),
+                    faceted_fields,
+                )?;
 
                 // send docid_fid_facet_numbers_chunk to DB writer
                 let docid_fid_facet_numbers_chunk =
@@ -273,7 +294,13 @@ fn send_and_extract_flattened_documents_data(
                     docid_fid_facet_strings_chunk.clone(),
                 )));
 
-                Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk))
+                let docid_fid_facet_exists_chunk =
+                    unsafe { as_cloneable_grenad(&docid_fid_facet_exists_chunk)? };
+
+                Ok((
+                    docid_fid_facet_numbers_chunk,
+                    (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk),
+                ))
             },
         );
 
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index ba428f078..82457762e 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1931,4 +1931,153 @@ mod tests {
 
         assert_eq!(ids.len(), map.len());
     }
+
+    #[test]
+    fn index_documents_check_exists_database_reindex() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        let mut wtxn = index.write_txn().unwrap();
+        let content = documents!([
+            {
+                "id": 0,
+                "colour": 0,
+            },
+            {
+                "id": 1,
+                "colour": []
+            },
+            {
+                "id": 2,
+                "colour": {}
+            },
+            {
+                "id": 3,
+                "colour": null
+            },
+            {
+                "id": 4,
+                "colour": [1]
+            },
+            {
+                "id": 5
+            },
+            {
+                "id": 6,
+                "colour": {
+                    "green": 1
+                }
+            }
+        ]);
+
+        let config = IndexerConfig::default();
+        let indexing_config = IndexDocumentsConfig::default();
+        let mut builder =
+            IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
+                .unwrap();
+        builder.add_documents(content).unwrap();
+        builder.execute().unwrap();
+
+        wtxn.commit().unwrap();
+
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = update::Settings::new(&mut wtxn, &index, &config);
+
+        let faceted_fields = hashset!(S("colour"));
+        builder.set_filterable_fields(faceted_fields);
+        builder.execute(|_| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let facets = index.faceted_fields(&rtxn).unwrap();
+        assert_eq!(facets, hashset!(S("colour"), S("colour.green")));
+
+        let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
+        let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
+
+        let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
+        assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6]);
+
+        let bitmap_colour_green =
+            index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
+        assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6]);
+    }
+
+    #[test]
+    fn index_documents_check_exists_database() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        let config = IndexerConfig::default();
+
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = update::Settings::new(&mut wtxn, &index, &config);
+
+        let faceted_fields = hashset!(S("colour"));
+        builder.set_filterable_fields(faceted_fields);
+        builder.execute(|_| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        let content = documents!([
+            {
+                "id": 0,
+                "colour": 0,
+            },
+            {
+                "id": 1,
+                "colour": []
+            },
+            {
+                "id": 2,
+                "colour": {}
+            },
+            {
+                "id": 3,
+                "colour": null
+            },
+            {
+                "id": 4,
+                "colour": [1]
+            },
+            {
+                "id": 5
+            },
+            {
+                "id": 6,
+                "colour": {
+                    "green": 1
+                }
+            }
+        ]);
+
+        let indexing_config = IndexDocumentsConfig::default();
+
+        let mut wtxn = index.write_txn().unwrap();
+
+        let mut builder =
+            IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
+                .unwrap();
+        builder.add_documents(content).unwrap();
+        builder.execute().unwrap();
+
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let facets = index.faceted_fields(&rtxn).unwrap();
+        assert_eq!(facets, hashset!(S("colour"), S("colour.green")));
+
+        let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
+        let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
+
+        let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
+        assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6]);
+
+        let bitmap_colour_green =
+            index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
+        assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6]);
+    }
 }
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 26b97c3a0..e501e5efd 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -35,6 +35,7 @@ pub(crate) enum TypedChunk {
     WordPairProximityDocids(grenad::Reader<File>),
     FieldIdFacetStringDocids(grenad::Reader<File>),
     FieldIdFacetNumberDocids(grenad::Reader<File>),
+    FieldIdFacetExistsDocids(grenad::Reader<File>),
     GeoPoints(grenad::Reader<File>),
 }
 
@@ -146,6 +147,18 @@ pub(crate) fn write_typed_chunk_into_index(
             )?;
             is_merged_database = true;
         }
+        TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids_iter) => {
+            append_entries_into_database(
+                facet_id_exists_docids_iter,
+                &index.facet_id_exists_docids,
+                wtxn,
+                index_is_empty,
+                |value, _buffer| Ok(value),
+                merge_cbo_roaring_bitmaps,
+            )
+            .unwrap();
+            is_merged_database = true;
+        }
         TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
             append_entries_into_database(
                 word_pair_proximity_docids_iter,

From a8641b42a7963666eedb9846b6aa69481a3f92ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Tue, 14 Jun 2022 15:05:49 +0200
Subject: [PATCH 02/18] Modify flatten_serde_json to keep dummy value for all
 object keys

Example:
```json
{
    "id": 0,
    "colour" : { "green": 1 }
}
```
becomes:
```json
{
    "id": 0,
    "colour" : [],
    "colour.green": 1
}
```
to retain the information the key "colour" exists in the original
json value.
---
 flatten-serde-json/src/lib.rs | 53 ++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 7 deletions(-)

diff --git a/flatten-serde-json/src/lib.rs b/flatten-serde-json/src/lib.rs
index 8312f5bd6..e1b2b20c7 100644
--- a/flatten-serde-json/src/lib.rs
+++ b/flatten-serde-json/src/lib.rs
@@ -4,7 +4,11 @@ use serde_json::{Map, Value};
 
 pub fn flatten(json: &Map<String, Value>) -> Map<String, Value> {
     let mut obj = Map::new();
-    insert_object(&mut obj, None, json);
+    let mut all_keys = vec![];
+    insert_object(&mut obj, None, json, &mut all_keys);
+    for key in all_keys {
+        obj.entry(key).or_insert(Value::Array(vec![]));
+    }
     obj
 }
 
@@ -12,26 +16,32 @@ fn insert_object(
     base_json: &mut Map<String, Value>,
     base_key: Option<&str>,
     object: &Map<String, Value>,
+    all_keys: &mut Vec<String>,
 ) {
     for (key, value) in object {
         let new_key = base_key.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}"));
-
+        all_keys.push(new_key.clone());
         if let Some(array) = value.as_array() {
-            insert_array(base_json, &new_key, array);
+            insert_array(base_json, &new_key, array, all_keys);
         } else if let Some(object) = value.as_object() {
-            insert_object(base_json, Some(&new_key), object);
+            insert_object(base_json, Some(&new_key), object, all_keys);
         } else {
             insert_value(base_json, &new_key, value.clone());
         }
     }
 }
 
-fn insert_array(base_json: &mut Map<String, Value>, base_key: &str, array: &Vec<Value>) {
+fn insert_array(
+    base_json: &mut Map<String, Value>,
+    base_key: &str,
+    array: &Vec<Value>,
+    all_keys: &mut Vec<String>,
+) {
     for value in array {
         if let Some(object) = value.as_object() {
-            insert_object(base_json, Some(base_key), object);
+            insert_object(base_json, Some(base_key), object, all_keys);
         } else if let Some(sub_array) = value.as_array() {
-            insert_array(base_json, base_key, sub_array);
+            insert_array(base_json, base_key, sub_array, all_keys);
         } else {
             insert_value(base_json, base_key, value.clone());
         }
@@ -103,6 +113,7 @@ mod tests {
         assert_eq!(
             &flat,
             json!({
+                "a": [],
                 "a.b": "c",
                 "a.d": "e",
                 "a.f": "g"
@@ -116,6 +127,10 @@ mod tests {
     fn flatten_array() {
         let mut base: Value = json!({
           "a": [
+            1,
+            "b",
+            [],
+            [{}],
             { "b": "c" },
             { "b": "d" },
             { "b": "e" },
@@ -127,6 +142,7 @@ mod tests {
         assert_eq!(
             &flat,
             json!({
+                "a": [1, "b"],
                 "a.b": ["c", "d", "e"],
             })
             .as_object()
@@ -154,6 +170,28 @@ mod tests {
             .as_object()
             .unwrap()
         );
+
+        // here we must keep 42 in "a"
+        let mut base: Value = json!({
+          "a": [
+            { "b": "c" },
+            { "b": "d" },
+            { "b": "e" },
+            null,
+          ]
+        });
+        let json = std::mem::take(base.as_object_mut().unwrap());
+        let flat = flatten(&json);
+
+        assert_eq!(
+            &flat,
+            json!({
+                "a": null,
+                "a.b": ["c", "d", "e"],
+            })
+            .as_object()
+            .unwrap()
+        );
     }
 
     #[test]
@@ -170,6 +208,7 @@ mod tests {
         assert_eq!(
             &flat,
             json!({
+                "a": [],
                 "a.b": ["c", "d"],
             })
             .as_object()

From 72452f0cb270d5225e83b31ec4d707845afd1eea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Wed, 25 May 2022 11:55:16 +0200
Subject: [PATCH 03/18] Implements the EXIST filter operator

---
 filter-parser/src/condition.rs   | 13 ++++++++++++-
 filter-parser/src/error.rs       |  4 ++--
 filter-parser/src/lib.rs         | 27 ++++++++++++++++++++++-----
 milli/src/search/facet/filter.rs | 19 +++++++++++++++++++
 4 files changed, 55 insertions(+), 8 deletions(-)

diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs
index 264787055..0ece99a0d 100644
--- a/filter-parser/src/condition.rs
+++ b/filter-parser/src/condition.rs
@@ -8,7 +8,7 @@
 use nom::branch::alt;
 use nom::bytes::complete::tag;
 use nom::combinator::cut;
-use nom::sequence::tuple;
+use nom::sequence::{terminated, tuple};
 use Condition::*;
 
 use crate::{parse_value, FilterCondition, IResult, Span, Token};
@@ -19,6 +19,8 @@ pub enum Condition<'a> {
     GreaterThanOrEqual(Token<'a>),
     Equal(Token<'a>),
     NotEqual(Token<'a>),
+    Exist,
+    NotExist,
     LowerThan(Token<'a>),
     LowerThanOrEqual(Token<'a>),
     Between { from: Token<'a>, to: Token<'a> },
@@ -33,6 +35,8 @@ impl<'a> Condition<'a> {
             GreaterThanOrEqual(n) => (LowerThan(n), None),
             Equal(s) => (NotEqual(s), None),
             NotEqual(s) => (Equal(s), None),
+            Exist => (NotExist, None),
+            NotExist => (Exist, None),
             LowerThan(n) => (GreaterThanOrEqual(n), None),
             LowerThanOrEqual(n) => (GreaterThan(n), None),
             Between { from, to } => (LowerThan(from), Some(GreaterThan(to))),
@@ -58,6 +62,13 @@ pub fn parse_condition(input: Span) -> IResult<FilterCondition> {
     Ok((input, condition))
 }
 
+/// exist          = value EXIST
+pub fn parse_exist(input: Span) -> IResult<FilterCondition> {
+    let (input, key) = terminated(parse_value, tag("EXIST"))(input)?;
+
+    Ok((input, FilterCondition::Condition { fid: key.into(), op: Exist }))
+}
+
 /// to             = value value TO value
 pub fn parse_to(input: Span) -> IResult<FilterCondition> {
     let (input, (key, from, _, to)) =
diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs
index ddf7bea47..8136732c8 100644
--- a/filter-parser/src/error.rs
+++ b/filter-parser/src/error.rs
@@ -128,10 +128,10 @@ impl<'a> Display for Error<'a> {
                 writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)?
             }
             ErrorKind::InvalidPrimary if input.trim().is_empty() => {
-                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing.")?
+                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing.")?
             }
             ErrorKind::InvalidPrimary => {
-                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `{}`.", escaped_input)?
+                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `{}`.", escaped_input)?
             }
             ErrorKind::ExpectedEof => {
                 writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs
index 243d1a3f4..ee4edc122 100644
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@@ -6,8 +6,9 @@
 //! or             = and (~ "OR" ~ and)
 //! and            = not (~ "AND" not)*
 //! not            = ("NOT" ~ not) | primary
-//! primary        = (WS* ~ "("  expression ")" ~ WS*) | geoRadius | condition | to
+//! primary        = (WS* ~ "("  expression ")" ~ WS*) | geoRadius | condition | exist | to
 //! condition      = value ("==" | ">" ...) value
+//! exist          = value EXIST
 //! to             = value value TO value
 //! value          = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS*
 //! singleQuoted   = "'" .* all but quotes "'"
@@ -42,6 +43,7 @@ mod value;
 use std::fmt::Debug;
 use std::str::FromStr;
 
+use condition::parse_exist;
 pub use condition::{parse_condition, parse_to, Condition};
 use error::{cut_with_err, NomErrorExt};
 pub use error::{Error, ErrorKind};
@@ -248,6 +250,7 @@ fn parse_primary(input: Span) -> IResult<FilterCondition> {
         ),
         parse_geo_radius,
         parse_condition,
+        parse_exist,
         parse_to,
         // the next lines are only for error handling and are written at the end to have the less possible performance impact
         parse_geo_point,
@@ -420,6 +423,20 @@ pub mod tests {
                     op: Condition::LowerThan(rtok("NOT subscribers >= ", "1000")),
                 },
             ),
+            (
+                "subscribers EXIST",
+                Fc::Condition {
+                    fid: rtok("", "subscribers"),
+                    op: Condition::Exist,
+                },
+            ),
+            (
+                "NOT subscribers EXIST",
+                Fc::Condition {
+                    fid: rtok("NOT ", "subscribers"),
+                    op: Condition::NotExist,
+                },
+            ),
             (
                 "subscribers 100 TO 1000",
                 Fc::Condition {
@@ -577,10 +594,10 @@ pub mod tests {
             ("channel =    ", "Was expecting a value but instead got nothing."),
             ("channel = 🐻", "Was expecting a value but instead got `🐻`."),
             ("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."),
-            ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `OR`."),
-            ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `AND`."),
-            ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` at `channel Ponce`."),
-            ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO` or `_geoRadius` but instead got nothing."),
+            ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `OR`."),
+            ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `AND`."),
+            ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `channel Ponce`."),
+            ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing."),
             ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."),
             ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."),
             ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."),
diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs
index d89413f62..a5c13ec2a 100644
--- a/milli/src/search/facet/filter.rs
+++ b/milli/src/search/facet/filter.rs
@@ -280,6 +280,25 @@ impl<'a> Filter<'a> {
             Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse()?)),
             Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)),
             Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)),
+            Condition::Exist => {
+                let exist = index.exists_faceted_documents_ids(rtxn, field_id)?;
+                return Ok(exist);
+            }
+            Condition::NotExist => {
+                let all_ids = index.documents_ids(rtxn)?;
+
+                let exist = Self::evaluate_operator(
+                    rtxn,
+                    index,
+                    numbers_db,
+                    strings_db,
+                    field_id,
+                    &Condition::Exist,
+                )?;
+
+                let notexist = all_ids - exist;
+                return Ok(notexist);
+            }
             Condition::Equal(val) => {
                 let (_original_value, string_docids) = strings_db
                     .get(rtxn, &(field_id, &val.value().to_lowercase()))?

From dc64170a69e0be95b2c9e5bb96e1a5be0b58ec06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Tue, 14 Jun 2022 16:42:09 +0200
Subject: [PATCH 04/18] =?UTF-8?q?Improve=20syntax=20of=20EXISTS=20filter,?=
 =?UTF-8?q?=20allow=20=E2=80=9Cvalue=20NOT=20EXISTS=E2=80=9D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 filter-parser/src/condition.rs   | 22 ++++++++++------
 filter-parser/src/error.rs       |  4 +--
 filter-parser/src/lib.rs         | 45 +++++++++++++++++++-------------
 milli/src/search/facet/filter.rs |  6 ++---
 4 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs
index 0ece99a0d..b57d68b75 100644
--- a/filter-parser/src/condition.rs
+++ b/filter-parser/src/condition.rs
@@ -19,8 +19,8 @@ pub enum Condition<'a> {
     GreaterThanOrEqual(Token<'a>),
     Equal(Token<'a>),
     NotEqual(Token<'a>),
-    Exist,
-    NotExist,
+    Exists,
+    NotExists,
     LowerThan(Token<'a>),
     LowerThanOrEqual(Token<'a>),
     Between { from: Token<'a>, to: Token<'a> },
@@ -35,8 +35,8 @@ impl<'a> Condition<'a> {
             GreaterThanOrEqual(n) => (LowerThan(n), None),
             Equal(s) => (NotEqual(s), None),
             NotEqual(s) => (Equal(s), None),
-            Exist => (NotExist, None),
-            NotExist => (Exist, None),
+            Exists => (NotExists, None),
+            NotExists => (Exists, None),
             LowerThan(n) => (GreaterThanOrEqual(n), None),
             LowerThanOrEqual(n) => (GreaterThan(n), None),
             Between { from, to } => (LowerThan(from), Some(GreaterThan(to))),
@@ -62,11 +62,17 @@ pub fn parse_condition(input: Span) -> IResult<FilterCondition> {
     Ok((input, condition))
 }
 
-/// exist          = value EXIST
-pub fn parse_exist(input: Span) -> IResult<FilterCondition> {
-    let (input, key) = terminated(parse_value, tag("EXIST"))(input)?;
+/// exist          = value NOT EXISTS
+pub fn parse_exists(input: Span) -> IResult<FilterCondition> {
+    let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?;
 
-    Ok((input, FilterCondition::Condition { fid: key.into(), op: Exist }))
+    Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists }))
+}
+/// exist          = value NOT EXISTS
+pub fn parse_not_exists(input: Span) -> IResult<FilterCondition> {
+    let (input, key) = terminated(parse_value, tag("NOT EXISTS"))(input)?;
+
+    Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists }))
 }
 
 /// to             = value value TO value
diff --git a/filter-parser/src/error.rs b/filter-parser/src/error.rs
index 8136732c8..a3720f7bf 100644
--- a/filter-parser/src/error.rs
+++ b/filter-parser/src/error.rs
@@ -128,10 +128,10 @@ impl<'a> Display for Error<'a> {
                 writeln!(f, "Was expecting a value but instead got `{}`.", escaped_input)?
             }
             ErrorKind::InvalidPrimary if input.trim().is_empty() => {
-                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing.")?
+                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing.")?
             }
             ErrorKind::InvalidPrimary => {
-                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `{}`.", escaped_input)?
+                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `{}`.", escaped_input)?
             }
             ErrorKind::ExpectedEof => {
                 writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs
index ee4edc122..69215798c 100644
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@@ -3,23 +3,24 @@
 //! ```text
 //! filter         = expression ~ EOF
 //! expression     = or
-//! or             = and (~ "OR" ~ and)
-//! and            = not (~ "AND" not)*
-//! not            = ("NOT" ~ not) | primary
-//! primary        = (WS* ~ "("  expression ")" ~ WS*) | geoRadius | condition | exist | to
+//! or             = and ("OR" and)
+//! and            = not ("AND" not)*
+//! not            = ("NOT" not) | primary
+//! primary        = (WS* "("  expression ")" WS*) | geoRadius | condition | exists | not_exists | to
 //! condition      = value ("==" | ">" ...) value
-//! exist          = value EXIST
+//! exists         = value EXISTS
+//! not_exists     = value NOT EXISTS
 //! to             = value value TO value
-//! value          = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS*
+//! value          = WS* ( word | singleQuoted | doubleQuoted) ~ WS*
 //! singleQuoted   = "'" .* all but quotes "'"
 //! doubleQuoted   = "\"" .* all but double quotes "\""
 //! word           = (alphanumeric | _ | - | .)+
-//! geoRadius      = WS* ~ "_geoRadius(" ~ WS* ~ float ~ WS* ~ "," ~ WS* ~ float ~ WS* ~ "," float ~ WS* ~ ")"
+//! geoRadius      = WS* ~ "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")"
 //! ```
 //!
 //! Other BNF grammar used to handle some specific errors:
 //! ```text
-//! geoPoint       = WS* ~ "_geoPoint(" ~ (float ~ ",")* ~ ")"
+//! geoPoint       = WS* "_geoPoint(" (float ",")* ")"
 //! ```
 //!
 //! Specific errors:
@@ -43,8 +44,8 @@ mod value;
 use std::fmt::Debug;
 use std::str::FromStr;
 
-use condition::parse_exist;
 pub use condition::{parse_condition, parse_to, Condition};
+use condition::{parse_exists, parse_not_exists};
 use error::{cut_with_err, NomErrorExt};
 pub use error::{Error, ErrorKind};
 use nom::branch::alt;
@@ -250,7 +251,8 @@ fn parse_primary(input: Span) -> IResult<FilterCondition> {
         ),
         parse_geo_radius,
         parse_condition,
-        parse_exist,
+        parse_exists,
+        parse_not_exists,
         parse_to,
         // the next lines are only for error handling and are written at the end to have the less possible performance impact
         parse_geo_point,
@@ -424,17 +426,24 @@ pub mod tests {
                 },
             ),
             (
-                "subscribers EXIST",
+                "subscribers EXISTS",
                 Fc::Condition {
                     fid: rtok("", "subscribers"),
-                    op: Condition::Exist,
+                    op: Condition::Exists,
                 },
             ),
             (
-                "NOT subscribers EXIST",
+                "NOT subscribers EXISTS",
                 Fc::Condition {
                     fid: rtok("NOT ", "subscribers"),
-                    op: Condition::NotExist,
+                    op: Condition::NotExists,
+                },
+            ),
+            (
+                "subscribers NOT EXISTS",
+                Fc::Condition {
+                    fid: rtok("", "subscribers"),
+                    op: Condition::NotExists,
                 },
             ),
             (
@@ -594,10 +603,10 @@ pub mod tests {
             ("channel =    ", "Was expecting a value but instead got nothing."),
             ("channel = 🐻", "Was expecting a value but instead got `🐻`."),
             ("channel = 🐻 AND followers < 100", "Was expecting a value but instead got `🐻`."),
-            ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `OR`."),
-            ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `AND`."),
-            ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` at `channel Ponce`."),
-            ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXIST`, or `_geoRadius` but instead got nothing."),
+            ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `OR`."),
+            ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `AND`."),
+            ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."),
+            ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing."),
             ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."),
             ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."),
             ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."),
diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs
index a5c13ec2a..7f3b928dd 100644
--- a/milli/src/search/facet/filter.rs
+++ b/milli/src/search/facet/filter.rs
@@ -280,11 +280,11 @@ impl<'a> Filter<'a> {
             Condition::LowerThan(val) => (Included(f64::MIN), Excluded(val.parse()?)),
             Condition::LowerThanOrEqual(val) => (Included(f64::MIN), Included(val.parse()?)),
             Condition::Between { from, to } => (Included(from.parse()?), Included(to.parse()?)),
-            Condition::Exist => {
+            Condition::Exists => {
                 let exist = index.exists_faceted_documents_ids(rtxn, field_id)?;
                 return Ok(exist);
             }
-            Condition::NotExist => {
+            Condition::NotExists => {
                 let all_ids = index.documents_ids(rtxn)?;
 
                 let exist = Self::evaluate_operator(
@@ -293,7 +293,7 @@ impl<'a> Filter<'a> {
                     numbers_db,
                     strings_db,
                     field_id,
-                    &Condition::Exist,
+                    &Condition::Exists,
                 )?;
 
                 let notexist = all_ids - exist;

From 0388b2d46382e5578824dd5b2b522cead764f35b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Wed, 15 Jun 2022 08:58:41 +0200
Subject: [PATCH 05/18] Run cargo fmt

---
 milli/src/heed_codec/facet/mod.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs
index 79dbffa1d..8c5a4c118 100644
--- a/milli/src/heed_codec/facet/mod.rs
+++ b/milli/src/heed_codec/facet/mod.rs
@@ -26,10 +26,11 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
     }
 }
 
-use crate::{try_split_array_at, DocumentId, FieldId};
 use std::borrow::Cow;
 use std::convert::TryInto;
 
+use crate::{try_split_array_at, DocumentId, FieldId};
+
 pub struct FieldIdCodec;
 
 impl<'a> heed::BytesDecode<'a> for FieldIdCodec {

From a5c916225077b0dfe8ba35cea702af730f692b1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Wed, 15 Jun 2022 09:14:19 +0200
Subject: [PATCH 06/18] Improve parser for NOT EXISTS filter

Allow multiple spaces between NOT and EXISTS
---
 filter-parser/src/condition.rs | 10 ++++----
 filter-parser/src/lib.rs       | 43 +++++++++++++++++++++++-----------
 filter-parser/src/value.rs     |  2 +-
 3 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs
index b57d68b75..6a5ecbe0a 100644
--- a/filter-parser/src/condition.rs
+++ b/filter-parser/src/condition.rs
@@ -7,11 +7,12 @@
 
 use nom::branch::alt;
 use nom::bytes::complete::tag;
+use nom::character::complete::multispace1;
 use nom::combinator::cut;
 use nom::sequence::{terminated, tuple};
 use Condition::*;
 
-use crate::{parse_value, FilterCondition, IResult, Span, Token};
+use crate::{parse_value, ws, FilterCondition, IResult, Span, Token};
 
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum Condition<'a> {
@@ -62,16 +63,17 @@ pub fn parse_condition(input: Span) -> IResult<FilterCondition> {
     Ok((input, condition))
 }
 
-/// exist          = value NOT EXISTS
+/// exist          = value "EXISTS"
 pub fn parse_exists(input: Span) -> IResult<FilterCondition> {
     let (input, key) = terminated(parse_value, tag("EXISTS"))(input)?;
 
     Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists }))
 }
-/// exist          = value NOT EXISTS
+/// exist          = value "NOT" WS* "EXISTS"
 pub fn parse_not_exists(input: Span) -> IResult<FilterCondition> {
-    let (input, key) = terminated(parse_value, tag("NOT EXISTS"))(input)?;
+    let (input, key) = parse_value(input)?;
 
+    let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?;
     Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists }))
 }
 
diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs
index 69215798c..e40519e87 100644
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@@ -1,21 +1,21 @@
 //! BNF grammar:
 //!
 //! ```text
-//! filter         = expression ~ EOF
+//! filter         = expression EOF
 //! expression     = or
 //! or             = and ("OR" and)
 //! and            = not ("AND" not)*
 //! not            = ("NOT" not) | primary
 //! primary        = (WS* "("  expression ")" WS*) | geoRadius | condition | exists | not_exists | to
 //! condition      = value ("==" | ">" ...) value
-//! exists         = value EXISTS
-//! not_exists     = value NOT EXISTS
-//! to             = value value TO value
-//! value          = WS* ( word | singleQuoted | doubleQuoted) ~ WS*
+//! exists         = value "EXISTS"
+//! not_exists     = value "NOT" WS* "EXISTS"
+//! to             = value value "TO" value
+//! value          = WS* ( word | singleQuoted | doubleQuoted) WS*
 //! singleQuoted   = "'" .* all but quotes "'"
 //! doubleQuoted   = "\"" .* all but double quotes "\""
 //! word           = (alphanumeric | _ | - | .)+
-//! geoRadius      = WS* ~ "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")"
+//! geoRadius      = WS* "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")"
 //! ```
 //!
 //! Other BNF grammar used to handle some specific errors:
@@ -31,7 +31,7 @@
 //! field < 12 AND _geoPoint(1, 2)
 //! ```
 //!
-//! - If a user try to use a geoRadius as a value we must throw an error.
+//! - If a user try to use a geoRadius as a value we must throw an error.
 //! ```text
 //! field = _geoRadius(12, 13, 14)
 //! ```
@@ -170,7 +170,7 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult<O>) -> impl FnMut(Span<'a>)
     delimited(multispace0, inner, multispace0)
 }
 
-/// or             = and (~ "OR" ~ and)
+/// or             = and ("OR" and)
 fn parse_or(input: Span) -> IResult<FilterCondition> {
     let (input, lhs) = parse_and(input)?;
     // if we found a `OR` then we MUST find something next
@@ -182,7 +182,7 @@ fn parse_or(input: Span) -> IResult<FilterCondition> {
     Ok((input, expr))
 }
 
-/// and            = not (~ "AND" not)*
+/// and            = not ("AND" not)*
 fn parse_and(input: Span) -> IResult<FilterCondition> {
     let (input, lhs) = parse_not(input)?;
     // if we found a `AND` then we MUST find something next
@@ -193,14 +193,14 @@ fn parse_and(input: Span) -> IResult<FilterCondition> {
     Ok((input, expr))
 }
 
-/// not            = ("NOT" ~ not) | primary
+/// not            = ("NOT" not) | primary
 /// We can have multiple consecutive not, eg: `NOT NOT channel = mv`.
 /// If we parse a `NOT` we MUST parse something behind.
 fn parse_not(input: Span) -> IResult<FilterCondition> {
     alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input)
 }
 
-/// geoRadius      = WS* ~ "_geoRadius(float ~ "," ~ float ~ "," float)
+/// geoRadius      = WS* "_geoRadius(float "," float "," float)
 /// If we parse `_geoRadius` we MUST parse the rest of the expression.
 fn parse_geo_radius(input: Span) -> IResult<FilterCondition> {
     // we want to forbid space BEFORE the _geoRadius but not after
@@ -224,7 +224,7 @@ fn parse_geo_radius(input: Span) -> IResult<FilterCondition> {
     Ok((input, res))
 }
 
-/// geoPoint      = WS* ~ "_geoPoint(float ~ "," ~ float ~ "," float)
+/// geoPoint      = WS* "_geoPoint(float "," float "," float)
 fn parse_geo_point(input: Span) -> IResult<FilterCondition> {
     // we want to forbid space BEFORE the _geoPoint but not after
     tuple((
@@ -238,7 +238,7 @@ fn parse_geo_point(input: Span) -> IResult<FilterCondition> {
     Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))))
 }
 
-/// primary        = (WS* ~ "("  expression ")" ~ WS*) | geoRadius | condition | to
+/// primary        = (WS* "("  expression ")" WS*) | geoRadius | condition | to
 fn parse_primary(input: Span) -> IResult<FilterCondition> {
     alt((
         // if we find a first parenthesis, then we must parse an expression and find the closing parenthesis
@@ -266,7 +266,7 @@ pub fn parse_expression(input: Span) -> IResult<FilterCondition> {
     parse_or(input)
 }
 
-/// filter     = expression ~ EOF
+/// filter     = expression EOF
 pub fn parse_filter(input: Span) -> IResult<FilterCondition> {
     terminated(parse_expression, eof)(input)
 }
@@ -446,6 +446,20 @@ pub mod tests {
                     op: Condition::NotExists,
                 },
             ),
+            (
+                "NOT subscribers NOT EXISTS",
+                Fc::Condition {
+                    fid: rtok("NOT ", "subscribers"),
+                    op: Condition::Exists,
+                },
+            ),
+            (
+                "subscribers NOT   EXISTS",
+                Fc::Condition {
+                    fid: rtok("", "subscribers"),
+                    op: Condition::NotExists,
+                },
+            ),
             (
                 "subscribers 100 TO 1000",
                 Fc::Condition {
@@ -616,6 +630,7 @@ pub mod tests {
             ("channel = \"ponce", "Expression `\\\"ponce` is missing the following closing delimiter: `\"`."),
             ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."),
             ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."),
+            ("colour NOT EXIST", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`."),
         ];
 
         for (input, expected) in test_case {
diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs
index 84dd21902..18ae58ae5 100644
--- a/filter-parser/src/value.rs
+++ b/filter-parser/src/value.rs
@@ -48,7 +48,7 @@ fn quoted_by(quote: char, input: Span) -> IResult<Token> {
     ))
 }
 
-/// value          = WS* ~ ( word | singleQuoted | doubleQuoted) ~ WS*
+/// value          = WS* ( word | singleQuoted | doubleQuoted) WS*
 pub fn parse_value<'a>(input: Span<'a>) -> IResult<Token<'a>> {
     // to get better diagnostic message we are going to strip the left whitespaces from the input right now
     let (input, _) = take_while(char::is_whitespace)(input)?;

From 722db7b088a86f2f4cabef93927e6815a791136e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Wed, 15 Jun 2022 09:28:21 +0200
Subject: [PATCH 07/18] Ignore target directory of filter-parser/fuzz crate

---
 filter-parser/fuzz/.gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/filter-parser/fuzz/.gitignore b/filter-parser/fuzz/.gitignore
index cb73742e4..9a2e1d58c 100644
--- a/filter-parser/fuzz/.gitignore
+++ b/filter-parser/fuzz/.gitignore
@@ -1,2 +1,3 @@
 /corpus/
 /artifacts/
+/target/
\ No newline at end of file

From bd15f5625af5d3032f386d8c67cfb9a4d5f8330d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Wed, 15 Jun 2022 13:31:51 +0200
Subject: [PATCH 08/18] Fix compiler warning

---
 filter-parser/src/condition.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs
index 6a5ecbe0a..c63f1d926 100644
--- a/filter-parser/src/condition.rs
+++ b/filter-parser/src/condition.rs
@@ -12,7 +12,7 @@ use nom::combinator::cut;
 use nom::sequence::{terminated, tuple};
 use Condition::*;
 
-use crate::{parse_value, ws, FilterCondition, IResult, Span, Token};
+use crate::{parse_value, FilterCondition, IResult, Span, Token};
 
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum Condition<'a> {

From 392472f4bb9f1f16aa27e6b9feef249f2d1f31f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Thu, 16 Jun 2022 06:19:33 +0200
Subject: [PATCH 09/18] Apply suggestions from code review

Co-authored-by: Tamo <tamo@meilisearch.com>
---
 filter-parser/src/lib.rs                                    | 6 +++---
 infos/src/main.rs                                           | 2 +-
 milli/src/index.rs                                          | 3 +--
 .../index_documents/extract/extract_facet_exists_docids.rs  | 2 --
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs
index e40519e87..5cce5f4c3 100644
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@@ -170,7 +170,7 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult<O>) -> impl FnMut(Span<'a>)
     delimited(multispace0, inner, multispace0)
 }
 
-/// or             = and ("OR" and)
+/// or             = and ("OR" and)*
 fn parse_or(input: Span) -> IResult<FilterCondition> {
     let (input, lhs) = parse_and(input)?;
     // if we found a `OR` then we MUST find something next
@@ -200,7 +200,7 @@ fn parse_not(input: Span) -> IResult<FilterCondition> {
     alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input)
 }
 
-/// geoRadius      = WS* "_geoRadius(float "," float "," float)
+/// geoRadius      = WS* "_geoRadius(float WS* "," WS* float WS* "," WS* float)
 /// If we parse `_geoRadius` we MUST parse the rest of the expression.
 fn parse_geo_radius(input: Span) -> IResult<FilterCondition> {
     // we want to forbid space BEFORE the _geoRadius but not after
@@ -224,7 +224,7 @@ fn parse_geo_radius(input: Span) -> IResult<FilterCondition> {
     Ok((input, res))
 }
 
-/// geoPoint      = WS* "_geoPoint(float "," float "," float)
+/// geoPoint      = WS* "_geoPoint(float WS* "," WS* float WS* "," WS* float)
 fn parse_geo_point(input: Span) -> IResult<FilterCondition> {
     // we want to forbid space BEFORE the _geoPoint but not after
     tuple((
diff --git a/infos/src/main.rs b/infos/src/main.rs
index 89aec6182..feec17557 100644
--- a/infos/src/main.rs
+++ b/infos/src/main.rs
@@ -551,7 +551,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
             let db = facet_id_exists_docids.remap_data_type::<ByteSlice>();
             for result in facet_values_iter(rtxn, db, facet_id)? {
                 let (_fid, value) = result?;
-                let key = format!("{}", facet_name);
+                let key = facet_name.to_string();
                 heap.push(Reverse((value.len(), key, facet_id_exists_docids_name)));
                 if heap.len() > limit {
                     heap.pop();
diff --git a/milli/src/index.rs b/milli/src/index.rs
index 816112178..b0897271e 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -156,8 +156,7 @@ impl Index {
         let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
         let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
         let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
-        let facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec> =
-            env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
+        let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
 
         let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?;
         let field_id_docid_facet_strings =
diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs
index e7a001c08..d25c57aea 100644
--- a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs
@@ -32,9 +32,7 @@ pub fn extract_facet_exists_docids<R: io::Read + io::Seek>(
     let mut cursor = docid_fid_facet_number.into_cursor()?;
     while let Some((key_bytes, _)) = cursor.move_on_next()? {
         let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap();
-
         let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap();
-
         facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
     }
 

From 30bd4db0fcbcf31d80baec5892d28076c16b8577 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Thu, 16 Jun 2022 08:24:16 +0200
Subject: [PATCH 10/18] Simplify indexing task for facet_exists_docids database

---
 milli/src/heed_codec/facet/field_id_codec.rs  | 25 +++++++++++
 milli/src/heed_codec/facet/mod.rs             | 43 +------------------
 milli/src/lib.rs                              |  1 +
 .../extract/extract_facet_exists_docids.rs    | 40 -----------------
 .../extract/extract_fid_docid_facet_values.rs | 24 +++++++----
 .../src/update/index_documents/extract/mod.rs | 22 +++-------
 6 files changed, 50 insertions(+), 105 deletions(-)
 create mode 100644 milli/src/heed_codec/facet/field_id_codec.rs
 delete mode 100644 milli/src/update/index_documents/extract/extract_facet_exists_docids.rs

diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs
new file mode 100644
index 000000000..d147423f2
--- /dev/null
+++ b/milli/src/heed_codec/facet/field_id_codec.rs
@@ -0,0 +1,25 @@
+use crate::{FieldId, BEU16};
+use heed::zerocopy::AsBytes;
+use std::{borrow::Cow, convert::TryInto};
+
+pub struct FieldIdCodec;
+
+impl<'a> heed::BytesDecode<'a> for FieldIdCodec {
+    type DItem = FieldId;
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let bytes: [u8; 2] = bytes[..2].try_into().ok()?;
+        let field_id = BEU16::from(bytes).get();
+        Some(field_id)
+    }
+}
+
+impl<'a> heed::BytesEncode<'a> for FieldIdCodec {
+    type EItem = FieldId;
+
+    fn bytes_encode(field_id: &Self::EItem) -> Option<Cow<[u8]>> {
+        let field_id = BEU16::new(*field_id);
+        let bytes = field_id.as_bytes();
+        Some(Cow::Owned(bytes.to_vec()))
+    }
+}
diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs
index 8c5a4c118..384991fd7 100644
--- a/milli/src/heed_codec/facet/mod.rs
+++ b/milli/src/heed_codec/facet/mod.rs
@@ -5,6 +5,7 @@ mod facet_string_level_zero_value_codec;
 mod facet_string_zero_bounds_value_codec;
 mod field_doc_id_facet_f64_codec;
 mod field_doc_id_facet_string_codec;
+mod field_id_codec;
 
 pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
 pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
@@ -15,6 +16,7 @@ pub use self::facet_string_level_zero_value_codec::{
 pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
 pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
 pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
+pub use self::field_id_codec::FieldIdCodec;
 
 /// Tries to split a slice in half at the given middle point,
 /// `None` if the slice is too short.
@@ -25,44 +27,3 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
         None
     }
 }
-
-use std::borrow::Cow;
-use std::convert::TryInto;
-
-use crate::{try_split_array_at, DocumentId, FieldId};
-
-pub struct FieldIdCodec;
-
-impl<'a> heed::BytesDecode<'a> for FieldIdCodec {
-    type DItem = FieldId;
-
-    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
-        let (field_id_bytes, _) = try_split_array_at(bytes)?;
-        let field_id = u16::from_be_bytes(field_id_bytes);
-        Some(field_id)
-    }
-}
-
-impl<'a> heed::BytesEncode<'a> for FieldIdCodec {
-    type EItem = FieldId;
-
-    fn bytes_encode(field_id: &Self::EItem) -> Option<Cow<[u8]>> {
-        Some(Cow::Owned(field_id.to_be_bytes().to_vec()))
-    }
-}
-
-pub struct FieldIdDocIdCodec;
-
-impl<'a> heed::BytesDecode<'a> for FieldIdDocIdCodec {
-    type DItem = (FieldId, DocumentId);
-
-    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
-        let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
-        let field_id = u16::from_be_bytes(field_id_bytes);
-
-        let document_id_bytes = bytes[..4].try_into().ok()?;
-        let document_id = u32::from_be_bytes(document_id_bytes);
-
-        Some((field_id, document_id))
-    }
-}
diff --git a/milli/src/lib.rs b/milli/src/lib.rs
index 81cd057d5..20fdceaec 100644
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -49,6 +49,7 @@ pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
 pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;
 pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
 pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
+pub type BEU16 = heed::zerocopy::U16<heed::byteorder::BE>;
 pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
 pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
 pub type Attribute = u32;
diff --git a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs b/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs
deleted file mode 100644
index d25c57aea..000000000
--- a/milli/src/update/index_documents/extract/extract_facet_exists_docids.rs
+++ /dev/null
@@ -1,40 +0,0 @@
-use std::fs::File;
-use std::io;
-
-use heed::{BytesDecode, BytesEncode};
-
-use super::helpers::{
-    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
-};
-use crate::heed_codec::facet::{FieldIdCodec, FieldIdDocIdCodec};
-use crate::Result;
-
-/// Extracts the documents ids where this field appears.
-///
-/// Returns a grenad reader whose key is the field id encoded
-/// with `FieldIdCodec` and the value is a document_id (u32)
-/// encoded as native-endian bytes.
-#[logging_timer::time]
-pub fn extract_facet_exists_docids<R: io::Read + io::Seek>(
-    docid_fid_facet_number: grenad::Reader<R>,
-    indexer: GrenadParameters,
-) -> Result<grenad::Reader<File>> {
-    let max_memory = indexer.max_memory_by_thread();
-
-    let mut facet_exists_docids_sorter = create_sorter(
-        merge_cbo_roaring_bitmaps,
-        indexer.chunk_compression_type,
-        indexer.chunk_compression_level,
-        indexer.max_nb_chunks,
-        max_memory,
-    );
-
-    let mut cursor = docid_fid_facet_number.into_cursor()?;
-    while let Some((key_bytes, _)) = cursor.move_on_next()? {
-        let (field_id, document_id) = FieldIdDocIdCodec::bytes_decode(key_bytes).unwrap();
-        let key_bytes = FieldIdCodec::bytes_encode(&field_id).unwrap();
-        facet_exists_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
-    }
-
-    sorter_into_reader(facet_exists_docids_sorter, indexer)
-}
diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index d93bde500..c83ac49e0 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -1,15 +1,16 @@
+use heed::zerocopy::AsBytes;
+use serde_json::Value;
 use std::collections::HashSet;
+use std::convert::TryInto;
 use std::fs::File;
 use std::io;
 use std::mem::size_of;
 
-use heed::zerocopy::AsBytes;
-use serde_json::Value;
-
 use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
 use crate::error::InternalError;
 use crate::facet::value_encoding::f64_into_bytes;
-use crate::{DocumentId, FieldId, Result};
+use crate::update::index_documents::merge_cbo_roaring_bitmaps;
+use crate::{DocumentId, FieldId, Result, BEU32};
 
 /// Extracts the facet values of each faceted field of each document.
 ///
@@ -40,7 +41,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
     );
 
     let mut fid_docid_facet_exists_sorter = create_sorter(
-        keep_first,
+        merge_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
@@ -56,12 +57,17 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
             if faceted_fields.contains(&field_id) {
                 key_buffer.clear();
 
-                // here, we know already that the document must be added to the “field id exists” database
-                // prefix key with the field_id and the document_id
-
+                // Set key to the field_id
+                // Note: this encoding is consistent with FieldIdCodec
                 key_buffer.extend_from_slice(&field_id.to_be_bytes());
+
+                // Here, we know already that the document must be added to the “field id exists” database
+                let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
+                let document = BEU32::from(document).get();
+                fid_docid_facet_exists_sorter.insert(&key_buffer, document.to_ne_bytes())?;
+
+                // For the other extraction tasks, prefix the key with the field_id and the document_id
                 key_buffer.extend_from_slice(&docid_bytes);
-                fid_docid_facet_exists_sorter.insert(&key_buffer, ().as_bytes())?;
 
                 let value =
                     serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 7d26e0984..bb695a99f 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -1,5 +1,4 @@
 mod extract_docid_word_positions;
-mod extract_facet_exists_docids;
 mod extract_facet_number_docids;
 mod extract_facet_string_docids;
 mod extract_fid_docid_facet_values;
@@ -17,7 +16,6 @@ use log::debug;
 use rayon::prelude::*;
 
 use self::extract_docid_word_positions::extract_docid_word_positions;
-use self::extract_facet_exists_docids::extract_facet_exists_docids;
 use self::extract_facet_number_docids::extract_facet_number_docids;
 use self::extract_facet_string_docids::extract_facet_string_docids;
 use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
@@ -142,15 +140,12 @@ pub(crate) fn data_from_obkv_documents(
         TypedChunk::FieldIdFacetNumberDocids,
         "field-id-facet-number-docids",
     );
-    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
-        docid_fid_facet_exists_chunks.clone(),
-        indexer.clone(),
-        lmdb_writer_sx.clone(),
-        extract_facet_exists_docids,
-        merge_cbo_roaring_bitmaps,
-        TypedChunk::FieldIdFacetExistsDocids,
-        "field-id-facet-exists-docids",
-    );
+
+    // spawn extraction task for field-id-facet-exists-docids
+    rayon::spawn(move || {
+        let reader = docid_fid_facet_exists_chunks.merge(merge_cbo_roaring_bitmaps, &indexer);
+        let _ = lmdb_writer_sx.send(reader.map(TypedChunk::FieldIdFacetExistsDocids));
+    });
 
     Ok(())
 }
@@ -226,7 +221,7 @@ fn send_and_extract_flattened_documents_data(
     grenad::Reader<CursorClonableMmap>,
     (
         grenad::Reader<CursorClonableMmap>,
-        (grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
+        (grenad::Reader<CursorClonableMmap>, grenad::Reader<File>),
     ),
 )> {
     let flattened_documents_chunk =
@@ -294,9 +289,6 @@ fn send_and_extract_flattened_documents_data(
                     docid_fid_facet_strings_chunk.clone(),
                 )));
 
-                let docid_fid_facet_exists_chunk =
-                    unsafe { as_cloneable_grenad(&docid_fid_facet_exists_chunk)? };
-
                 Ok((
                     docid_fid_facet_numbers_chunk,
                     (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk),

From c17d616250cee38d34dbad51173188ae8cc54ead Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Thu, 16 Jun 2022 08:41:33 +0200
Subject: [PATCH 11/18] Refactor index_documents_check_exists_database tests

---
 milli/src/update/index_documents/mod.rs | 222 +++++++++---------------
 1 file changed, 86 insertions(+), 136 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 82457762e..99f474eb6 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1932,152 +1932,102 @@ mod tests {
         assert_eq!(ids.len(), map.len());
     }
 
-    #[test]
-    fn index_documents_check_exists_database_reindex() {
-        let path = tempfile::tempdir().unwrap();
-        let mut options = EnvOpenOptions::new();
-        options.map_size(10 * 1024 * 1024); // 10 MB
-        let index = Index::new(options, &path).unwrap();
-
-        let mut wtxn = index.write_txn().unwrap();
-        let content = documents!([
-            {
-                "id": 0,
-                "colour": 0,
-            },
-            {
-                "id": 1,
-                "colour": []
-            },
-            {
-                "id": 2,
-                "colour": {}
-            },
-            {
-                "id": 3,
-                "colour": null
-            },
-            {
-                "id": 4,
-                "colour": [1]
-            },
-            {
-                "id": 5
-            },
-            {
-                "id": 6,
-                "colour": {
-                    "green": 1
-                }
-            }
-        ]);
-
-        let config = IndexerConfig::default();
-        let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
-            IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
-                .unwrap();
-        builder.add_documents(content).unwrap();
-        builder.execute().unwrap();
-
-        wtxn.commit().unwrap();
-
-        let mut wtxn = index.write_txn().unwrap();
-        let mut builder = update::Settings::new(&mut wtxn, &index, &config);
-
-        let faceted_fields = hashset!(S("colour"));
-        builder.set_filterable_fields(faceted_fields);
-        builder.execute(|_| ()).unwrap();
-        wtxn.commit().unwrap();
-
-        let rtxn = index.read_txn().unwrap();
-        let facets = index.faceted_fields(&rtxn).unwrap();
-        assert_eq!(facets, hashset!(S("colour"), S("colour.green")));
-
-        let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
-        let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
-
-        let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
-        assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6]);
-
-        let bitmap_colour_green =
-            index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
-        assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6]);
-    }
-
     #[test]
     fn index_documents_check_exists_database() {
-        let path = tempfile::tempdir().unwrap();
-        let mut options = EnvOpenOptions::new();
-        options.map_size(10 * 1024 * 1024); // 10 MB
-        let index = Index::new(options, &path).unwrap();
-
         let config = IndexerConfig::default();
-
-        let mut wtxn = index.write_txn().unwrap();
-        let mut builder = update::Settings::new(&mut wtxn, &index, &config);
-
-        let faceted_fields = hashset!(S("colour"));
-        builder.set_filterable_fields(faceted_fields);
-        builder.execute(|_| ()).unwrap();
-        wtxn.commit().unwrap();
-
-        let content = documents!([
-            {
-                "id": 0,
-                "colour": 0,
-            },
-            {
-                "id": 1,
-                "colour": []
-            },
-            {
-                "id": 2,
-                "colour": {}
-            },
-            {
-                "id": 3,
-                "colour": null
-            },
-            {
-                "id": 4,
-                "colour": [1]
-            },
-            {
-                "id": 5
-            },
-            {
-                "id": 6,
-                "colour": {
-                    "green": 1
-                }
-            }
-        ]);
-
         let indexing_config = IndexDocumentsConfig::default();
 
-        let mut wtxn = index.write_txn().unwrap();
+        let faceted_fields = hashset!(S("colour"));
+        let content = || {
+            documents!([
+                {
+                    "id": 0,
+                    "colour": 0,
+                },
+                {
+                    "id": 1,
+                    "colour": []
+                },
+                {
+                    "id": 2,
+                    "colour": {}
+                },
+                {
+                    "id": 3,
+                    "colour": null
+                },
+                {
+                    "id": 4,
+                    "colour": [1]
+                },
+                {
+                    "id": 5
+                },
+                {
+                    "id": 6,
+                    "colour": {
+                        "green": 1
+                    }
+                },
+                {
+                    "id": 7,
+                    "colour": {
+                        "green": {
+                            "blue": []
+                        }
+                    }
+                }
+            ])
+        };
+        let make_index = || {
+            let path = tempfile::tempdir().unwrap();
+            let mut options = EnvOpenOptions::new();
+            options.map_size(10 * 1024 * 1024); // 10 MB
+            Index::new(options, &path).unwrap()
+        };
 
-        let mut builder =
-            IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
-                .unwrap();
-        builder.add_documents(content).unwrap();
-        builder.execute().unwrap();
+        let set_filterable_fields = |index: &Index| {
+            let mut wtxn = index.write_txn().unwrap();
+            let mut builder = update::Settings::new(&mut wtxn, &index, &config);
+            builder.set_filterable_fields(faceted_fields.clone());
+            builder.execute(|_| ()).unwrap();
+            wtxn.commit().unwrap();
+        };
+        let add_documents = |index: &Index| {
+            let mut wtxn = index.write_txn().unwrap();
+            let mut builder =
+                IndexDocuments::new(&mut wtxn, index, &config, indexing_config.clone(), |_| ())
+                    .unwrap();
+            builder.add_documents(content()).unwrap();
+            builder.execute().unwrap();
+            wtxn.commit().unwrap();
+        };
 
-        wtxn.commit().unwrap();
+        let check_ok = |index: &Index| {
+            let rtxn = index.read_txn().unwrap();
+            let facets = index.faceted_fields(&rtxn).unwrap();
+            assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));
 
-        let rtxn = index.read_txn().unwrap();
-        let facets = index.faceted_fields(&rtxn).unwrap();
-        assert_eq!(facets, hashset!(S("colour"), S("colour.green")));
+            let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
+            let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
 
-        let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
-        let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
+            let bitmap_colour =
+                index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
+            assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6, 7]);
 
-        let bitmap_colour = index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
-        assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6]);
+            let bitmap_colour_green =
+                index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
+            assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6, 7]);
+        };
 
-        let bitmap_colour_green =
-            index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
-        assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6]);
+        let index = make_index();
+        add_documents(&index);
+        set_filterable_fields(&index);
+        check_ok(&index);
+
+        let index = make_index();
+        set_filterable_fields(&index);
+        add_documents(&index);
+        check_ok(&index);
     }
 }

From ea0642c32d799b201c61cc81d8f979e33cc74380 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Thu, 16 Jun 2022 09:12:37 +0200
Subject: [PATCH 12/18] Make filter parser more strict regarding spacing around
 operators

OR, AND, NOT, TO must now be followed by spaces
---
 filter-parser/src/condition.rs | 11 ++++-----
 filter-parser/src/lib.rs       | 45 ++++++++++++++++++++--------------
 filter-parser/src/value.rs     |  2 +-
 3 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/filter-parser/src/condition.rs b/filter-parser/src/condition.rs
index c63f1d926..cbf73b96a 100644
--- a/filter-parser/src/condition.rs
+++ b/filter-parser/src/condition.rs
@@ -44,8 +44,7 @@ impl<'a> Condition<'a> {
         }
     }
 }
-
-/// condition      = value ("==" | ">" ...) value
+/// condition      = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value
 pub fn parse_condition(input: Span) -> IResult<FilterCondition> {
     let operator = alt((tag("<="), tag(">="), tag("!="), tag("<"), tag(">"), tag("=")));
     let (input, (fid, op, value)) = tuple((parse_value, operator, cut(parse_value)))(input)?;
@@ -69,7 +68,7 @@ pub fn parse_exists(input: Span) -> IResult<FilterCondition> {
 
     Ok((input, FilterCondition::Condition { fid: key.into(), op: Exists }))
 }
-/// exist          = value "NOT" WS* "EXISTS"
+/// exist          = value "NOT" WS+ "EXISTS"
 pub fn parse_not_exists(input: Span) -> IResult<FilterCondition> {
     let (input, key) = parse_value(input)?;
 
@@ -77,10 +76,10 @@ pub fn parse_not_exists(input: Span) -> IResult<FilterCondition> {
     Ok((input, FilterCondition::Condition { fid: key.into(), op: NotExists }))
 }
 
-/// to             = value value TO value
+/// to             = value value "TO" WS+ value
 pub fn parse_to(input: Span) -> IResult<FilterCondition> {
-    let (input, (key, from, _, to)) =
-        tuple((parse_value, parse_value, tag("TO"), cut(parse_value)))(input)?;
+    let (input, (key, from, _, _, to)) =
+        tuple((parse_value, parse_value, tag("TO"), multispace1, cut(parse_value)))(input)?;
 
     Ok((input, FilterCondition::Condition { fid: key, op: Between { from, to } }))
 }
diff --git a/filter-parser/src/lib.rs b/filter-parser/src/lib.rs
index 5cce5f4c3..01be432d7 100644
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@@ -3,19 +3,19 @@
 //! ```text
 //! filter         = expression EOF
 //! expression     = or
-//! or             = and ("OR" and)
-//! and            = not ("AND" not)*
-//! not            = ("NOT" not) | primary
-//! primary        = (WS* "("  expression ")" WS*) | geoRadius | condition | exists | not_exists | to
-//! condition      = value ("==" | ">" ...) value
+//! or             = and ("OR" WS+ and)*
+//! and            = not ("AND" WS+ not)*
+//! not            = ("NOT" WS+ not) | primary
+//! primary        = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to
+//! condition      = value ("=" | "!=" | ">" | ">=" | "<" | "<=") value
 //! exists         = value "EXISTS"
-//! not_exists     = value "NOT" WS* "EXISTS"
-//! to             = value value "TO" value
-//! value          = WS* ( word | singleQuoted | doubleQuoted) WS*
+//! not_exists     = value "NOT" WS+ "EXISTS"
+//! to             = value value "TO" WS+ value
+//! value          = WS* ( word | singleQuoted | doubleQuoted) WS+
 //! singleQuoted   = "'" .* all but quotes "'"
 //! doubleQuoted   = "\"" .* all but double quotes "\""
 //! word           = (alphanumeric | _ | - | .)+
-//! geoRadius      = WS* "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")"
+//! geoRadius      = "_geoRadius(" WS* float WS* "," WS* float WS* "," float WS* ")"
 //! ```
 //!
 //! Other BNF grammar used to handle some specific errors:
@@ -50,7 +50,7 @@ use error::{cut_with_err, NomErrorExt};
 pub use error::{Error, ErrorKind};
 use nom::branch::alt;
 use nom::bytes::complete::tag;
-use nom::character::complete::{char, multispace0};
+use nom::character::complete::{char, multispace0, multispace1};
 use nom::combinator::{cut, eof, map};
 use nom::multi::{many0, separated_list1};
 use nom::number::complete::recognize_float;
@@ -170,11 +170,11 @@ fn ws<'a, O>(inner: impl FnMut(Span<'a>) -> IResult<O>) -> impl FnMut(Span<'a>)
     delimited(multispace0, inner, multispace0)
 }
 
-/// or             = and ("OR" and)*
+/// or             = and ("OR" WS+ and)*
 fn parse_or(input: Span) -> IResult<FilterCondition> {
     let (input, lhs) = parse_and(input)?;
     // if we found a `OR` then we MUST find something next
-    let (input, ors) = many0(preceded(ws(tag("OR")), cut(parse_and)))(input)?;
+    let (input, ors) = many0(preceded(ws(tuple((tag("OR"), multispace1))), cut(parse_and)))(input)?;
 
     let expr = ors
         .into_iter()
@@ -186,24 +186,28 @@ fn parse_or(input: Span) -> IResult<FilterCondition> {
 fn parse_and(input: Span) -> IResult<FilterCondition> {
     let (input, lhs) = parse_not(input)?;
     // if we found a `AND` then we MUST find something next
-    let (input, ors) = many0(preceded(ws(tag("AND")), cut(parse_not)))(input)?;
+    let (input, ors) =
+        many0(preceded(ws(tuple((tag("AND"), multispace1))), cut(parse_not)))(input)?;
     let expr = ors
         .into_iter()
         .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch)));
     Ok((input, expr))
 }
 
-/// not            = ("NOT" not) | primary
-/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`.
+/// not            = ("NOT" WS+ not) | primary
+/// We can have multiple consecutive not, eg: `NOT NOT channel = mv`.
 /// If we parse a `NOT` we MUST parse something behind.
 fn parse_not(input: Span) -> IResult<FilterCondition> {
-    alt((map(preceded(tag("NOT"), cut(parse_not)), |e| e.negate()), parse_primary))(input)
+    alt((
+        map(preceded(ws(tuple((tag("NOT"), multispace1))), cut(parse_not)), |e| e.negate()),
+        parse_primary,
+    ))(input)
 }
 
 /// geoRadius      = WS* "_geoRadius(float WS* "," WS* float WS* "," WS* float)
 /// If we parse `_geoRadius` we MUST parse the rest of the expression.
 fn parse_geo_radius(input: Span) -> IResult<FilterCondition> {
-    // we want to forbid space BEFORE the _geoRadius but not after
+    // we want to allow space BEFORE the _geoRadius but not after
     let parsed = preceded(
         tuple((multispace0, tag("_geoRadius"))),
         // if we were able to parse `_geoRadius` and can't parse the rest of the input we return a failure
@@ -238,7 +242,7 @@ fn parse_geo_point(input: Span) -> IResult<FilterCondition> {
     Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))))
 }
 
-/// primary        = (WS* "("  expression ")" WS*) | geoRadius | condition | to
+/// primary        = (WS* "(" WS* expression WS* ")" WS*) | geoRadius | condition | exists | not_exists | to
 fn parse_primary(input: Span) -> IResult<FilterCondition> {
     alt((
         // if we find a first parenthesis, then we must parse an expression and find the closing parenthesis
@@ -620,7 +624,7 @@ pub mod tests {
             ("OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `OR`."),
             ("AND", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `AND`."),
             ("channel Ponce", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `channel Ponce`."),
-            ("channel = Ponce OR", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` but instead got nothing."),
+            ("channel = Ponce OR", "Found unexpected characters at the end of the filter: `OR`. You probably forgot an `OR` or an `AND` rule."),
             ("_geoRadius", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."),
             ("_geoRadius = 12", "The `_geoRadius` filter expects three arguments: `_geoRadius(latitude, longitude, radius)`."),
             ("_geoPoint(12, 13, 14)", "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance) built-in rule to filter on `_geo` coordinates."),
@@ -631,6 +635,9 @@ pub mod tests {
             ("channel = mv OR (followers >= 1000", "Expression `(followers >= 1000` is missing the following closing delimiter: `)`."),
             ("channel = mv OR followers >= 1000)", "Found unexpected characters at the end of the filter: `)`. You probably forgot an `OR` or an `AND` rule."),
             ("colour NOT EXIST", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `colour NOT EXIST`."),
+            ("subscribers 100 TO1000", "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `TO`, `EXISTS`, `NOT EXISTS`, or `_geoRadius` at `subscribers 100 TO1000`."),
+            ("channel = ponce ORdog != 'bernese mountain'", "Found unexpected characters at the end of the filter: `ORdog != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."),
+            ("channel = ponce AND'dog' != 'bernese mountain'", "Found unexpected characters at the end of the filter: `AND\\'dog\\' != \\'bernese mountain\\'`. You probably forgot an `OR` or an `AND` rule."),
         ];
 
         for (input, expected) in test_case {
diff --git a/filter-parser/src/value.rs b/filter-parser/src/value.rs
index 18ae58ae5..22da6a0df 100644
--- a/filter-parser/src/value.rs
+++ b/filter-parser/src/value.rs
@@ -48,7 +48,7 @@ fn quoted_by(quote: char, input: Span) -> IResult<Token> {
     ))
 }
 
-/// value          = WS* ( word | singleQuoted | doubleQuoted) WS*
+/// value          = WS* ( word | singleQuoted | doubleQuoted) WS+
 pub fn parse_value<'a>(input: Span<'a>) -> IResult<Token<'a>> {
     // to get better diagnostic message we are going to strip the left whitespaces from the input right now
     let (input, _) = take_while(char::is_whitespace)(input)?;

From 80b962b4f4aeacaec68e22279e3349adf3138746 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Thu, 16 Jun 2022 09:16:53 +0200
Subject: [PATCH 13/18] Run cargo fmt

---
 milli/src/heed_codec/facet/field_id_codec.rs               | 7 +++++--
 .../extract/extract_fid_docid_facet_values.rs              | 5 +++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs
index d147423f2..871b05a09 100644
--- a/milli/src/heed_codec/facet/field_id_codec.rs
+++ b/milli/src/heed_codec/facet/field_id_codec.rs
@@ -1,6 +1,9 @@
-use crate::{FieldId, BEU16};
+use std::borrow::Cow;
+use std::convert::TryInto;
+
 use heed::zerocopy::AsBytes;
-use std::{borrow::Cow, convert::TryInto};
+
+use crate::{FieldId, BEU16};
 
 pub struct FieldIdCodec;
 
diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index c83ac49e0..6d66a7a64 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -1,11 +1,12 @@
-use heed::zerocopy::AsBytes;
-use serde_json::Value;
 use std::collections::HashSet;
 use std::convert::TryInto;
 use std::fs::File;
 use std::io;
 use std::mem::size_of;
 
+use heed::zerocopy::AsBytes;
+use serde_json::Value;
+
 use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
 use crate::error::InternalError;
 use crate::facet::value_encoding::f64_into_bytes;

From 4f0bd317dff811a74a86649d4c6e064f31949c15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Mon, 4 Jul 2022 08:41:54 +0200
Subject: [PATCH 14/18] Remove custom implementation of BytesEncode/Decode for
 the FieldId

---
 milli/src/heed_codec/facet/field_id_codec.rs | 28 --------------------
 milli/src/heed_codec/facet/mod.rs            |  7 +++--
 milli/src/index.rs                           |  4 +--
 3 files changed, 7 insertions(+), 32 deletions(-)
 delete mode 100644 milli/src/heed_codec/facet/field_id_codec.rs

diff --git a/milli/src/heed_codec/facet/field_id_codec.rs b/milli/src/heed_codec/facet/field_id_codec.rs
deleted file mode 100644
index 871b05a09..000000000
--- a/milli/src/heed_codec/facet/field_id_codec.rs
+++ /dev/null
@@ -1,28 +0,0 @@
-use std::borrow::Cow;
-use std::convert::TryInto;
-
-use heed::zerocopy::AsBytes;
-
-use crate::{FieldId, BEU16};
-
-pub struct FieldIdCodec;
-
-impl<'a> heed::BytesDecode<'a> for FieldIdCodec {
-    type DItem = FieldId;
-
-    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
-        let bytes: [u8; 2] = bytes[..2].try_into().ok()?;
-        let field_id = BEU16::from(bytes).get();
-        Some(field_id)
-    }
-}
-
-impl<'a> heed::BytesEncode<'a> for FieldIdCodec {
-    type EItem = FieldId;
-
-    fn bytes_encode(field_id: &Self::EItem) -> Option<Cow<[u8]>> {
-        let field_id = BEU16::new(*field_id);
-        let bytes = field_id.as_bytes();
-        Some(Cow::Owned(bytes.to_vec()))
-    }
-}
diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs
index 384991fd7..51812d97a 100644
--- a/milli/src/heed_codec/facet/mod.rs
+++ b/milli/src/heed_codec/facet/mod.rs
@@ -5,7 +5,9 @@ mod facet_string_level_zero_value_codec;
 mod facet_string_zero_bounds_value_codec;
 mod field_doc_id_facet_f64_codec;
 mod field_doc_id_facet_string_codec;
-mod field_id_codec;
+
+use crate::BEU16;
+use heed::types::OwnedType;
 
 pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
 pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
@@ -16,7 +18,8 @@ pub use self::facet_string_level_zero_value_codec::{
 pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
 pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
 pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
-pub use self::field_id_codec::FieldIdCodec;
+
+pub type FieldIdCodec = OwnedType<BEU16>;
 
 /// Tries to split a slice in half at the given middle point,
 /// `None` if the slice is too short.
diff --git a/milli/src/index.rs b/milli/src/index.rs
index b0897271e..aec7aa396 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -21,7 +21,7 @@ use crate::{
     default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
     DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
     FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
-    Search, StrBEU32Codec, StrStrU8Codec, BEU32,
+    Search, StrBEU32Codec, StrStrU8Codec, BEU16, BEU32,
 };
 
 pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
@@ -819,7 +819,7 @@ impl Index {
         rtxn: &RoTxn,
         field_id: FieldId,
     ) -> heed::Result<RoaringBitmap> {
-        match self.facet_id_exists_docids.get(rtxn, &field_id)? {
+        match self.facet_id_exists_docids.get(rtxn, &BEU16::new(field_id))? {
             Some(docids) => Ok(docids),
             None => Ok(RoaringBitmap::new()),
         }

From 1eb1e73bb3d909d2be54ff835126f008b812aa9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Mon, 4 Jul 2022 09:28:23 +0200
Subject: [PATCH 15/18] Add integration tests for the EXISTS filter

---
 milli/src/heed_codec/facet/mod.rs       |  2 +-
 milli/src/update/index_documents/mod.rs | 11 ++++--
 milli/tests/assets/test_set.ndjson      | 30 +++++++-------
 milli/tests/search/filters.rs           |  6 +++
 milli/tests/search/mod.rs               | 52 +++++++++++++++++++++++--
 5 files changed, 78 insertions(+), 23 deletions(-)

diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs
index 51812d97a..0b2d9186f 100644
--- a/milli/src/heed_codec/facet/mod.rs
+++ b/milli/src/heed_codec/facet/mod.rs
@@ -6,7 +6,6 @@ mod facet_string_zero_bounds_value_codec;
 mod field_doc_id_facet_f64_codec;
 mod field_doc_id_facet_string_codec;
 
-use crate::BEU16;
 use heed::types::OwnedType;
 
 pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
@@ -18,6 +17,7 @@ pub use self::facet_string_level_zero_value_codec::{
 pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
 pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
 pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
+use crate::BEU16;
 
 pub type FieldIdCodec = OwnedType<BEU16>;
 
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 99f474eb6..950b3a417 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -592,7 +592,7 @@ mod tests {
     use super::*;
     use crate::documents::DocumentBatchBuilder;
     use crate::update::DeleteDocuments;
-    use crate::HashMap;
+    use crate::{HashMap, BEU16};
 
     #[test]
     fn simple_document_replacement() {
@@ -2012,11 +2012,14 @@ mod tests {
             let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
 
             let bitmap_colour =
-                index.facet_id_exists_docids.get(&rtxn, &colour_id).unwrap().unwrap();
+                index.facet_id_exists_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap();
             assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6, 7]);
 
-            let bitmap_colour_green =
-                index.facet_id_exists_docids.get(&rtxn, &colour_green_id).unwrap().unwrap();
+            let bitmap_colour_green = index
+                .facet_id_exists_docids
+                .get(&rtxn, &BEU16::new(colour_green_id))
+                .unwrap()
+                .unwrap();
             assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6, 7]);
         };
 
diff --git a/milli/tests/assets/test_set.ndjson b/milli/tests/assets/test_set.ndjson
index 6383d274e..427daca8c 100644
--- a/milli/tests/assets/test_set.ndjson
+++ b/milli/tests/assets/test_set.ndjson
@@ -1,17 +1,17 @@
-{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":""}
-{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":""}
-{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":""}
-{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":""}
-{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":""}
-{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":""}
-{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":""}
-{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":""}
+{"id":"A","word_rank":0,"typo_rank":1,"proximity_rank":15,"attribute_rank":505,"exact_rank":5,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":43,"title":"hell o","description":"hell o is the fourteenth episode of the american television series glee performing songs with this word","tag":"blue","_geo": { "lat": 50.62984446145472, "lng": 3.085712705162039 },"":"", "opt1": [null]}
+{"id":"B","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":191,"title":"hello","description":"hello is a song recorded by english singer songwriter adele","tag":"red","_geo": { "lat": 50.63047567664291, "lng": 3.088852230809636 },"":"", "opt1": []}
+{"id":"C","word_rank":0,"typo_rank":1,"proximity_rank":8,"attribute_rank":336,"exact_rank":4,"asc_desc_rank":2,"sort_by_rank":0,"geo_rank":283,"title":"hell on earth","description":"hell on earth is the third studio album by american hip hop duo mobb deep","tag":"blue","_geo": { "lat": 50.6321800003937, "lng": 3.088331882262139 },"":"", "opt1": null}
+{"id":"D","word_rank":0,"typo_rank":1,"proximity_rank":10,"attribute_rank":757,"exact_rank":4,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":1381,"title":"hell on wheels tv series","description":"the construction of the first transcontinental railroad across the united states in the world","tag":"red","_geo": { "lat": 50.63728851135729, "lng": 3.0703951595971626 },"":"", "opt1": 4}
+{"id":"E","word_rank":2,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":4,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":1979,"title":"hello kitty","description":"also known by her full name kitty white is a fictional character produced by the japanese company sanrio","tag":"green","_geo": { "lat": 50.64264610511925, "lng": 3.0665099941857634 },"":"", "opt1": "E"}
+{"id":"F","word_rank":2,"typo_rank":1,"proximity_rank":0,"attribute_rank":1017,"exact_rank":5,"asc_desc_rank":5,"sort_by_rank":0,"geo_rank":65022,"title":"laptop orchestra","description":"a laptop orchestra lork or lo is a chamber music ensemble consisting primarily of laptops like helo huddersfield experimental laptop orchestra","tag":"blue","_geo": { "lat": 51.05028653642387, "lng": 3.7301072771642096 },"":"", "opt1": ["F"]}
+{"id":"G","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":5,"sort_by_rank":2,"geo_rank":34692,"title":"hello world film","description":"hello world is a 2019 japanese animated sci fi romantic drama film directed by tomohiko ito and produced by graphinica","tag":"red","_geo": { "lat": 50.78776041427129, "lng": 2.661201766290338 },"":"", "opt1": [7]}
+{"id":"H","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":202182,"title":"world hello day","description":"holiday observed on november 21 to express that conflicts should be resolved through communication rather than the use of force","tag":"green","_geo": { "lat": 48.875617484531965, "lng": 2.346747821504194 },"":"", "opt1": ["H", 8]}
 {"id":"I","word_rank":0,"typo_rank":0,"proximity_rank":8,"attribute_rank":338,"exact_rank":3,"asc_desc_rank":3,"sort_by_rank":0,"geo_rank":740667,"title":"hello world song","description":"hello world is a song written by tom douglas tony lane and david lee and recorded by american country music group lady antebellum","tag":"blue","_geo": { "lat": 43.973998070351065, "lng": 3.4661837318345032 },"":""}
-{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":""}
-{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":""}
-{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":""}
-{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":""}
-{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":""}
-{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":""}
-{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":""}
+{"id":"J","word_rank":1,"typo_rank":0,"proximity_rank":1,"attribute_rank":1,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":739020,"title":"hello cruel world","description":"hello cruel world is an album by new zealand band tall dwarfs","tag":"green","_geo": { "lat": 43.98920130353838, "lng": 3.480519311627928 },"":"", "opt1": {}}
+{"id":"K","word_rank":0,"typo_rank":2,"proximity_rank":9,"attribute_rank":670,"exact_rank":5,"asc_desc_rank":1,"sort_by_rank":2,"geo_rank":738830,"title":"hallo creation system","description":"in few word hallo was a construction toy created by the american company mattel to engage girls in construction play","tag":"red","_geo": { "lat": 43.99155030238669, "lng": 3.503453528249425 },"":"", "opt1": [{"opt2": 11}] }
+{"id":"L","word_rank":0,"typo_rank":0,"proximity_rank":2,"attribute_rank":250,"exact_rank":4,"asc_desc_rank":0,"sort_by_rank":0,"geo_rank":737861,"title":"good morning world","description":"good morning world is an american sitcom broadcast on cbs tv during the 1967 1968 season","tag":"blue","_geo": { "lat": 44.000507750283695, "lng": 3.5116812040621572 },"":"", "opt1": {"opt2": [12]}}
+{"id":"M","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":0,"asc_desc_rank":0,"sort_by_rank":2,"geo_rank":739203,"title":"hello world america","description":"a perfect match for a perfect engine using the query hello world america","tag":"red","_geo": { "lat": 43.99150729038736, "lng": 3.606143957295055 },"":"", "opt1": [13, [{"opt2": null}]]}
+{"id":"N","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":1,"asc_desc_rank":4,"sort_by_rank":1,"geo_rank":9499586,"title":"hello world america unleashed","description":"a very good match for a very good engine using the query hello world america","tag":"green","_geo": { "lat": 35.511540843367115, "lng": 138.764368875787 },"":"", "opt1": {"a": 1, "opt2": {"opt3": 14}} }
+{"id":"O","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":10,"exact_rank":0,"asc_desc_rank":6,"sort_by_rank":0,"geo_rank":9425163,"title":"a perfect match for a perfect engine using the query hello world america","description":"hello world america","tag":"blue","_geo": { "lat": 35.00536702277189, "lng": 135.76118763940391 },"":"", "opt1": [[[[]]]]  }
+{"id":"P","word_rank":0,"typo_rank":0,"proximity_rank":0,"attribute_rank":12,"exact_rank":1,"asc_desc_rank":3,"sort_by_rank":2,"geo_rank":9422437,"title":"a very good match for a very good engine using the query hello world america","description":"hello world america unleashed","tag":"red","_geo": { "lat": 35.06462306367058, "lng": 135.8338440354251 },"":"", "opt1.opt2": 16}
 {"id":"Q","word_rank":1,"typo_rank":0,"proximity_rank":0,"attribute_rank":0,"exact_rank":3,"asc_desc_rank":2,"sort_by_rank":1,"geo_rank":9339230,"title":"hello world","description":"a hello world program generally is a computer program that outputs or displays the message hello world","tag":"green","_geo": { "lat": 34.39548365683149, "lng": 132.4535960928883 },"":""}
diff --git a/milli/tests/search/filters.rs b/milli/tests/search/filters.rs
index fe926d17a..1700a1478 100644
--- a/milli/tests/search/filters.rs
+++ b/milli/tests/search/filters.rs
@@ -80,3 +80,9 @@ test_filter!(
     lower_complex_filter_2,
     vec![Left(vec!["tag=red", "tag=green"]), Left(vec!["asc_desc_rank<3", "asc_desc_rank<1"])]
 );
+test_filter!(exists_filter_1, vec![Right("opt1 EXISTS")]);
+test_filter!(exists_filter_1_not, vec![Right("opt1 NOT EXISTS")]);
+test_filter!(exists_filter_1_not_alt, vec![Right("NOT opt1 EXISTS")]);
+test_filter!(exists_filter_1_double_not, vec![Right("NOT opt1 NOT EXISTS")]);
+
+test_filter!(exists_filter_2, vec![Right("opt1.opt2 EXISTS")]);
diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs
index 472fbafe0..ec784bfc0 100644
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@@ -9,7 +9,7 @@ use maplit::{hashmap, hashset};
 use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
 use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
 use milli::{AscDesc, Criterion, DocumentId, Index, Member};
-use serde::Deserialize;
+use serde::{Deserialize, Deserializer};
 use slice_group_by::GroupBy;
 
 mod distinct;
@@ -43,6 +43,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
         S("tag"),
         S("asc_desc_rank"),
         S("_geo"),
+        S("opt1"),
+        S("opt1.opt2")
     });
     builder.set_sortable_fields(hashset! {
         S("tag"),
@@ -196,12 +198,44 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option<String> {
         id = (document.geo_rank < 100000).then(|| document.id.clone());
     } else if filter.starts_with("NOT _geoRadius") {
         id = (document.geo_rank > 1000000).then(|| document.id.clone());
+    } else if matches!(filter, "opt1 EXISTS" | "NOT opt1 NOT EXISTS") {
+        id = document.opt1.is_some().then(|| document.id.clone());
+    } else if matches!(filter, "NOT opt1 EXISTS" | "opt1 NOT EXISTS") {
+        id = document.opt1.is_none().then(|| document.id.clone());
+    } else if matches!(filter, "opt1.opt2 EXISTS") {
+        if document.opt1opt2.is_some() {
+            id = Some(document.id.clone());
+        } else if let Some(opt1) = &document.opt1 {
+            id = contains_key_rec(opt1, "opt2").then(|| document.id.clone());
+        }
     }
     id
 }
 
+pub fn contains_key_rec(v: &serde_json::Value, key: &str) -> bool {
+    match v {
+        serde_json::Value::Array(v) => {
+            for v in v.iter() {
+                if contains_key_rec(v, key) {
+                    return true;
+                }
+            }
+            false
+        }
+        serde_json::Value::Object(v) => {
+            for (k, v) in v.iter() {
+                if k == key || contains_key_rec(v, key) {
+                    return true;
+                }
+            }
+            false
+        }
+        _ => false,
+    }
+}
+
 pub fn expected_filtered_ids(filters: Vec<Either<Vec<&str>, &str>>) -> HashSet<String> {
-    let dataset: HashSet<TestDocument> =
+    let dataset: Vec<TestDocument> =
         serde_json::Deserializer::from_str(CONTENT).into_iter().map(|r| r.unwrap()).collect();
 
     let mut filtered_ids: HashSet<_> = dataset.iter().map(|d| d.id.clone()).collect();
@@ -229,7 +263,7 @@ pub fn expected_filtered_ids(filters: Vec<Either<Vec<&str>, &str>>) -> HashSet<S
     filtered_ids
 }
 
-#[derive(Debug, Clone, Deserialize, PartialEq, Eq, Hash)]
+#[derive(Debug, Clone, Deserialize, PartialEq, Eq)]
 pub struct TestDocument {
     pub id: String,
     pub word_rank: u32,
@@ -243,4 +277,16 @@ pub struct TestDocument {
     pub title: String,
     pub description: String,
     pub tag: String,
+    #[serde(default, deserialize_with = "some_option")]
+    pub opt1: Option<serde_json::Value>,
+    #[serde(default, deserialize_with = "some_option", rename = "opt1.opt2")]
+    pub opt1opt2: Option<serde_json::Value>,
+}
+
+fn some_option<'de, D>(deserializer: D) -> Result<Option<serde_json::Value>, D::Error>
+where
+    D: Deserializer<'de>,
+{
+    let result = serde_json::Value::deserialize(deserializer)?;
+    Ok(Some(result))
 }

From aed8c69bcb94b83ccb2957b741584f908c22d094 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Tue, 19 Jul 2022 09:57:28 +0200
Subject: [PATCH 16/18] Refactor indexation of the "facet-id-exists-docids"
 database

The idea is to directly create a sorted and merged list of bitmaps
in the form of a BTreeMap<FieldId, RoaringBitmap> instead of creating
a grenad::Reader where the keys are field_id and the values are docids.

Then we send that BTreeMap to the thing that handles TypedChunks, which
inserts its content into the database.
---
 .../extract/extract_fid_docid_facet_values.rs | 23 +++---
 .../src/update/index_documents/extract/mod.rs | 35 +++++----
 .../src/update/index_documents/typed_chunk.rs | 73 ++++++++++++++++---
 3 files changed, 92 insertions(+), 39 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index 6d66a7a64..368378792 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -1,16 +1,16 @@
-use std::collections::HashSet;
+use std::collections::{BTreeMap, HashSet};
 use std::convert::TryInto;
 use std::fs::File;
 use std::io;
 use std::mem::size_of;
 
 use heed::zerocopy::AsBytes;
+use roaring::RoaringBitmap;
 use serde_json::Value;
 
 use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
 use crate::error::InternalError;
 use crate::facet::value_encoding::f64_into_bytes;
-use crate::update::index_documents::merge_cbo_roaring_bitmaps;
 use crate::{DocumentId, FieldId, Result, BEU32};
 
 /// Extracts the facet values of each faceted field of each document.
@@ -22,7 +22,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
     obkv_documents: grenad::Reader<R>,
     indexer: GrenadParameters,
     faceted_fields: &HashSet<FieldId>,
-) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
+) -> Result<(grenad::Reader<File>, grenad::Reader<File>, BTreeMap<FieldId, RoaringBitmap>)> {
     let max_memory = indexer.max_memory_by_thread();
 
     let mut fid_docid_facet_numbers_sorter = create_sorter(
@@ -30,7 +30,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
-        max_memory.map(|m| m / 3),
+        max_memory.map(|m| m / 2),
     );
 
     let mut fid_docid_facet_strings_sorter = create_sorter(
@@ -38,16 +38,10 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
-        max_memory.map(|m| m / 3),
+        max_memory.map(|m| m / 2),
     );
 
-    let mut fid_docid_facet_exists_sorter = create_sorter(
-        merge_cbo_roaring_bitmaps,
-        indexer.chunk_compression_type,
-        indexer.chunk_compression_level,
-        indexer.max_nb_chunks,
-        max_memory.map(|m| m / 3),
-    );
+    let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
 
     let mut key_buffer = Vec::new();
     let mut cursor = obkv_documents.into_cursor()?;
@@ -65,7 +59,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
                 // Here, we know already that the document must be added to the “field id exists” database
                 let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
                 let document = BEU32::from(document).get();
-                fid_docid_facet_exists_sorter.insert(&key_buffer, document.to_ne_bytes())?;
+
+                facet_exists_docids.entry(field_id).or_default().insert(document);
 
                 // For the other extraction tasks, prefix the key with the field_id and the document_id
                 key_buffer.extend_from_slice(&docid_bytes);
@@ -99,7 +94,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
     Ok((
         sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?,
         sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?,
-        sorter_into_reader(fid_docid_facet_exists_sorter, indexer)?,
+        facet_exists_docids,
     ))
 }
 
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index bb695a99f..76d968919 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -8,12 +8,13 @@ mod extract_word_docids;
 mod extract_word_pair_proximity_docids;
 mod extract_word_position_docids;
 
-use std::collections::HashSet;
+use std::collections::{BTreeMap, HashSet};
 use std::fs::File;
 
 use crossbeam_channel::Sender;
 use log::debug;
 use rayon::prelude::*;
+use roaring::RoaringBitmap;
 
 use self::extract_docid_word_positions::extract_docid_word_positions;
 use self::extract_facet_number_docids::extract_facet_number_docids;
@@ -72,12 +73,24 @@ pub(crate) fn data_from_obkv_documents(
 
     let (
         docid_word_positions_chunks,
-        (
-            docid_fid_facet_numbers_chunks,
-            (docid_fid_facet_strings_chunks, docid_fid_facet_exists_chunks),
-        ),
+        (docid_fid_facet_numbers_chunks, (docid_fid_facet_strings_chunks, facet_exists_docids)),
     ) = result?;
 
+    // merge facet_exists_docids hashmaps and send them as a typed chunk
+    {
+        let lmdb_writer_sx = lmdb_writer_sx.clone();
+        rayon::spawn(move || {
+            let mut all = BTreeMap::default();
+            for facet_exists_docids in facet_exists_docids {
+                for (field_id, docids) in facet_exists_docids {
+                    let docids0 = all.entry(field_id).or_default();
+                    *docids0 |= docids;
+                }
+            }
+            let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(all)));
+        });
+    }
+
     spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
         docid_word_positions_chunks.clone(),
         indexer.clone(),
@@ -141,12 +154,6 @@ pub(crate) fn data_from_obkv_documents(
         "field-id-facet-number-docids",
     );
 
-    // spawn extraction task for field-id-facet-exists-docids
-    rayon::spawn(move || {
-        let reader = docid_fid_facet_exists_chunks.merge(merge_cbo_roaring_bitmaps, &indexer);
-        let _ = lmdb_writer_sx.send(reader.map(TypedChunk::FieldIdFacetExistsDocids));
-    });
-
     Ok(())
 }
 
@@ -221,7 +228,7 @@ fn send_and_extract_flattened_documents_data(
     grenad::Reader<CursorClonableMmap>,
     (
         grenad::Reader<CursorClonableMmap>,
-        (grenad::Reader<CursorClonableMmap>, grenad::Reader<File>),
+        (grenad::Reader<CursorClonableMmap>, BTreeMap<FieldId, RoaringBitmap>),
     ),
 )> {
     let flattened_documents_chunk =
@@ -266,7 +273,7 @@ fn send_and_extract_flattened_documents_data(
                 let (
                     docid_fid_facet_numbers_chunk,
                     docid_fid_facet_strings_chunk,
-                    docid_fid_facet_exists_chunk,
+                    facet_exists_docids,
                 ) = extract_fid_docid_facet_values(
                     flattened_documents_chunk.clone(),
                     indexer.clone(),
@@ -291,7 +298,7 @@ fn send_and_extract_flattened_documents_data(
 
                 Ok((
                     docid_fid_facet_numbers_chunk,
-                    (docid_fid_facet_strings_chunk, docid_fid_facet_exists_chunk),
+                    (docid_fid_facet_strings_chunk, facet_exists_docids),
                 ))
             },
         );
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index e501e5efd..e1fd8f98d 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -1,11 +1,12 @@
 use std::borrow::Cow;
+use std::collections::BTreeMap;
 use std::convert::TryInto;
 use std::fs::File;
 use std::io;
 
 use grenad::MergerBuilder;
 use heed::types::ByteSlice;
-use heed::{BytesDecode, RwTxn};
+use heed::{BytesDecode, BytesEncode, RwTxn};
 use roaring::RoaringBitmap;
 
 use super::helpers::{
@@ -16,8 +17,8 @@ use super::{ClonableMmap, MergeFn};
 use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
 use crate::update::index_documents::helpers::as_cloneable_grenad;
 use crate::{
-    lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
-    Result,
+    error, lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId,
+    GeoPoint, Index, Result, BEU16,
 };
 
 pub(crate) enum TypedChunk {
@@ -35,7 +36,7 @@ pub(crate) enum TypedChunk {
     WordPairProximityDocids(grenad::Reader<File>),
     FieldIdFacetStringDocids(grenad::Reader<File>),
     FieldIdFacetNumberDocids(grenad::Reader<File>),
-    FieldIdFacetExistsDocids(grenad::Reader<File>),
+    FieldIdFacetExistsDocids(BTreeMap<FieldId, RoaringBitmap>),
     GeoPoints(grenad::Reader<File>),
 }
 
@@ -147,16 +148,14 @@ pub(crate) fn write_typed_chunk_into_index(
             )?;
             is_merged_database = true;
         }
-        TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids_iter) => {
-            append_entries_into_database(
-                facet_id_exists_docids_iter,
+        TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
+            write_sorted_iterator_into_database(
+                facet_id_exists_docids.into_iter().map(|(k, v)| (BEU16::new(k), v)),
                 &index.facet_id_exists_docids,
+                "facet-id-exists-docids",
                 wtxn,
-                index_is_empty,
-                |value, _buffer| Ok(value),
                 merge_cbo_roaring_bitmaps,
-            )
-            .unwrap();
+            )?;
             is_merged_database = true;
         }
         TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
@@ -270,6 +269,58 @@ fn merge_cbo_roaring_bitmaps(
     )?)
 }
 
+fn write_sorted_iterator_into_database<Iter, Key, Value, KeyCodec, ValueCodec, Merge>(
+    mut iterator: Iter,
+    database: &heed::Database<KeyCodec, ValueCodec>,
+    database_name: &'static str,
+    wtxn: &mut RwTxn,
+    merge_values: Merge,
+) -> Result<()>
+where
+    for<'a> KeyCodec: BytesEncode<'a, EItem = Key>,
+    for<'a> ValueCodec: BytesEncode<'a, EItem = Value> + BytesDecode<'a, DItem = Value>,
+    Iter: Iterator<Item = (Key, Value)>,
+    Merge: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
+{
+    if database.is_empty(wtxn)? {
+        let mut database = database.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
+
+        while let Some((key, value)) = iterator.next() {
+            let key = KeyCodec::bytes_encode(&key)
+                .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?;
+            if valid_lmdb_key(&key) {
+                let value = ValueCodec::bytes_encode(&value)
+                    .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?;
+                unsafe { database.append(&key, &value)? };
+            }
+        }
+
+        Ok(())
+    } else {
+        let database = database.remap_types::<ByteSlice, ByteSlice>();
+        let mut buffer = Vec::new();
+        while let Some((key, value)) = iterator.next() {
+            let key = KeyCodec::bytes_encode(&key)
+                .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?;
+            if valid_lmdb_key(&key) {
+                let value = ValueCodec::bytes_encode(&value)
+                    .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?;
+                let value = match database.get(wtxn, &key)? {
+                    Some(prev_value) => {
+                        merge_values(&value, &prev_value, &mut buffer)?;
+                        &buffer[..]
+                    }
+                    None => &value,
+                };
+
+                database.put(wtxn, &key, value)?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
 /// Write provided entries in database using serialize_value function.
 /// merge_values function is used if an entry already exist in the database.
 fn write_entries_into_database<R, K, V, FS, FM>(

From d0eee5ff7a970b42b1a3723c1e627dc2f2ea8608 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Tue, 19 Jul 2022 13:44:36 +0200
Subject: [PATCH 17/18] Fix compiler error

---
 milli/src/search/facet/filter.rs | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs
index 7f3b928dd..903f9644f 100644
--- a/milli/src/search/facet/filter.rs
+++ b/milli/src/search/facet/filter.rs
@@ -287,14 +287,7 @@ impl<'a> Filter<'a> {
             Condition::NotExists => {
                 let all_ids = index.documents_ids(rtxn)?;
 
-                let exist = Self::evaluate_operator(
-                    rtxn,
-                    index,
-                    numbers_db,
-                    strings_db,
-                    field_id,
-                    &Condition::Exists,
-                )?;
+                let exist = Self::evaluate_operator(rtxn, index, field_id, &Condition::Exists)?;
 
                 let notexist = all_ids - exist;
                 return Ok(notexist);

From 1506683705b0ee01afdef14d1f07a527d6dadc17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Tue, 19 Jul 2022 14:42:35 +0200
Subject: [PATCH 18/18] Avoid using too much memory when indexing
 facet-exists-docids

---
 .../extract/extract_fid_docid_facet_values.rs | 19 +++++-
 .../src/update/index_documents/extract/mod.rs | 29 ++++----
 .../src/update/index_documents/typed_chunk.rs | 68 +++----------------
 3 files changed, 40 insertions(+), 76 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index 368378792..cf116e6f5 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -5,13 +5,15 @@ use std::io;
 use std::mem::size_of;
 
 use heed::zerocopy::AsBytes;
+use heed::BytesEncode;
 use roaring::RoaringBitmap;
 use serde_json::Value;
 
 use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
 use crate::error::InternalError;
 use crate::facet::value_encoding::f64_into_bytes;
-use crate::{DocumentId, FieldId, Result, BEU32};
+use crate::update::index_documents::{create_writer, writer_into_reader};
+use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32};
 
 /// Extracts the facet values of each faceted field of each document.
 ///
@@ -22,7 +24,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
     obkv_documents: grenad::Reader<R>,
     indexer: GrenadParameters,
     faceted_fields: &HashSet<FieldId>,
-) -> Result<(grenad::Reader<File>, grenad::Reader<File>, BTreeMap<FieldId, RoaringBitmap>)> {
+) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
     let max_memory = indexer.max_memory_by_thread();
 
     let mut fid_docid_facet_numbers_sorter = create_sorter(
@@ -91,10 +93,21 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         }
     }
 
+    let mut facet_exists_docids_writer = create_writer(
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        tempfile::tempfile()?,
+    );
+    for (fid, bitmap) in facet_exists_docids.into_iter() {
+        let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
+        facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
+    }
+    let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
+
     Ok((
         sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?,
         sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?,
-        facet_exists_docids,
+        facet_exists_docids_reader,
     ))
 }
 
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 76d968919..157886e63 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -8,13 +8,12 @@ mod extract_word_docids;
 mod extract_word_pair_proximity_docids;
 mod extract_word_position_docids;
 
-use std::collections::{BTreeMap, HashSet};
+use std::collections::HashSet;
 use std::fs::File;
 
 use crossbeam_channel::Sender;
 use log::debug;
 use rayon::prelude::*;
-use roaring::RoaringBitmap;
 
 use self::extract_docid_word_positions::extract_docid_word_positions;
 use self::extract_facet_number_docids::extract_facet_number_docids;
@@ -73,21 +72,25 @@ pub(crate) fn data_from_obkv_documents(
 
     let (
         docid_word_positions_chunks,
-        (docid_fid_facet_numbers_chunks, (docid_fid_facet_strings_chunks, facet_exists_docids)),
+        (
+            docid_fid_facet_numbers_chunks,
+            (docid_fid_facet_strings_chunks, facet_exists_docids_chunks),
+        ),
     ) = result?;
 
-    // merge facet_exists_docids hashmaps and send them as a typed chunk
+    // merge facet_exists_docids and send them as a typed chunk
     {
         let lmdb_writer_sx = lmdb_writer_sx.clone();
         rayon::spawn(move || {
-            let mut all = BTreeMap::default();
-            for facet_exists_docids in facet_exists_docids {
-                for (field_id, docids) in facet_exists_docids {
-                    let docids0 = all.entry(field_id).or_default();
-                    *docids0 |= docids;
+            debug!("merge {} database", "facet-id-exists-docids");
+            match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
+                Ok(reader) => {
+                    let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
+                }
+                Err(e) => {
+                    let _ = lmdb_writer_sx.send(Err(e));
                 }
             }
-            let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(all)));
         });
     }
 
@@ -228,7 +231,7 @@ fn send_and_extract_flattened_documents_data(
     grenad::Reader<CursorClonableMmap>,
     (
         grenad::Reader<CursorClonableMmap>,
-        (grenad::Reader<CursorClonableMmap>, BTreeMap<FieldId, RoaringBitmap>),
+        (grenad::Reader<CursorClonableMmap>, grenad::Reader<File>),
     ),
 )> {
     let flattened_documents_chunk =
@@ -273,7 +276,7 @@ fn send_and_extract_flattened_documents_data(
                 let (
                     docid_fid_facet_numbers_chunk,
                     docid_fid_facet_strings_chunk,
-                    facet_exists_docids,
+                    fid_facet_exists_docids_chunk,
                 ) = extract_fid_docid_facet_values(
                     flattened_documents_chunk.clone(),
                     indexer.clone(),
@@ -298,7 +301,7 @@ fn send_and_extract_flattened_documents_data(
 
                 Ok((
                     docid_fid_facet_numbers_chunk,
-                    (docid_fid_facet_strings_chunk, facet_exists_docids),
+                    (docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk),
                 ))
             },
         );
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index e1fd8f98d..5b7b00c21 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -1,12 +1,11 @@
 use std::borrow::Cow;
-use std::collections::BTreeMap;
 use std::convert::TryInto;
 use std::fs::File;
 use std::io;
 
 use grenad::MergerBuilder;
 use heed::types::ByteSlice;
-use heed::{BytesDecode, BytesEncode, RwTxn};
+use heed::{BytesDecode, RwTxn};
 use roaring::RoaringBitmap;
 
 use super::helpers::{
@@ -17,8 +16,8 @@ use super::{ClonableMmap, MergeFn};
 use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
 use crate::update::index_documents::helpers::as_cloneable_grenad;
 use crate::{
-    error, lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId,
-    GeoPoint, Index, Result, BEU16,
+    lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
+    Result,
 };
 
 pub(crate) enum TypedChunk {
@@ -36,7 +35,7 @@ pub(crate) enum TypedChunk {
     WordPairProximityDocids(grenad::Reader<File>),
     FieldIdFacetStringDocids(grenad::Reader<File>),
     FieldIdFacetNumberDocids(grenad::Reader<File>),
-    FieldIdFacetExistsDocids(BTreeMap<FieldId, RoaringBitmap>),
+    FieldIdFacetExistsDocids(grenad::Reader<File>),
     GeoPoints(grenad::Reader<File>),
 }
 
@@ -149,11 +148,12 @@ pub(crate) fn write_typed_chunk_into_index(
             is_merged_database = true;
         }
         TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
-            write_sorted_iterator_into_database(
-                facet_id_exists_docids.into_iter().map(|(k, v)| (BEU16::new(k), v)),
+            append_entries_into_database(
+                facet_id_exists_docids,
                 &index.facet_id_exists_docids,
-                "facet-id-exists-docids",
                 wtxn,
+                index_is_empty,
+                |value, _buffer| Ok(value),
                 merge_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
@@ -269,58 +269,6 @@ fn merge_cbo_roaring_bitmaps(
     )?)
 }
 
-fn write_sorted_iterator_into_database<Iter, Key, Value, KeyCodec, ValueCodec, Merge>(
-    mut iterator: Iter,
-    database: &heed::Database<KeyCodec, ValueCodec>,
-    database_name: &'static str,
-    wtxn: &mut RwTxn,
-    merge_values: Merge,
-) -> Result<()>
-where
-    for<'a> KeyCodec: BytesEncode<'a, EItem = Key>,
-    for<'a> ValueCodec: BytesEncode<'a, EItem = Value> + BytesDecode<'a, DItem = Value>,
-    Iter: Iterator<Item = (Key, Value)>,
-    Merge: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
-{
-    if database.is_empty(wtxn)? {
-        let mut database = database.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
-
-        while let Some((key, value)) = iterator.next() {
-            let key = KeyCodec::bytes_encode(&key)
-                .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?;
-            if valid_lmdb_key(&key) {
-                let value = ValueCodec::bytes_encode(&value)
-                    .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?;
-                unsafe { database.append(&key, &value)? };
-            }
-        }
-
-        Ok(())
-    } else {
-        let database = database.remap_types::<ByteSlice, ByteSlice>();
-        let mut buffer = Vec::new();
-        while let Some((key, value)) = iterator.next() {
-            let key = KeyCodec::bytes_encode(&key)
-                .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?;
-            if valid_lmdb_key(&key) {
-                let value = ValueCodec::bytes_encode(&value)
-                    .ok_or(error::SerializationError::Encoding { db_name: Some(database_name) })?;
-                let value = match database.get(wtxn, &key)? {
-                    Some(prev_value) => {
-                        merge_values(&value, &prev_value, &mut buffer)?;
-                        &buffer[..]
-                    }
-                    None => &value,
-                };
-
-                database.put(wtxn, &key, value)?;
-            }
-        }
-
-        Ok(())
-    }
-}
-
 /// Write provided entries in database using serialize_value function.
 /// merge_values function is used if an entry already exist in the database.
 fn write_entries_into_database<R, K, V, FS, FM>(