Merge pull request #217 from meilisearch/improve-exactness-criterion

Improve the exactness criterion
2025-01-18 08:48:32 +08:00 · 2019-10-15 11:37:33 +02:00 · 2019-10-15 11:37:33 +02:00 · fdc98f9ef3
commit fdc98f9ef3
parent b7ea812dcc 0de37819b4
14 changed files with 381 additions and 109 deletions
--- a/meilidb-core/src/criterion/exact.rs
+++ b/meilidb-core/src/criterion/exact.rs
@ -1,16 +1,38 @@
 use std::cmp::Ordering;
+
+use sdset::Set;
 use slice_group_by::GroupBy;
+use meilidb_schema::SchemaAttr;
+
 use crate::criterion::Criterion;
 use crate::RawDocument;

 #[inline]
-fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {
+fn number_exact_matches(
+    query_index: &[u32],
+    attribute: &[u16],
+    is_exact: &[bool],
+    fields_counts: &Set<(SchemaAttr, u64)>,
+) -> usize
+{
    let mut count = 0;
    let mut index = 0;

    for group in query_index.linear_group() {
        let len = group.len();
-        count += is_exact[index..index + len].contains(&true) as usize;
+
+        let mut found_exact = false;
+        for (pos, _) in is_exact[index..index + len].iter().filter(|x| **x).enumerate() {
+            found_exact = true;
+            if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) {
+                let (_, count) = fields_counts[pos];
+                if count == 1 {
+                    return usize::max_value()
+                }
+            }
+        }
+
+        count += found_exact as usize;
        index += len;
    }

@ -25,13 +47,19 @@ impl Criterion for Exact {
        let lhs = {
            let query_index = lhs.query_index();
            let is_exact = lhs.is_exact();
-            number_exact_matches(query_index, is_exact)
+            let attribute = lhs.attribute();
+            let fields_counts = &lhs.fields_counts;
+
+            number_exact_matches(query_index, attribute, is_exact, fields_counts)
        };

        let rhs = {
            let query_index = rhs.query_index();
            let is_exact = rhs.is_exact();
-            number_exact_matches(query_index, is_exact)
+            let attribute = rhs.attribute();
+            let fields_counts = &rhs.fields_counts;
+
+            number_exact_matches(query_index, attribute, is_exact, fields_counts)
        };

        lhs.cmp(&rhs).reverse()
@ -52,14 +80,51 @@ mod tests {
    // doc1: "souliereres rouge"
    #[test]
    fn easy_case() {
-        let query_index0 = &[0];
-        let is_exact0 = &[true];
+        let doc0 = {
+            let query_index   = &[0];
+            let attribute     = &[0];
+            let is_exact      = &[true];
+            let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();

-        let query_index1 = &[0];
-        let is_exact1 = &[false];
+            number_exact_matches(query_index, attribute, is_exact, fields_counts)
+        };
+
+        let doc1 = {
+            let query_index   = &[0];
+            let attribute     = &[0];
+            let is_exact      = &[false];
+            let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
+
+            number_exact_matches(query_index, attribute, is_exact, fields_counts)
+        };
+
+        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
+    }
+
+    // typing: "soulier"
+    //
+    // doc0: { 0. "soulier" }
+    // doc1: { 0. "soulier bleu et blanc" }
+    #[test]
+    fn basic() {
+        let doc0 = {
+            let query_index   = &[0];
+            let attribute     = &[0];
+            let is_exact      = &[true];
+            let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
+
+            number_exact_matches(query_index, attribute, is_exact, fields_counts)
+        };
+
+        let doc1 = {
+            let query_index   = &[0];
+            let attribute     = &[0];
+            let is_exact      = &[true];
+            let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
+
+            number_exact_matches(query_index, attribute, is_exact, fields_counts)
+        };

-        let doc0 = number_exact_matches(query_index0, is_exact0);
-        let doc1 = number_exact_matches(query_index1, is_exact1);
        assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
    }
 }
--- a/meilidb-core/src/query_builder.rs
+++ b/meilidb-core/src/query_builder.rs
@ -22,6 +22,7 @@ pub struct QueryBuilder<'c, FI = fn(DocumentId) -> bool> {
    timeout: Option<Duration>,
    main_store: store::Main,
    postings_lists_store: store::PostingsLists,
+    documents_fields_counts_store: store::DocumentsFieldsCounts,
    synonyms_store: store::Synonyms,
 }

@ -130,6 +131,7 @@ fn fetch_raw_documents(
    searchables: Option<&ReorderedAttrs>,
    main_store: &store::Main,
    postings_lists_store: &store::PostingsLists,
+    documents_fields_counts_store: &store::DocumentsFieldsCounts,
 ) -> MResult<Vec<RawDocument>>
 {
    let mut matches = Vec::new();
@ -187,22 +189,42 @@ fn fetch_raw_documents(
        SetBuf::new_unchecked(highlights)
    };

-    Ok(raw_documents_from(matches, highlights))
+    let fields_counts = {
+        let mut fields_counts = Vec::new();
+        for group in matches.linear_group_by_key(|(id, ..)| *id) {
+            let id = group[0].0;
+            for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
+                let (attr, count) = result?;
+                fields_counts.push((id, attr, count));
+            }
+        }
+        SetBuf::new(fields_counts).unwrap()
+    };
+
+    Ok(raw_documents_from(matches, highlights, fields_counts))
 }

 impl<'c> QueryBuilder<'c> {
    pub fn new(
        main: store::Main,
        postings_lists: store::PostingsLists,
+        documents_fields_counts: store::DocumentsFieldsCounts,
        synonyms: store::Synonyms,
    ) -> QueryBuilder<'c>
    {
-        QueryBuilder::with_criteria(main, postings_lists, synonyms, Criteria::default())
+        QueryBuilder::with_criteria(
+            main,
+            postings_lists,
+            documents_fields_counts,
+            synonyms,
+            Criteria::default(),
+        )
    }

    pub fn with_criteria(
        main: store::Main,
        postings_lists: store::PostingsLists,
+        documents_fields_counts: store::DocumentsFieldsCounts,
        synonyms: store::Synonyms,
        criteria: Criteria<'c>,
    ) -> QueryBuilder<'c>
@ -214,6 +236,7 @@ impl<'c> QueryBuilder<'c> {
            timeout: None,
            main_store: main,
            postings_lists_store: postings_lists,
+            documents_fields_counts_store: documents_fields_counts,
            synonyms_store: synonyms,
        }
    }
@ -230,6 +253,7 @@ impl<'c, FI> QueryBuilder<'c, FI> {
            timeout: self.timeout,
            main_store: self.main_store,
            postings_lists_store: self.postings_lists_store,
+            documents_fields_counts_store: self.documents_fields_counts_store,
            synonyms_store: self.synonyms_store,
        }
    }
@ -292,6 +316,7 @@ impl<FI> QueryBuilder<'_, FI> where FI: Fn(DocumentId) -> bool {
                self.searchable_attrs.as_ref(),
                &self.main_store,
                &self.postings_lists_store,
+                &self.documents_fields_counts_store,
            )?;

            // stop processing when time is running out
@ -420,6 +445,7 @@ where FI: Fn(DocumentId) -> bool,
                self.inner.searchable_attrs.as_ref(),
                &self.inner.main_store,
                &self.inner.postings_lists_store,
+                &self.inner.documents_fields_counts_store,
            )?;

            // stop processing when time is running out
@ -549,6 +575,7 @@ mod tests {
    use fst::{Set, IntoStreamer};
    use sdset::SetBuf;
    use tempfile::TempDir;
+    use meilidb_schema::SchemaAttr;

    use crate::automaton::normalize_str;
    use crate::database::Database;
@ -653,11 +680,15 @@ mod tests {

            let mut words_fst = BTreeSet::new();
            let mut postings_lists = HashMap::new();
+            let mut fields_counts = HashMap::<_, u64>::new();

            for (word, indexes) in iter {
                let word = word.to_lowercase().into_bytes();
                words_fst.insert(word.clone());
                postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
+                for idx in indexes {
+                    fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1);
+                }
            }

            let words_fst = Set::from_iter(words_fst).unwrap();
@ -669,6 +700,25 @@ mod tests {
                index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap();
            }

+            for ((docid, attr, _), count) in fields_counts {
+                let prev = index.documents_fields_counts
+                    .document_field_count(
+                        &mut writer,
+                        docid,
+                        SchemaAttr(attr),
+                    ).unwrap();
+
+                let prev = prev.unwrap_or(0);
+
+                index.documents_fields_counts
+                    .put_document_field_count(
+                        &mut writer,
+                        docid,
+                        SchemaAttr(attr),
+                        prev + count,
+                    ).unwrap();
+            }
+
            writer.commit().unwrap();
            drop(rkv);

@ -1470,8 +1520,8 @@ mod tests {
    #[test]
    fn deunicoded_synonyms() {
        let mut store = TempDatabase::from_iter(vec![
-            ("telephone", &[doc_index(0, 0)][..]), // meilidb-data indexes the unidecoded
-            ("téléphone", &[doc_index(0, 0)][..]), // and the original words with the same DocIndex
+            ("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded
+            ("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex

            ("iphone",    &[doc_index(1, 0)][..]),
        ]);
--- a/meilidb-core/src/raw_document.rs
+++ b/meilidb-core/src/raw_document.rs
@ -1,7 +1,10 @@
 use std::sync::Arc;
 use std::fmt;
+
+use meilidb_schema::SchemaAttr;
 use sdset::SetBuf;
 use slice_group_by::GroupBy;
+
 use crate::{TmpMatch, DocumentId, Highlight};

 #[derive(Clone)]
@ -9,13 +12,10 @@ pub struct RawDocument {
    pub id: DocumentId,
    pub matches: SharedMatches,
    pub highlights: Vec<Highlight>,
+    pub fields_counts: SetBuf<(SchemaAttr, u64)>,
 }

 impl RawDocument {
-    fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
-        RawDocument { id, matches, highlights }
-    }
-
    pub fn query_index(&self) -> &[u32] {
        let r = self.matches.range;
        // it is safe because construction/modifications
@ -60,7 +60,7 @@ impl fmt::Debug for RawDocument {
        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "distance",    self.distance()))?;
        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "attribute",   self.attribute()))?;
        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "word_index",  self.word_index()))?;
-        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact",    self.is_exact()))?;
+        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "is_exact",    self.is_exact()))?;
        f.write_str("}")?;
        Ok(())
    }
@ -69,31 +69,34 @@ impl fmt::Debug for RawDocument {
 pub fn raw_documents_from(
    matches: SetBuf<(DocumentId, TmpMatch)>,
    highlights: SetBuf<(DocumentId, Highlight)>,
+    fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
 ) -> Vec<RawDocument>
 {
-    let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
+    let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
    let mut matches2 = Matches::with_capacity(matches.len());

    let matches = matches.linear_group_by_key(|(id, _)| *id);
    let highlights = highlights.linear_group_by_key(|(id, _)| *id);
+    let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);

-    for (mgroup, hgroup) in matches.zip(highlights) {
+    for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
        debug_assert_eq!(mgroup[0].0, hgroup[0].0);
+        debug_assert_eq!(mgroup[0].0, fgroup[0].0);

        let document_id = mgroup[0].0;
-        let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
+        let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
        let end = start + mgroup.len();
-
        let highlights = hgroup.iter().map(|(_, h)| *h).collect();
-        docs_ranges.push((document_id, Range { start, end }, highlights));
+        let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();

+        docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
        matches2.extend_from_slice(mgroup);
    }

    let matches = Arc::new(matches2);
-    docs_ranges.into_iter().map(|(id, range, highlights)| {
+    docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| {
        let matches = SharedMatches { range, matches: matches.clone() };
-        RawDocument::new(id, matches, highlights)
+        RawDocument { id, matches, highlights, fields_counts }
    }).collect()
 }

--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@ -33,7 +33,8 @@ impl RawIndexer {
        }
    }

-    pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
+    pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize {
+        let mut number_of_words = 0;
        let lowercase_text = text.to_lowercase();
        let deunicoded = deunicode_with_tofu(&lowercase_text, "");

@ -46,6 +47,9 @@ impl RawIndexer {
        let iter = Some(lowercase_text).into_iter().chain(next);

        for text in iter {
+            // we must not count 2 times the same words
+            number_of_words = 0;
+
            for token in Tokenizer::new(&text) {
                let must_continue = index_token(
                    token,
@ -57,8 +61,12 @@ impl RawIndexer {
                );

                if !must_continue { break }
+
+                number_of_words += 1;
            }
        }
+
+        number_of_words
    }

    pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
--- a/meilidb-core/src/serde/indexer.rs
+++ b/meilidb-core/src/serde/indexer.rs
@ -13,7 +13,7 @@ pub struct Indexer<'a> {
 }

 impl<'a> ser::Serializer for Indexer<'a> {
-    type Ok = ();
+    type Ok = Option<usize>;
    type Error = SerializerError;
    type SerializeSeq = SeqIndexer<'a>;
    type SerializeTuple = TupleIndexer<'a>;
@ -83,8 +83,8 @@ impl<'a> ser::Serializer for Indexer<'a> {
    }

    fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
-        self.indexer.index_text(self.document_id, self.attribute, text);
-        Ok(())
+        let number_of_words = self.indexer.index_text(self.document_id, self.attribute, text);
+        Ok(Some(number_of_words))
    }

    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
@ -99,8 +99,8 @@ impl<'a> ser::Serializer for Indexer<'a> {
    where T: ser::Serialize,
    {
        let text = value.serialize(ConvertToString)?;
-        self.indexer.index_text(self.document_id, self.attribute, &text);
-        Ok(())
+        let number_of_words = self.indexer.index_text(self.document_id, self.attribute, &text);
+        Ok(Some(number_of_words))
    }

    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
@ -225,7 +225,7 @@ pub struct SeqIndexer<'a> {
 }

 impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
-    type Ok = ();
+    type Ok = Option<usize>;
    type Error = SerializerError;

    fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
@ -239,7 +239,7 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
    fn end(self) -> Result<Self::Ok, Self::Error> {
        let texts = self.texts.iter().map(String::as_str);
        self.indexer.index_text_seq(self.document_id, self.attribute, texts);
-        Ok(())
+        Ok(None)
    }
 }

@ -251,7 +251,7 @@ pub struct MapIndexer<'a> {
 }

 impl<'a> ser::SerializeMap for MapIndexer<'a> {
-    type Ok = ();
+    type Ok = Option<usize>;
    type Error = SerializerError;

    fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
@ -273,7 +273,7 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> {
    fn end(self) -> Result<Self::Ok, Self::Error> {
        let texts = self.texts.iter().map(String::as_str);
        self.indexer.index_text_seq(self.document_id, self.attribute, texts);
-        Ok(())
+        Ok(None)
    }
 }

@ -285,7 +285,7 @@ pub struct StructSerializer<'a> {
 }

 impl<'a> ser::SerializeStruct for StructSerializer<'a> {
-    type Ok = ();
+    type Ok = Option<usize>;
    type Error = SerializerError;

    fn serialize_field<T: ?Sized>(
@ -305,7 +305,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
    fn end(self) -> Result<Self::Ok, Self::Error> {
        let texts = self.texts.iter().map(String::as_str);
        self.indexer.index_text_seq(self.document_id, self.attribute, texts);
-        Ok(())
+        Ok(None)
    }
 }

@ -317,7 +317,7 @@ pub struct TupleIndexer<'a> {
 }

 impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
-    type Ok = ();
+    type Ok = Option<usize>;
    type Error = SerializerError;

    fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
@ -331,6 +331,6 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
    fn end(self) -> Result<Self::Ok, Self::Error> {
        let texts = self.texts.iter().map(String::as_str);
        self.indexer.index_text_seq(self.document_id, self.attribute, texts);
-        Ok(())
+        Ok(None)
    }
 }
--- a/meilidb-core/src/serde/serializer.rs
+++ b/meilidb-core/src/serde/serializer.rs
@ -1,4 +1,5 @@
-use meilidb_schema::Schema;
+use std::collections::HashMap;
+use meilidb_schema::{Schema, SchemaAttr};
 use serde::ser;

 use crate::{DocumentId, RankedMap};
@ -10,6 +11,7 @@ use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer};
 pub struct Serializer<'a> {
    pub schema: &'a Schema,
    pub document_store: &'a mut RamDocumentStore,
+    pub document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
    pub indexer: &'a mut RawIndexer,
    pub ranked_map: &'a mut RankedMap,
    pub document_id: DocumentId,
@ -135,6 +137,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
            schema: self.schema,
            document_id: self.document_id,
            document_store: self.document_store,
+            document_fields_counts: self.document_fields_counts,
            indexer: self.indexer,
            ranked_map: self.ranked_map,
            current_key_name: None,
@ -151,6 +154,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
            schema: self.schema,
            document_id: self.document_id,
            document_store: self.document_store,
+            document_fields_counts: self.document_fields_counts,
            indexer: self.indexer,
            ranked_map: self.ranked_map,
        })
@ -172,6 +176,7 @@ pub struct MapSerializer<'a> {
    schema: &'a Schema,
    document_id: DocumentId,
    document_store: &'a mut RamDocumentStore,
+    document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
    indexer: &'a mut RawIndexer,
    ranked_map: &'a mut RankedMap,
    current_key_name: Option<String>,
@ -209,6 +214,7 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> {
            self.schema,
            self.document_id,
            self.document_store,
+            self.document_fields_counts,
            self.indexer,
            self.ranked_map,
            &key,
@ -225,6 +231,7 @@ pub struct StructSerializer<'a> {
    schema: &'a Schema,
    document_id: DocumentId,
    document_store: &'a mut RamDocumentStore,
+    document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>,
    indexer: &'a mut RawIndexer,
    ranked_map: &'a mut RankedMap,
 }
@ -244,6 +251,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
            self.schema,
            self.document_id,
            self.document_store,
+            self.document_fields_counts,
            self.indexer,
            self.ranked_map,
            key,
@ -260,6 +268,7 @@ fn serialize_value<T: ?Sized>(
    schema: &Schema,
    document_id: DocumentId,
    document_store: &mut RamDocumentStore,
+    documents_fields_counts: &mut HashMap<(DocumentId, SchemaAttr), u64>,
    indexer: &mut RawIndexer,
    ranked_map: &mut RankedMap,
    key: &str,
@ -275,7 +284,9 @@ where T: ser::Serialize,

        if props.is_indexed() {
            let indexer = Indexer { attribute, indexer, document_id };
-            value.serialize(indexer)?;
+            if let Some(number_of_words) = value.serialize(indexer)? {
+                documents_fields_counts.insert((document_id, attribute), number_of_words as u64);
+            }
        }

        if props.is_ranked() {
--- a/meilidb-core/src/store/documents_fields.rs
+++ b/meilidb-core/src/store/documents_fields.rs
@ -1,37 +1,13 @@
 use std::convert::TryFrom;
 use meilidb_schema::SchemaAttr;
 use crate::DocumentId;
+use super::{document_attribute_into_key, document_attribute_from_key};

 #[derive(Copy, Clone)]
 pub struct DocumentsFields {
    pub(crate) documents_fields: rkv::SingleStore,
 }

-fn document_attribute_into_key(document_id: DocumentId, attribute: SchemaAttr) -> [u8; 10] {
-    let document_id_bytes = document_id.0.to_be_bytes();
-    let attr_bytes = attribute.0.to_be_bytes();
-
-    let mut key = [0u8; 10];
-    key[0..8].copy_from_slice(&document_id_bytes);
-    key[8..10].copy_from_slice(&attr_bytes);
-
-    key
-}
-
-fn document_attribute_from_key(key: [u8; 10]) -> (DocumentId, SchemaAttr) {
-    let document_id = {
-        let array = TryFrom::try_from(&key[0..8]).unwrap();
-        DocumentId(u64::from_be_bytes(array))
-    };
-
-    let schema_attr = {
-        let array = TryFrom::try_from(&key[8..8+2]).unwrap();
-        SchemaAttr(u16::from_be_bytes(array))
-    };
-
-    (document_id, schema_attr)
-}
-
 impl DocumentsFields {
    pub fn put_document_field(
        &self,
@ -100,15 +76,6 @@ impl DocumentsFields {
        let iter = self.documents_fields.iter_from(reader, document_id_bytes)?;
        Ok(DocumentFieldsIter { document_id, iter })
    }
-
-    pub fn documents_ids<'r, T: rkv::Readable>(
-        &self,
-        reader: &'r T,
-    ) -> Result<DocumentsIdsIter<'r>, rkv::StoreError>
-    {
-        let iter = self.documents_fields.iter_start(reader)?;
-        Ok(DocumentsIdsIter { last_seen_id: None, iter })
-    }
 }

 pub struct DocumentFieldsIter<'r> {
@ -134,30 +101,3 @@ impl<'r> Iterator for DocumentFieldsIter<'r> {
        }
    }
 }
-
-pub struct DocumentsIdsIter<'r> {
-    last_seen_id: Option<DocumentId>,
-    iter: rkv::store::single::Iter<'r>,
-}
-
-impl<'r> Iterator for DocumentsIdsIter<'r> {
-    type Item = Result<DocumentId, rkv::StoreError>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        for result in &mut self.iter {
-            match result {
-                Ok((key, _)) => {
-                    let array = TryFrom::try_from(key).unwrap();
-                    let (document_id, _) = document_attribute_from_key(array);
-                    if Some(document_id) != self.last_seen_id {
-                        self.last_seen_id = Some(document_id);
-                        return Some(Ok(document_id))
-                    }
-                },
-                Err(e) => return Some(Err(e)),
-            }
-        }
-
-        None
-    }
-}
--- a/meilidb-core/src/store/documents_fields_counts.rs
+++ b/meilidb-core/src/store/documents_fields_counts.rs
@ -0,0 +1,134 @@
+use std::convert::TryFrom;
+use meilidb_schema::SchemaAttr;
+use crate::DocumentId;
+use super::{document_attribute_into_key, document_attribute_from_key};
+
+#[derive(Copy, Clone)]
+pub struct DocumentsFieldsCounts {
+    pub(crate) documents_fields_counts: rkv::SingleStore,
+}
+
+impl DocumentsFieldsCounts {
+    pub fn put_document_field_count(
+        &self,
+        writer: &mut rkv::Writer,
+        document_id: DocumentId,
+        attribute: SchemaAttr,
+        value: u64,
+    ) -> Result<(), rkv::StoreError>
+    {
+        let key = document_attribute_into_key(document_id, attribute);
+        self.documents_fields_counts.put(writer, key, &rkv::Value::U64(value))
+    }
+
+    pub fn del_all_document_fields_counts(
+        &self,
+        writer: &mut rkv::Writer,
+        document_id: DocumentId,
+    ) -> Result<usize, rkv::StoreError>
+    {
+        let mut keys_to_delete = Vec::new();
+
+        // WARN we can not delete the keys using the iterator
+        //      so we store them and delete them just after
+        for result in self.document_fields_counts(writer, document_id)? {
+            let (attribute, _) = result?;
+            let key = document_attribute_into_key(document_id, attribute);
+            keys_to_delete.push(key);
+        }
+
+        let count = keys_to_delete.len();
+        for key in keys_to_delete {
+            self.documents_fields_counts.delete(writer, key)?;
+        }
+
+        Ok(count)
+    }
+
+    pub fn document_field_count(
+        &self,
+        reader: &impl rkv::Readable,
+        document_id: DocumentId,
+        attribute: SchemaAttr,
+    ) -> Result<Option<u64>, rkv::StoreError>
+    {
+        let key = document_attribute_into_key(document_id, attribute);
+
+        match self.documents_fields_counts.get(reader, key)? {
+            Some(rkv::Value::U64(count)) => Ok(Some(count)),
+            Some(value) => panic!("invalid type {:?}", value),
+            None => Ok(None),
+        }
+    }
+
+    pub fn document_fields_counts<'r, T: rkv::Readable>(
+        &self,
+        reader: &'r T,
+        document_id: DocumentId,
+    ) -> Result<DocumentFieldsCountsIter<'r>, rkv::StoreError>
+    {
+        let document_id_bytes = document_id.0.to_be_bytes();
+        let iter = self.documents_fields_counts.iter_from(reader, document_id_bytes)?;
+        Ok(DocumentFieldsCountsIter { document_id, iter })
+    }
+
+    pub fn documents_ids<'r, T: rkv::Readable>(
+        &self,
+        reader: &'r T,
+    ) -> Result<DocumentsIdsIter<'r>, rkv::StoreError>
+    {
+        let iter = self.documents_fields_counts.iter_start(reader)?;
+        Ok(DocumentsIdsIter { last_seen_id: None, iter })
+    }
+}
+
+pub struct DocumentFieldsCountsIter<'r> {
+    document_id: DocumentId,
+    iter: rkv::store::single::Iter<'r>,
+}
+
+impl Iterator for DocumentFieldsCountsIter<'_> {
+    type Item = Result<(SchemaAttr, u64), rkv::StoreError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.iter.next() {
+            Some(Ok((key, Some(rkv::Value::U64(count))))) => {
+                let array = TryFrom::try_from(key).unwrap();
+                let (current_document_id, attr) = document_attribute_from_key(array);
+                if current_document_id != self.document_id { return None; }
+
+                Some(Ok((attr, count)))
+            },
+            Some(Ok((key, data))) => panic!("{:?}, {:?}", key, data),
+            Some(Err(e)) => Some(Err(e)),
+            None => None,
+        }
+    }
+}
+
+pub struct DocumentsIdsIter<'r> {
+    last_seen_id: Option<DocumentId>,
+    iter: rkv::store::single::Iter<'r>,
+}
+
+impl Iterator for DocumentsIdsIter<'_> {
+    type Item = Result<DocumentId, rkv::StoreError>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        for result in &mut self.iter {
+            match result {
+                Ok((key, _)) => {
+                    let array = TryFrom::try_from(key).unwrap();
+                    let (document_id, _) = document_attribute_from_key(array);
+                    if Some(document_id) != self.last_seen_id {
+                        self.last_seen_id = Some(document_id);
+                        return Some(Ok(document_id))
+                    }
+                },
+                Err(e) => return Some(Err(e)),
+            }
+        }
+
+        None
+    }
+}
--- a/meilidb-core/src/store/main.rs
+++ b/meilidb-core/src/store/main.rs
@ -3,7 +3,6 @@ use std::convert::TryInto;

 use meilidb_schema::Schema;
 use rkv::Value;
-use serde::de;
 use crate::{RankedMap, MResult};

 const CUSTOMS_KEY:             &str = "customs-key";
--- a/meilidb-core/src/store/mod.rs
+++ b/meilidb-core/src/store/mod.rs
@ -1,5 +1,6 @@
 mod docs_words;
 mod documents_fields;
+mod documents_fields_counts;
 mod main;
 mod postings_lists;
 mod synonyms;
@ -8,6 +9,7 @@ mod updates_results;

 pub use self::docs_words::DocsWords;
 pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter};
+pub use self::documents_fields_counts::{DocumentsFieldsCounts, DocumentFieldsCountsIter, DocumentsIdsIter};
 pub use self::main::Main;
 pub use self::postings_lists::PostingsLists;
 pub use self::synonyms::Synonyms;
@ -15,8 +17,11 @@ pub use self::updates::Updates;
 pub use self::updates_results::UpdatesResults;

 use std::collections::HashSet;
+use std::convert::TryFrom;
+
 use meilidb_schema::{Schema, SchemaAttr};
-use serde::{ser, de};
+use serde::de;
+
 use crate::criterion::Criteria;
 use crate::serde::Deserializer;
 use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error};
@ -25,6 +30,31 @@ fn aligned_to(bytes: &[u8], align: usize) -> bool {
    (bytes as *const _ as *const () as usize) % align == 0
 }

+fn document_attribute_into_key(document_id: DocumentId, attribute: SchemaAttr) -> [u8; 10] {
+    let document_id_bytes = document_id.0.to_be_bytes();
+    let attr_bytes = attribute.0.to_be_bytes();
+
+    let mut key = [0u8; 10];
+    key[0..8].copy_from_slice(&document_id_bytes);
+    key[8..10].copy_from_slice(&attr_bytes);
+
+    key
+}
+
+fn document_attribute_from_key(key: [u8; 10]) -> (DocumentId, SchemaAttr) {
+    let document_id = {
+        let array = TryFrom::try_from(&key[0..8]).unwrap();
+        DocumentId(u64::from_be_bytes(array))
+    };
+
+    let schema_attr = {
+        let array = TryFrom::try_from(&key[8..8+2]).unwrap();
+        SchemaAttr(u16::from_be_bytes(array))
+    };
+
+    (document_id, schema_attr)
+}
+
 fn main_name(name: &str) -> String {
    format!("store-{}", name)
 }
@ -37,6 +67,10 @@ fn documents_fields_name(name: &str) -> String {
    format!("store-{}-documents-fields", name)
 }

+fn documents_fields_counts_name(name: &str) -> String {
+    format!("store-{}-documents-fields-counts", name)
+}
+
 fn synonyms_name(name: &str) -> String {
    format!("store-{}-synonyms", name)
 }
@ -58,6 +92,7 @@ pub struct Index {
    pub main: Main,
    pub postings_lists: PostingsLists,
    pub documents_fields: DocumentsFields,
+    pub documents_fields_counts: DocumentsFieldsCounts,
    pub synonyms: Synonyms,
    pub docs_words: DocsWords,

@ -166,11 +201,22 @@ impl Index {
    }

    pub fn query_builder(&self) -> QueryBuilder {
-        QueryBuilder::new(self.main, self.postings_lists, self.synonyms)
+        QueryBuilder::new(
+            self.main,
+            self.postings_lists,
+            self.documents_fields_counts,
+            self.synonyms,
+        )
    }

    pub fn query_builder_with_criteria<'c>(&self, criteria: Criteria<'c>) -> QueryBuilder<'c> {
-        QueryBuilder::with_criteria(self.main, self.postings_lists, self.synonyms, criteria)
+        QueryBuilder::with_criteria(
+            self.main,
+            self.postings_lists,
+            self.documents_fields_counts,
+            self.synonyms,
+            criteria,
+        )
    }
 }

@ -205,6 +251,7 @@ fn open_options(
    let main_name = main_name(name);
    let postings_lists_name = postings_lists_name(name);
    let documents_fields_name = documents_fields_name(name);
+    let documents_fields_counts_name = documents_fields_counts_name(name);
    let synonyms_name = synonyms_name(name);
    let docs_words_name = docs_words_name(name);
    let updates_name = updates_name(name);
@ -214,6 +261,7 @@ fn open_options(
    let main = env.open_single(main_name.as_str(), options)?;
    let postings_lists = env.open_single(postings_lists_name.as_str(), options)?;
    let documents_fields = env.open_single(documents_fields_name.as_str(), options)?;
+    let documents_fields_counts = env.open_single(documents_fields_counts_name.as_str(), options)?;
    let synonyms = env.open_single(synonyms_name.as_str(), options)?;
    let docs_words = env.open_single(docs_words_name.as_str(), options)?;
    let updates = env.open_single(updates_name.as_str(), options)?;
@ -223,6 +271,7 @@ fn open_options(
        main: Main { main },
        postings_lists: PostingsLists { postings_lists },
        documents_fields: DocumentsFields { documents_fields },
+        documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
        synonyms: Synonyms { synonyms },
        docs_words: DocsWords { docs_words },
        updates: Updates { updates },
--- a/meilidb-core/src/update/customs_update.rs
+++ b/meilidb-core/src/update/customs_update.rs
@ -1,5 +1,5 @@
-use crate::{store, error::UnsupportedOperation, MResult};
 use crate::update::{Update, next_update_id};
+use crate::{store, MResult};

 pub fn apply_customs_update(
    writer: &mut rkv::Writer,
--- a/meilidb-core/src/update/documents_addition.rs
+++ b/meilidb-core/src/update/documents_addition.rs
@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};

 use fst::{SetBuilder, set::OpBuilder};
 use sdset::{SetOperation, duo::Union};
@ -82,6 +82,7 @@ pub fn apply_documents_addition(
    writer: &mut rkv::Writer,
    main_store: store::Main,
    documents_fields_store: store::DocumentsFields,
+    documents_fields_counts_store: store::DocumentsFieldsCounts,
    postings_lists_store: store::PostingsLists,
    docs_words_store: store::DocsWords,
    mut ranked_map: RankedMap,
@ -90,6 +91,7 @@ pub fn apply_documents_addition(
 {
    let mut document_ids = HashSet::new();
    let mut document_store = RamDocumentStore::new();
+    let mut document_fields_counts = HashMap::new();
    let mut indexer = RawIndexer::new();

    let schema = match main_store.schema(writer)? {
@ -112,6 +114,7 @@ pub fn apply_documents_addition(
        let serializer = Serializer {
            schema: &schema,
            document_store: &mut document_store,
+            document_fields_counts: &mut document_fields_counts,
            indexer: &mut indexer,
            ranked_map: &mut ranked_map,
            document_id,
@ -126,6 +129,7 @@ pub fn apply_documents_addition(
        writer,
        main_store,
        documents_fields_store,
+        documents_fields_counts_store,
        postings_lists_store,
        docs_words_store,
        ranked_map.clone(),
@ -137,6 +141,11 @@ pub fn apply_documents_addition(
        documents_fields_store.put_document_field(writer, id, attr, &value)?;
    }

+    // 3. insert new document attributes counts
+    for ((id, attr), count) in document_fields_counts {
+        documents_fields_counts_store.put_document_field_count(writer, id, attr, count)?;
+    }
+
    let indexed = indexer.build();
    let mut delta_words_builder = SetBuilder::memory();

--- a/meilidb-core/src/update/documents_deletion.rs
+++ b/meilidb-core/src/update/documents_deletion.rs
@ -86,6 +86,7 @@ pub fn apply_documents_deletion(
    writer: &mut rkv::Writer,
    main_store: store::Main,
    documents_fields_store: store::DocumentsFields,
+    documents_fields_counts_store: store::DocumentsFieldsCounts,
    postings_lists_store: store::PostingsLists,
    docs_words_store: store::DocsWords,
    mut ranked_map: RankedMap,
@ -140,6 +141,7 @@ pub fn apply_documents_deletion(
        }

        for id in document_ids {
+            documents_fields_counts_store.del_all_document_fields_counts(writer, id)?;
            if documents_fields_store.del_all_document_fields(writer, id)? != 0 {
                deleted_documents.insert(id);
            }
--- a/meilidb-core/src/update/mod.rs
+++ b/meilidb-core/src/update/mod.rs
@ -138,6 +138,7 @@ pub fn update_task(writer: &mut rkv::Writer, index: store::Index) -> MResult<Opt
                writer,
                index.main,
                index.documents_fields,
+                index.documents_fields_counts,
                index.postings_lists,
                index.docs_words,
                ranked_map,
@ -160,6 +161,7 @@ pub fn update_task(writer: &mut rkv::Writer, index: store::Index) -> MResult<Opt
                writer,
                index.main,
                index.documents_fields,
+                index.documents_fields_counts,
                index.postings_lists,
                index.docs_words,
                ranked_map,