Use the documents_fileds_count store in the QueryBuilder

2024-11-23 10:37:41 +08:00 · 2019-10-14 18:48:32 +02:00 · 2019-10-14 18:48:32 +02:00 · e629f51af4
commit e629f51af4
parent b377003192
4 changed files with 85 additions and 18 deletions
--- a/meilidb-core/src/query_builder.rs
+++ b/meilidb-core/src/query_builder.rs
@ -22,6 +22,7 @@ pub struct QueryBuilder<'c, FI = fn(DocumentId) -> bool> {
    timeout: Option<Duration>,
    main_store: store::Main,
    postings_lists_store: store::PostingsLists,
+    documents_fields_counts_store: store::DocumentsFieldsCounts,
    synonyms_store: store::Synonyms,
 }

@ -130,6 +131,7 @@ fn fetch_raw_documents(
    searchables: Option<&ReorderedAttrs>,
    main_store: &store::Main,
    postings_lists_store: &store::PostingsLists,
+    documents_fields_counts_store: &store::DocumentsFieldsCounts,
 ) -> MResult<Vec<RawDocument>>
 {
    let mut matches = Vec::new();
@ -187,22 +189,42 @@ fn fetch_raw_documents(
        SetBuf::new_unchecked(highlights)
    };

-    Ok(raw_documents_from(matches, highlights))
+    let fields_counts = {
+        let mut fields_counts = Vec::new();
+        for group in matches.linear_group_by_key(|(id, ..)| *id) {
+            let id = group[0].0;
+            for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
+                let (attr, count) = result?;
+                fields_counts.push((id, attr, count));
+            }
+        }
+        SetBuf::new(fields_counts).unwrap()
+    };
+
+    Ok(raw_documents_from(matches, highlights, fields_counts))
 }

 impl<'c> QueryBuilder<'c> {
    pub fn new(
        main: store::Main,
        postings_lists: store::PostingsLists,
+        documents_fields_counts: store::DocumentsFieldsCounts,
        synonyms: store::Synonyms,
    ) -> QueryBuilder<'c>
    {
-        QueryBuilder::with_criteria(main, postings_lists, synonyms, Criteria::default())
+        QueryBuilder::with_criteria(
+            main,
+            postings_lists,
+            documents_fields_counts,
+            synonyms,
+            Criteria::default(),
+        )
    }

    pub fn with_criteria(
        main: store::Main,
        postings_lists: store::PostingsLists,
+        documents_fields_counts: store::DocumentsFieldsCounts,
        synonyms: store::Synonyms,
        criteria: Criteria<'c>,
    ) -> QueryBuilder<'c>
@ -214,6 +236,7 @@ impl<'c> QueryBuilder<'c> {
            timeout: None,
            main_store: main,
            postings_lists_store: postings_lists,
+            documents_fields_counts_store: documents_fields_counts,
            synonyms_store: synonyms,
        }
    }
@ -230,6 +253,7 @@ impl<'c, FI> QueryBuilder<'c, FI> {
            timeout: self.timeout,
            main_store: self.main_store,
            postings_lists_store: self.postings_lists_store,
+            documents_fields_counts_store: self.documents_fields_counts_store,
            synonyms_store: self.synonyms_store,
        }
    }
@ -292,6 +316,7 @@ impl<FI> QueryBuilder<'_, FI> where FI: Fn(DocumentId) -> bool {
                self.searchable_attrs.as_ref(),
                &self.main_store,
                &self.postings_lists_store,
+                &self.documents_fields_counts_store,
            )?;

            // stop processing when time is running out
@ -420,6 +445,7 @@ where FI: Fn(DocumentId) -> bool,
                self.inner.searchable_attrs.as_ref(),
                &self.inner.main_store,
                &self.inner.postings_lists_store,
+                &self.inner.documents_fields_counts_store,
            )?;

            // stop processing when time is running out
@ -549,6 +575,7 @@ mod tests {
    use fst::{Set, IntoStreamer};
    use sdset::SetBuf;
    use tempfile::TempDir;
+    use meilidb_schema::SchemaAttr;

    use crate::automaton::normalize_str;
    use crate::database::Database;
@ -653,11 +680,15 @@ mod tests {

            let mut words_fst = BTreeSet::new();
            let mut postings_lists = HashMap::new();
+            let mut fields_counts = HashMap::<_, u64>::new();

            for (word, indexes) in iter {
                let word = word.to_lowercase().into_bytes();
                words_fst.insert(word.clone());
                postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes);
+                for idx in indexes {
+                    fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1);
+                }
            }

            let words_fst = Set::from_iter(words_fst).unwrap();
@ -669,6 +700,25 @@ mod tests {
                index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap();
            }

+            for ((docid, attr, _), count) in fields_counts {
+                let prev = index.documents_fields_counts
+                    .document_attribute_count(
+                        &mut writer,
+                        docid,
+                        SchemaAttr(attr),
+                    ).unwrap();
+
+                let prev = prev.unwrap_or(0);
+
+                index.documents_fields_counts
+                    .put_document_field_count(
+                        &mut writer,
+                        docid,
+                        SchemaAttr(attr),
+                        prev + count,
+                    ).unwrap();
+            }
+
            writer.commit().unwrap();
            drop(rkv);

@ -1470,8 +1520,8 @@ mod tests {
    #[test]
    fn deunicoded_synonyms() {
        let mut store = TempDatabase::from_iter(vec![
-            ("telephone", &[doc_index(0, 0)][..]), // meilidb-data indexes the unidecoded
-            ("téléphone", &[doc_index(0, 0)][..]), // and the original words with the same DocIndex
+            ("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded
+            ("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex

            ("iphone",    &[doc_index(1, 0)][..]),
        ]);
--- a/meilidb-core/src/raw_document.rs
+++ b/meilidb-core/src/raw_document.rs
@ -1,7 +1,10 @@
 use std::sync::Arc;
 use std::fmt;
+
+use meilidb_schema::SchemaAttr;
 use sdset::SetBuf;
 use slice_group_by::GroupBy;
+
 use crate::{TmpMatch, DocumentId, Highlight};

 #[derive(Clone)]
@ -9,13 +12,10 @@ pub struct RawDocument {
    pub id: DocumentId,
    pub matches: SharedMatches,
    pub highlights: Vec<Highlight>,
+    pub fields_counts: SetBuf<(SchemaAttr, u64)>,
 }

 impl RawDocument {
-    fn new(id: DocumentId, matches: SharedMatches, highlights: Vec<Highlight>) -> RawDocument {
-        RawDocument { id, matches, highlights }
-    }
-
    pub fn query_index(&self) -> &[u32] {
        let r = self.matches.range;
        // it is safe because construction/modifications
@ -60,7 +60,7 @@ impl fmt::Debug for RawDocument {
        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "distance",    self.distance()))?;
        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "attribute",   self.attribute()))?;
        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "word_index",  self.word_index()))?;
-        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact",    self.is_exact()))?;
+        f.write_fmt(format_args!("{:>15}: {:^5?},\r\n",  "is_exact",    self.is_exact()))?;
        f.write_str("}")?;
        Ok(())
    }
@ -69,31 +69,34 @@ impl fmt::Debug for RawDocument {
 pub fn raw_documents_from(
    matches: SetBuf<(DocumentId, TmpMatch)>,
    highlights: SetBuf<(DocumentId, Highlight)>,
+    fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
 ) -> Vec<RawDocument>
 {
-    let mut docs_ranges: Vec<(_, Range, _)> = Vec::new();
+    let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
    let mut matches2 = Matches::with_capacity(matches.len());

    let matches = matches.linear_group_by_key(|(id, _)| *id);
    let highlights = highlights.linear_group_by_key(|(id, _)| *id);
+    let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);

-    for (mgroup, hgroup) in matches.zip(highlights) {
+    for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
        debug_assert_eq!(mgroup[0].0, hgroup[0].0);
+        debug_assert_eq!(mgroup[0].0, fgroup[0].0);

        let document_id = mgroup[0].0;
-        let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0);
+        let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
        let end = start + mgroup.len();
-
        let highlights = hgroup.iter().map(|(_, h)| *h).collect();
-        docs_ranges.push((document_id, Range { start, end }, highlights));
+        let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();

+        docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
        matches2.extend_from_slice(mgroup);
    }

    let matches = Arc::new(matches2);
-    docs_ranges.into_iter().map(|(id, range, highlights)| {
+    docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| {
        let matches = SharedMatches { range, matches: matches.clone() };
-        RawDocument::new(id, matches, highlights)
+        RawDocument { id, matches, highlights, fields_counts }
    }).collect()
 }

--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@ -47,6 +47,9 @@ impl RawIndexer {
        let iter = Some(lowercase_text).into_iter().chain(next);

        for text in iter {
+            // we must not count 2 times the same words
+            number_of_words = 0;
+
            for token in Tokenizer::new(&text) {
                let must_continue = index_token(
                    token,
--- a/meilidb-core/src/store/mod.rs
+++ b/meilidb-core/src/store/mod.rs
@ -201,11 +201,22 @@ impl Index {
    }

    pub fn query_builder(&self) -> QueryBuilder {
-        QueryBuilder::new(self.main, self.postings_lists, self.synonyms)
+        QueryBuilder::new(
+            self.main,
+            self.postings_lists,
+            self.documents_fields_counts,
+            self.synonyms,
+        )
    }

    pub fn query_builder_with_criteria<'c>(&self, criteria: Criteria<'c>) -> QueryBuilder<'c> {
-        QueryBuilder::with_criteria(self.main, self.postings_lists, self.synonyms, criteria)
+        QueryBuilder::with_criteria(
+            self.main,
+            self.postings_lists,
+            self.documents_fields_counts,
+            self.synonyms,
+            criteria,
+        )
    }
 }