From 54370e228a34039ab01ab495d180ed4cf90346cc Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 29 Sep 2020 16:07:04 +0200
Subject: [PATCH] Search for documents with longer proximities until we find
 enough

---
 src/bin/indexer.rs |  1 +
 src/search.rs      | 27 ++++++++++++++++++++-------
 2 files changed, 21 insertions(+), 7 deletions(-)
diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs
index 688d9f40b..ae0f83767 100644
--- a/src/bin/indexer.rs
+++ b/src/bin/indexer.rs
@@ -535,6 +535,7 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
 
 // TODO merge with the previous values
 // TODO store the documents in a compressed MTBL
+// TODO prefer using iter.append when possible, it is way faster (4x) to inject ordered entries.
 fn lmdb_writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyhow::Result<()> {
     if key == WORDS_FST_KEY {
         // Write the words fst
diff --git a/src/search.rs b/src/search.rs
index 36b67b502..831042c00 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -176,7 +176,7 @@ impl<'a> Search<'a> {
         &self,
         words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
         candidates: &RoaringBitmap,
-        parent_docids: Option<&RoaringBitmap>,
+        parent_docids: &RoaringBitmap,
         union_cache: &mut HashMap<(usize, u8), RoaringBitmap>,
     ) -> anyhow::Result<Option<RoaringBitmap>>
     {
@@ -202,15 +202,13 @@ impl<'a> Search<'a> {
                 }
             };
 
-            if let Some(parent_docids) = &parent_docids {
-                docids.intersect_with(parent_docids);
-            }
+            docids.intersect_with(parent_docids);
 
             if !docids.is_empty() {
                 let words = &words[1..];
                 // We are the last word.
                 if words.len() < 2 { return Ok(Some(docids)) }
-                if let Some(di) = self.depth_first_search(words, candidates, Some(&docids), union_cache)? {
+                if let Some(di) = self.depth_first_search(words, candidates, &docids, union_cache)? {
                     return Ok(Some(di))
                 }
             }
@@ -250,9 +248,24 @@ impl<'a> Search<'a> {
             return Ok(SearchResult { found_words, documents_ids });
         }
 
-        let mut union_cache = HashMap::new();
         let mut documents = Vec::new();
-        if let Some(answer) = answer {
+        let mut union_cache = HashMap::new();
+
+        // We execute the DFS until we find enough documents, we run it with the
+        // candidates list and remove the found documents from this list at each iteration.
+        while documents.iter().map(RoaringBitmap::len).sum::<u64>() < limit as u64 {
+            let answer = self.depth_first_search(&derived_words, &candidates, &candidates, &mut union_cache)?;
+
+            let answer = match answer {
+                Some(answer) if !answer.is_empty() => answer,
+                _ => break,
+            };
+
+            debug!("answer: {:?}", answer);
+
+            // We remove the answered documents from the list of
+            // candidates to be sure we don't search for them again.
+            candidates.difference_with(&answer);
             documents.push(answer);
         }