From 54370e228a34039ab01ab495d180ed4cf90346cc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 29 Sep 2020 16:07:04 +0200 Subject: [PATCH] Search for documents with longer proximities until we find enough --- src/bin/indexer.rs | 1 + src/search.rs | 27 ++++++++++++++++++++------- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 688d9f40b..ae0f83767 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -535,6 +535,7 @@ fn merge(key: &[u8], values: &[Vec]) -> Result, ()> { // TODO merge with the previous values // TODO store the documents in a compressed MTBL +// TODO prefer using iter.append when possible, it is way faster (4x) to inject ordered entries. fn lmdb_writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyhow::Result<()> { if key == WORDS_FST_KEY { // Write the words fst diff --git a/src/search.rs b/src/search.rs index 36b67b502..831042c00 100644 --- a/src/search.rs +++ b/src/search.rs @@ -176,7 +176,7 @@ impl<'a> Search<'a> { &self, words: &[(HashMap, RoaringBitmap)], candidates: &RoaringBitmap, - parent_docids: Option<&RoaringBitmap>, + parent_docids: &RoaringBitmap, union_cache: &mut HashMap<(usize, u8), RoaringBitmap>, ) -> anyhow::Result> { @@ -202,15 +202,13 @@ impl<'a> Search<'a> { } }; - if let Some(parent_docids) = &parent_docids { - docids.intersect_with(parent_docids); - } + docids.intersect_with(parent_docids); if !docids.is_empty() { let words = &words[1..]; // We are the last word. if words.len() < 2 { return Ok(Some(docids)) } - if let Some(di) = self.depth_first_search(words, candidates, Some(&docids), union_cache)? { + if let Some(di) = self.depth_first_search(words, candidates, &docids, union_cache)? { return Ok(Some(di)) } } @@ -250,9 +248,24 @@ impl<'a> Search<'a> { return Ok(SearchResult { found_words, documents_ids }); } - let mut union_cache = HashMap::new(); let mut documents = Vec::new(); - if let Some(answer) = answer { + let mut union_cache = HashMap::new(); + + // We execute the DFS until we find enough documents, we run it with the + // candidates list and remove the found documents from this list at each iteration. + while documents.iter().map(RoaringBitmap::len).sum::() < limit as u64 { + let answer = self.depth_first_search(&derived_words, &candidates, &candidates, &mut union_cache)?; + + let answer = match answer { + Some(answer) if !answer.is_empty() => answer, + _ => break, + }; + + debug!("answer: {:?}", answer); + + // We remove the answered documents from the list of + // candidates to be sure we don't search for them again. + candidates.difference_with(&answer); documents.push(answer); }