Search for documents with longer proximities until we find enough

This commit is contained in:
Kerollmops 2020-09-29 16:07:04 +02:00
parent f277ea134f
commit 54370e228a
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 21 additions and 7 deletions

View File

@ -535,6 +535,7 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
// TODO merge with the previous values // TODO merge with the previous values
// TODO store the documents in a compressed MTBL // TODO store the documents in a compressed MTBL
// TODO prefer using iter.append when possible, it is way faster (4x) to inject ordered entries.
fn lmdb_writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyhow::Result<()> { fn lmdb_writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> anyhow::Result<()> {
if key == WORDS_FST_KEY { if key == WORDS_FST_KEY {
// Write the words fst // Write the words fst

View File

@ -176,7 +176,7 @@ impl<'a> Search<'a> {
&self, &self,
words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)], words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
parent_docids: Option<&RoaringBitmap>, parent_docids: &RoaringBitmap,
union_cache: &mut HashMap<(usize, u8), RoaringBitmap>, union_cache: &mut HashMap<(usize, u8), RoaringBitmap>,
) -> anyhow::Result<Option<RoaringBitmap>> ) -> anyhow::Result<Option<RoaringBitmap>>
{ {
@ -202,15 +202,13 @@ impl<'a> Search<'a> {
} }
}; };
if let Some(parent_docids) = &parent_docids { docids.intersect_with(parent_docids);
docids.intersect_with(parent_docids);
}
if !docids.is_empty() { if !docids.is_empty() {
let words = &words[1..]; let words = &words[1..];
// We are the last word. // We are the last word.
if words.len() < 2 { return Ok(Some(docids)) } if words.len() < 2 { return Ok(Some(docids)) }
if let Some(di) = self.depth_first_search(words, candidates, Some(&docids), union_cache)? { if let Some(di) = self.depth_first_search(words, candidates, &docids, union_cache)? {
return Ok(Some(di)) return Ok(Some(di))
} }
} }
@ -250,9 +248,24 @@ impl<'a> Search<'a> {
return Ok(SearchResult { found_words, documents_ids }); return Ok(SearchResult { found_words, documents_ids });
} }
let mut union_cache = HashMap::new();
let mut documents = Vec::new(); let mut documents = Vec::new();
if let Some(answer) = answer { let mut union_cache = HashMap::new();
// We execute the DFS until we find enough documents, we run it with the
// candidates list and remove the found documents from this list at each iteration.
while documents.iter().map(RoaringBitmap::len).sum::<u64>() < limit as u64 {
let answer = self.depth_first_search(&derived_words, &candidates, &candidates, &mut union_cache)?;
let answer = match answer {
Some(answer) if !answer.is_empty() => answer,
_ => break,
};
debug!("answer: {:?}", answer);
// We remove the answered documents from the list of
// candidates to be sure we don't search for them again.
candidates.difference_with(&answer);
documents.push(answer); documents.push(answer);
} }