Use the words pair proximities in the search algorithm

This commit is contained in:
Clément Renault 2020-09-22 18:19:28 +02:00
parent 31224a8425
commit 1f6e00878d
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -28,13 +28,7 @@ pub struct Search<'a> {
impl<'a> Search<'a> { impl<'a> Search<'a> {
pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> {
Search { Search { query: None, offset: 0, limit: 20, rtxn, index }
query: None,
offset: 0,
limit: 20,
rtxn,
index,
}
} }
pub fn query(&mut self, query: impl Into<String>) -> &mut Search<'a> { pub fn query(&mut self, query: impl Into<String>) -> &mut Search<'a> {
@ -85,8 +79,8 @@ impl<'a> Search<'a> {
.collect() .collect()
} }
/// Fetch the words from the given FST related to the /// Fetch the words from the given FST related to the given DFAs along with
/// given DFAs along with the associated documents ids. /// the associated documents ids.
fn fetch_words_docids( fn fetch_words_docids(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
index: &Index, index: &Index,
@ -194,26 +188,36 @@ impl<'a> Search<'a> {
let mut documents = Vec::new(); let mut documents = Vec::new();
// If there only is one word, no need to compute the best proximities. // If there is only one query word, no need to compute the best proximities.
if derived_words.len() == 1 { if derived_words.len() == 1 || candidates.is_empty() {
let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect();
let documents_ids = candidates.iter().take(limit).collect(); let documents_ids = candidates.iter().take(limit).collect();
return Ok(SearchResult { found_words, documents_ids }); return Ok(SearchResult { found_words, documents_ids });
} }
let mut paths = Vec::new(); let mut answer = RoaringBitmap::new();
for candidate in candidates { for (i, words) in derived_words.windows(2).enumerate() {
let keywords = Self::fecth_keywords(rtxn, index, &derived_words, candidate)?; let w1: Vec<_> = words[0].0.keys().collect();
near_proximity(keywords, &mut paths, path_proximity); let w2: Vec<_> = words[1].0.keys().collect();
if let Some((prox, _path)) = paths.first() {
documents.push((*prox, candidate)); let key = (w1[0].as_str(), w2[0].as_str(), 1);
match index.word_pair_proximity_docids.get(rtxn, &key)? {
Some(docids) => if i == 0 {
answer = docids;
} else {
answer.intersect_with(&docids);
},
None => {
answer = RoaringBitmap::new();
break;
},
} }
} }
documents.sort_unstable_by_key(|(prox, _)| *prox); documents.push(answer);
let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect();
let documents_ids = documents.into_iter().map(|(_, id)| id).take(limit).collect(); let documents_ids = documents.into_iter().flatten().take(limit).collect();
Ok(SearchResult { found_words, documents_ids }) Ok(SearchResult { found_words, documents_ids })
} }
} }