Introduce the final Fetcher criterion

This commit is contained in:
Kerollmops 2021-02-25 16:34:29 +01:00
parent 7ac09d7b7c
commit daf126a638
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
4 changed files with 201 additions and 83 deletions

View File

@ -0,0 +1,107 @@
use std::collections::HashMap;
use std::mem::take;
use log::debug;
use roaring::RoaringBitmap;
use crate::search::query_tree::Operation;
use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context};
pub struct Fetcher<'t> {
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Candidates,
parent: Option<Box<dyn Criterion + 't>>,
should_get_documents_ids: bool,
}
impl<'t> Fetcher<'t> {
pub fn initial(
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
) -> Self
{
Fetcher {
ctx,
query_tree,
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
parent: None,
should_get_documents_ids: true,
}
}
pub fn new(
ctx: &'t dyn Context,
parent: Box<dyn Criterion + 't>,
) -> Self
{
Fetcher {
ctx,
query_tree: None,
candidates: Candidates::default(),
parent: Some(parent),
should_get_documents_ids: true,
}
}
}
impl<'t> Criterion for Fetcher<'t> {
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>> {
use Candidates::{Allowed, Forbidden};
loop {
debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})",
self.should_get_documents_ids, self.candidates,
);
match &mut self.candidates {
Allowed(candidates) => if candidates.is_empty() {
self.candidates = Candidates::default();
} else {
self.should_get_documents_ids = false;
let candidates = take(&mut self.candidates).into_inner();
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: candidates.clone(),
bucket_candidates: Some(candidates),
}));
},
Forbidden(_) => {
let should_get_documents_ids = take(&mut self.should_get_documents_ids);
match self.parent.as_mut() {
Some(parent) => {
match parent.next()? {
Some(result) => return Ok(Some(result)),
None => if should_get_documents_ids {
let candidates = match &self.query_tree {
Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?,
None => self.ctx.documents_ids()?,
};
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: candidates.clone(),
bucket_candidates: Some(candidates),
}));
},
}
},
None => if should_get_documents_ids {
let candidates = match &self.query_tree {
Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?,
None => self.ctx.documents_ids()?,
};
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: candidates.clone(),
bucket_candidates: Some(candidates),
}));
},
}
return Ok(None);
},
}
}
}
}

View File

@ -1,16 +1,19 @@
use std::collections::HashMap;
use std::borrow::Cow; use std::borrow::Cow;
use anyhow::bail;
use roaring::RoaringBitmap;
use crate::Index; use crate::Index;
use crate::search::word_derivations; use crate::search::word_derivations;
use roaring::RoaringBitmap;
use super::query_tree::{Operation, Query, QueryKind}; use super::query_tree::{Operation, Query, QueryKind};
pub mod typo; pub mod typo;
pub mod words; pub mod words;
pub mod asc_desc; pub mod asc_desc;
pub mod proximity; pub mod proximity;
pub mod fetcher;
pub trait Criterion { pub trait Criterion {
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>>; fn next(&mut self) -> anyhow::Result<Option<CriterionResult>>;
@ -51,6 +54,7 @@ impl Default for Candidates {
} }
} }
pub trait Context { pub trait Context {
fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>; fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
@ -66,6 +70,10 @@ pub struct HeedContext<'t> {
} }
impl<'a> Context for HeedContext<'a> { impl<'a> Context for HeedContext<'a> {
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
self.index.documents_ids(self.rtxn)
}
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> { fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
self.index.word_docids.get(self.rtxn, &word) self.index.word_docids.get(self.rtxn, &word)
} }
@ -107,6 +115,80 @@ impl<'t> HeedContext<'t> {
} }
} }
pub fn resolve_query_tree<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
) -> anyhow::Result<RoaringBitmap>
{
fn resolve_operation<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
) -> anyhow::Result<RoaringBitmap>
{
use Operation::{And, Consecutive, Or, Query};
match query_tree {
And(ops) => {
let mut ops = ops.iter().map(|op| {
resolve_operation(ctx, op, cache)
}).collect::<anyhow::Result<Vec<_>>>()?;
ops.sort_unstable_by_key(|cds| cds.len());
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for docids in ops {
if first_loop {
candidates = docids;
first_loop = false;
} else {
candidates.intersect_with(&docids);
}
}
Ok(candidates)
},
Consecutive(ops) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in ops.windows(2) {
match (&slice[0], &slice[1]) {
(Operation::Query(left), Operation::Query(right)) => {
match query_pair_proximity_docids(ctx, left, right, 1)? {
pair_docids if pair_docids.is_empty() => {
return Ok(RoaringBitmap::new())
},
pair_docids if first_loop => {
candidates = pair_docids;
first_loop = false;
},
pair_docids => {
candidates.intersect_with(&pair_docids);
},
}
},
_ => bail!("invalid consecutive query type"),
}
}
Ok(candidates)
},
Or(_, ops) => {
let mut candidates = RoaringBitmap::new();
for op in ops {
let docids = resolve_operation(ctx, op, cache)?;
candidates.union_with(&docids);
}
Ok(candidates)
},
Query(q) => Ok(query_docids(ctx, q)?),
}
}
resolve_operation(ctx, query_tree, cache)
}
fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>( fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
ctx: &dyn Context, ctx: &dyn Context,
left_words: &[(T, u8)], left_words: &[(T, u8)],
@ -218,6 +300,10 @@ pub mod test {
} }
impl<'a> Context for TestContext<'a> { impl<'a> Context for TestContext<'a> {
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids))
}
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> { fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
Ok(self.word_docids.get(&word.to_string()).cloned()) Ok(self.word_docids.get(&word.to_string()).cloned())
} }

View File

@ -1,12 +1,11 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::mem::take; use std::mem::take;
use anyhow::bail;
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::search::query_tree::Operation; use crate::search::query_tree::Operation;
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context};
pub struct Words<'t> { pub struct Words<'t> {
ctx: &'t dyn Context, ctx: &'t dyn Context,
@ -62,7 +61,7 @@ impl<'t> Criterion for Words<'t> {
self.candidates = Candidates::default(); self.candidates = Candidates::default();
}, },
(Some(qt), Allowed(candidates)) => { (Some(qt), Allowed(candidates)) => {
let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?;
found_candidates.intersect_with(&candidates); found_candidates.intersect_with(&candidates);
candidates.difference_with(&found_candidates); candidates.difference_with(&found_candidates);
@ -78,7 +77,7 @@ impl<'t> Criterion for Words<'t> {
})); }));
}, },
(Some(qt), Forbidden(candidates)) => { (Some(qt), Forbidden(candidates)) => {
let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?;
found_candidates.difference_with(&candidates); found_candidates.difference_with(&candidates);
candidates.union_with(&found_candidates); candidates.union_with(&found_candidates);
@ -127,76 +126,3 @@ fn explode_query_tree(query_tree: Operation) -> Vec<Operation> {
otherwise => vec![otherwise], otherwise => vec![otherwise],
} }
} }
fn resolve_candidates<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
) -> anyhow::Result<RoaringBitmap>
{
fn resolve_operation<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
) -> anyhow::Result<RoaringBitmap>
{
use Operation::{And, Consecutive, Or, Query};
match query_tree {
And(ops) => {
let mut ops = ops.iter().map(|op| {
resolve_operation(ctx, op, cache)
}).collect::<anyhow::Result<Vec<_>>>()?;
ops.sort_unstable_by_key(|cds| cds.len());
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for docids in ops {
if first_loop {
candidates = docids;
first_loop = false;
} else {
candidates.intersect_with(&docids);
}
}
Ok(candidates)
},
Consecutive(ops) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in ops.windows(2) {
match (&slice[0], &slice[1]) {
(Operation::Query(left), Operation::Query(right)) => {
match query_pair_proximity_docids(ctx, left, right, 1)? {
pair_docids if pair_docids.is_empty() => {
return Ok(RoaringBitmap::new())
},
pair_docids if first_loop => {
candidates = pair_docids;
first_loop = false;
},
pair_docids => {
candidates.intersect_with(&pair_docids);
},
}
},
_ => bail!("invalid consecutive query type"),
}
}
Ok(candidates)
},
Or(_, ops) => {
let mut candidates = RoaringBitmap::new();
for op in ops {
let docids = resolve_operation(ctx, op, cache)?;
candidates.union_with(&docids);
}
Ok(candidates)
},
Query(q) => Ok(query_docids(ctx, q)?),
}
}
resolve_operation(ctx, query_tree, cache)
}

View File

@ -10,7 +10,7 @@ use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap; use roaring::bitmap::RoaringBitmap;
use crate::search::criteria::{Criterion, CriterionResult}; use crate::search::criteria::{Criterion, CriterionResult};
use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity}; use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity, fetcher::Fetcher};
use crate::{Index, DocumentId}; use crate::{Index, DocumentId};
pub use self::facet::FacetIter; pub use self::facet::FacetIter;
@ -92,13 +92,12 @@ impl<'a> Search<'a> {
None => MatchingWords::default(), None => MatchingWords::default(),
}; };
// We are testing the typo criteria but there will be more of them soon.
let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?;
let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?;
let words_criterion = Words::new(&criteria_ctx, Box::new(typo_criterion))?; let words_criterion = Words::new(&criteria_ctx, Box::new(typo_criterion))?;
let proximity_criterion = Proximity::new(&criteria_ctx, Box::new(words_criterion))?; let proximity_criterion = Proximity::new(&criteria_ctx, Box::new(words_criterion))?;
// let proximity_criterion = Proximity::initial(&criteria_ctx, query_tree, facet_candidates)?; let fetcher_criterion = Fetcher::new(&criteria_ctx, Box::new(proximity_criterion));
let mut criteria = proximity_criterion; let mut criteria = fetcher_criterion;
// // We sort in descending order on a specific field *by hand*, don't do that at home. // // We sort in descending order on a specific field *by hand*, don't do that at home.
// let attr_name = "released-timestamp"; // let attr_name = "released-timestamp";