Use the Typo criteria in the search module

This commit is contained in:
Clément Renault 2021-02-17 10:29:28 +01:00 committed by Kerollmops
parent ad20d72a39
commit f091f370d0
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -18,10 +18,13 @@ use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}
use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
use crate::mdfs::Mdfs;
use crate::query_tokens::{query_tokens, QueryToken};
use crate::{Index, FieldId, DocumentId, Criterion};
use crate::search::criteria::Criterion;
use crate::search::criteria::typo::Typo;
use crate::{Index, FieldId, DocumentId};
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
pub use self::facet::{FacetIter};
use self::query_tree::QueryTreeBuilder;
// Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
@ -30,6 +33,7 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
mod facet;
mod query_tree;
mod criteria;
pub struct Search<'a> {
query: Option<String>,
@ -258,15 +262,22 @@ impl<'a> Search<'a> {
}
pub fn execute(&self) -> anyhow::Result<SearchResult> {
let limit = self.limit;
let fst = self.index.words_fst(self.rtxn)?;
// Construct the DFAs related to the query words.
let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) {
Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?),
_otherwise => None,
// We create the query tree by spliting the query into tokens.
let before = Instant::now();
let query_tree = match self.query.as_ref() {
Some(query) => {
let builder = QueryTreeBuilder::new(self.rtxn, self.index);
let stop_words = &Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
let result = analyzer.analyze(query);
let tokens = result.tokens();
builder.build(false, true, tokens)
},
None => None,
};
debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed());
// We create the original candidates with the facet conditions results.
let before = Instant::now();
let facet_candidates = match &self.facet_condition {
@ -276,100 +287,129 @@ impl<'a> Search<'a> {
debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed());
let order_by_facet = {
let criteria = self.index.criteria(self.rtxn)?;
let result = criteria.into_iter().flat_map(|criterion| {
match criterion {
Criterion::Asc(fid) => Some((fid, true)),
Criterion::Desc(fid) => Some((fid, false)),
_ => None
}
}).next();
match result {
Some((attr_name, is_ascending)) => {
let field_id_map = self.index.fields_ids_map(self.rtxn)?;
let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?;
let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?;
let ftype = *faceted_fields.get(&fid)
.with_context(|| format!("{:?} not found in the faceted fields.", attr_name))
.expect("corrupted data: ");
Some((fid, ftype, is_ascending))
},
None => None,
}
};
// We aretesting the typo criteria but there will be more of them soon.
let mut criteria = Typo::initial(self.index, self.rtxn, query_tree, facet_candidates)?;
let before = Instant::now();
let (candidates, derived_words) = match (facet_candidates, derived_words) {
(Some(mut facet_candidates), Some(derived_words)) => {
let words_candidates = Self::compute_candidates(&derived_words);
facet_candidates.intersect_with(&words_candidates);
(facet_candidates, derived_words)
},
(None, Some(derived_words)) => {
(Self::compute_candidates(&derived_words), derived_words)
},
(Some(facet_candidates), None) => {
// If the query is not set or results in no DFAs but
// there is some facet conditions we return a placeholder.
let documents_ids = match order_by_facet {
Some((fid, ftype, is_ascending)) => {
self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)?
},
None => facet_candidates.iter().take(limit).collect(),
};
return Ok(SearchResult {
documents_ids,
candidates: facet_candidates,
..Default::default()
})
},
(None, None) => {
// If the query is not set or results in no DFAs we return a placeholder.
let all_docids = self.index.documents_ids(self.rtxn)?;
let documents_ids = match order_by_facet {
Some((fid, ftype, is_ascending)) => {
self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)?
},
None => all_docids.iter().take(limit).collect(),
};
return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() })
},
};
let mut offset = self.offset;
let mut limit = self.limit;
let mut documents_ids = Vec::new();
while let Some((_qt, docids)) = criteria.next()? {
debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed());
let mut len = docids.len() as usize;
let mut docids = docids.into_iter();
// The mana depth first search is a revised DFS that explore
// solutions in the order of their proximities.
let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone());
let mut documents = Vec::new();
// We execute the Mdfs iterator until we find enough documents.
while documents.iter().map(RoaringBitmap::len).sum::<u64>() < limit as u64 {
match mdfs.next().transpose()? {
Some((proximity, answer)) => {
debug!("answer with a proximity of {}: {:?}", proximity, answer);
documents.push(answer);
},
None => break,
}
if offset != 0 {
docids.by_ref().skip(offset).for_each(drop);
offset = offset.saturating_sub(len.min(offset));
len = len.saturating_sub(len.min(offset));
}
let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect();
let documents_ids = match order_by_facet {
Some((fid, ftype, order)) => {
let mut ordered_documents = Vec::new();
for documents_ids in documents {
let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?;
ordered_documents.push(docids);
if ordered_documents.iter().map(Vec::len).sum::<usize>() >= limit { break }
if len != 0 {
documents_ids.extend(docids.take(limit));
limit = limit.saturating_sub(len.min(limit));
}
ordered_documents.into_iter().flatten().take(limit).collect()
},
None => documents.into_iter().flatten().take(limit).collect(),
};
if limit == 0 { break }
}
let found_words = HashSet::new();
let candidates = RoaringBitmap::new();
Ok(SearchResult { found_words, candidates, documents_ids })
// let order_by_facet = {
// let criteria = self.index.criteria(self.rtxn)?;
// let result = criteria.into_iter().flat_map(|criterion| {
// match criterion {
// Criterion::Asc(fid) => Some((fid, true)),
// Criterion::Desc(fid) => Some((fid, false)),
// _ => None
// }
// }).next();
// match result {
// Some((attr_name, is_ascending)) => {
// let field_id_map = self.index.fields_ids_map(self.rtxn)?;
// let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?;
// let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?;
// let ftype = *faceted_fields.get(&fid)
// .with_context(|| format!("{:?} not found in the faceted fields.", attr_name))
// .expect("corrupted data: ");
// Some((fid, ftype, is_ascending))
// },
// None => None,
// }
// };
// let before = Instant::now();
// let (candidates, derived_words) = match (facet_candidates, derived_words) {
// (Some(mut facet_candidates), Some(derived_words)) => {
// let words_candidates = Self::compute_candidates(&derived_words);
// facet_candidates.intersect_with(&words_candidates);
// (facet_candidates, derived_words)
// },
// (None, Some(derived_words)) => {
// (Self::compute_candidates(&derived_words), derived_words)
// },
// (Some(facet_candidates), None) => {
// // If the query is not set or results in no DFAs but
// // there is some facet conditions we return a placeholder.
// let documents_ids = match order_by_facet {
// Some((fid, ftype, is_ascending)) => {
// self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)?
// },
// None => facet_candidates.iter().take(limit).collect(),
// };
// return Ok(SearchResult {
// documents_ids,
// candidates: facet_candidates,
// ..Default::default()
// })
// },
// (None, None) => {
// // If the query is not set or results in no DFAs we return a placeholder.
// let all_docids = self.index.documents_ids(self.rtxn)?;
// let documents_ids = match order_by_facet {
// Some((fid, ftype, is_ascending)) => {
// self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)?
// },
// None => all_docids.iter().take(limit).collect(),
// };
// return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() })
// },
// };
// debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed());
// // The mana depth first search is a revised DFS that explore
// // solutions in the order of their proximities.
// let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone());
// let mut documents = Vec::new();
// // We execute the Mdfs iterator until we find enough documents.
// while documents.iter().map(RoaringBitmap::len).sum::<u64>() < limit as u64 {
// match mdfs.next().transpose()? {
// Some((proximity, answer)) => {
// debug!("answer with a proximity of {}: {:?}", proximity, answer);
// documents.push(answer);
// },
// None => break,
// }
// }
// let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect();
// let documents_ids = match order_by_facet {
// Some((fid, ftype, order)) => {
// let mut ordered_documents = Vec::new();
// for documents_ids in documents {
// let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?;
// ordered_documents.push(docids);
// if ordered_documents.iter().map(Vec::len).sum::<usize>() >= limit { break }
// }
// ordered_documents.into_iter().flatten().take(limit).collect()
// },
// None => documents.into_iter().flatten().take(limit).collect(),
// };
// Ok(SearchResult { found_words, candidates, documents_ids })
}
}