mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 09:04:59 +08:00
Compute edges of proximity graph lazily
This commit is contained in:
parent
272cd7ebbd
commit
83e5b4ed0d
@ -367,6 +367,7 @@ pub fn word_derivations<'c>(
|
|||||||
match cache.entry((word.to_string(), is_prefix, max_typo)) {
|
match cache.entry((word.to_string(), is_prefix, max_typo)) {
|
||||||
Entry::Occupied(entry) => Ok(entry.into_mut()),
|
Entry::Occupied(entry) => Ok(entry.into_mut()),
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
|
// println!("word derivations {word} {is_prefix} {max_typo}");
|
||||||
let mut derived_words = Vec::new();
|
let mut derived_words = Vec::new();
|
||||||
if max_typo == 0 {
|
if max_typo == 0 {
|
||||||
if is_prefix {
|
if is_prefix {
|
||||||
|
@ -318,9 +318,10 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
|
|||||||
let mut used_words = HashSet::new();
|
let mut used_words = HashSet::new();
|
||||||
let mut used_phrases = HashSet::new();
|
let mut used_phrases = HashSet::new();
|
||||||
for condition in used_conditions.iter() {
|
for condition in used_conditions.iter() {
|
||||||
let condition = graph.conditions_interner.get(condition);
|
let (ws, ps) =
|
||||||
used_words.extend(G::words_used_by_condition(ctx, condition)?);
|
condition_docids_cache.get_condition_used_words_and_phrases(condition);
|
||||||
used_phrases.extend(G::phrases_used_by_condition(ctx, condition)?);
|
used_words.extend(ws);
|
||||||
|
used_phrases.extend(ps);
|
||||||
}
|
}
|
||||||
// 2. Remove the unused words and phrases from all the nodes in the graph
|
// 2. Remove the unused words and phrases from all the nodes in the graph
|
||||||
let mut nodes_to_remove = vec![];
|
let mut nodes_to_remove = vec![];
|
||||||
|
@ -30,7 +30,7 @@ impl<T> Interned<T> {
|
|||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct DedupInterner<T> {
|
pub struct DedupInterner<T> {
|
||||||
stable_store: Vec<T>,
|
stable_store: Vec<T>,
|
||||||
lookup: FxHashMap<T, Interned<T>>,
|
lookup: FxHashMap<T, Interned<T>>, // TODO: Arc
|
||||||
}
|
}
|
||||||
impl<T> Default for DedupInterner<T> {
|
impl<T> Default for DedupInterner<T> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
|
@ -287,368 +287,3 @@ impl<'a> Search<'a> {
|
|||||||
todo!()
|
todo!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
// use crate::allocator::ALLOC;
|
|
||||||
use std::fs::File;
|
|
||||||
use std::io::{BufRead, BufReader, Cursor, Seek};
|
|
||||||
use std::time::Instant;
|
|
||||||
|
|
||||||
use big_s::S;
|
|
||||||
use heed::EnvOpenOptions;
|
|
||||||
use maplit::hashset;
|
|
||||||
|
|
||||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
|
||||||
// use crate::search::new::logger::detailed::DetailedSearchLogger;
|
|
||||||
use crate::search::new::logger::DefaultSearchLogger;
|
|
||||||
use crate::search::new::{execute_search, SearchContext};
|
|
||||||
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
|
||||||
use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy};
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn search_wiki_new() {
|
|
||||||
let mut options = EnvOpenOptions::new();
|
|
||||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
||||||
|
|
||||||
let index = Index::new(options, "data_wiki").unwrap();
|
|
||||||
let txn = index.read_txn().unwrap();
|
|
||||||
|
|
||||||
println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len());
|
|
||||||
|
|
||||||
loop {
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
// let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log");
|
|
||||||
let mut ctx = SearchContext::new(&index, &txn);
|
|
||||||
let results = execute_search(
|
|
||||||
&mut ctx,
|
|
||||||
"released from prison by the government",
|
|
||||||
// "which a the releases from poison by the government",
|
|
||||||
// "sun flower s are the best",
|
|
||||||
// "zero config",
|
|
||||||
TermsMatchingStrategy::Last,
|
|
||||||
None,
|
|
||||||
0,
|
|
||||||
20,
|
|
||||||
&mut DefaultSearchLogger,
|
|
||||||
&mut DefaultSearchLogger,
|
|
||||||
// &mut logger,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
// logger.write_d2_description(&mut ctx);
|
|
||||||
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
println!("{}us", elapsed.as_micros());
|
|
||||||
|
|
||||||
let _documents = index
|
|
||||||
.documents(&txn, results.documents_ids.iter().copied())
|
|
||||||
.unwrap()
|
|
||||||
.into_iter()
|
|
||||||
.map(|(id, obkv)| {
|
|
||||||
let mut object = serde_json::Map::default();
|
|
||||||
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
|
||||||
let value = obkv.get(fid).unwrap();
|
|
||||||
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
|
||||||
object.insert(fid_name.to_owned(), value);
|
|
||||||
}
|
|
||||||
(id, serde_json::to_string_pretty(&object).unwrap())
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
println!("{}us: {:?}", elapsed.as_micros(), results);
|
|
||||||
}
|
|
||||||
// for (id, document) in documents {
|
|
||||||
// println!("{id}:");
|
|
||||||
// // println!("{document}");
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn search_wiki_old() {
|
|
||||||
let mut options = EnvOpenOptions::new();
|
|
||||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
||||||
|
|
||||||
let index = Index::new(options, "data_wiki").unwrap();
|
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
|
||||||
|
|
||||||
let rr = index.criteria(&txn).unwrap();
|
|
||||||
println!("{rr:?}");
|
|
||||||
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
let mut s = Search::new(&txn, &index);
|
|
||||||
s.query(
|
|
||||||
// "which a the releases from poison by the government",
|
|
||||||
// "sun flower s are the best",
|
|
||||||
"zero config",
|
|
||||||
);
|
|
||||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
|
||||||
// s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlyIterative);
|
|
||||||
let docs = s.execute().unwrap();
|
|
||||||
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
|
|
||||||
let documents = index
|
|
||||||
.documents(&txn, docs.documents_ids.iter().copied())
|
|
||||||
.unwrap()
|
|
||||||
.into_iter()
|
|
||||||
.map(|(id, obkv)| {
|
|
||||||
let mut object = serde_json::Map::default();
|
|
||||||
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
|
|
||||||
let value = obkv.get(fid).unwrap();
|
|
||||||
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
|
|
||||||
object.insert(fid_name.to_owned(), value);
|
|
||||||
}
|
|
||||||
(id, serde_json::to_string_pretty(&object).unwrap())
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
|
||||||
for (id, _document) in documents {
|
|
||||||
println!("{id}:");
|
|
||||||
// println!("{document}");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#[test]
|
|
||||||
fn search_movies_new() {
|
|
||||||
let mut options = EnvOpenOptions::new();
|
|
||||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
||||||
|
|
||||||
let index = Index::new(options, "data_movies").unwrap();
|
|
||||||
let txn = index.read_txn().unwrap();
|
|
||||||
|
|
||||||
// let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
|
||||||
// let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
|
||||||
// loop {
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log");
|
|
||||||
let mut ctx = SearchContext::new(&index, &txn);
|
|
||||||
let results = execute_search(
|
|
||||||
&mut ctx,
|
|
||||||
"releases from poison by the government",
|
|
||||||
TermsMatchingStrategy::Last,
|
|
||||||
None,
|
|
||||||
0,
|
|
||||||
20,
|
|
||||||
&mut DefaultSearchLogger,
|
|
||||||
&mut logger,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
logger.write_d2_description(&mut ctx);
|
|
||||||
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
|
|
||||||
// let ids = index
|
|
||||||
// .documents(&txn, results.iter().copied())
|
|
||||||
// .unwrap()
|
|
||||||
// .into_iter()
|
|
||||||
// .map(|x| {
|
|
||||||
// let obkv = &x.1;
|
|
||||||
// let id = obkv.get(primary_key).unwrap();
|
|
||||||
// let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
|
||||||
// id.as_str().unwrap().to_owned()
|
|
||||||
// })
|
|
||||||
// .collect::<Vec<_>>();
|
|
||||||
|
|
||||||
println!("{}us: {results:?}", elapsed.as_micros());
|
|
||||||
// println!("external ids: {ids:?}");
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn search_movies_old() {
|
|
||||||
let mut options = EnvOpenOptions::new();
|
|
||||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
||||||
|
|
||||||
let index = Index::new(options, "data_movies").unwrap();
|
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
|
||||||
|
|
||||||
let rr = index.criteria(&txn).unwrap();
|
|
||||||
println!("{rr:?}");
|
|
||||||
|
|
||||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
|
||||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
|
||||||
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
let mut s = Search::new(&txn, &index);
|
|
||||||
s.query("which a the releases from poison by the government");
|
|
||||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
|
||||||
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
|
|
||||||
let docs = s.execute().unwrap();
|
|
||||||
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
|
|
||||||
let ids = index
|
|
||||||
.documents(&txn, docs.documents_ids.iter().copied())
|
|
||||||
.unwrap()
|
|
||||||
.into_iter()
|
|
||||||
.map(|x| {
|
|
||||||
let obkv = &x.1;
|
|
||||||
let id = obkv.get(primary_key).unwrap();
|
|
||||||
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
|
|
||||||
id.as_str().unwrap().to_owned()
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
|
|
||||||
println!("external ids: {ids:?}");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn _settings_movies() {
|
|
||||||
let mut options = EnvOpenOptions::new();
|
|
||||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
||||||
|
|
||||||
let index = Index::new(options, "data_movies").unwrap();
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
|
||||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
|
||||||
|
|
||||||
builder.set_min_word_len_one_typo(5);
|
|
||||||
builder.set_min_word_len_two_typos(100);
|
|
||||||
builder.set_sortable_fields(hashset! { S("release_date") });
|
|
||||||
builder.set_criteria(vec![
|
|
||||||
Criterion::Words,
|
|
||||||
Criterion::Typo,
|
|
||||||
Criterion::Proximity,
|
|
||||||
Criterion::Asc("release_date".to_owned()),
|
|
||||||
]);
|
|
||||||
|
|
||||||
builder.execute(|_| (), || false).unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn _index_movies() {
|
|
||||||
let mut options = EnvOpenOptions::new();
|
|
||||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
||||||
|
|
||||||
let index = Index::new(options, "data_movies").unwrap();
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
|
||||||
|
|
||||||
let primary_key = "id";
|
|
||||||
let searchable_fields = vec!["title", "overview"];
|
|
||||||
let filterable_fields = vec!["release_date", "genres"];
|
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
|
||||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
|
||||||
builder.set_primary_key(primary_key.to_owned());
|
|
||||||
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
|
||||||
builder.set_searchable_fields(searchable_fields);
|
|
||||||
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
|
||||||
builder.set_filterable_fields(filterable_fields);
|
|
||||||
|
|
||||||
builder.set_min_word_len_one_typo(5);
|
|
||||||
builder.set_min_word_len_two_typos(100);
|
|
||||||
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
|
||||||
builder.execute(|_| (), || false).unwrap();
|
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
|
||||||
let indexing_config = IndexDocumentsConfig::default();
|
|
||||||
let builder =
|
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let documents = documents_from(
|
|
||||||
"/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json",
|
|
||||||
"json",
|
|
||||||
);
|
|
||||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
|
||||||
user_error.unwrap();
|
|
||||||
builder.execute().unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
index.prepare_for_closing().wait();
|
|
||||||
}
|
|
||||||
#[test]
|
|
||||||
fn _index_wiki() {
|
|
||||||
let mut options = EnvOpenOptions::new();
|
|
||||||
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
|
|
||||||
|
|
||||||
let index = Index::new(options, "data_wiki").unwrap();
|
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
|
||||||
|
|
||||||
// let primary_key = "id";
|
|
||||||
let searchable_fields = vec!["body", "title", "url"];
|
|
||||||
// let filterable_fields = vec![];
|
|
||||||
let config = IndexerConfig::default();
|
|
||||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
|
||||||
// builder.set_primary_key(primary_key.to_owned());
|
|
||||||
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
|
||||||
builder.set_searchable_fields(searchable_fields);
|
|
||||||
// let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
|
||||||
// builder.set_filterable_fields(filterable_fields);
|
|
||||||
|
|
||||||
// builder.set_min_word_len_one_typo(5);
|
|
||||||
// builder.set_min_word_len_two_typos(100);
|
|
||||||
builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
|
|
||||||
builder.execute(|_| (), || false).unwrap();
|
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
|
||||||
let indexing_config =
|
|
||||||
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
|
|
||||||
let builder =
|
|
||||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let documents = documents_from(
|
|
||||||
"/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv",
|
|
||||||
"csv",
|
|
||||||
);
|
|
||||||
let (builder, user_error) = builder.add_documents(documents).unwrap();
|
|
||||||
user_error.unwrap();
|
|
||||||
builder.execute().unwrap();
|
|
||||||
wtxn.commit().unwrap();
|
|
||||||
|
|
||||||
index.prepare_for_closing().wait();
|
|
||||||
}
|
|
||||||
|
|
||||||
fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
|
|
||||||
let reader = File::open(filename)
|
|
||||||
.unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
|
|
||||||
let reader = BufReader::new(reader);
|
|
||||||
let documents = match filetype {
|
|
||||||
"csv" => documents_from_csv(reader).unwrap(),
|
|
||||||
"json" => documents_from_json(reader).unwrap(),
|
|
||||||
"jsonl" => documents_from_jsonl(reader).unwrap(),
|
|
||||||
otherwise => panic!("invalid update format {:?}", otherwise),
|
|
||||||
};
|
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn documents_from_jsonl(reader: impl BufRead) -> crate::Result<Vec<u8>> {
|
|
||||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
|
|
||||||
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
|
||||||
let object = result.unwrap();
|
|
||||||
documents.append_json_object(&object)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
documents.into_inner().map_err(Into::into)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn documents_from_json(reader: impl BufRead) -> crate::Result<Vec<u8>> {
|
|
||||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
|
|
||||||
documents.append_json_array(reader)?;
|
|
||||||
|
|
||||||
documents.into_inner().map_err(Into::into)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn documents_from_csv(reader: impl BufRead) -> crate::Result<Vec<u8>> {
|
|
||||||
let csv = csv::Reader::from_reader(reader);
|
|
||||||
|
|
||||||
let mut documents = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
documents.append_csv(csv)?;
|
|
||||||
|
|
||||||
documents.into_inner().map_err(Into::into)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -1,19 +1,28 @@
|
|||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
|
|
||||||
use fxhash::FxHashMap;
|
use fxhash::{FxHashMap, FxHashSet};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{RankingRuleGraph, RankingRuleGraphTrait};
|
use super::{RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
use crate::search::new::interner::Interned;
|
use crate::search::new::interner::Interned;
|
||||||
|
use crate::search::new::query_term::Phrase;
|
||||||
use crate::search::new::SearchContext;
|
use crate::search::new::SearchContext;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
// TODO: give a generation to each universe, then be able to get the exact
|
// TODO: give a generation to each universe, then be able to get the exact
|
||||||
// delta of docids between two universes of different generations!
|
// delta of docids between two universes of different generations!
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct ComputedCondition {
|
||||||
|
docids: RoaringBitmap,
|
||||||
|
universe_len: u64,
|
||||||
|
used_words: FxHashSet<Interned<String>>,
|
||||||
|
used_phrases: FxHashSet<Interned<Phrase>>,
|
||||||
|
}
|
||||||
|
|
||||||
/// A cache storing the document ids associated with each ranking rule edge
|
/// A cache storing the document ids associated with each ranking rule edge
|
||||||
pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> {
|
pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> {
|
||||||
pub cache: FxHashMap<Interned<G::Condition>, (u64, RoaringBitmap)>,
|
pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>,
|
||||||
_phantom: PhantomData<G>,
|
_phantom: PhantomData<G>,
|
||||||
}
|
}
|
||||||
impl<G: RankingRuleGraphTrait> Default for ConditionDocIdsCache<G> {
|
impl<G: RankingRuleGraphTrait> Default for ConditionDocIdsCache<G> {
|
||||||
@ -22,6 +31,14 @@ impl<G: RankingRuleGraphTrait> Default for ConditionDocIdsCache<G> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
|
impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
|
||||||
|
pub fn get_condition_used_words_and_phrases(
|
||||||
|
&mut self,
|
||||||
|
interned_condition: Interned<G::Condition>,
|
||||||
|
) -> (&FxHashSet<Interned<String>>, &FxHashSet<Interned<Phrase>>) {
|
||||||
|
let ComputedCondition { used_words, used_phrases, .. } = &self.cache[&interned_condition];
|
||||||
|
(used_words, used_phrases)
|
||||||
|
}
|
||||||
|
|
||||||
/// Retrieve the document ids for the given edge condition.
|
/// Retrieve the document ids for the given edge condition.
|
||||||
///
|
///
|
||||||
/// If the cache does not yet contain these docids, they are computed
|
/// If the cache does not yet contain these docids, they are computed
|
||||||
@ -30,14 +47,14 @@ impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
|
|||||||
&'s mut self,
|
&'s mut self,
|
||||||
ctx: &mut SearchContext<'ctx>,
|
ctx: &mut SearchContext<'ctx>,
|
||||||
interned_condition: Interned<G::Condition>,
|
interned_condition: Interned<G::Condition>,
|
||||||
graph: &RankingRuleGraph<G>,
|
graph: &mut RankingRuleGraph<G>,
|
||||||
// TODO: maybe universe doesn't belong here
|
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<&'s RoaringBitmap> {
|
) -> Result<&'s RoaringBitmap> {
|
||||||
if self.cache.contains_key(&interned_condition) {
|
if self.cache.contains_key(&interned_condition) {
|
||||||
// TODO compare length of universe compared to the one in self
|
// TODO compare length of universe compared to the one in self
|
||||||
// if it is smaller, then update the value
|
// if it is smaller, then update the value
|
||||||
let (universe_len, docids) = self.cache.entry(interned_condition).or_default();
|
let ComputedCondition { docids, universe_len, .. } =
|
||||||
|
self.cache.entry(interned_condition).or_default();
|
||||||
if *universe_len == universe.len() {
|
if *universe_len == universe.len() {
|
||||||
return Ok(docids);
|
return Ok(docids);
|
||||||
} else {
|
} else {
|
||||||
@ -46,12 +63,13 @@ impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
|
|||||||
return Ok(docids);
|
return Ok(docids);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// TODO: maybe universe doesn't belong here
|
let condition = graph.conditions_interner.get_mut(interned_condition);
|
||||||
let condition = graph.conditions_interner.get(interned_condition);
|
let (docids, used_words, used_phrases) = G::resolve_condition(ctx, condition, universe)?;
|
||||||
// TODO: faster way to do this?
|
let _ = self.cache.insert(
|
||||||
let docids = G::resolve_condition(ctx, condition, universe)?;
|
interned_condition,
|
||||||
let _ = self.cache.insert(interned_condition, (universe.len(), docids));
|
ComputedCondition { docids, universe_len: universe.len(), used_words, used_phrases },
|
||||||
let (_, docids) = &self.cache[&interned_condition];
|
);
|
||||||
|
let ComputedCondition { docids, .. } = &self.cache[&interned_condition];
|
||||||
Ok(docids)
|
Ok(docids)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -15,11 +15,11 @@ mod proximity;
|
|||||||
/// Implementation of the `typo` ranking rule
|
/// Implementation of the `typo` ranking rule
|
||||||
mod typo;
|
mod typo;
|
||||||
|
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::hash::Hash;
|
use std::hash::Hash;
|
||||||
|
|
||||||
pub use condition_docids_cache::ConditionDocIdsCache;
|
pub use condition_docids_cache::ConditionDocIdsCache;
|
||||||
pub use dead_ends_cache::DeadEndsCache;
|
pub use dead_ends_cache::DeadEndsCache;
|
||||||
|
use fxhash::FxHashSet;
|
||||||
pub use proximity::{ProximityCondition, ProximityGraph};
|
pub use proximity::{ProximityCondition, ProximityGraph};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
pub use typo::{TypoCondition, TypoGraph};
|
pub use typo::{TypoCondition, TypoGraph};
|
||||||
@ -80,23 +80,13 @@ pub trait RankingRuleGraphTrait: Sized {
|
|||||||
condition: &Self::Condition,
|
condition: &Self::Condition,
|
||||||
) -> Result<String>;
|
) -> Result<String>;
|
||||||
|
|
||||||
fn words_used_by_condition<'ctx>(
|
|
||||||
ctx: &mut SearchContext<'ctx>,
|
|
||||||
condition: &Self::Condition,
|
|
||||||
) -> Result<HashSet<Interned<String>>>;
|
|
||||||
|
|
||||||
fn phrases_used_by_condition<'ctx>(
|
|
||||||
ctx: &mut SearchContext<'ctx>,
|
|
||||||
condition: &Self::Condition,
|
|
||||||
) -> Result<HashSet<Interned<Phrase>>>;
|
|
||||||
|
|
||||||
/// Compute the document ids associated with the given edge condition,
|
/// Compute the document ids associated with the given edge condition,
|
||||||
/// restricted to the given universe.
|
/// restricted to the given universe.
|
||||||
fn resolve_condition<'ctx>(
|
fn resolve_condition<'ctx>(
|
||||||
ctx: &mut SearchContext<'ctx>,
|
ctx: &mut SearchContext<'ctx>,
|
||||||
condition: &Self::Condition,
|
condition: &Self::Condition,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<RoaringBitmap>;
|
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)>;
|
||||||
|
|
||||||
/// Return the costs and conditions of the edges going from the source node to the destination node
|
/// Return the costs and conditions of the edges going from the source node to the destination node
|
||||||
fn build_edges<'ctx>(
|
fn build_edges<'ctx>(
|
||||||
|
@ -1,56 +1,18 @@
|
|||||||
#![allow(clippy::too_many_arguments)]
|
#![allow(clippy::too_many_arguments)]
|
||||||
use std::collections::BTreeMap;
|
|
||||||
|
|
||||||
use heed::RoTxn;
|
|
||||||
|
|
||||||
use super::ProximityCondition;
|
use super::ProximityCondition;
|
||||||
use crate::search::new::db_cache::DatabaseCache;
|
|
||||||
use crate::search::new::interner::{DedupInterner, Interned};
|
use crate::search::new::interner::{DedupInterner, Interned};
|
||||||
use crate::search::new::query_graph::QueryNodeData;
|
use crate::search::new::query_graph::QueryNodeData;
|
||||||
use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm};
|
use crate::search::new::query_term::LocatedQueryTerm;
|
||||||
use crate::search::new::ranking_rule_graph::proximity::WordPair;
|
|
||||||
use crate::search::new::{QueryNode, SearchContext};
|
use crate::search::new::{QueryNode, SearchContext};
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
fn last_word_of_term_iter<'t>(
|
|
||||||
t: &'t QueryTerm,
|
|
||||||
phrase_interner: &'t DedupInterner<Phrase>,
|
|
||||||
) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
|
|
||||||
t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
|
|
||||||
move |p| {
|
|
||||||
let phrase = phrase_interner.get(p);
|
|
||||||
phrase.words.last().unwrap().map(|last| (Some(p), last))
|
|
||||||
},
|
|
||||||
))
|
|
||||||
}
|
|
||||||
fn first_word_of_term_iter<'t>(
|
|
||||||
t: &'t QueryTerm,
|
|
||||||
phrase_interner: &'t DedupInterner<Phrase>,
|
|
||||||
) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
|
|
||||||
t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
|
|
||||||
move |p| {
|
|
||||||
let phrase = phrase_interner.get(p);
|
|
||||||
phrase.words.first().unwrap().map(|first| (first, Some(p)))
|
|
||||||
},
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build_edges<'ctx>(
|
pub fn build_edges<'ctx>(
|
||||||
ctx: &mut SearchContext<'ctx>,
|
_ctx: &mut SearchContext<'ctx>,
|
||||||
conditions_interner: &mut DedupInterner<ProximityCondition>,
|
conditions_interner: &mut DedupInterner<ProximityCondition>,
|
||||||
from_node: &QueryNode,
|
from_node: &QueryNode,
|
||||||
to_node: &QueryNode,
|
to_node: &QueryNode,
|
||||||
) -> Result<Vec<(u8, Option<Interned<ProximityCondition>>)>> {
|
) -> Result<Vec<(u8, Option<Interned<ProximityCondition>>)>> {
|
||||||
let SearchContext {
|
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
word_interner,
|
|
||||||
phrase_interner,
|
|
||||||
term_interner,
|
|
||||||
term_docids: _,
|
|
||||||
} = ctx;
|
|
||||||
|
|
||||||
let right_term = match &to_node.data {
|
let right_term = match &to_node.data {
|
||||||
QueryNodeData::End => return Ok(vec![(0, None)]),
|
QueryNodeData::End => return Ok(vec![(0, None)]),
|
||||||
QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
|
QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
|
||||||
@ -59,13 +21,11 @@ pub fn build_edges<'ctx>(
|
|||||||
|
|
||||||
let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;
|
let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;
|
||||||
|
|
||||||
let (right_term, right_start_position, right_ngram_length) =
|
let (right_start_position, right_ngram_length) =
|
||||||
(term_interner.get(*right_term_interned), *right_positions.start(), right_positions.len());
|
(*right_positions.start(), right_positions.len());
|
||||||
|
|
||||||
let (left_term, left_end_position) = match &from_node.data {
|
let (left_term_interned, left_end_position) = match &from_node.data {
|
||||||
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
|
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()),
|
||||||
(term_interner.get(*value), *positions.end())
|
|
||||||
}
|
|
||||||
QueryNodeData::Deleted => return Ok(vec![]),
|
QueryNodeData::Deleted => return Ok(vec![]),
|
||||||
QueryNodeData::Start => {
|
QueryNodeData::Start => {
|
||||||
return Ok(vec![(
|
return Ok(vec![(
|
||||||
@ -94,175 +54,24 @@ pub fn build_edges<'ctx>(
|
|||||||
)]);
|
)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut cost_word_pairs = BTreeMap::<u8, Vec<WordPair>>::new();
|
let mut conditions = vec![];
|
||||||
|
for cost in right_ngram_length..(7 + right_ngram_length) {
|
||||||
if let Some(right_prefix) = right_term.use_prefix_db {
|
let cost = cost as u8;
|
||||||
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
|
conditions.push((
|
||||||
add_prefix_edges(
|
cost,
|
||||||
index,
|
Some(conditions_interner.insert(ProximityCondition::Uninit {
|
||||||
txn,
|
left_term: left_term_interned,
|
||||||
db_cache,
|
right_term: *right_term_interned,
|
||||||
word_interner,
|
right_term_ngram_len: right_ngram_length as u8,
|
||||||
right_ngram_length,
|
|
||||||
left_word,
|
|
||||||
right_prefix,
|
|
||||||
&mut cost_word_pairs,
|
|
||||||
left_phrase,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: add safeguard in case the cartesian product is too large!
|
|
||||||
// even if we restrict the word derivations to a maximum of 100, the size of the
|
|
||||||
// caterisan product could reach a maximum of 10_000 derivations, which is way too much.
|
|
||||||
// Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
|
|
||||||
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
|
|
||||||
// reached
|
|
||||||
|
|
||||||
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
|
|
||||||
for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) {
|
|
||||||
add_non_prefix_edges(
|
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
word_interner,
|
|
||||||
right_ngram_length,
|
|
||||||
left_word,
|
|
||||||
right_word,
|
|
||||||
&mut cost_word_pairs,
|
|
||||||
&[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut new_edges = cost_word_pairs
|
|
||||||
.into_iter()
|
|
||||||
.map(|(cost, word_pairs)| {
|
|
||||||
(
|
|
||||||
cost,
|
cost,
|
||||||
Some(
|
})),
|
||||||
conditions_interner
|
))
|
||||||
.insert(ProximityCondition::Pairs { pairs: word_pairs.into_boxed_slice() }),
|
}
|
||||||
),
|
|
||||||
)
|
conditions.push((
|
||||||
})
|
(7 + right_ngram_length) as u8,
|
||||||
.collect::<Vec<_>>();
|
|
||||||
new_edges.push((
|
|
||||||
8 + (right_ngram_length - 1) as u8,
|
|
||||||
Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })),
|
Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })),
|
||||||
));
|
));
|
||||||
Ok(new_edges)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add_prefix_edges<'ctx>(
|
Ok(conditions)
|
||||||
index: &mut &crate::Index,
|
|
||||||
txn: &'ctx RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'ctx>,
|
|
||||||
word_interner: &mut DedupInterner<String>,
|
|
||||||
right_ngram_length: usize,
|
|
||||||
left_word: Interned<String>,
|
|
||||||
right_prefix: Interned<String>,
|
|
||||||
cost_proximity_word_pairs: &mut BTreeMap<u8, Vec<WordPair>>,
|
|
||||||
left_phrase: Option<Interned<Phrase>>,
|
|
||||||
) -> Result<()> {
|
|
||||||
for proximity in 1..=(8 - right_ngram_length) {
|
|
||||||
let cost = (proximity + right_ngram_length - 1) as u8;
|
|
||||||
// TODO: if we had access to the universe here, we could already check whether
|
|
||||||
// the bitmap corresponding to this word pair is disjoint with the universe or not
|
|
||||||
if db_cache
|
|
||||||
.get_word_prefix_pair_proximity_docids(
|
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
word_interner,
|
|
||||||
left_word,
|
|
||||||
right_prefix,
|
|
||||||
proximity as u8,
|
|
||||||
)?
|
|
||||||
.is_some()
|
|
||||||
{
|
|
||||||
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefix {
|
|
||||||
phrases: left_phrase.into_iter().collect(),
|
|
||||||
left: left_word,
|
|
||||||
right_prefix,
|
|
||||||
proximity: proximity as u8,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// No swapping when computing the proximity between a phrase and a word
|
|
||||||
if left_phrase.is_none()
|
|
||||||
&& db_cache
|
|
||||||
.get_prefix_word_pair_proximity_docids(
|
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
word_interner,
|
|
||||||
right_prefix,
|
|
||||||
left_word,
|
|
||||||
proximity as u8 - 1,
|
|
||||||
)?
|
|
||||||
.is_some()
|
|
||||||
{
|
|
||||||
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefixSwapped {
|
|
||||||
left_prefix: right_prefix,
|
|
||||||
right: left_word,
|
|
||||||
proximity: proximity as u8 - 1,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn add_non_prefix_edges<'ctx>(
|
|
||||||
index: &mut &crate::Index,
|
|
||||||
txn: &'ctx RoTxn,
|
|
||||||
db_cache: &mut DatabaseCache<'ctx>,
|
|
||||||
word_interner: &mut DedupInterner<String>,
|
|
||||||
right_ngram_length: usize,
|
|
||||||
word1: Interned<String>,
|
|
||||||
word2: Interned<String>,
|
|
||||||
cost_proximity_word_pairs: &mut BTreeMap<u8, Vec<WordPair>>,
|
|
||||||
phrases: &[Interned<Phrase>],
|
|
||||||
) -> Result<()> {
|
|
||||||
for proximity in 1..=(8 - right_ngram_length) {
|
|
||||||
let cost = (proximity + right_ngram_length - 1) as u8;
|
|
||||||
if db_cache
|
|
||||||
.get_word_pair_proximity_docids(
|
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
word_interner,
|
|
||||||
word1,
|
|
||||||
word2,
|
|
||||||
proximity as u8,
|
|
||||||
)?
|
|
||||||
.is_some()
|
|
||||||
{
|
|
||||||
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words {
|
|
||||||
phrases: phrases.to_vec(),
|
|
||||||
left: word1,
|
|
||||||
right: word2,
|
|
||||||
proximity: proximity as u8,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
if proximity > 1
|
|
||||||
// no swapping when either term is a phrase
|
|
||||||
&& phrases.is_empty()
|
|
||||||
&& db_cache
|
|
||||||
.get_word_pair_proximity_docids(
|
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
word_interner,
|
|
||||||
word2,
|
|
||||||
word1,
|
|
||||||
proximity as u8 - 1,
|
|
||||||
)?
|
|
||||||
.is_some()
|
|
||||||
{
|
|
||||||
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words {
|
|
||||||
phrases: vec![],
|
|
||||||
left: word2,
|
|
||||||
right: word1,
|
|
||||||
proximity: proximity as u8 - 1,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,15 @@
|
|||||||
|
#![allow(clippy::too_many_arguments)]
|
||||||
|
|
||||||
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
|
use fxhash::FxHashSet;
|
||||||
|
use heed::RoTxn;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{ProximityCondition, WordPair};
|
use super::ProximityCondition;
|
||||||
|
use crate::search::new::db_cache::DatabaseCache;
|
||||||
|
use crate::search::new::interner::{DedupInterner, Interned};
|
||||||
|
use crate::search::new::query_term::{Phrase, QueryTerm};
|
||||||
use crate::search::new::SearchContext;
|
use crate::search::new::SearchContext;
|
||||||
use crate::{CboRoaringBitmapCodec, Result};
|
use crate::{CboRoaringBitmapCodec, Result};
|
||||||
|
|
||||||
@ -8,7 +17,7 @@ pub fn compute_docids<'ctx>(
|
|||||||
ctx: &mut SearchContext<'ctx>,
|
ctx: &mut SearchContext<'ctx>,
|
||||||
condition: &ProximityCondition,
|
condition: &ProximityCondition,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
|
||||||
let SearchContext {
|
let SearchContext {
|
||||||
index,
|
index,
|
||||||
txn,
|
txn,
|
||||||
@ -18,96 +27,238 @@ pub fn compute_docids<'ctx>(
|
|||||||
phrase_interner,
|
phrase_interner,
|
||||||
term_interner,
|
term_interner,
|
||||||
} = ctx;
|
} = ctx;
|
||||||
let pairs = match condition {
|
|
||||||
ProximityCondition::Term { term } => {
|
let (left_term, right_term, right_term_ngram_len, cost) = match condition {
|
||||||
return term_docids
|
ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => {
|
||||||
.get_query_term_docids(
|
(*left_term, *right_term, *right_term_ngram_len, *cost)
|
||||||
index,
|
}
|
||||||
txn,
|
ProximityCondition::Term { term } => {
|
||||||
db_cache,
|
let term_v = term_interner.get(*term);
|
||||||
word_interner,
|
return Ok((
|
||||||
term_interner,
|
term_docids
|
||||||
phrase_interner,
|
.get_query_term_docids(
|
||||||
*term,
|
index,
|
||||||
)
|
txn,
|
||||||
.cloned()
|
db_cache,
|
||||||
|
word_interner,
|
||||||
|
term_interner,
|
||||||
|
phrase_interner,
|
||||||
|
*term,
|
||||||
|
)?
|
||||||
|
.clone(),
|
||||||
|
FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()),
|
||||||
|
FxHashSet::from_iter(term_v.all_phrases()),
|
||||||
|
));
|
||||||
}
|
}
|
||||||
ProximityCondition::Pairs { pairs } => pairs,
|
|
||||||
};
|
};
|
||||||
let mut pair_docids = RoaringBitmap::new();
|
|
||||||
for pair in pairs.iter() {
|
let left_term = term_interner.get(left_term);
|
||||||
let pair = match pair {
|
let right_term = term_interner.get(right_term);
|
||||||
WordPair::Words { phrases, left, right, proximity } => {
|
|
||||||
let mut docids = db_cache
|
// e.g. for the simple words `sun .. flower`
|
||||||
.get_word_pair_proximity_docids(
|
// the cost is 5
|
||||||
index,
|
// the forward proximity is 5
|
||||||
txn,
|
// the backward proximity is 4
|
||||||
word_interner,
|
//
|
||||||
*left,
|
// for the 2gram `the sunflower`
|
||||||
*right,
|
// the cost is 5
|
||||||
*proximity,
|
// the forward proximity is 4
|
||||||
)?
|
// the backward proximity is 3
|
||||||
.map(CboRoaringBitmapCodec::deserialize_from)
|
let forward_proximity = 1 + cost - right_term_ngram_len;
|
||||||
.transpose()?
|
let backward_proximity = cost - right_term_ngram_len;
|
||||||
.unwrap_or_default();
|
|
||||||
if !docids.is_empty() {
|
let mut used_words = FxHashSet::default();
|
||||||
for phrase in phrases {
|
let mut used_phrases = FxHashSet::default();
|
||||||
docids &= ctx.term_docids.get_phrase_docids(
|
|
||||||
index,
|
let mut docids = RoaringBitmap::new();
|
||||||
txn,
|
|
||||||
db_cache,
|
if let Some(right_prefix) = right_term.use_prefix_db {
|
||||||
word_interner,
|
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
|
||||||
&ctx.phrase_interner,
|
compute_prefix_edges(
|
||||||
*phrase,
|
index,
|
||||||
)?;
|
txn,
|
||||||
}
|
db_cache,
|
||||||
}
|
word_interner,
|
||||||
docids
|
left_word,
|
||||||
}
|
right_prefix,
|
||||||
WordPair::WordPrefix { phrases, left, right_prefix, proximity } => {
|
left_phrase,
|
||||||
let mut docids = db_cache
|
forward_proximity,
|
||||||
.get_word_prefix_pair_proximity_docids(
|
backward_proximity,
|
||||||
index,
|
&mut docids,
|
||||||
txn,
|
universe,
|
||||||
word_interner,
|
&mut used_words,
|
||||||
*left,
|
&mut used_phrases,
|
||||||
*right_prefix,
|
)?;
|
||||||
*proximity,
|
}
|
||||||
)?
|
|
||||||
.map(CboRoaringBitmapCodec::deserialize_from)
|
|
||||||
.transpose()?
|
|
||||||
.unwrap_or_default();
|
|
||||||
if !docids.is_empty() {
|
|
||||||
for phrase in phrases {
|
|
||||||
docids &= ctx.term_docids.get_phrase_docids(
|
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
db_cache,
|
|
||||||
word_interner,
|
|
||||||
&ctx.phrase_interner,
|
|
||||||
*phrase,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
docids
|
|
||||||
}
|
|
||||||
WordPair::WordPrefixSwapped { left_prefix, right, proximity } => db_cache
|
|
||||||
.get_prefix_word_pair_proximity_docids(
|
|
||||||
index,
|
|
||||||
txn,
|
|
||||||
word_interner,
|
|
||||||
*left_prefix,
|
|
||||||
*right,
|
|
||||||
*proximity,
|
|
||||||
)?
|
|
||||||
.map(CboRoaringBitmapCodec::deserialize_from)
|
|
||||||
.transpose()?
|
|
||||||
.unwrap_or_default(),
|
|
||||||
};
|
|
||||||
// TODO: deserialize bitmap within a universe
|
|
||||||
let bitmap = universe & pair;
|
|
||||||
pair_docids |= bitmap;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(pair_docids)
|
// TODO: add safeguard in case the cartesian product is too large!
|
||||||
|
// even if we restrict the word derivations to a maximum of 100, the size of the
|
||||||
|
// caterisan product could reach a maximum of 10_000 derivations, which is way too much.
|
||||||
|
// Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
|
||||||
|
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
|
||||||
|
// reached
|
||||||
|
|
||||||
|
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
|
||||||
|
for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) {
|
||||||
|
compute_non_prefix_edges(
|
||||||
|
index,
|
||||||
|
txn,
|
||||||
|
db_cache,
|
||||||
|
word_interner,
|
||||||
|
left_word,
|
||||||
|
right_word,
|
||||||
|
&[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
|
||||||
|
forward_proximity,
|
||||||
|
backward_proximity,
|
||||||
|
&mut docids,
|
||||||
|
universe,
|
||||||
|
&mut used_words,
|
||||||
|
&mut used_phrases,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok((docids, used_words, used_phrases))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compute_prefix_edges<'ctx>(
|
||||||
|
index: &mut &crate::Index,
|
||||||
|
txn: &'ctx RoTxn,
|
||||||
|
db_cache: &mut DatabaseCache<'ctx>,
|
||||||
|
word_interner: &mut DedupInterner<String>,
|
||||||
|
left_word: Interned<String>,
|
||||||
|
right_prefix: Interned<String>,
|
||||||
|
left_phrase: Option<Interned<Phrase>>,
|
||||||
|
forward_proximity: u8,
|
||||||
|
backward_proximity: u8,
|
||||||
|
docids: &mut RoaringBitmap,
|
||||||
|
universe: &RoaringBitmap,
|
||||||
|
used_words: &mut FxHashSet<Interned<String>>,
|
||||||
|
used_phrases: &mut FxHashSet<Interned<Phrase>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
if let Some(phrase) = left_phrase {
|
||||||
|
// TODO: compute the phrase, take the intersection between
|
||||||
|
// the phrase and the docids
|
||||||
|
used_phrases.insert(phrase); // This is not fully correct
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids(
|
||||||
|
index,
|
||||||
|
txn,
|
||||||
|
word_interner,
|
||||||
|
left_word,
|
||||||
|
right_prefix,
|
||||||
|
forward_proximity,
|
||||||
|
)? {
|
||||||
|
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||||
|
if !new_docids.is_empty() {
|
||||||
|
used_words.insert(left_word);
|
||||||
|
used_words.insert(right_prefix);
|
||||||
|
*docids |= new_docids;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// No swapping when computing the proximity between a phrase and a word
|
||||||
|
if left_phrase.is_none() {
|
||||||
|
if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids(
|
||||||
|
index,
|
||||||
|
txn,
|
||||||
|
word_interner,
|
||||||
|
right_prefix,
|
||||||
|
left_word,
|
||||||
|
backward_proximity,
|
||||||
|
)? {
|
||||||
|
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||||
|
if !new_docids.is_empty() {
|
||||||
|
used_words.insert(left_word);
|
||||||
|
used_words.insert(right_prefix);
|
||||||
|
*docids |= new_docids;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compute_non_prefix_edges<'ctx>(
|
||||||
|
index: &mut &crate::Index,
|
||||||
|
txn: &'ctx RoTxn,
|
||||||
|
db_cache: &mut DatabaseCache<'ctx>,
|
||||||
|
word_interner: &mut DedupInterner<String>,
|
||||||
|
word1: Interned<String>,
|
||||||
|
word2: Interned<String>,
|
||||||
|
phrases: &[Interned<Phrase>],
|
||||||
|
forward_proximity: u8,
|
||||||
|
backward_proximity: u8,
|
||||||
|
docids: &mut RoaringBitmap,
|
||||||
|
universe: &RoaringBitmap,
|
||||||
|
used_words: &mut FxHashSet<Interned<String>>,
|
||||||
|
used_phrases: &mut FxHashSet<Interned<Phrase>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
if !phrases.is_empty() {
|
||||||
|
// TODO: compute the docids associated with these phrases
|
||||||
|
// take their intersection with the new docids
|
||||||
|
used_phrases.extend(phrases); // This is not fully correct
|
||||||
|
}
|
||||||
|
if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
|
||||||
|
index,
|
||||||
|
txn,
|
||||||
|
word_interner,
|
||||||
|
word1,
|
||||||
|
word2,
|
||||||
|
forward_proximity,
|
||||||
|
)? {
|
||||||
|
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||||
|
if !new_docids.is_empty() {
|
||||||
|
used_words.insert(word1);
|
||||||
|
used_words.insert(word2);
|
||||||
|
*docids |= new_docids;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if backward_proximity >= 1
|
||||||
|
// no swapping when either term is a phrase
|
||||||
|
&& phrases.is_empty()
|
||||||
|
{
|
||||||
|
if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
|
||||||
|
index,
|
||||||
|
txn,
|
||||||
|
word_interner,
|
||||||
|
word2,
|
||||||
|
word1,
|
||||||
|
backward_proximity,
|
||||||
|
)? {
|
||||||
|
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||||
|
if !new_docids.is_empty() {
|
||||||
|
used_words.insert(word1);
|
||||||
|
used_words.insert(word2);
|
||||||
|
*docids |= new_docids;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn last_word_of_term_iter<'t>(
|
||||||
|
t: &'t QueryTerm,
|
||||||
|
phrase_interner: &'t DedupInterner<Phrase>,
|
||||||
|
) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
|
||||||
|
t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
|
||||||
|
move |p| {
|
||||||
|
let phrase = phrase_interner.get(p);
|
||||||
|
phrase.words.last().unwrap().map(|last| (Some(p), last))
|
||||||
|
},
|
||||||
|
))
|
||||||
|
}
|
||||||
|
fn first_word_of_term_iter<'t>(
|
||||||
|
t: &'t QueryTerm,
|
||||||
|
phrase_interner: &'t DedupInterner<Phrase>,
|
||||||
|
) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
|
||||||
|
t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
|
||||||
|
move |p| {
|
||||||
|
let phrase = phrase_interner.get(p);
|
||||||
|
phrase.words.first().unwrap().map(|first| (first, Some(p)))
|
||||||
|
},
|
||||||
|
))
|
||||||
}
|
}
|
||||||
|
@ -1,9 +1,7 @@
|
|||||||
pub mod build;
|
pub mod build;
|
||||||
pub mod compute_docids;
|
pub mod compute_docids;
|
||||||
|
|
||||||
use std::collections::HashSet;
|
use fxhash::FxHashSet;
|
||||||
use std::iter::FromIterator;
|
|
||||||
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
@ -13,31 +11,17 @@ use crate::search::new::query_term::{Phrase, QueryTerm};
|
|||||||
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
|
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
|
||||||
pub enum WordPair {
|
|
||||||
Words {
|
|
||||||
phrases: Vec<Interned<Phrase>>,
|
|
||||||
left: Interned<String>,
|
|
||||||
right: Interned<String>,
|
|
||||||
proximity: u8,
|
|
||||||
},
|
|
||||||
WordPrefix {
|
|
||||||
phrases: Vec<Interned<Phrase>>,
|
|
||||||
left: Interned<String>,
|
|
||||||
right_prefix: Interned<String>,
|
|
||||||
proximity: u8,
|
|
||||||
},
|
|
||||||
WordPrefixSwapped {
|
|
||||||
left_prefix: Interned<String>,
|
|
||||||
right: Interned<String>,
|
|
||||||
proximity: u8,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
pub enum ProximityCondition {
|
pub enum ProximityCondition {
|
||||||
Term { term: Interned<QueryTerm> },
|
Uninit {
|
||||||
Pairs { pairs: Box<[WordPair]> },
|
left_term: Interned<QueryTerm>,
|
||||||
|
right_term: Interned<QueryTerm>,
|
||||||
|
right_term_ngram_len: u8,
|
||||||
|
cost: u8,
|
||||||
|
},
|
||||||
|
Term {
|
||||||
|
term: Interned<QueryTerm>,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
pub enum ProximityGraph {}
|
pub enum ProximityGraph {}
|
||||||
@ -49,7 +33,8 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
|||||||
ctx: &mut SearchContext<'ctx>,
|
ctx: &mut SearchContext<'ctx>,
|
||||||
condition: &Self::Condition,
|
condition: &Self::Condition,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<roaring::RoaringBitmap> {
|
) -> Result<(roaring::RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)>
|
||||||
|
{
|
||||||
compute_docids::compute_docids(ctx, condition, universe)
|
compute_docids::compute_docids(ctx, condition, universe)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,107 +64,14 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
|||||||
condition: &Self::Condition,
|
condition: &Self::Condition,
|
||||||
) -> Result<String> {
|
) -> Result<String> {
|
||||||
match condition {
|
match condition {
|
||||||
|
ProximityCondition::Uninit { cost, .. } => {
|
||||||
|
// TODO
|
||||||
|
Ok(format!("{cost}: cost"))
|
||||||
|
}
|
||||||
ProximityCondition::Term { term } => {
|
ProximityCondition::Term { term } => {
|
||||||
let term = ctx.term_interner.get(*term);
|
let term = ctx.term_interner.get(*term);
|
||||||
Ok(format!("{} : exists", ctx.word_interner.get(term.original)))
|
Ok(format!("{} : exists", ctx.word_interner.get(term.original)))
|
||||||
}
|
}
|
||||||
ProximityCondition::Pairs { pairs } => {
|
|
||||||
let mut s = String::new();
|
|
||||||
for pair in pairs.iter() {
|
|
||||||
match pair {
|
|
||||||
WordPair::Words { phrases, left, right, proximity } => {
|
|
||||||
let left = ctx.word_interner.get(*left);
|
|
||||||
let right = ctx.word_interner.get(*right);
|
|
||||||
if !phrases.is_empty() {
|
|
||||||
s.push_str(&format!("{} phrases + ", phrases.len()));
|
|
||||||
}
|
|
||||||
s.push_str(&format!("\"{left} {right}\": {proximity}\n"));
|
|
||||||
}
|
|
||||||
WordPair::WordPrefix { phrases, left, right_prefix, proximity } => {
|
|
||||||
let left = ctx.word_interner.get(*left);
|
|
||||||
let right = ctx.word_interner.get(*right_prefix);
|
|
||||||
if !phrases.is_empty() {
|
|
||||||
s.push_str(&format!("{} phrases + ", phrases.len()));
|
|
||||||
}
|
|
||||||
s.push_str(&format!("\"{left} {right}...\" : {proximity}\n"));
|
|
||||||
}
|
|
||||||
WordPair::WordPrefixSwapped { left_prefix, right, proximity } => {
|
|
||||||
let left = ctx.word_interner.get(*left_prefix);
|
|
||||||
let right = ctx.word_interner.get(*right);
|
|
||||||
s.push_str(&format!("\"{left}... {right}\" : {proximity}\n"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(s)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn words_used_by_condition<'ctx>(
|
|
||||||
ctx: &mut SearchContext<'ctx>,
|
|
||||||
condition: &Self::Condition,
|
|
||||||
) -> Result<HashSet<Interned<String>>> {
|
|
||||||
match condition {
|
|
||||||
ProximityCondition::Term { term } => {
|
|
||||||
let term = ctx.term_interner.get(*term);
|
|
||||||
Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
|
|
||||||
}
|
|
||||||
ProximityCondition::Pairs { pairs } => {
|
|
||||||
let mut set = HashSet::new();
|
|
||||||
for pair in pairs.iter() {
|
|
||||||
match pair {
|
|
||||||
WordPair::Words { phrases: _, left, right, proximity: _ } => {
|
|
||||||
set.insert(*left);
|
|
||||||
set.insert(*right);
|
|
||||||
}
|
|
||||||
WordPair::WordPrefix { phrases: _, left, right_prefix, proximity: _ } => {
|
|
||||||
set.insert(*left);
|
|
||||||
// TODO: this is not correct, there should be another trait method for collecting the prefixes
|
|
||||||
// to be used with the prefix DBs
|
|
||||||
set.insert(*right_prefix);
|
|
||||||
}
|
|
||||||
WordPair::WordPrefixSwapped { left_prefix, right, proximity: _ } => {
|
|
||||||
// TODO: this is not correct, there should be another trait method for collecting the prefixes
|
|
||||||
// to be used with the prefix DBs
|
|
||||||
set.insert(*left_prefix);
|
|
||||||
set.insert(*right);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(set)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn phrases_used_by_condition<'ctx>(
|
|
||||||
ctx: &mut SearchContext<'ctx>,
|
|
||||||
condition: &Self::Condition,
|
|
||||||
) -> Result<HashSet<Interned<Phrase>>> {
|
|
||||||
match condition {
|
|
||||||
ProximityCondition::Term { term } => {
|
|
||||||
let term = ctx.term_interner.get(*term);
|
|
||||||
Ok(HashSet::from_iter(term.all_phrases()))
|
|
||||||
}
|
|
||||||
ProximityCondition::Pairs { pairs } => {
|
|
||||||
let mut set = HashSet::new();
|
|
||||||
for pair in pairs.iter() {
|
|
||||||
match pair {
|
|
||||||
WordPair::Words { phrases, left: _, right: _, proximity: _ } => {
|
|
||||||
set.extend(phrases.iter().copied());
|
|
||||||
}
|
|
||||||
WordPair::WordPrefix {
|
|
||||||
phrases,
|
|
||||||
left: _,
|
|
||||||
right_prefix: _,
|
|
||||||
proximity: _,
|
|
||||||
} => {
|
|
||||||
set.extend(phrases.iter().copied());
|
|
||||||
}
|
|
||||||
WordPair::WordPrefixSwapped { left_prefix: _, right: _, proximity: _ } => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(set)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
use std::collections::HashSet;
|
// use std::collections::HashSet;
|
||||||
use std::fmt::Write;
|
use std::fmt::Write;
|
||||||
use std::iter::FromIterator;
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
|
use fxhash::FxHashSet;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
@ -26,7 +27,7 @@ impl RankingRuleGraphTrait for TypoGraph {
|
|||||||
ctx: &mut SearchContext<'ctx>,
|
ctx: &mut SearchContext<'ctx>,
|
||||||
condition: &Self::Condition,
|
condition: &Self::Condition,
|
||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
|
||||||
let SearchContext {
|
let SearchContext {
|
||||||
index,
|
index,
|
||||||
txn,
|
txn,
|
||||||
@ -48,7 +49,12 @@ impl RankingRuleGraphTrait for TypoGraph {
|
|||||||
condition.term,
|
condition.term,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
Ok(docids)
|
let term = term_interner.get(condition.term);
|
||||||
|
Ok((
|
||||||
|
docids,
|
||||||
|
FxHashSet::from_iter(term.all_single_words_except_prefix_db()),
|
||||||
|
FxHashSet::from_iter(term.all_phrases()),
|
||||||
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn build_edges<'ctx>(
|
fn build_edges<'ctx>(
|
||||||
@ -202,21 +208,21 @@ impl RankingRuleGraphTrait for TypoGraph {
|
|||||||
Ok(s)
|
Ok(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn words_used_by_condition<'ctx>(
|
// fn words_used_by_condition<'ctx>(
|
||||||
ctx: &mut SearchContext<'ctx>,
|
// ctx: &mut SearchContext<'ctx>,
|
||||||
condition: &Self::Condition,
|
// condition: &Self::Condition,
|
||||||
) -> Result<HashSet<Interned<String>>> {
|
// ) -> Result<HashSet<Interned<String>>> {
|
||||||
let TypoCondition { term, .. } = condition;
|
// let TypoCondition { term, .. } = condition;
|
||||||
let term = ctx.term_interner.get(*term);
|
// let term = ctx.term_interner.get(*term);
|
||||||
Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
|
// Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
|
||||||
}
|
// }
|
||||||
|
|
||||||
fn phrases_used_by_condition<'ctx>(
|
// fn phrases_used_by_condition<'ctx>(
|
||||||
ctx: &mut SearchContext<'ctx>,
|
// ctx: &mut SearchContext<'ctx>,
|
||||||
condition: &Self::Condition,
|
// condition: &Self::Condition,
|
||||||
) -> Result<HashSet<Interned<Phrase>>> {
|
// ) -> Result<HashSet<Interned<Phrase>>> {
|
||||||
let TypoCondition { term, .. } = condition;
|
// let TypoCondition { term, .. } = condition;
|
||||||
let term = ctx.term_interner.get(*term);
|
// let term = ctx.term_interner.get(*term);
|
||||||
Ok(HashSet::from_iter(term.all_phrases()))
|
// Ok(HashSet::from_iter(term.all_phrases()))
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
@ -125,7 +125,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||||||
let mut results = vec![];
|
let mut results = vec![];
|
||||||
let mut cur_offset = 0usize;
|
let mut cur_offset = 0usize;
|
||||||
|
|
||||||
/// Add the candidates to the results. Take `distinct`, `from`, `limit`, and `cur_offset`
|
/// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset`
|
||||||
/// into account and inform the logger.
|
/// into account and inform the logger.
|
||||||
macro_rules! maybe_add_to_results {
|
macro_rules! maybe_add_to_results {
|
||||||
($candidates:expr) => {
|
($candidates:expr) => {
|
||||||
@ -181,6 +181,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
|||||||
cur_offset += len as usize;
|
cur_offset += len as usize;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
while results.len() < length {
|
while results.len() < length {
|
||||||
// The universe for this bucket is zero or one element, so we don't need to sort
|
// The universe for this bucket is zero or one element, so we don't need to sort
|
||||||
// anything, just extend the results and go back to the parent ranking rule.
|
// anything, just extend the results and go back to the parent ranking rule.
|
||||||
|
@ -9,9 +9,9 @@ use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
|||||||
use crate::{Result, TermsMatchingStrategy};
|
use crate::{Result, TermsMatchingStrategy};
|
||||||
|
|
||||||
pub struct Words {
|
pub struct Words {
|
||||||
exhausted: bool,
|
exhausted: bool, // TODO: remove
|
||||||
query_graph: Option<QueryGraph>,
|
query_graph: Option<QueryGraph>,
|
||||||
iterating: bool,
|
iterating: bool, // TODO: remove
|
||||||
positions_to_remove: Vec<i8>,
|
positions_to_remove: Vec<i8>,
|
||||||
terms_matching_strategy: TermsMatchingStrategy,
|
terms_matching_strategy: TermsMatchingStrategy,
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user