mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 20:15:07 +08:00
Fix bug in computation of query term at a position
This commit is contained in:
parent
11f814821d
commit
1b514517f5
@ -100,8 +100,6 @@ impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static mut COUNT_PATHS: usize = 0;
|
|
||||||
|
|
||||||
/// The internal state of a graph-based ranking rule during iteration
|
/// The internal state of a graph-based ranking rule during iteration
|
||||||
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
|
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
|
||||||
/// The current graph
|
/// The current graph
|
||||||
@ -219,34 +217,11 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
|
|||||||
// the number of future candidate paths given by that same function.
|
// the number of future candidate paths given by that same function.
|
||||||
|
|
||||||
let mut subpaths_docids: Vec<(Interned<G::Condition>, RoaringBitmap)> = vec![];
|
let mut subpaths_docids: Vec<(Interned<G::Condition>, RoaringBitmap)> = vec![];
|
||||||
let mut at_least_one = false;
|
|
||||||
|
|
||||||
// unsafe {
|
|
||||||
// if COUNT_PATHS >= 1489 && COUNT_PATHS < 1491 {
|
|
||||||
// println!("COUNT_PATHS {COUNT_PATHS} COST {cost}, NODES {COUNT_VISITED_NODES}, UNIVERSE {}", universe.len());
|
|
||||||
// // let all_costs = all_costs.get(graph.query_graph.root_node);
|
|
||||||
// // println!("{all_costs:?}");
|
|
||||||
// dead_ends_cache.debug_print(0);
|
|
||||||
// println!("{universe:?}");
|
|
||||||
|
|
||||||
// println!("==================");
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
let mut nodes_with_removed_outgoing_conditions = BTreeSet::new();
|
let mut nodes_with_removed_outgoing_conditions = BTreeSet::new();
|
||||||
let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache);
|
let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache);
|
||||||
|
|
||||||
visitor.visit_paths(&mut |path, graph, dead_ends_cache| {
|
visitor.visit_paths(&mut |path, graph, dead_ends_cache| {
|
||||||
unsafe {
|
|
||||||
COUNT_PATHS += 1;
|
|
||||||
}
|
|
||||||
// if self.id == "position" {
|
|
||||||
// at_least_one = true;
|
|
||||||
// print!(".");
|
|
||||||
// }
|
|
||||||
// if self.id == "fid" {
|
|
||||||
at_least_one = true;
|
|
||||||
// print!("!");
|
|
||||||
// }
|
|
||||||
considered_paths.push(path.to_vec());
|
considered_paths.push(path.to_vec());
|
||||||
// If the universe is empty, stop exploring the graph, since no docids will ever be found anymore.
|
// If the universe is empty, stop exploring the graph, since no docids will ever be found anymore.
|
||||||
if universe.is_empty() {
|
if universe.is_empty() {
|
||||||
|
@ -35,7 +35,6 @@ impl RankingRuleGraphTrait for PositionGraph {
|
|||||||
*position,
|
*position,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(ComputedCondition {
|
Ok(ComputedCondition {
|
||||||
docids,
|
docids,
|
||||||
universe_len: universe.len(),
|
universe_len: universe.len(),
|
||||||
@ -91,15 +90,12 @@ impl RankingRuleGraphTrait for PositionGraph {
|
|||||||
};
|
};
|
||||||
positions_for_costs.entry(cost).or_default().push(position);
|
positions_for_costs.entry(cost).or_default().push(position);
|
||||||
}
|
}
|
||||||
println!(
|
|
||||||
"positions for cost {} : {positions_for_costs:?}",
|
|
||||||
term.term_subset.description(ctx)
|
|
||||||
);
|
|
||||||
let mut edges = vec![];
|
let mut edges = vec![];
|
||||||
|
|
||||||
for (cost, positions) in positions_for_costs {
|
for (cost, positions) in positions_for_costs {
|
||||||
// TODO: We can improve performances and relevancy by storing
|
// TODO: We can improve performances and relevancy by storing
|
||||||
// the term subsets associated to each position fetched.
|
// the term subsets associated to each position fetched
|
||||||
edges.push((
|
edges.push((
|
||||||
cost,
|
cost,
|
||||||
conditions_interner.insert(PositionCondition {
|
conditions_interner.insert(PositionCondition {
|
||||||
|
@ -69,16 +69,14 @@ pub fn compute_query_term_subset_docids_within_field_id(
|
|||||||
}
|
}
|
||||||
|
|
||||||
for phrase in term.all_phrases(ctx)? {
|
for phrase in term.all_phrases(ctx)? {
|
||||||
let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone();
|
|
||||||
// There may be false positives when resolving a phrase, so we're not
|
// There may be false positives when resolving a phrase, so we're not
|
||||||
// guaranteed that all of its words are within a single fid.
|
// guaranteed that all of its words are within a single fid.
|
||||||
// TODO: fix this?
|
// TODO: fix this?
|
||||||
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
||||||
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
|
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
|
||||||
phrase_docids &= word_fid_docids;
|
docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
docids |= phrase_docids;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(word_prefix) = term.use_prefix_db(ctx) {
|
if let Some(word_prefix) = term.use_prefix_db(ctx) {
|
||||||
@ -98,7 +96,6 @@ pub fn compute_query_term_subset_docids_within_position(
|
|||||||
position: u16,
|
position: u16,
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
// TODO Use the roaring::MultiOps trait
|
// TODO Use the roaring::MultiOps trait
|
||||||
|
|
||||||
let mut docids = RoaringBitmap::new();
|
let mut docids = RoaringBitmap::new();
|
||||||
for word in term.all_single_words_except_prefix_db(ctx)? {
|
for word in term.all_single_words_except_prefix_db(ctx)? {
|
||||||
if let Some(word_position_docids) =
|
if let Some(word_position_docids) =
|
||||||
@ -109,16 +106,14 @@ pub fn compute_query_term_subset_docids_within_position(
|
|||||||
}
|
}
|
||||||
|
|
||||||
for phrase in term.all_phrases(ctx)? {
|
for phrase in term.all_phrases(ctx)? {
|
||||||
let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone();
|
|
||||||
// It's difficult to know the expected position of the words in the phrase,
|
// It's difficult to know the expected position of the words in the phrase,
|
||||||
// so instead we just check the first one.
|
// so instead we just check the first one.
|
||||||
// TODO: fix this?
|
// TODO: fix this?
|
||||||
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
||||||
if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
|
if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
|
||||||
phrase_docids &= word_position_docids;
|
docids |= ctx.get_phrase_docids(phrase)? & word_position_docids
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
docids |= phrase_docids;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(word_prefix) = term.use_prefix_db(ctx) {
|
if let Some(word_prefix) = term.use_prefix_db(ctx) {
|
||||||
@ -128,7 +123,6 @@ pub fn compute_query_term_subset_docids_within_position(
|
|||||||
docids |= word_position_docids;
|
docids |= word_position_docids;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(docids)
|
Ok(docids)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
78
milli/src/search/new/tests/integration.rs
Normal file
78
milli/src/search/new/tests/integration.rs
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
use std::io::Cursor;
|
||||||
|
|
||||||
|
use big_s::S;
|
||||||
|
use heed::EnvOpenOptions;
|
||||||
|
use maplit::{hashmap, hashset};
|
||||||
|
|
||||||
|
use crate::{
|
||||||
|
db_snap,
|
||||||
|
documents::{DocumentsBatchBuilder, DocumentsBatchReader},
|
||||||
|
update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings},
|
||||||
|
Criterion, Index, Object,
|
||||||
|
};
|
||||||
|
pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson");
|
||||||
|
|
||||||
|
pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let config = IndexerConfig::default();
|
||||||
|
|
||||||
|
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||||
|
|
||||||
|
builder.set_criteria(criteria.to_vec());
|
||||||
|
builder.set_filterable_fields(hashset! {
|
||||||
|
S("tag"),
|
||||||
|
S("asc_desc_rank"),
|
||||||
|
S("_geo"),
|
||||||
|
S("opt1"),
|
||||||
|
S("opt1.opt2"),
|
||||||
|
S("tag_in")
|
||||||
|
});
|
||||||
|
builder.set_sortable_fields(hashset! {
|
||||||
|
S("tag"),
|
||||||
|
S("asc_desc_rank"),
|
||||||
|
});
|
||||||
|
builder.set_synonyms(hashmap! {
|
||||||
|
S("hello") => vec![S("good morning")],
|
||||||
|
S("world") => vec![S("earth")],
|
||||||
|
S("america") => vec![S("the united states")],
|
||||||
|
});
|
||||||
|
builder.set_searchable_fields(vec![S("title"), S("description")]);
|
||||||
|
builder.execute(|_| (), || false).unwrap();
|
||||||
|
|
||||||
|
// index documents
|
||||||
|
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
||||||
|
let indexing_config = IndexDocumentsConfig::default();
|
||||||
|
|
||||||
|
let builder =
|
||||||
|
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
||||||
|
let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
let reader = Cursor::new(CONTENT.as_bytes());
|
||||||
|
|
||||||
|
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||||
|
let object = result.unwrap();
|
||||||
|
documents_builder.append_json_object(&object).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
let vector = documents_builder.into_inner().unwrap();
|
||||||
|
|
||||||
|
// index documents
|
||||||
|
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
||||||
|
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||||
|
user_error.unwrap();
|
||||||
|
builder.execute().unwrap();
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn snapshot_integration_dataset() {
|
||||||
|
let index = setup_search_index_with_criteria(&[Criterion::Attribute]);
|
||||||
|
db_snap!(index, word_position_docids, @"3c9347a767bceef3beb31465f1e5f3ae");
|
||||||
|
}
|
@ -3,16 +3,17 @@ pub mod attribute_position;
|
|||||||
pub mod distinct;
|
pub mod distinct;
|
||||||
pub mod exactness;
|
pub mod exactness;
|
||||||
pub mod geo_sort;
|
pub mod geo_sort;
|
||||||
|
pub mod integration;
|
||||||
#[cfg(feature = "default")]
|
#[cfg(feature = "default")]
|
||||||
pub mod language;
|
pub mod language;
|
||||||
pub mod ngram_split_words;
|
pub mod ngram_split_words;
|
||||||
pub mod proximity;
|
pub mod proximity;
|
||||||
pub mod proximity_typo;
|
pub mod proximity_typo;
|
||||||
pub mod sort;
|
pub mod sort;
|
||||||
|
pub mod stop_words;
|
||||||
pub mod typo;
|
pub mod typo;
|
||||||
pub mod typo_proximity;
|
pub mod typo_proximity;
|
||||||
pub mod words_tms;
|
pub mod words_tms;
|
||||||
pub mod stop_words;
|
|
||||||
|
|
||||||
fn collect_field_values(
|
fn collect_field_values(
|
||||||
index: &crate::Index,
|
index: &crate::Index,
|
||||||
|
Loading…
Reference in New Issue
Block a user