From 1b514517f52efb833d4dc98361d70827d760ad6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 1 May 2023 16:26:01 +0200 Subject: [PATCH] Fix bug in computation of query term at a position --- .../search/new/graph_based_ranking_rule.rs | 25 ------ .../new/ranking_rule_graph/position/mod.rs | 8 +- milli/src/search/new/resolve_query_graph.rs | 10 +-- milli/src/search/new/tests/integration.rs | 78 +++++++++++++++++++ milli/src/search/new/tests/mod.rs | 3 +- 5 files changed, 84 insertions(+), 40 deletions(-) create mode 100644 milli/src/search/new/tests/integration.rs diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 379a0b2ab..d8f6836e7 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -100,8 +100,6 @@ impl GraphBasedRankingRule { } } -static mut COUNT_PATHS: usize = 0; - /// The internal state of a graph-based ranking rule during iteration pub struct GraphBasedRankingRuleState { /// The current graph @@ -219,34 +217,11 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase // the number of future candidate paths given by that same function. let mut subpaths_docids: Vec<(Interned, RoaringBitmap)> = vec![]; - let mut at_least_one = false; - // unsafe { - // if COUNT_PATHS >= 1489 && COUNT_PATHS < 1491 { - // println!("COUNT_PATHS {COUNT_PATHS} COST {cost}, NODES {COUNT_VISITED_NODES}, UNIVERSE {}", universe.len()); - // // let all_costs = all_costs.get(graph.query_graph.root_node); - // // println!("{all_costs:?}"); - // dead_ends_cache.debug_print(0); - // println!("{universe:?}"); - - // println!("=================="); - // } - // } let mut nodes_with_removed_outgoing_conditions = BTreeSet::new(); let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache); visitor.visit_paths(&mut |path, graph, dead_ends_cache| { - unsafe { - COUNT_PATHS += 1; - } - // if self.id == "position" { - // at_least_one = true; - // print!("."); - // } - // if self.id == "fid" { - at_least_one = true; - // print!("!"); - // } considered_paths.push(path.to_vec()); // If the universe is empty, stop exploring the graph, since no docids will ever be found anymore. if universe.is_empty() { diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index c3d33174b..d4640097e 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -35,7 +35,6 @@ impl RankingRuleGraphTrait for PositionGraph { *position, )?; } - Ok(ComputedCondition { docids, universe_len: universe.len(), @@ -91,15 +90,12 @@ impl RankingRuleGraphTrait for PositionGraph { }; positions_for_costs.entry(cost).or_default().push(position); } - println!( - "positions for cost {} : {positions_for_costs:?}", - term.term_subset.description(ctx) - ); + let mut edges = vec![]; for (cost, positions) in positions_for_costs { // TODO: We can improve performances and relevancy by storing - // the term subsets associated to each position fetched. + // the term subsets associated to each position fetched edges.push(( cost, conditions_interner.insert(PositionCondition { diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 131dcd856..2bcb4d3ac 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -69,16 +69,14 @@ pub fn compute_query_term_subset_docids_within_field_id( } for phrase in term.all_phrases(ctx)? { - let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone(); // There may be false positives when resolving a phrase, so we're not // guaranteed that all of its words are within a single fid. // TODO: fix this? if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? { - phrase_docids &= word_fid_docids; + docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids; } } - docids |= phrase_docids; } if let Some(word_prefix) = term.use_prefix_db(ctx) { @@ -98,7 +96,6 @@ pub fn compute_query_term_subset_docids_within_position( position: u16, ) -> Result { // TODO Use the roaring::MultiOps trait - let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { if let Some(word_position_docids) = @@ -109,16 +106,14 @@ pub fn compute_query_term_subset_docids_within_position( } for phrase in term.all_phrases(ctx)? { - let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone(); // It's difficult to know the expected position of the words in the phrase, // so instead we just check the first one. // TODO: fix this? if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? { - phrase_docids &= word_position_docids; + docids |= ctx.get_phrase_docids(phrase)? & word_position_docids } } - docids |= phrase_docids; } if let Some(word_prefix) = term.use_prefix_db(ctx) { @@ -128,7 +123,6 @@ pub fn compute_query_term_subset_docids_within_position( docids |= word_position_docids; } } - Ok(docids) } diff --git a/milli/src/search/new/tests/integration.rs b/milli/src/search/new/tests/integration.rs new file mode 100644 index 000000000..153a80343 --- /dev/null +++ b/milli/src/search/new/tests/integration.rs @@ -0,0 +1,78 @@ +use std::io::Cursor; + +use big_s::S; +use heed::EnvOpenOptions; +use maplit::{hashmap, hashset}; + +use crate::{ + db_snap, + documents::{DocumentsBatchBuilder, DocumentsBatchReader}, + update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}, + Criterion, Index, Object, +}; +pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson"); + +pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let config = IndexerConfig::default(); + + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_criteria(criteria.to_vec()); + builder.set_filterable_fields(hashset! { + S("tag"), + S("asc_desc_rank"), + S("_geo"), + S("opt1"), + S("opt1.opt2"), + S("tag_in") + }); + builder.set_sortable_fields(hashset! { + S("tag"), + S("asc_desc_rank"), + }); + builder.set_synonyms(hashmap! { + S("hello") => vec![S("good morning")], + S("world") => vec![S("earth")], + S("america") => vec![S("the united states")], + }); + builder.set_searchable_fields(vec![S("title"), S("description")]); + builder.execute(|_| (), || false).unwrap(); + + // index documents + let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; + let indexing_config = IndexDocumentsConfig::default(); + + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); + let reader = Cursor::new(CONTENT.as_bytes()); + + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); + } + + let vector = documents_builder.into_inner().unwrap(); + + // index documents + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + + wtxn.commit().unwrap(); + + index +} + +#[test] +fn snapshot_integration_dataset() { + let index = setup_search_index_with_criteria(&[Criterion::Attribute]); + db_snap!(index, word_position_docids, @"3c9347a767bceef3beb31465f1e5f3ae"); +} diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index 2ad806a87..906aeda83 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -3,16 +3,17 @@ pub mod attribute_position; pub mod distinct; pub mod exactness; pub mod geo_sort; +pub mod integration; #[cfg(feature = "default")] pub mod language; pub mod ngram_split_words; pub mod proximity; pub mod proximity_typo; pub mod sort; +pub mod stop_words; pub mod typo; pub mod typo_proximity; pub mod words_tms; -pub mod stop_words; fn collect_field_values( index: &crate::Index,