diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index ef4880cfb..a8e3f3916 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -77,6 +77,10 @@ impl RankingRuleGraphTrait for PositionGraph { let cost = { let mut cost = 0; for i in 0..term.term_ids.len() { + // This is actually not fully correct and slightly penalises ngrams unfairly. + // Because if two words are in the same bucketed position (e.g. 32) and consecutive, + // then their position cost will be 32+32=64, but an ngram of these two words at the + // same position will have a cost of 32+32+1=65 cost += position as u32 + i as u32; } cost diff --git a/milli/src/search/new/tests/attribute_fid.rs b/milli/src/search/new/tests/attribute_fid.rs index ec7b7a69e..d71c57f2c 100644 --- a/milli/src/search/new/tests/attribute_fid.rs +++ b/milli/src/search/new/tests/attribute_fid.rs @@ -88,7 +88,25 @@ fn create_index() -> TempIndex { "title": "the quick", "description": "", "plot": "brown fox jumps over the lazy dog", - } + }, + { + "id": 12, + "title": "", + "description": "the quickbrownfox", + "plot": "jumps over the lazy dog", + }, + { + "id": 13, + "title": "", + "description": "the quick brown fox", + "plot": "jumps over the lazy dog", + }, + { + "id": 14, + "title": "", + "description": "the quickbrownfox", + "plot": "jumps overthelazy dog", + }, ])) .unwrap(); index @@ -104,5 +122,18 @@ fn test_attribute_fid_simple() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 6, 5, 4, 3, 9, 7, 8, 11, 10, 0]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 6, 5, 4, 3, 9, 7, 8, 11, 10, 12, 13, 14, 0]"); +} + +#[test] +fn test_attribute_fid_ngrams() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 6, 5, 4, 3, 9, 7, 8, 11, 10, 12, 13, 14, 0]"); } diff --git a/milli/src/search/new/tests/attribute_position.rs b/milli/src/search/new/tests/attribute_position.rs index e4ed8b5ff..08b38684b 100644 --- a/milli/src/search/new/tests/attribute_position.rs +++ b/milli/src/search/new/tests/attribute_position.rs @@ -8,7 +8,11 @@ fn create_index() -> TempIndex { index .update_settings(|s| { s.set_primary_key("id".to_owned()); - s.set_searchable_fields(vec!["text".to_owned(), "other".to_owned()]); + s.set_searchable_fields(vec![ + "text".to_owned(), + "text2".to_owned(), + "other".to_owned(), + ]); s.set_criteria(vec![Criterion::Attribute]); }) .unwrap(); @@ -83,8 +87,41 @@ fn create_index() -> TempIndex { a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + quickbrown", + }, + { + "id": 8, + "text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a quick brown", }, + { + "id": 9, + "text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a + quickbrown", + }, + { + "id": 10, + "text": "quick brown", + "text2": "brown quick", + }, + { + "id": 11, + "text": "quickbrown", + }, + { + "id": 12, + "text": "quick brown", + }, + { + "id": 13, + "text": "quickbrown", + }, ])) .unwrap(); index @@ -94,7 +131,7 @@ fn create_index() -> TempIndex { fn test_attribute_position_simple() { let index = create_index(); - db_snap!(index, word_position_docids, @"fe86911166fa4c0903c512fd86ec65e4"); + db_snap!(index, word_position_docids, @"1ad58847d772924d8aab5e92be8cf0cc"); let txn = index.read_txn().unwrap(); @@ -102,7 +139,7 @@ fn test_attribute_position_simple() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("quick brown"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0, 6, 7, 5]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]"); } #[test] fn test_attribute_position_repeated() { @@ -114,5 +151,31 @@ fn test_attribute_position_repeated() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("a a a a a"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 7, 6]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 7, 8, 9, 6]"); +} + +#[test] +fn test_attribute_position_different_fields() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("quick brown"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]"); +} + +#[test] +fn test_attribute_position_ngrams() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("quick brown"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 11, 12, 13, 3, 4, 2, 1, 0, 6, 8, 7, 9, 5]"); }