diff --git a/.gitignore b/.gitignore index 107b5bb36..02c4fcd79 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,10 @@ *.csv *.mmdb *.svg + +# Snapshots +## ... large +*.full.snap + +# ... unreviewed +*.snap.new diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 37c7b7c84..2bb6a50a1 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -51,7 +51,9 @@ csv = "1.1.6" [dev-dependencies] big_s = "1.0.2" +insta = "1.18.1" maplit = "1.0.2" +md5 = "0.7.0" rand = "0.8.5" [features] diff --git a/milli/src/error.rs b/milli/src/error.rs index 80c923bd9..c817f64fa 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -99,7 +99,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco )] InvalidDocumentId { document_id: Value }, #[error("Invalid facet distribution, the fields `{}` are not set as filterable.", - .invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") + .invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", ") )] InvalidFacetsDistribution { invalid_facets_name: BTreeSet }, #[error(transparent)] @@ -111,7 +111,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco match .valid_fields.is_empty() { true => "This index does not have configured sortable attributes.".to_string(), false => format!("Available sortable attributes are: `{}`.", - valid_fields.iter().map(AsRef::as_ref).collect::>().join(", ") + valid_fields.iter().map(AsRef::as_ref).collect::>().join(", ") ), } )] diff --git a/milli/src/index.rs b/milli/src/index.rs index 43888a177..36e15c181 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1183,13 +1183,12 @@ pub(crate) mod tests { use big_s::S; use heed::{EnvOpenOptions, RwTxn}; - use maplit::btreemap; use tempfile::TempDir; use crate::documents::DocumentsBatchReader; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{self, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; - use crate::Index; + use crate::{db_snap, Index}; pub(crate) struct TempIndex { pub inner: Index, @@ -1288,17 +1287,30 @@ pub(crate) mod tests { ])) .unwrap(); - let rtxn = index.read_txn().unwrap(); - let field_distribution = index.field_distribution(&rtxn).unwrap(); - assert_eq!( - field_distribution, - btreemap! { - "id".to_string() => 2, - "name".to_string() => 2, - "age".to_string() => 1, - } + db_snap!(index, field_distribution, 1); + + db_snap!(index, word_docids, + @r###" + 1 [0, ] + 2 [1, ] + 20 [1, ] + bob [1, ] + kevin [0, ] + "### ); + db_snap!(index, field_distribution); + + db_snap!(index, field_distribution, + @" + age 1 + id 2 + name 2 + " + ); + + // snapshot_index!(&index, "1", include: "^field_distribution$"); + // we add all the documents a second time. we are supposed to get the same // field_distribution in the end index @@ -1309,16 +1321,12 @@ pub(crate) mod tests { ])) .unwrap(); - let rtxn = index.read_txn().unwrap(); - - let field_distribution = index.field_distribution(&rtxn).unwrap(); - assert_eq!( - field_distribution, - btreemap! { - "id".to_string() => 2, - "name".to_string() => 2, - "age".to_string() => 1, - } + db_snap!(index, field_distribution, + @r###" + age 1 + id 2 + name 2 + "### ); // then we update a document by removing one field and another by adding one field @@ -1329,16 +1337,12 @@ pub(crate) mod tests { ])) .unwrap(); - let rtxn = index.read_txn().unwrap(); - - let field_distribution = index.field_distribution(&rtxn).unwrap(); - assert_eq!( - field_distribution, - btreemap! { - "id".to_string() => 2, - "name".to_string() => 2, - "has_dog".to_string() => 1, - } + db_snap!(index, field_distribution, + @r###" + has_dog 1 + id 2 + name 2 + "### ); } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 09cecb228..85b25cad1 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -13,6 +13,10 @@ pub mod proximity; mod search; pub mod update; +#[cfg(test)] +#[macro_use] +pub mod snapshot_tests; + use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 3d67b60c0..d8feeeee9 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -632,25 +632,59 @@ mod tests { ]), ], ); - - let expected = vec![ - vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]], - vec![ - vec![Query { prefix: false, kind: QueryKind::exact(S("manythe")) }], - vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], - ], - vec![ - vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], - vec![Query { prefix: false, kind: QueryKind::exact(S("thefish")) }], - ], - vec![ - vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], - vec![Query { prefix: false, kind: QueryKind::exact(S("the")) }], - vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], - ], - ]; - let result = flatten_query_tree(&query_tree); - assert_eq!(expected, result); + + insta::assert_debug_snapshot!(result, @r###" + [ + [ + [ + Exact { + word: "manythefish", + }, + ], + ], + [ + [ + Exact { + word: "manythe", + }, + ], + [ + Exact { + word: "fish", + }, + ], + ], + [ + [ + Exact { + word: "many", + }, + ], + [ + Exact { + word: "thefish", + }, + ], + ], + [ + [ + Exact { + word: "many", + }, + ], + [ + Exact { + word: "the", + }, + ], + [ + Exact { + word: "fish", + }, + ], + ], + ] + "###); } } diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 97a9b4e4b..e9e6fb2f5 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -349,22 +349,33 @@ mod test { use super::super::test::TestContext; use super::*; + fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String { + let mut result = String::new(); + while let Some(criterion) = criteria.next(&mut parameters).unwrap() { + result.push_str(&format!("{criterion:?}\n\n")); + } + result + } + #[test] fn initial_placeholder_no_facets() { let context = TestContext::default(); let query_tree = None; let facet_candidates = None; - let mut criterion_parameters = CriterionParameters { + let criterion_parameters = CriterionParameters { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; let parent = Initial::new(query_tree, facet_candidates); - let mut criteria = Typo::new(&context, Box::new(parent)); + let criteria = Typo::new(&context, Box::new(parent)); - assert!(criteria.next(&mut criterion_parameters).unwrap().unwrap().candidates.is_none()); - assert!(criteria.next(&mut criterion_parameters).unwrap().is_none()); + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, bucket_candidates: None } + + "###); } #[test] @@ -390,78 +401,32 @@ mod test { let facet_candidates = None; - let mut criterion_parameters = CriterionParameters { + let criterion_parameters = CriterionParameters { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; let parent = Initial::new(Some(query_tree), facet_candidates); - let mut criteria = Typo::new(&context, Box::new(parent)); + let criteria = Typo::new(&context, Box::new(parent)); - let candidates_1 = context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("world").unwrap().unwrap(); - let expected_1 = CriterionResult { - query_tree: Some(Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("world".to_string()), - }), - ])], - )), - candidates: Some(candidates_1.clone()), - bucket_candidates: Some(candidates_1), - filtered_candidates: None, - }; + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } - assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + OR + Exact { word: "word" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } - let candidates_2 = (context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("word").unwrap().unwrap()) - - context.word_docids("world").unwrap().unwrap(); - let expected_2 = CriterionResult { - query_tree: Some(Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact_with_typo(1, "word".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("world".to_string()), - }), - ], - ), - ])], - )), - candidates: Some(candidates_2.clone()), - bucket_candidates: Some(candidates_2), - filtered_candidates: None, - }; - - assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); + "###); } #[test] @@ -470,25 +435,18 @@ mod test { let query_tree = None; let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criterion_parameters = CriterionParameters { + let criterion_parameters = CriterionParameters { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; let parent = Initial::new(query_tree, Some(facet_candidates.clone())); - let mut criteria = Typo::new(&context, Box::new(parent)); + let criteria = Typo::new(&context, Box::new(parent)); - let expected = CriterionResult { - query_tree: None, - candidates: None, - bucket_candidates: None, - filtered_candidates: Some(facet_candidates.clone()), - }; + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), bucket_candidates: None } - // first iteration, returns the facet candidates - assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected)); - - // second iteration, returns None because there is no more things to do - assert!(criteria.next(&mut criterion_parameters).unwrap().is_none()); + "###); } #[test] @@ -514,77 +472,31 @@ mod test { let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - let mut criterion_parameters = CriterionParameters { + let criterion_parameters = CriterionParameters { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone())); - let mut criteria = Typo::new(&context, Box::new(parent)); + let criteria = Typo::new(&context, Box::new(parent)); - let candidates_1 = context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("world").unwrap().unwrap(); - let expected_1 = CriterionResult { - query_tree: Some(Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("world".to_string()), - }), - ])], - )), - candidates: Some(&candidates_1 & &facet_candidates), - bucket_candidates: Some(&candidates_1 & &facet_candidates), - filtered_candidates: None, - }; + let result = display_criteria(criteria, criterion_parameters); + insta::assert_snapshot!(result, @r###" + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } - assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); + CriterionResult { query_tree: Some(OR + AND + Exact { word: "split" } + Exact { word: "this" } + OR + Exact { word: "word" } + Exact { word: "world" } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } - let candidates_2 = (context.word_docids("split").unwrap().unwrap() - & context.word_docids("this").unwrap().unwrap() - & context.word_docids("word").unwrap().unwrap()) - - context.word_docids("world").unwrap().unwrap(); - let expected_2 = CriterionResult { - query_tree: Some(Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact_with_typo(1, "word".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("world".to_string()), - }), - ], - ), - ])], - )), - candidates: Some(&candidates_2 & &facet_candidates), - bucket_candidates: Some(&candidates_2 & &facet_candidates), - filtered_candidates: None, - }; - - assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); + "###); } } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 225d3ea8d..90aab826a 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -45,7 +45,7 @@ impl<'a> Display for FilterError<'a> { attribute, ) } else { - let filterables_list = filterable_fields.iter().map(AsRef::as_ref).collect::>().join(" "); + let filterables_list = filterable_fields.iter().map(AsRef::as_ref).collect::>().join(" "); write!( f, diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 72592c4cb..2697405be 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -573,15 +573,18 @@ mod tests { let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World." + ); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. - assert_eq!( - &matcher.format(format_options), - "Natalie risk her future to build a world with the boy she loves." + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves." ); } @@ -602,19 +605,28 @@ mod tests { let text = "Ŵôřlḑôle"; let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options), "Ŵôřlḑôle"); + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑôle" + ); // Text containing unicode match. let text = "Ŵôřlḑ"; let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options), "Ŵôřlḑ"); + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑ" + ); // Text containing unicode match. let text = "Westfália"; let mut matcher = builder.build(text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options), "Westfália"); + insta::assert_snapshot!( + matcher.format(format_options), + @"Westfália" + ); } #[test] @@ -628,83 +640,89 @@ mod tests { // empty text. let text = ""; let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ""); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); // text containing only separators. let text = ":-)"; let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ":-)"); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let mut matcher = builder.build(text); // no highlight should return 10 first words with a marker at the end. - assert_eq!( - &matcher.format(format_options), - "A quick brown fox can not jump 32 feet, right…" + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" ); // Text without any match starting by a separator. let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; let mut matcher = builder.build(text); // no highlight should return 10 first words with a marker at the end. - assert_eq!( - &matcher.format(format_options), - "(A quick brown fox can not jump 32 feet, right…" + insta::assert_snapshot!( + matcher.format(format_options), + @"(A quick brown fox can not jump 32 feet, right…" ); // Test phrase propagation let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; let mut matcher = builder.build(text); // should crop the phrase instead of croping around the match. - assert_eq!( - &matcher.format(format_options), - "… Split The World is a book written by Emily Henry…", + insta::assert_snapshot!( + matcher.format(format_options), + @"… Split The World is a book written by Emily Henry…" ); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; let mut matcher = builder.build(text); // no highlight should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…future to build a world with the boy she loves…" + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" ); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let mut matcher = builder.build(text); // no highlight should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…she loves. Emily Henry: The Love That Split The World." + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." ); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…void void void void void split the world void void" + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" ); // Text containing matches with diferent density. let text = "split void the void void world void void void void void void void void void void split the world void void"; let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…void void void void void split the world void void" + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" ); // Text containing matches with same word. let text = "split split split split split split void void void void void void void void void void split the world void void"; let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…void void void void void split the world void void" + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" ); } @@ -719,44 +737,53 @@ mod tests { // empty text. let text = ""; let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ""); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); // text containing only separators. let text = ":-)"; let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ":-)"); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let mut matcher = builder.build(text); // both should return 10 first words with a marker at the end. - assert_eq!( - &matcher.format(format_options), - "A quick brown fox can not jump 32 feet, right…" + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" ); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; let mut matcher = builder.build(text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!( - &matcher.format(format_options), - "…future to build a world with the boy she loves…" + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" ); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let mut matcher = builder.build(text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: The Love That Split The World."); + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." + ); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; let mut matcher = builder.build(text); // crop should return 10 last words with a marker at the start. - assert_eq!( - &matcher.format(format_options), - "…void void void void void split the world void void" + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" ); } @@ -773,19 +800,28 @@ mod tests { let format_options = FormatOptions { highlight: false, crop: Some(2) }; let mut matcher = builder.build(text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(format_options), "…split the…"); + insta::assert_snapshot!( + matcher.format(format_options), + @"…split the…" + ); // set a smaller crop size let format_options = FormatOptions { highlight: false, crop: Some(1) }; let mut matcher = builder.build(text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(format_options), "…split…"); + insta::assert_snapshot!( + matcher.format(format_options), + @"…split…" + ); // set crop size to 0 let format_options = FormatOptions { highlight: false, crop: Some(0) }; let mut matcher = builder.build(text); // because crop size is 0, crop is ignored. - assert_eq!(&matcher.format(format_options), "void void split the world void void."); + insta::assert_snapshot!( + matcher.format(format_options), + @"void void split the world void void." + ); } #[test] @@ -820,11 +856,9 @@ mod tests { let text = "the do or die can't be he do and or isn't he"; let mut matcher = builder.build(text); - assert_eq!( - &matcher.format(format_options), - "_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_", - "matches: {:?}", - &matcher.matches + insta::assert_snapshot!( + matcher.format(format_options), + @"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_" ); } } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index e0fac0f43..617d9e4d9 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -99,11 +99,6 @@ impl QueryKind { QueryKind::Exact { original_typo: 0, word } } - #[cfg(test)] - pub fn exact_with_typo(original_typo: u8, word: String) -> Self { - QueryKind::Exact { original_typo, word } - } - pub fn tolerant(typo: u8, word: String) -> Self { QueryKind::Tolerant { typo, word } } @@ -857,30 +852,16 @@ mod test { let query = "hey friends"; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "friends".to_string()), - }), - ]), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "heyfriends".to_string()), - }), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "hey" } + PrefixTolerant { word: "friends", max typo: 1 } + PrefixTolerant { word: "heyfriends", max typo: 1 } + "###); } #[test] @@ -888,30 +869,16 @@ mod test { let query = "hey friends "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "friends".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "heyfriends".to_string()), - }), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "hey" } + Tolerant { word: "friends", max typo: 1 } + Tolerant { word: "heyfriends", max typo: 1 } + "###); } #[test] @@ -919,62 +886,24 @@ mod test { let query = "hello world "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hi".to_string()), - }), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("good".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("morning".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "hello".to_string()), - }), - ], - ), - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("earth".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("nature".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ], - ), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "helloworld".to_string()), - }), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + OR + Exact { word: "hi" } + AND + Exact { word: "good" } + Exact { word: "morning" } + Tolerant { word: "hello", max typo: 1 } + OR + Exact { word: "earth" } + Exact { word: "nature" } + Tolerant { word: "world", max typo: 1 } + Tolerant { word: "helloworld", max typo: 1 } + "###); } #[test] @@ -982,97 +911,34 @@ mod test { let query = "new york city "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("new".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("york".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("city".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "yorkcity".to_string()), - }), - ], - ), - ]), - Operation::And(vec![ - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("nyc".to_string()), - }), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("new".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("york".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("city".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "newyork".to_string()), - }), - ], - ), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("city".to_string()), - }), - ]), - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("nyc".to_string()), - }), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("new".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("york".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "newyorkcity".to_string()), - }), - ], - ), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "new" } + OR + AND + Exact { word: "york" } + Exact { word: "city" } + Tolerant { word: "yorkcity", max typo: 1 } + AND + OR + Exact { word: "nyc" } + AND + Exact { word: "new" } + Exact { word: "york" } + Exact { word: "city" } + Tolerant { word: "newyork", max typo: 1 } + Exact { word: "city" } + OR + Exact { word: "nyc" } + AND + Exact { word: "new" } + Exact { word: "york" } + Tolerant { word: "newyorkcity", max typo: 1 } + "###); } #[test] @@ -1080,30 +946,16 @@ mod test { let query = "n grams "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("n".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "grams".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "ngrams".to_string()), - }), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "n" } + Tolerant { word: "grams", max typo: 1 } + Tolerant { word: "ngrams", max typo: 1 } + "###); } #[test] @@ -1111,36 +963,18 @@ mod test { let query = "wordsplit fish "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Or( - false, - vec![ - Operation::Phrase(vec!["word".to_string(), "split".to_string()]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(2, "wordsplit".to_string()), - }), - ], - ), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("fish".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "wordsplitfish".to_string()), - }), - ], - ); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + OR + PHRASE ["word", "split"] + Tolerant { word: "wordsplit", max typo: 2 } + Exact { word: "fish" } + Tolerant { word: "wordsplitfish", max typo: 1 } + "###); } #[test] @@ -1148,15 +982,14 @@ mod test { let query = "\"hey friends\" \" \" \"wooop"; let tokens = query.tokenize(); - let expected = Operation::And(vec![ - Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), - ]); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + AND + PHRASE ["hey", "friends"] + Exact { word: "wooop" } + "###); } #[test] @@ -1164,15 +997,14 @@ mod test { let query = "\"hey friends. wooop wooop\""; let tokens = query.tokenize(); - let expected = Operation::And(vec![ - Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), - Operation::Phrase(vec!["wooop".to_string(), "wooop".to_string()]), - ]); - let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + AND + PHRASE ["hey", "friends"] + PHRASE ["wooop", "wooop"] + "###); } #[test] @@ -1180,82 +1012,30 @@ mod test { let query = "hey my friend "; let tokens = query.tokenize(); - let expected = Operation::Or( - true, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("my".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "heymy".to_string()), - }), - ], - ), - Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("my".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "friend".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "myfriend".to_string()), - }), - ], - ), - ]), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "heymy".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "friend".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "heymyfriend".to_string()), - }), - ], - ), - ], - ); let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR(WORD) + Exact { word: "hey" } + OR + AND + Exact { word: "hey" } + Exact { word: "my" } + Tolerant { word: "heymy", max typo: 1 } + OR + AND + Exact { word: "hey" } + OR + AND + Exact { word: "my" } + Tolerant { word: "friend", max typo: 1 } + Tolerant { word: "myfriend", max typo: 1 } + AND + Tolerant { word: "heymy", max typo: 1 } + Tolerant { word: "friend", max typo: 1 } + Tolerant { word: "heymyfriend", max typo: 1 } + "###); } #[test] @@ -1263,11 +1043,12 @@ mod test { let query = "\"hey my\""; let tokens = query.tokenize(); - let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]); let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + PHRASE ["hey", "my"] + "###); } #[test] @@ -1275,68 +1056,27 @@ mod test { let query = r#""hey" my good "friend""#; let tokens = query.tokenize(); - let expected = Operation::Or( - true, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("friend".to_string()), - }), - ]), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("my".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("friend".to_string()), - }), - ]), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("my".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("good".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "mygood".to_string()), - }), - ], - ), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("friend".to_string()), - }), - ]), - ], - ); let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR(WORD) + AND + Exact { word: "hey" } + Exact { word: "friend" } + AND + Exact { word: "hey" } + Exact { word: "my" } + Exact { word: "friend" } + AND + Exact { word: "hey" } + OR + AND + Exact { word: "my" } + Exact { word: "good" } + Tolerant { word: "mygood", max typo: 1 } + Exact { word: "friend" } + "###); } #[test] @@ -1344,29 +1084,16 @@ mod test { let query = "hey friends "; let tokens = query.tokenize(); - let expected = Operation::Or( - false, - vec![ - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("hey".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("friends".to_string()), - }), - ]), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("heyfriends".to_string()), - }), - ], - ); let (query_tree, _) = TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + OR + AND + Exact { word: "hey" } + Exact { word: "friends" } + Exact { word: "heyfriends" } + "###); } #[test] @@ -1374,15 +1101,14 @@ mod test { let query = "\"hey my\" good friend"; let tokens = query.tokenize(); - let expected = Operation::And(vec![ - Operation::Phrase(vec!["hey".to_string(), "my".to_string()]), - Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), - ]); - let (query_tree, _) = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); - assert_eq!(expected, query_tree); + insta::assert_debug_snapshot!(query_tree, @r###" + AND + PHRASE ["hey", "my"] + Exact { word: "good" } + "###); } #[test] diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs new file mode 100644 index 000000000..eac3340fd --- /dev/null +++ b/milli/src/snapshot_tests.rs @@ -0,0 +1,527 @@ +use std::borrow::Cow; +use std::fmt::Write; +use std::path::Path; + +use heed::types::ByteSlice; +use heed::BytesDecode; +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::{ + FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FacetStringZeroBoundsValueCodec, +}; +use crate::{make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index}; + +#[track_caller] +pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Settings { + let mut settings = insta::Settings::clone_current(); + settings.set_prepend_module_to_snapshot(false); + let path = Path::new(std::panic::Location::caller().file()); + let filename = path.file_name().unwrap().to_str().unwrap(); + settings.set_omit_expression(true); + let test_name = std::thread::current().name().unwrap().rsplit("::").next().unwrap().to_owned(); + + if let Some(name) = name { + settings + .set_snapshot_path(Path::new("snapshots").join(filename).join(test_name).join(name)); + } else { + settings.set_snapshot_path(Path::new("snapshots").join(filename).join(test_name)); + } + + settings +} + +/** +Create a snapshot test of the given database. + +## Arguments +1. The identifier for the `Index` +2. The content of the index to snapshot. Available options are: + - `settings` + - `word_docids` + - `exact_word_docids` + - `word_prefix_docids` + - `exact_word_prefix_docids` + - `docid_word_positions` + - `word_pair_proximity_docids` + - `word_prefix_pair_proximity_docids` + - `word_position_docids` + - `field_id_word_count_docids` + - `word_prefix_position_docids` + - `facet_id_f64_docids` + - `facet_id_string_docids` + - `documents_ids` + - `stop_words` + - `soft_deleted_documents_ids` + - `field_distribution` + - `fields_ids_map` + - `geo_faceted_documents_ids` + - `external_documents_ids` + - `number_faceted_documents_ids` + - `string_faceted_documents_ids` + - `words_fst` + - `words_prefixes_fst` + +3. The identifier for the snapshot test (optional) +4. `@""` to write the snapshot inline (optional) + +## Behaviour +The content of the database will be printed either inline or to the file system +at `test_directory/test_file.rs/test_name/db_name.snap`. + +If the database is too large, then only the hash of the database will be saved, with +the name `db_name.hash.snap`. To *also* save the full content of the database anyway, +set the `MILLI_TEST_FULL_SNAPS` environment variable to `true`. The full snapshot will +be saved with the name `db_name.full.snap` but will not be saved to the git repository. + +Running `cargo test` will check whether the old snapshot is identical to the +current one. If they are equal, the test passes. Otherwise, the test fails. + +Use the command line `cargo insta` to approve or reject new snapshots. + +## Example +```ignore +let index = TempIndex::new(); + +// basic usages +db_snap!(index, word_docids); + +// named snapshot to avoid conflicts +db_snap!(index, word_docids, "some_identifier"); + +// write the snapshot inline +db_snap!(index, word_docids, @""); // will be autocompleted by running `cargo insta review` + +// give a name to the inline snapshot +db_snap!(index, word_docids, "some_identifier", @""); +``` +*/ +#[macro_export] +macro_rules! db_snap { + ($index:ident, $db_name:ident, $name:expr) => { + let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some( + &format!("{}", $name), + )); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($index:ident, $db_name:ident) => { + let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($index:ident, $db_name:ident, @$inline:literal) => { + let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; + ($index:ident, $db_name:ident, $name:literal, @$inline:literal) => { + let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(&format!("{}", $name))); + settings.bind(|| { + let snap = $crate::full_snap_of_db!($index, $db_name); + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; +} + +pub fn snap_word_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_exact_word_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, exact_word_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_prefix_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_prefix_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_exact_word_prefix_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, exact_word_prefix_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_docid_word_positions(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, docid_word_positions, |((idx, s), b)| { + &format!("{idx:<6} {s:<16} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_pair_proximity_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_pair_proximity_docids, |( + (word1, word2, proximity), + b, + )| { + &format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( + (word1, prefix, proximity), + b, + )| { + &format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_position_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { + &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_field_id_word_count_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, field_id_word_count_docids, |( + (field_id, word_count), + b, + )| { + &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_word_prefix_position_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, word_prefix_position_docids, |( + (word_prefix, position), + b, + )| { + &format!("{word_prefix:<4} {position:<6} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_facet_id_f64_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( + (facet_id, level, left, right), + b, + )| { + &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) + }); + snap +} +pub fn snap_facet_id_string_docids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let bytes_db = index.facet_id_string_docids.remap_types::(); + let iter = bytes_db.iter(&rtxn).unwrap(); + let mut snap = String::new(); + + for x in iter { + let (key, value) = x.unwrap(); + if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) { + let (orig_string, docids) = + FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); + snap.push_str(&format!( + "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", + display_bitmap(&docids) + )); + } else if let Some((field_id, level, left, right)) = + FacetLevelValueU32Codec::bytes_decode(key) + { + snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); + let (bounds, docids) = + FacetStringZeroBoundsValueCodec::::bytes_decode(value) + .unwrap(); + if let Some((left, right)) = bounds { + snap.push_str(&format!("{left:<8} {right:<8} ")); + } + snap.push_str(&display_bitmap(&docids)); + snap.push('\n'); + } else { + panic!(); + } + } + snap +} +pub fn snap_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let documents_ids = index.documents_ids(&rtxn).unwrap(); + let snap = display_bitmap(&documents_ids); + snap +} +pub fn snap_stop_words(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let stop_words = index.stop_words(&rtxn).unwrap(); + let snap = format!("{stop_words:?}"); + snap +} +pub fn snap_soft_deleted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); + let soft_deleted_documents_ids = display_bitmap(&soft_deleted_documents_ids); + soft_deleted_documents_ids +} +pub fn snap_field_distributions(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let mut snap = String::new(); + for (field, count) in index.field_distribution(&rtxn).unwrap() { + writeln!(&mut snap, "{field:<16} {count:<6}").unwrap(); + } + snap +} +pub fn snap_fields_ids_map(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let name = fields_ids_map.name(field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap(); + } + snap +} +pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); + let snap = display_bitmap(&geo_faceted_documents_ids); + snap +} +pub fn snap_external_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap(); + let mut snap = String::new(); + let soft_bytes = soft.into_fst().as_bytes().to_owned(); + let mut hex_soft = String::new(); + for byte in soft_bytes { + write!(&mut hex_soft, "{:x}", byte).unwrap(); + } + writeln!(&mut snap, "soft: {hex_soft}").unwrap(); + let hard_bytes = hard.into_fst().as_bytes().to_owned(); + let mut hex_hard = String::new(); + for byte in hard_bytes { + write!(&mut hex_hard, "{:x}", byte).unwrap(); + } + writeln!(&mut snap, "hard: {hex_hard}").unwrap(); + snap +} +pub fn snap_number_faceted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let number_faceted_documents_ids = + index.number_faceted_documents_ids(&rtxn, field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids)) + .unwrap(); + } + snap +} +pub fn snap_string_faceted_documents_ids(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let string_faceted_documents_ids = + index.string_faceted_documents_ids(&rtxn, field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids)) + .unwrap(); + } + snap +} +pub fn snap_words_fst(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let words_fst = index.words_fst(&rtxn).unwrap(); + let bytes = words_fst.into_fst().as_bytes().to_owned(); + let mut snap = String::new(); + for byte in bytes { + write!(&mut snap, "{:x}", byte).unwrap(); + } + snap +} +pub fn snap_words_prefixes_fst(index: &Index) -> String { + let rtxn = index.read_txn().unwrap(); + let words_prefixes_fst = index.words_prefixes_fst(&rtxn).unwrap(); + let bytes = words_prefixes_fst.into_fst().as_bytes().to_owned(); + let mut snap = String::new(); + for byte in bytes { + write!(&mut snap, "{:x}", byte).unwrap(); + } + snap +} + +pub fn snap_settings(index: &Index) -> String { + let mut snap = String::new(); + let rtxn = index.read_txn().unwrap(); + + macro_rules! write_setting_to_snap { + ($name:ident) => { + let $name = index.$name(&rtxn).unwrap(); + writeln!(&mut snap, "{}: {:?}", stringify!($name), $name).unwrap(); + }; + } + + write_setting_to_snap!(primary_key); + write_setting_to_snap!(criteria); + write_setting_to_snap!(displayed_fields); + write_setting_to_snap!(distinct_field); + write_setting_to_snap!(filterable_fields); + write_setting_to_snap!(sortable_fields); + write_setting_to_snap!(synonyms); + write_setting_to_snap!(authorize_typos); + write_setting_to_snap!(min_word_len_one_typo); + write_setting_to_snap!(min_word_len_two_typos); + write_setting_to_snap!(exact_words); + write_setting_to_snap!(exact_attributes); + write_setting_to_snap!(max_values_per_facet); + write_setting_to_snap!(pagination_max_total_hits); + write_setting_to_snap!(searchable_fields); + write_setting_to_snap!(user_defined_searchable_fields); + + snap +} + +#[macro_export] +macro_rules! full_snap_of_db { + ($index:ident, settings) => {{ + $crate::snapshot_tests::snap_settings(&$index) + }}; + ($index:ident, word_docids) => {{ + $crate::snapshot_tests::snap_word_docids(&$index) + }}; + ($index:ident, exact_word_docids) => {{ + $crate::snapshot_tests::snap_exact_word_docids(&$index) + }}; + ($index:ident, word_prefix_docids) => {{ + $crate::snapshot_tests::snap_word_prefix_docids(&$index) + }}; + ($index:ident, exact_word_prefix_docids) => {{ + $crate::snapshot_tests::snap_exact_word_prefix_docids(&$index) + }}; + ($index:ident, docid_word_positions) => {{ + $crate::snapshot_tests::snap_docid_word_positions(&$index) + }}; + ($index:ident, word_pair_proximity_docids) => {{ + $crate::snapshot_tests::snap_word_pair_proximity_docids(&$index) + }}; + ($index:ident, word_prefix_pair_proximity_docids) => {{ + $crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index) + }}; + ($index:ident, word_position_docids) => {{ + $crate::snapshot_tests::snap_word_position_docids(&$index) + }}; + ($index:ident, field_id_word_count_docids) => {{ + $crate::snapshot_tests::snap_field_id_word_count_docids(&$index) + }}; + ($index:ident, word_prefix_position_docids) => {{ + $crate::snapshot_tests::snap_word_prefix_position_docids(&$index) + }}; + ($index:ident, facet_id_f64_docids) => {{ + $crate::snapshot_tests::snap_facet_id_f64_docids(&$index) + }}; + ($index:ident, facet_id_string_docids) => {{ + $crate::snapshot_tests::snap_facet_id_string_docids(&$index) + }}; + ($index:ident, documents_ids) => {{ + $crate::snapshot_tests::snap_documents_ids(&$index) + }}; + ($index:ident, stop_words) => {{ + $crate::snapshot_tests::snap_stop_words(&$index) + }}; + ($index:ident, soft_deleted_documents_ids) => {{ + $crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index) + }}; + ($index:ident, field_distribution) => {{ + $crate::snapshot_tests::snap_field_distributions(&$index) + }}; + ($index:ident, fields_ids_map) => {{ + $crate::snapshot_tests::snap_fields_ids_map(&$index) + }}; + ($index:ident, geo_faceted_documents_ids) => {{ + $crate::snapshot_tests::snap_geo_faceted_documents_ids(&$index) + }}; + ($index:ident, external_documents_ids) => {{ + $crate::snapshot_tests::snap_external_documents_ids(&$index) + }}; + ($index:ident, number_faceted_documents_ids) => {{ + $crate::snapshot_tests::snap_number_faceted_documents_ids(&$index) + }}; + ($index:ident, string_faceted_documents_ids) => {{ + $crate::snapshot_tests::snap_string_faceted_documents_ids(&$index) + }}; + ($index:ident, words_fst) => {{ + $crate::snapshot_tests::snap_words_fst(&$index) + }}; + ($index:ident, words_prefixes_fst) => {{ + $crate::snapshot_tests::snap_words_prefixes_fst(&$index) + }}; +} + +pub fn convert_snap_to_hash_if_needed<'snap>( + name: &str, + snap: &'snap str, + inline: bool, +) -> Vec<(String, Cow<'snap, str>)> { + let store_whole_snapshot = std::env::var("MILLI_TEST_FULL_SNAPS").unwrap_or("false".to_owned()); + let store_whole_snapshot: bool = store_whole_snapshot.parse().unwrap(); + + let max_len = if inline { 256 } else { 2048 }; + + if snap.len() < max_len { + vec![(name.to_owned(), Cow::Borrowed(snap))] + } else { + let mut r = vec![]; + if store_whole_snapshot { + r.push((format!("{name}.full"), Cow::Borrowed(snap))); + } + let hash = md5::compute(snap.as_bytes()); + let hash_str = format!("{hash:x}"); + r.push((format!("{name}.hash"), Cow::Owned(hash_str))); + r + } +} + +#[macro_export] +macro_rules! make_db_snap_from_iter { + ($index:ident, $name:ident, |$vars:pat| $push:block) => {{ + let rtxn = $index.read_txn().unwrap(); + let iter = $index.$name.iter(&rtxn).unwrap(); + let mut snap = String::new(); + for x in iter { + let $vars = x.unwrap(); + snap.push_str($push); + snap.push('\n'); + } + snap + }}; +} + +pub fn display_bitmap(b: &RoaringBitmap) -> String { + let mut s = String::new(); + s.push('['); + for x in b.into_iter() { + write!(&mut s, "{x}, ").unwrap(); + } + s.push(']'); + s +} diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap new file mode 100644 index 000000000..9b074fb59 --- /dev/null +++ b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/index.rs +--- +age 1 +id 2 +name 2 + diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap new file mode 100644 index 000000000..9b074fb59 --- /dev/null +++ b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/index.rs +--- +age 1 +id 2 +name 2 + diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 5892123eb..4c4963b56 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -342,3 +342,93 @@ fn write_string_entry( writer.insert(&key, &data)?; Ok(()) } + +#[cfg(test)] +mod tests { + use std::num::NonZeroUsize; + + use crate::db_snap; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + + #[test] + fn test_facets_number() { + let test = + |name: &str, group_size: Option, min_level_size: Option| { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB + index.index_documents_config.autogenerate_docids = true; + index.index_documents_config.facet_level_group_size = group_size; + index.index_documents_config.facet_min_level_size = min_level_size; + + index + .update_settings(|settings| { + settings.set_filterable_fields( + IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) + .collect(), + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1_000 { + documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); + } + for i in 0..100 { + documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); + } + let documents = documents_batch_reader_from_objects(documents); + + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, name); + }; + + test("default", None, None); + test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + test("small_groups_small_levels", NonZeroUsize::new(2), NonZeroUsize::new(2)); + test("small_groups_large_levels", NonZeroUsize::new(2), NonZeroUsize::new(128)); + test("large_groups_small_levels", NonZeroUsize::new(16), NonZeroUsize::new(2)); + test("large_groups_large_levels", NonZeroUsize::new(16), NonZeroUsize::new(256)); + } + + #[test] + fn test_facets_string() { + let test = |name: &str, + group_size: Option, + min_level_size: Option| { + let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB + index.index_documents_config.autogenerate_docids = true; + index.index_documents_config.facet_level_group_size = group_size; + index.index_documents_config.facet_min_level_size = min_level_size; + + index + .update_settings(|settings| { + settings.set_filterable_fields( + IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) + .collect(), + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..100 { + documents.push( + serde_json::json!({ "facet": format!("s{i:X}") }).as_object().unwrap().clone(), + ); + } + for i in 0..10 { + documents.push( + serde_json::json!({ "facet2": format!("s{i:X}") }).as_object().unwrap().clone(), + ); + } + let documents = documents_batch_reader_from_objects(documents); + + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_string_docids, name); + }; + + test("default", None, None); + test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + } +} diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..373455db6 --- /dev/null +++ b/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +587899707db2848da3f18399e14ed4d0 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..c3415c320 --- /dev/null +++ b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +02bbf2ca1663cccea0e4c06d5ad06a45 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..78dad29f1 --- /dev/null +++ b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +e68ea591e1af3e53e544dff9a1648e88 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..61a5908f4 --- /dev/null +++ b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +12a4bb0f5b95d7629c2b9a915150c0cf diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..961346de5 --- /dev/null +++ b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +6438e94bc7fada13022e0efccdf294e0 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..2b7c1ef9c --- /dev/null +++ b/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +5348bbc46b5384455b6a900666d2a502 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..901b86255 --- /dev/null +++ b/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +faddef9eae5f2efacfec51f20f2e8cd6 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap new file mode 100644 index 000000000..aa6c85461 --- /dev/null +++ b/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facets.rs +--- +ddb8fc987c5dc892337682595043858e diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..0a61cf4e8 --- /dev/null +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,46 @@ +--- +source: milli/src/update/word_prefix_pair_proximity_docids.rs +--- +5 a 1 [101, ] +5 a 2 [101, ] +5 b 4 [101, ] +5 be 4 [101, ] +am a 3 [101, ] +amazing a 1 [100, ] +amazing a 2 [100, ] +amazing a 3 [100, ] +amazing b 2 [100, ] +amazing be 2 [100, ] +an a 1 [100, ] +an a 2 [100, ] +an b 3 [100, ] +an be 3 [100, ] +and a 2 [100, ] +and a 3 [100, ] +and a 4 [100, ] +and b 1 [100, ] +and be 1 [100, ] +at a 1 [100, ] +at a 2 [100, 101, ] +at a 3 [100, ] +at b 3 [101, ] +at b 4 [100, ] +at be 3 [101, ] +at be 4 [100, ] +beautiful a 2 [100, ] +beautiful a 3 [100, ] +beautiful a 4 [100, ] +bell a 2 [101, ] +bell a 4 [101, ] +house a 3 [100, ] +house a 4 [100, ] +house b 2 [100, ] +house be 2 [100, ] +rings a 1 [101, ] +rings a 3 [101, ] +rings b 2 [101, ] +rings be 2 [101, ] +the a 3 [101, ] +the b 1 [101, ] +the be 1 [101, ] + diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..aabd9ddec --- /dev/null +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,56 @@ +--- +source: milli/src/update/word_prefix_pair_proximity_docids.rs +--- +5 a 1 [101, ] +5 a 2 [101, ] +5 am 1 [101, ] +5 b 4 [101, ] +5 be 4 [101, ] +am a 3 [101, ] +amazing a 1 [100, ] +amazing a 2 [100, ] +amazing a 3 [100, ] +amazing b 2 [100, ] +amazing be 2 [100, ] +an a 1 [100, ] +an a 2 [100, 202, ] +an am 1 [100, ] +an b 3 [100, ] +an be 3 [100, ] +and a 2 [100, ] +and a 3 [100, ] +and a 4 [100, ] +and am 2 [100, ] +and b 1 [100, ] +and be 1 [100, ] +at a 1 [100, 202, ] +at a 2 [100, 101, ] +at a 3 [100, ] +at am 2 [100, 101, ] +at b 3 [101, ] +at b 4 [100, ] +at be 3 [101, ] +at be 4 [100, ] +beautiful a 2 [100, ] +beautiful a 3 [100, ] +beautiful a 4 [100, ] +beautiful am 3 [100, ] +bell a 2 [101, ] +bell a 4 [101, ] +bell am 4 [101, ] +extraordinary a 2 [202, ] +extraordinary a 3 [202, ] +house a 3 [100, 202, ] +house a 4 [100, 202, ] +house am 4 [100, ] +house b 2 [100, ] +house be 2 [100, ] +rings a 1 [101, ] +rings a 3 [101, ] +rings am 3 [101, ] +rings b 2 [101, ] +rings be 2 [101, ] +the a 3 [101, ] +the b 1 [101, ] +the be 1 [101, ] + diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 72b41c472..574b49e97 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -244,3 +244,88 @@ fn insert_current_prefix_data_in_sorter<'a>( Ok(()) } + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use crate::db_snap; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; + + fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { + let mut documents = Vec::new(); + for prefix in prefixes { + for i in 0..50 { + documents.push( + serde_json::json!({ + "text": format!("{prefix}{i:x}"), + }) + .as_object() + .unwrap() + .clone(), + ) + } + } + documents + } + + #[test] + fn test_update() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + + let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); + documents.push( + serde_json::json!({ + "text": "At an extraordinary house" + }) + .as_object() + .unwrap() + .clone(), + ); + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_prefix_pair_proximity_docids, "update"); + } +}