diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 37c7b7c84..318a2604a 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -51,8 +51,11 @@ csv = "1.1.6" [dev-dependencies] big_s = "1.0.2" +insta = "1.17.1" maplit = "1.0.2" +md5 = "0.7.0" rand = "0.8.5" +regex = "1.6.0" [features] default = [] diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 09cecb228..85b25cad1 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -13,6 +13,10 @@ pub mod proximity; mod search; pub mod update; +#[cfg(test)] +#[macro_use] +pub mod snapshot_tests; + use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs new file mode 100644 index 000000000..aa1d5cf27 --- /dev/null +++ b/milli/src/snapshot_tests.rs @@ -0,0 +1,320 @@ +use heed::BytesDecode; +use roaring::RoaringBitmap; +use std::path::Path; + +use crate::{ + heed_codec::facet::{ + FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FacetStringZeroBoundsValueCodec, + }, + CboRoaringBitmapCodec, ExternalDocumentsIds, Index, +}; + +macro_rules! snapshot_index { + ($index:expr, $name:expr) => { + $crate::index::tests::snapshot_index($index, $name, None, None) + }; + ($index:expr, $name:expr, include: $regex:literal) => { + $crate::index::tests::snapshot_index( + $index, + $name, + Some(regex::Regex::new($regex).unwrap()), + None, + ) + }; + ($index:expr, $name:expr, exclude: $regex:literal) => { + $crate::index::tests::snapshot_index( + $index, + $name, + None, + Some(regex::Regex::new($regex).unwrap()), + ) + }; +} + +#[track_caller] +pub fn snapshot_index( + index: &Index, + name: &str, + include: Option, + exclude: Option, +) { + use std::fmt::Write; + + let should_snapshot = |name: &str| -> bool { + include.as_ref().map(|f| f.is_match(name)).unwrap_or(true) + && !exclude.as_ref().map(|f| f.is_match(name)).unwrap_or(false) + }; + + let mut settings = insta::Settings::clone_current(); + settings.set_prepend_module_to_snapshot(false); + let path = Path::new(std::panic::Location::caller().file()); + let path = path.strip_prefix("milli/src").unwrap(); + settings.set_omit_expression(true); + settings.set_snapshot_path(Path::new("snapshots").join(path).join(name)); + let rtxn = index.read_txn().unwrap(); + + let store_whole_snapshot = std::env::var("MILLI_TEST_FULL_SNAPS").unwrap_or("false".to_owned()); + let store_whole_snapshot: bool = store_whole_snapshot.parse().unwrap(); + + macro_rules! snapshot_db { + ($name:ident, |$vars:pat| $push:block) => { + let name_str = stringify!($name); + if should_snapshot(name_str) { + let iter = index.$name.iter(&rtxn).unwrap(); + let mut snap = String::new(); + for x in iter { + let $vars = x.unwrap(); + snap.push_str($push); + snap.push('\n'); + } + if snap.len() < 512 { + insta::assert_snapshot!(name_str, snap); + } else { + if store_whole_snapshot { + insta::assert_snapshot!(format!("{name_str}.full"), snap); + } + let hash = md5::compute(snap.as_bytes()); + let hash_str = format!("{hash:x}"); + insta::assert_snapshot!(format!("{name_str}.hash"), hash_str); + } + } + }; + } + + fn display_bitmap(b: &RoaringBitmap) -> String { + let mut s = String::new(); + s.push('['); + for x in b.into_iter() { + write!(&mut s, "{x}, ").unwrap(); + } + s.push(']'); + s + } + + settings.bind(|| { + snapshot_db!(word_docids, |(s, b)| { &format!("{s:<16} {}", display_bitmap(&b)) }); + snapshot_db!(exact_word_docids, |(s, b)| { &format!("{s:<16} {}", display_bitmap(&b)) }); + snapshot_db!(word_prefix_docids, |(s, b)| { &format!("{s:<16} {}", display_bitmap(&b)) }); + snapshot_db!(exact_word_prefix_docids, |(s, b)| { + &format!("{s:<16} {}", display_bitmap(&b)) + }); + + snapshot_db!(docid_word_positions, |((idx, s), b)| { + &format!("{idx:<6} {s:<16} {}", display_bitmap(&b)) + }); + + snapshot_db!(word_pair_proximity_docids, |((word1, word2, proximity), b)| { + &format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b)) + }); + + snapshot_db!(word_prefix_pair_proximity_docids, |((word1, prefix, proximity), b)| { + &format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b)) + }); + + snapshot_db!(word_position_docids, |((word, position), b)| { + &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) + }); + + snapshot_db!(field_id_word_count_docids, |((field_id, word_count), b)| { + &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) + }); + + snapshot_db!(word_prefix_position_docids, |((word_prefix, position), b)| { + &format!("{word_prefix:<4} {position:<6} {}", display_bitmap(&b)) + }); + + snapshot_db!(facet_id_f64_docids, |((facet_id, level, left, right), b)| { + &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) + }); + { + let name_str = stringify!(facet_id_string_docids); + if should_snapshot(name_str) { + let bytes_db = index.facet_id_string_docids.remap_types::(); + let iter = bytes_db.iter(&rtxn).unwrap(); + let mut snap = String::new(); + + for x in iter { + let (key, value) = x.unwrap(); + if let Some((field_id, normalized_str)) = + FacetStringLevelZeroCodec::bytes_decode(key) + { + let (orig_string, docids) = + FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); + snap.push_str(&format!( + "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", + display_bitmap(&docids) + )); + } else if let Some((field_id, level, left, right)) = + FacetLevelValueU32Codec::bytes_decode(key) + { + snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); + let (bounds, docids) = FacetStringZeroBoundsValueCodec::< + CboRoaringBitmapCodec, + >::bytes_decode(value) + .unwrap(); + if let Some((left, right)) = bounds { + snap.push_str(&format!("{left:<8} {right:<8} ")); + } + snap.push_str(&display_bitmap(&docids)); + snap.push('\n'); + } else { + panic!(); + } + } + insta::assert_snapshot!(name_str, snap); + } + } + + // Main - computed settings + { + let mut snap = String::new(); + + macro_rules! write_setting_to_snap { + ($name:ident) => { + if should_snapshot(&format!("settings.{}", stringify!($name))) { + let $name = index.$name(&rtxn).unwrap(); + writeln!(&mut snap, "{}: {:?}", stringify!($name), $name).unwrap(); + } + }; + } + write_setting_to_snap!(primary_key); + write_setting_to_snap!(criteria); + write_setting_to_snap!(displayed_fields); + write_setting_to_snap!(distinct_field); + write_setting_to_snap!(filterable_fields); + write_setting_to_snap!(sortable_fields); + write_setting_to_snap!(synonyms); + write_setting_to_snap!(authorize_typos); + write_setting_to_snap!(min_word_len_one_typo); + write_setting_to_snap!(min_word_len_two_typos); + write_setting_to_snap!(exact_words); + write_setting_to_snap!(exact_attributes); + write_setting_to_snap!(max_values_per_facet); + write_setting_to_snap!(pagination_max_total_hits); + write_setting_to_snap!(searchable_fields); + write_setting_to_snap!(user_defined_searchable_fields); + + if !snap.is_empty() { + insta::assert_snapshot!("settings", snap); + } + } + // Main - others + { + macro_rules! snapshot_string { + ($name:ident) => { + if should_snapshot(&format!("{}", stringify!($name))) { + insta::assert_snapshot!(stringify!($name), $name); + } + }; + } + { + let documents_ids = index.documents_ids(&rtxn).unwrap(); + let documents_ids = display_bitmap(&documents_ids); + snapshot_string!(documents_ids); + } + { + let stop_words = index.stop_words(&rtxn).unwrap(); + let stop_words = format!("{stop_words:?}"); + snapshot_string!(stop_words); + } + { + let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); + let soft_deleted_documents_ids = display_bitmap(&soft_deleted_documents_ids); + snapshot_string!(soft_deleted_documents_ids); + } + + { + let mut field_distribution = String::new(); + for (field, count) in index.field_distribution(&rtxn).unwrap() { + writeln!(&mut field_distribution, "{field:<16} {count:<6}").unwrap(); + } + snapshot_string!(field_distribution); + } + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + { + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let name = fields_ids_map.name(field_id).unwrap(); + writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap(); + } + let fields_ids_map = snap; + snapshot_string!(fields_ids_map); + } + + { + let geo_faceted_documents_ids = index.geo_faceted_documents_ids(&rtxn).unwrap(); + let geo_faceted_documents_ids = display_bitmap(&geo_faceted_documents_ids); + snapshot_string!(geo_faceted_documents_ids); + } + // let geo_rtree = index.geo_rtree(&rtxn).unwrap(); + { + let ExternalDocumentsIds { soft, hard, .. } = + index.external_documents_ids(&rtxn).unwrap(); + let mut external_documents_ids = String::new(); + let soft_bytes = soft.into_fst().as_bytes().to_owned(); + let mut hex_soft = String::new(); + for byte in soft_bytes { + write!(&mut hex_soft, "{:x}", byte).unwrap(); + } + writeln!(&mut external_documents_ids, "soft: {hex_soft}").unwrap(); + let hard_bytes = hard.into_fst().as_bytes().to_owned(); + let mut hex_hard = String::new(); + for byte in hard_bytes { + write!(&mut hex_hard, "{:x}", byte).unwrap(); + } + writeln!(&mut external_documents_ids, "hard: {hex_hard}").unwrap(); + + snapshot_string!(external_documents_ids); + } + { + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let number_faceted_documents_ids = + index.number_faceted_documents_ids(&rtxn, field_id).unwrap(); + writeln!( + &mut snap, + "{field_id:<3} {}", + display_bitmap(&number_faceted_documents_ids) + ) + .unwrap(); + } + let number_faceted_documents_ids = snap; + snapshot_string!(number_faceted_documents_ids); + } + { + let mut snap = String::new(); + for field_id in fields_ids_map.ids() { + let string_faceted_documents_ids = + index.string_faceted_documents_ids(&rtxn, field_id).unwrap(); + writeln!( + &mut snap, + "{field_id:<3} {}", + display_bitmap(&string_faceted_documents_ids) + ) + .unwrap(); + } + let string_faceted_documents_ids = snap; + snapshot_string!(string_faceted_documents_ids); + } + { + let words_fst = index.words_fst(&rtxn).unwrap(); + let bytes = words_fst.into_fst().as_bytes().to_owned(); + let mut words_fst = String::new(); + for byte in bytes { + write!(&mut words_fst, "{:x}", byte).unwrap(); + } + snapshot_string!(words_fst); + } + { + let words_prefixes_fst = index.words_prefixes_fst(&rtxn).unwrap(); + let bytes = words_prefixes_fst.into_fst().as_bytes().to_owned(); + let mut words_prefixes_fst = String::new(); + for byte in bytes { + write!(&mut words_prefixes_fst, "{:x}", byte).unwrap(); + } + snapshot_string!(words_prefixes_fst); + } + } + }); +}