#![cfg_attr(all(test, fuzzing), feature(no_coverage))] #![allow(clippy::type_complexity)] #[cfg(test)] #[global_allocator] pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; // #[cfg(test)] // pub mod allocator { // use std::alloc::{GlobalAlloc, System}; // use std::sync::atomic::{self, AtomicI64}; // #[global_allocator] // pub static ALLOC: CountingAlloc = CountingAlloc { // max_resident: AtomicI64::new(0), // resident: AtomicI64::new(0), // allocated: AtomicI64::new(0), // }; // pub struct CountingAlloc { // pub max_resident: AtomicI64, // pub resident: AtomicI64, // pub allocated: AtomicI64, // } // unsafe impl GlobalAlloc for CountingAlloc { // unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { // self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); // let old_resident = // self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); // let resident = old_resident + layout.size() as i64; // self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst); // // if layout.size() > 1_000_000 { // // eprintln!( // // "allocating {} with new resident size: {resident}", // // layout.size() / 1_000_000 // // ); // // // let trace = std::backtrace::Backtrace::capture(); // // // let t = trace.to_string(); // // // eprintln!("{t}"); // // } // System.alloc(layout) // } // unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { // self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); // System.dealloc(ptr, layout) // } // } // } #[macro_use] pub mod documents; mod asc_desc; mod criterion; mod error; mod external_documents_ids; pub mod facet; mod fields_ids_map; pub mod heed_codec; pub mod index; pub mod proximity; mod search; pub mod update; #[cfg(test)] #[macro_use] pub mod snapshot_tests; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; use std::hash::BuildHasherDefault; use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer}; pub use filter_parser::{Condition, FilterCondition, Span, Token}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; pub use search::new::{ execute_search, DefaultSearchLogger, SearchContext, SearchLogger, VisualSearchLogger, }; use serde_json::Value; pub use {charabia as tokenizer, heed}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; pub use self::criterion::{default_criteria, Criterion, CriterionError}; pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec, }; pub use self::index::Index; pub use self::search::{ FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; pub type Attribute = u32; pub type BEU16 = heed::zerocopy::U16; pub type BEU32 = heed::zerocopy::U32; pub type BEU64 = heed::zerocopy::U64; pub type DocumentId = u32; pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; pub type FieldDistribution = BTreeMap; pub type FieldId = u16; pub type Object = serde_json::Map; pub type Position = u32; pub type RelativePosition = u16; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; /// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata /// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point /// expressed in term of latitude and longitude. pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 3], (DocumentId, [f64; 2])>; /// The maximum length a LMDB key can be. /// /// Note that the actual allowed length is a little bit higher, but /// we keep a margin of safety. const MAX_LMDB_KEY_LENGTH: usize = 500; /// The maximum length a field value can be when inserted in an LMDB key. /// /// This number is determined by the keys of the different facet databases /// and adding a margin of safety. pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20; /// The maximum length a word can be pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2; pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; // Convert an absolute word position into a relative position. // Return the field id of the attribute related to the absolute position // and the relative position in the attribute. pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) { ((absolute >> 16) as u16, (absolute & 0xFFFF) as u16) } // Compute the absolute word position with the field id of the attribute and relative position in the attribute. pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position { (field_id as u32) << 16 | (relative as u32) } // TODO: this is wrong, but will do for now /// Compute the "bucketed" absolute position from the field id and relative position in the field. /// /// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger. pub fn bucketed_position(relative: u16) -> u16 { // The first few relative positions are kept intact. if relative < 16 { relative } else if relative < 24 { // Relative positions between 16 and 24 all become equal to 24 24 } else { // Then, groups of positions that have the same base-2 logarithm are reduced to // the same relative position: the smallest power of 2 that is greater than them (relative as f64).log2().ceil().exp2() as u16 } } /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, obkv: obkv::KvReaderU16, ) -> Result { displayed_fields .iter() .copied() .flat_map(|id| obkv.get(id).map(|value| (id, value))) .map(|(id, value)| { let name = fields_ids_map.name(id).ok_or(error::FieldIdMapMissingEntry::FieldId { field_id: id, process: "obkv_to_json", })?; let value = serde_json::from_slice(value).map_err(error::InternalError::SerdeJson)?; Ok((name.to_owned(), value)) }) .collect() } /// Transform every field of a raw obkv store into a JSON Object. pub fn all_obkv_to_json(obkv: obkv::KvReaderU16, fields_ids_map: &FieldsIdsMap) -> Result { let all_keys = obkv.iter().map(|(k, _v)| k).collect::>(); obkv_to_json(all_keys.as_slice(), fields_ids_map, obkv) } /// Transform a JSON value into a string that can be indexed. pub fn json_to_string(value: &Value) -> Option { fn inner(value: &Value, output: &mut String) -> bool { use std::fmt::Write; match value { Value::Null => false, Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(), Value::Number(number) => write!(output, "{}", number).is_ok(), Value::String(string) => write!(output, "{}", string).is_ok(), Value::Array(array) => { let mut count = 0; for value in array { if inner(value, output) { output.push_str(". "); count += 1; } } // check that at least one value was written count != 0 } Value::Object(object) => { let mut buffer = String::new(); let mut count = 0; for (key, value) in object { buffer.clear(); let _ = write!(&mut buffer, "{}: ", key); if inner(value, &mut buffer) { buffer.push_str(". "); // We write the "key: value. " pair only when // we are sure that the value can be written. output.push_str(&buffer); count += 1; } } // check that at least one value was written count != 0 } } } let mut string = String::new(); if inner(value, &mut string) { Some(string) } else { None } } /// Divides one slice into two at an index, returns `None` if mid is out of bounds. fn try_split_at(slice: &[T], mid: usize) -> Option<(&[T], &[T])> { if mid <= slice.len() { Some(slice.split_at(mid)) } else { None } } /// Divides one slice into an array and the tail at an index, /// returns `None` if `N` is out of bounds. fn try_split_array_at(slice: &[T]) -> Option<([T; N], &[T])> where [T; N]: for<'a> TryFrom<&'a [T]>, { let (head, tail) = try_split_at(slice, N)?; let head = head.try_into().ok()?; Some((head, tail)) } /// Return the distance between two points in meters. Each points are composed of two f64, /// one latitude and one longitude. pub fn distance_between_two_points(a: &[f64; 2], b: &[f64; 2]) -> f64 { let a = geoutils::Location::new(a[0], a[1]); let b = geoutils::Location::new(b[0], b[1]); a.haversine_distance_to(&b).meters() } /// Convert a point expressed in terms of latitude and longitude to a point in the /// cartesian coordinate expressed in terms of x, y and z. pub fn lat_lng_to_xyz(coord: &[f64; 2]) -> [f64; 3] { let [lat, lng] = coord.map(|f| f.to_radians()); let x = lat.cos() * lng.cos(); let y = lat.cos() * lng.sin(); let z = lat.sin(); [x, y, z] } /// Returns `true` if the field match one of the faceted fields. /// See the function [`is_faceted_by`] below to see what “matching” means. pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator>) -> bool { faceted_fields.into_iter().any(|facet| is_faceted_by(field, facet.as_ref())) } /// Returns `true` if the field match the facet. /// ``` /// use milli::is_faceted_by; /// // -- the valid basics /// assert!(is_faceted_by("animaux", "animaux")); /// assert!(is_faceted_by("animaux.chien", "animaux")); /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux")); /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien")); /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois")); /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure")); /// assert!(is_faceted_by("animaux.chien.race.bouvier bernois.fourrure.couleur", "animaux.chien.race.bouvier bernois.fourrure.couleur")); /// /// // -- the wrongs /// assert!(!is_faceted_by("chien", "chat")); /// assert!(!is_faceted_by("animaux", "animaux.chien")); /// assert!(!is_faceted_by("animaux.chien", "animaux.chat")); /// /// // -- the strange edge cases /// assert!(!is_faceted_by("animaux.chien", "anima")); /// assert!(!is_faceted_by("animaux.chien", "animau")); /// assert!(!is_faceted_by("animaux.chien", "animaux.")); /// assert!(!is_faceted_by("animaux.chien", "animaux.c")); /// assert!(!is_faceted_by("animaux.chien", "animaux.ch")); /// assert!(!is_faceted_by("animaux.chien", "animaux.chi")); /// assert!(!is_faceted_by("animaux.chien", "animaux.chie")); /// ``` pub fn is_faceted_by(field: &str, facet: &str) -> bool { field.starts_with(facet) && field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true) } pub fn normalize_facet(original: &str) -> String { CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase() } #[cfg(test)] mod tests { use serde_json::json; use super::*; #[test] fn json_to_string_object() { let value = json!({ "name": "John Doe", "age": 43, "not_there": null, }); let string = json_to_string(&value).unwrap(); assert_eq!(string, "name: John Doe. age: 43. "); } #[test] fn json_to_string_array() { let value = json!([ { "name": "John Doe" }, 43, "hello", [ "I", "am", "fine" ], null, ]); let string = json_to_string(&value).unwrap(); // We don't care about having two point (.) after the other as // the distance of hard separators is clamped to 8 anyway. assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); } #[test] fn test_relative_position_conversion() { assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000)); assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF)); assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000)); assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00)); assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF)); assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678)); assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF)); } #[test] fn test_absolute_position_conversion() { assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000)); assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF)); assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000)); assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00)); assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF)); assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678)); assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF)); } #[test] fn test_all_obkv_to_json() { let mut fields_ids_map = FieldsIdsMap::new(); let id1 = fields_ids_map.insert("field1").unwrap(); let id2 = fields_ids_map.insert("field2").unwrap(); let mut writer = obkv::KvWriterU16::memory(); writer.insert(id1, b"1234").unwrap(); writer.insert(id2, b"4321").unwrap(); let contents = writer.into_inner().unwrap(); let obkv = obkv::KvReaderU16::new(&contents); let expected = json!({ "field1": 1234, "field2": 4321, }); let expected = expected.as_object().unwrap(); let actual = all_obkv_to_json(obkv, &fields_ids_map).unwrap(); assert_eq!(&actual, expected); } }