use std::cmp::Ordering; use itertools::Itertools; use serde::Serialize; use crate::distance_between_two_points; #[derive(Debug, Clone, PartialEq)] pub enum ScoreDetails { Words(Words), Typo(Typo), Proximity(Rank), Fid(Rank), Position(Rank), ExactAttribute(ExactAttribute), ExactWords(ExactWords), Sort(Sort), Vector(Vector), GeoSort(GeoSort), } #[derive(Clone, Copy)] pub enum ScoreValue<'a> { Score(f64), Sort(&'a Sort), GeoSort(&'a GeoSort), } enum RankOrValue<'a> { Rank(Rank), Sort(&'a Sort), GeoSort(&'a GeoSort), Score(f64), } impl ScoreDetails { pub fn local_score(&self) -> Option { self.rank().map(Rank::local_score) } pub fn rank(&self) -> Option { match self { ScoreDetails::Words(details) => Some(details.rank()), ScoreDetails::Typo(details) => Some(details.rank()), ScoreDetails::Proximity(details) => Some(*details), ScoreDetails::Fid(details) => Some(*details), ScoreDetails::Position(details) => Some(*details), ScoreDetails::ExactAttribute(details) => Some(details.rank()), ScoreDetails::ExactWords(details) => Some(details.rank()), ScoreDetails::Sort(_) => None, ScoreDetails::GeoSort(_) => None, ScoreDetails::Vector(_) => None, } } pub fn global_score<'a>(details: impl Iterator + 'a) -> f64 { Self::score_values(details) .find_map(|x| { let ScoreValue::Score(score) = x else { return None; }; Some(score) }) .unwrap_or(1.0f64) } pub fn score_values<'a>( details: impl Iterator + 'a, ) -> impl Iterator> + 'a { details .map(ScoreDetails::rank_or_value) .coalesce(|left, right| match (left, right) { (RankOrValue::Rank(left), RankOrValue::Rank(right)) => { Ok(RankOrValue::Rank(Rank::merge(left, right))) } (left, right) => Err((left, right)), }) .map(|rank_or_value| match rank_or_value { RankOrValue::Rank(r) => ScoreValue::Score(r.local_score()), RankOrValue::Sort(s) => ScoreValue::Sort(s), RankOrValue::GeoSort(g) => ScoreValue::GeoSort(g), RankOrValue::Score(s) => ScoreValue::Score(s), }) } fn rank_or_value(&self) -> RankOrValue<'_> { match self { ScoreDetails::Words(w) => RankOrValue::Rank(w.rank()), ScoreDetails::Typo(t) => RankOrValue::Rank(t.rank()), ScoreDetails::Proximity(p) => RankOrValue::Rank(*p), ScoreDetails::Fid(f) => RankOrValue::Rank(*f), ScoreDetails::Position(p) => RankOrValue::Rank(*p), ScoreDetails::ExactAttribute(e) => RankOrValue::Rank(e.rank()), ScoreDetails::ExactWords(e) => RankOrValue::Rank(e.rank()), ScoreDetails::Sort(sort) => RankOrValue::Sort(sort), ScoreDetails::GeoSort(geosort) => RankOrValue::GeoSort(geosort), ScoreDetails::Vector(vector) => RankOrValue::Score( vector.value_similarity.as_ref().map(|(_, s)| *s as f64).unwrap_or(0.0f64), ), } } /// Panics /// /// - If Position is not preceded by Fid /// - If Exactness is not preceded by ExactAttribute pub fn to_json_map<'a>( details: impl Iterator, ) -> serde_json::Map { let mut order = 0; let mut fid_details = None; let mut details_map = serde_json::Map::default(); for details in details { match details { ScoreDetails::Words(words) => { let words_details = serde_json::json!({ "order": order, "matchingWords": words.matching_words, "maxMatchingWords": words.max_matching_words, "score": words.rank().local_score(), }); details_map.insert("words".into(), words_details); order += 1; } ScoreDetails::Typo(typo) => { let typo_details = serde_json::json!({ "order": order, "typoCount": typo.typo_count, "maxTypoCount": typo.max_typo_count, "score": typo.rank().local_score(), }); details_map.insert("typo".into(), typo_details); order += 1; } ScoreDetails::Proximity(proximity) => { let proximity_details = serde_json::json!({ "order": order, "score": proximity.local_score(), }); details_map.insert("proximity".into(), proximity_details); order += 1; } ScoreDetails::Fid(fid) => { // copy the rank for future use in Position. fid_details = Some(*fid); // For now, fid is a virtual rule always followed by the "position" rule let fid_details = serde_json::json!({ "order": order, "attributeRankingOrderScore": fid.local_score(), }); details_map.insert("attribute".into(), fid_details); order += 1; } ScoreDetails::Position(position) => { // For now, position is a virtual rule always preceded by the "fid" rule let attribute_details = details_map .get_mut("attribute") .expect("position not preceded by attribute"); let attribute_details = attribute_details .as_object_mut() .expect("attribute details was not an object"); let Some(fid_details) = fid_details else { unimplemented!("position not preceded by attribute"); }; attribute_details .insert("queryWordDistanceScore".into(), position.local_score().into()); let score = Rank::global_score([fid_details, *position].iter().copied()); attribute_details.insert("score".into(), score.into()); // do not update the order since this was already done by fid } ScoreDetails::ExactAttribute(exact_attribute) => { let exactness_details = serde_json::json!({ "order": order, "matchType": exact_attribute, "score": exact_attribute.rank().local_score(), }); details_map.insert("exactness".into(), exactness_details); order += 1; } ScoreDetails::ExactWords(details) => { // For now, exactness is a virtual rule always preceded by the "ExactAttribute" rule let exactness_details = details_map .get_mut("exactness") .expect("Exactness not preceded by exactAttribute"); let exactness_details = exactness_details .as_object_mut() .expect("exactness details was not an object"); if exactness_details.get("matchType").expect("missing 'matchType'") == &serde_json::json!(ExactAttribute::NoExactMatch) { let score = Rank::global_score( [ExactAttribute::NoExactMatch.rank(), details.rank()].iter().copied(), ); // tiny detail, but we want the score to be the last displayed field, // so we're removing it here, adding the other fields, then adding the new score exactness_details.remove("score"); exactness_details .insert("matchingWords".into(), details.matching_words.into()); exactness_details .insert("maxMatchingWords".into(), details.max_matching_words.into()); exactness_details.insert("score".into(), score.into()); } // do not update the order since this was already done by exactAttribute } ScoreDetails::Sort(details) => { let sort = if details.redacted { format!("") } else { format!( "{}:{}", details.field_name, if details.ascending { "asc" } else { "desc" } ) }; let value = if details.redacted { "".into() } else { details.value.clone() }; let sort_details = serde_json::json!({ "order": order, "value": value, }); details_map.insert(sort, sort_details); order += 1; } ScoreDetails::GeoSort(details) => { let sort = format!( "_geoPoint({}, {}):{}", details.target_point[0], details.target_point[1], if details.ascending { "asc" } else { "desc" } ); let point = if let Some(value) = details.value { serde_json::json!({ "lat": value[0], "lng": value[1]}) } else { serde_json::Value::Null }; let sort_details = serde_json::json!({ "order": order, "value": point, "distance": details.distance(), }); details_map.insert(sort, sort_details); order += 1; } ScoreDetails::Vector(s) => { let vector = format!("vectorSort({:?})", s.target_vector); let value = s.value_similarity.as_ref().map(|(v, _)| v); let similarity = s.value_similarity.as_ref().map(|(_, s)| s); let details = serde_json::json!({ "order": order, "value": value, "similarity": similarity, }); details_map.insert(vector, details); order += 1; } } } details_map } } /// The strategy to compute scores. /// /// It makes sense to pass down this strategy to the internals of the search, because /// some optimizations (today, mainly skipping ranking rules for universes of a single document) /// are not correct to do when computing the scores. /// /// This strategy could feasibly be extended to differentiate between the normalized score and the /// detailed scores, but it is not useful today as the normalized score is *derived from* the /// detailed scores. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] pub enum ScoringStrategy { /// Don't compute scores #[default] Skip, /// Compute detailed scores Detailed, } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Words { pub matching_words: u32, pub max_matching_words: u32, } impl Words { pub fn rank(&self) -> Rank { Rank { rank: self.matching_words, max_rank: self.max_matching_words } } pub(crate) fn from_rank(rank: Rank) -> Self { Self { matching_words: rank.rank, max_matching_words: rank.max_rank } } } /// Structure that is super similar to [`Words`], but whose semantics is a bit distinct. /// /// In exactness, the number of matching words can actually be 0 with a non-zero score, /// if no words from the query appear exactly in the document. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct ExactWords { pub matching_words: u32, pub max_matching_words: u32, } impl ExactWords { pub fn rank(&self) -> Rank { // 0 matching words means last rank (1) Rank { rank: self.matching_words + 1, max_rank: self.max_matching_words + 1 } } pub(crate) fn from_rank(rank: Rank) -> Self { // last rank (1) means that 0 words from the query appear exactly in the document. // first rank (max_rank) means that (max_rank - 1) words from the query appear exactly in the document. Self { matching_words: rank.rank.saturating_sub(1), max_matching_words: rank.max_rank.saturating_sub(1), } } } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Typo { pub typo_count: u32, pub max_typo_count: u32, } impl Typo { pub fn rank(&self) -> Rank { Rank { rank: (self.max_typo_count + 1).saturating_sub(self.typo_count), max_rank: (self.max_typo_count + 1), } } // max_rank = max_typo + 1 // max_typo = max_rank - 1 // // rank = max_typo - typo + 1 // rank = max_rank - 1 - typo + 1 // rank + typo = max_rank // typo = max_rank - rank pub fn from_rank(rank: Rank) -> Typo { Typo { typo_count: rank.max_rank.saturating_sub(rank.rank), max_typo_count: rank.max_rank.saturating_sub(1), } } } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Rank { /// The ordinal rank, such that `max_rank` is the first rank, and 0 is the last rank. /// /// The higher the better. Documents with a rank of 0 have a score of 0 and are typically never returned /// (they don't match the query). pub rank: u32, /// The maximum possible rank. Documents with this rank have a score of 1. /// /// The max rank should not be 0. pub max_rank: u32, } impl Rank { pub fn local_score(self) -> f64 { self.rank as f64 / self.max_rank as f64 } pub fn global_score(details: impl Iterator) -> f64 { let mut rank = Rank { rank: 1, max_rank: 1 }; for inner_rank in details { rank = Rank::merge(rank, inner_rank); } rank.local_score() } pub fn merge(mut outer: Rank, inner: Rank) -> Rank { outer.rank = outer.rank.saturating_sub(1); outer.rank *= inner.max_rank; outer.max_rank *= inner.max_rank; outer.rank += inner.rank; outer } } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize)] #[serde(rename_all = "camelCase")] pub enum ExactAttribute { ExactMatch, MatchesStart, NoExactMatch, } impl ExactAttribute { pub fn rank(&self) -> Rank { let rank = match self { ExactAttribute::ExactMatch => 3, ExactAttribute::MatchesStart => 2, ExactAttribute::NoExactMatch => 1, }; Rank { rank, max_rank: 3 } } } #[derive(Debug, Clone, PartialEq)] pub struct Sort { pub field_name: String, pub ascending: bool, pub redacted: bool, pub value: serde_json::Value, } impl PartialOrd for Sort { fn partial_cmp(&self, other: &Self) -> Option { if self.field_name != other.field_name { return None; } if self.ascending != other.ascending { return None; } match (&self.value, &other.value) { (serde_json::Value::Null, serde_json::Value::Null) => Some(Ordering::Equal), (serde_json::Value::Null, _) => Some(Ordering::Less), (_, serde_json::Value::Null) => Some(Ordering::Greater), // numbers are always before strings (serde_json::Value::Number(_), serde_json::Value::String(_)) => Some(Ordering::Greater), (serde_json::Value::String(_), serde_json::Value::Number(_)) => Some(Ordering::Less), (serde_json::Value::Number(left), serde_json::Value::Number(right)) => { // FIXME: unwrap permitted here? let order = left.as_f64().unwrap().partial_cmp(&right.as_f64().unwrap())?; // 12 < 42, and when ascending, we want to see 12 first, so the smallest. // Hence, when ascending, smaller is better Some(if self.ascending { order.reverse() } else { order }) } (serde_json::Value::String(left), serde_json::Value::String(right)) => { let order = left.cmp(right); // Taking e.g. "a" and "z" // "a" < "z", and when ascending, we want to see "a" first, so the smallest. // Hence, when ascending, smaller is better Some(if self.ascending { order.reverse() } else { order }) } _ => None, } } } #[derive(Debug, Clone, Copy, PartialEq)] pub struct GeoSort { pub target_point: [f64; 2], pub ascending: bool, pub value: Option<[f64; 2]>, } impl PartialOrd for GeoSort { fn partial_cmp(&self, other: &Self) -> Option { if self.target_point != other.target_point { return None; } if self.ascending != other.ascending { return None; } Some(match (self.distance(), other.distance()) { (None, None) => Ordering::Equal, (None, Some(_)) => Ordering::Less, (Some(_), None) => Ordering::Greater, (Some(left), Some(right)) => { let order = left.partial_cmp(&right)?; if self.ascending { // when ascending, the one with the smallest distance has the best score order.reverse() } else { order } } }) } } #[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct Vector { pub target_vector: Vec, pub value_similarity: Option<(Vec, f32)>, } impl GeoSort { pub fn distance(&self) -> Option { self.value.map(|value| distance_between_two_points(&self.target_point, &value)) } }