From 6dcec4f473460847a7c35a34edb7af7d7ce985bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 24 Sep 2018 14:50:33 +0200 Subject: [PATCH] feat: Correct the sum of typos criterion Thanks to @tpayet! --- raptor/src/rank/sum_of_typos.rs | 111 ++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 5 deletions(-) diff --git a/raptor/src/rank/sum_of_typos.rs b/raptor/src/rank/sum_of_typos.rs index 4d1c80eea..ab90595c4 100644 --- a/raptor/src/rank/sum_of_typos.rs +++ b/raptor/src/rank/sum_of_typos.rs @@ -4,13 +4,18 @@ use crate::Match; use crate::rank::{match_query_index, Document}; #[inline] -fn sum_matches_typos(matches: &[Match]) -> u8 { +fn sum_matches_typos(matches: &[Match]) -> i8 { + let mut sum_typos = 0; + let mut number_words = 0; + // note that GroupBy will never return an empty group // so we can do this assumption safely - // matches must and will never be empty - GroupBy::new(matches, match_query_index).map(|group| unsafe { - group.get_unchecked(0).distance - }).min().unwrap() + for group in GroupBy::new(matches, match_query_index) { + sum_typos += unsafe { group.get_unchecked(0).distance } as i8; + number_words += 1; + } + + sum_typos - number_words } #[inline] @@ -20,3 +25,99 @@ pub fn sum_of_typos(lhs: &Document, rhs: &Document) -> Ordering { lhs.cmp(&rhs) } + +#[cfg(test)] +mod tests { + use super::*; + + // typing: "Geox CEO" + // + // doc0: "Geox SpA: CEO and Executive" + // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" + #[test] + fn one_typo_reference() { + let doc0 = { + let matches = vec![ + Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, + Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false }, + ]; + Document { + document_id: 0, + matches: matches, + } + }; + + let doc1 = { + let matches = vec![ + Match { query_index: 0, distance: 1, attribute: 0, attribute_index: 0, is_exact: false }, + Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 2, is_exact: false }, + ]; + Document { + document_id: 1, + matches: matches, + } + }; + + assert_eq!(sum_of_typos(&doc0, &doc1), Ordering::Less); + } + + // typing: "bouton manchette" + // + // doc0: "bouton manchette" + // doc1: "bouton" + #[test] + fn no_typo() { + let doc0 = { + let matches = vec![ + Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, + Match { query_index: 1, distance: 0, attribute: 0, attribute_index: 1, is_exact: false }, + ]; + Document { + document_id: 0, + matches: matches, + } + }; + + let doc1 = { + let matches = vec![ + Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, + ]; + Document { + document_id: 1, + matches: matches, + } + }; + + assert_eq!(sum_of_typos(&doc0, &doc1), Ordering::Less); + } + + // typing: "bouton manchztte" + // + // doc0: "bouton manchette" + // doc1: "bouton" + #[test] + fn one_typo() { + let doc0 = { + let matches = vec![ + Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, + Match { query_index: 1, distance: 1, attribute: 0, attribute_index: 1, is_exact: false }, + ]; + Document { + document_id: 0, + matches: matches, + } + }; + + let doc1 = { + let matches = vec![ + Match { query_index: 0, distance: 0, attribute: 0, attribute_index: 0, is_exact: false }, + ]; + Document { + document_id: 1, + matches: matches, + } + }; + + assert_eq!(sum_of_typos(&doc0, &doc1), Ordering::Equal); + } +}