feat: Make synonyms be not considered like exact matches

This commit is contained in:
Clément Renault 2019-06-19 14:10:21 +02:00
parent 43f11e929d
commit 3dcbc737f3
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE

View File

@ -20,7 +20,24 @@ use crate::{Match, DocumentId, Store, RawDocument, Document};
const NGRAMS: usize = 3; const NGRAMS: usize = 3;
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<(usize, DfaExt)>, S::Error> { struct Automaton {
index: usize,
is_synonym: bool,
number_words: usize,
dfa: DfaExt,
}
impl Automaton {
fn synonym(index: usize, number_words: usize, dfa: DfaExt) -> Automaton {
Automaton { index, is_synonym: true, number_words, dfa }
}
fn original(index: usize, number_words: usize, dfa: DfaExt) -> Automaton {
Automaton { index, is_synonym: false, number_words, dfa }
}
}
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
let mut automatons = Vec::new(); let mut automatons = Vec::new();
@ -54,25 +71,28 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<(usize, D
while let Some(synonyms) = stream.next() { while let Some(synonyms) = stream.next() {
let synonyms = std::str::from_utf8(synonyms).unwrap(); let synonyms = std::str::from_utf8(synonyms).unwrap();
let nb_synonym_words = split_query_string(synonyms).count();
for synonym in split_query_string(synonyms) { for synonym in split_query_string(synonyms) {
let lev = build_dfa(synonym); let lev = build_dfa(synonym);
automatons.push((index, synonym.to_owned(), lev)); let automaton = Automaton::synonym(index, nb_synonym_words, lev);
automatons.push((automaton, synonym.to_owned()));
} }
} }
} }
} }
if n == 1 { if n == 1 {
automatons.push((index, ngram, lev)); let automaton = Automaton::original(index, ngram_nb_words, lev);
automatons.push((automaton, ngram));
} }
index += 1; index += 1;
} }
} }
automatons.sort_unstable_by(|a, b| (a.0, &a.1).cmp(&(b.0, &b.1))); automatons.sort_unstable_by(|a, b| (a.0.index, &a.1).cmp(&(b.0.index, &b.1)));
automatons.dedup_by(|a, b| (a.0, &a.1) == (b.0, &b.1)); automatons.dedup_by(|a, b| (a.0.index, &a.1) == (b.0.index, &b.1));
let automatons = automatons.into_iter().map(|(i, _, a)| (i, a)).collect(); let automatons = automatons.into_iter().map(|(a, _)| a).collect();
Ok(automatons) Ok(automatons)
} }
@ -129,8 +149,8 @@ where S: Store,
let mut stream = { let mut stream = {
let mut op_builder = fst::raw::OpBuilder::new(); let mut op_builder = fst::raw::OpBuilder::new();
for (_index, automaton) in &automatons { for Automaton { dfa, .. } in &automatons {
let stream = words.search(automaton); let stream = words.search(dfa);
op_builder.push(stream); op_builder.push(stream);
} }
op_builder.r#union() op_builder.r#union()
@ -140,9 +160,9 @@ where S: Store,
while let Some((input, indexed_values)) = stream.next() { while let Some((input, indexed_values)) = stream.next() {
for iv in indexed_values { for iv in indexed_values {
let (index, automaton) = &automatons[iv.index]; let Automaton { index, is_synonym, number_words, ref dfa } = automatons[iv.index];
let distance = automaton.eval(input).to_u8(); let distance = dfa.eval(input).to_u8();
let is_exact = distance == 0 && input.len() == automaton.query_len(); let is_exact = (is_synonym && number_words == 1) || (!is_synonym && distance == 0 && input.len() == dfa.query_len());
let doc_indexes = self.store.word_indexes(input)?; let doc_indexes = self.store.word_indexes(input)?;
let doc_indexes = match doc_indexes { let doc_indexes = match doc_indexes {
@ -153,8 +173,8 @@ where S: Store,
for di in doc_indexes.as_slice() { for di in doc_indexes.as_slice() {
if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) { if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) {
let match_ = Match { let match_ = Match {
query_index: *index as u32, query_index: index as u32,
distance, distance: distance,
attribute: di.attribute, attribute: di.attribute,
word_index: di.word_index, word_index: di.word_index,
is_exact, is_exact,
@ -716,6 +736,12 @@ mod tests {
let results = builder.query("NY subway", 0..20).unwrap(); let results = builder.query("NY subway", 0..20).unwrap();
let mut iter = results.into_iter(); let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
let mut iter = matches.into_iter(); let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
@ -724,18 +750,18 @@ mod tests {
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
assert_matches!(iter.next(), None); // position rewritten ^ assert_matches!(iter.next(), None); // position rewritten ^
}); });
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None); assert_matches!(iter.next(), None);
let builder = QueryBuilder::new(&store); let builder = QueryBuilder::new(&store);
let results = builder.query("NYC subway", 0..20).unwrap(); let results = builder.query("NYC subway", 0..20).unwrap();
let mut iter = results.into_iter(); let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
let mut iter = matches.into_iter(); let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
@ -744,12 +770,6 @@ mod tests {
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
assert_matches!(iter.next(), None); // position rewritten ^ assert_matches!(iter.next(), None); // position rewritten ^
}); });
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None); assert_matches!(iter.next(), None);
} }
@ -776,6 +796,12 @@ mod tests {
let results = builder.query("NY subway", 0..20).unwrap(); let results = builder.query("NY subway", 0..20).unwrap();
let mut iter = results.into_iter(); let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
let mut iter = matches.into_iter(); let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
@ -784,18 +810,18 @@ mod tests {
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
assert_matches!(iter.next(), None); // position rewritten ^ assert_matches!(iter.next(), None); // position rewritten ^
}); });
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None); assert_matches!(iter.next(), None);
let builder = QueryBuilder::new(&store); let builder = QueryBuilder::new(&store);
let results = builder.query("NYC subway", 0..20).unwrap(); let results = builder.query("NYC subway", 0..20).unwrap();
let mut iter = results.into_iter(); let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
let mut iter = matches.into_iter(); let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
@ -804,12 +830,6 @@ mod tests {
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
assert_matches!(iter.next(), None); // position rewritten ^ assert_matches!(iter.next(), None); // position rewritten ^
}); });
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None); assert_matches!(iter.next(), None);
} }
@ -860,6 +880,12 @@ mod tests {
let results = builder.query("NYC subway", 0..20).unwrap(); let results = builder.query("NYC subway", 0..20).unwrap();
let mut iter = results.into_iter(); let mut iter = results.into_iter();
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
let mut iter = matches.into_iter(); let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY
@ -869,12 +895,6 @@ mod tests {
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway
assert_matches!(iter.next(), None); // position rewritten ^ assert_matches!(iter.next(), None); // position rewritten ^
}); });
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
let mut iter = matches.into_iter();
assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY
assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway
assert_matches!(iter.next(), None);
});
assert_matches!(iter.next(), None); assert_matches!(iter.next(), None);
} }