From 0633f16b4dca0fe2d61da4534862b69440b22e69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 17 Jun 2019 18:21:10 +0200 Subject: [PATCH] feat: Make multi-word support multi-word synonyms --- meilidb-core/src/query_builder.rs | 92 ++++++++++++++++++++++--------- 1 file changed, 67 insertions(+), 25 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index c93a7be9e..088d2bc2e 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -18,36 +18,46 @@ use crate::criterion::Criteria; use crate::raw_documents_from_matches; use crate::{Match, DocumentId, Store, RawDocument, Document}; +const NGRAMS: usize = 3; + fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let mut groups = split_query_string(query).map(str::to_lowercase).peekable(); + let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); let mut automatons = Vec::new(); - let mut index = 0; let synonyms = store.synonyms()?; - while let Some(query_word) = groups.next() { - let query_word_str = query_word.as_str(); - let has_following_word = groups.peek().is_some(); - let not_prefix_dfa = has_following_word || has_end_whitespace || query_word_str.chars().all(is_cjk); + for n in 1..=NGRAMS { + let mut index = 0; + let mut ngrams = query_words.windows(n).peekable(); - let lev = if not_prefix_dfa { build_dfa(query_word_str) } else { build_prefix_dfa(query_word_str) }; - let mut stream = synonyms.search(&lev).into_stream(); - while let Some(word) = stream.next() { - if let Some(synonyms) = store.alternatives_to(word)? { - let mut stream = synonyms.into_stream(); - while let Some(synonyms) = stream.next() { - let synonyms = std::str::from_utf8(synonyms).unwrap(); - for synonym in split_query_string(synonyms) { - let lev = if not_prefix_dfa { build_dfa(synonym) } else { build_prefix_dfa(synonym) }; - automatons.push((index, synonym.to_owned(), lev)); + while let Some(ngram) = ngrams.next() { + let ngram = ngram.join(" "); + + let has_following_word = ngrams.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); + + let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) }; + let mut stream = synonyms.search(&lev).into_stream(); + while let Some(word) = stream.next() { + if let Some(synonyms) = store.alternatives_to(word)? { + let mut stream = synonyms.into_stream(); + while let Some(synonyms) = stream.next() { + let synonyms = std::str::from_utf8(synonyms).unwrap(); + for synonym in split_query_string(synonyms) { + let lev = if not_prefix_dfa { build_dfa(synonym) } else { build_prefix_dfa(synonym) }; + automatons.push((index, synonym.to_owned(), lev)); + } } } } - } - automatons.push((index, query_word, lev)); - index += 1; + if n == 1 { + automatons.push((index, ngram, lev)); + } + + index += 1; + } } automatons.sort_unstable_by(|a, b| (a.0, &a.1).cmp(&(b.0, &b.1))); @@ -859,24 +869,56 @@ mod tests { } #[test] - #[ignore] /// Multi-word has multi-word synonyms fn multiword_to_multiword_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - ("NY", &[doc_index(0, 0)][..]), - ("subway", &[doc_index(0, 1)][..]), + ("NY", &[doc_char_index(0, 0, 0)][..]), + ("subway", &[doc_char_index(0, 1, 1)][..]), + + ("NYC", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ("broken", &[doc_char_index(1, 3, 3)][..]), ]); store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"])); + store.add_synonym("new york city", SetBuf::from_dirty(vec!["NYC", "NY", "new york"])); + store.add_synonym("underground train", SetBuf::from_dirty(vec!["subway"])); let builder = QueryBuilder::new(&store); - let results = builder.query("new york subway", 0..20).unwrap(); + let results = builder.query("new york underground train broken", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 2, .. })); // subway = underground train + assert_matches!(iter.next(), Some(Match { query_index: 4, word_index: 3, .. })); // broken + assert_matches!(iter.next(), None); + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 1, .. })); // subway + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY = new york + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 1, .. })); // subway = underground train + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york city underground train broken", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NYC = new york city + assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 2, .. })); // subway = underground train + assert_matches!(iter.next(), Some(Match { query_index: 5, word_index: 3, .. })); // broken + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY = new york city + assert_matches!(iter.next(), Some(Match { query_index: 3, word_index: 1, .. })); // subway = underground train assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None);