From f478bbf826a50f31cb5e40cae6f0ea95eed0d504 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 7 Jul 2019 20:27:37 +0200 Subject: [PATCH] feat: Introduce the QueryEnhancer in the query synonym system --- meilidb-core/src/query_builder.rs | 88 +++++++++++++++++++------------ 1 file changed, 54 insertions(+), 34 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 175430554..7e79ac15e 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -14,8 +14,9 @@ use sdset::SetBuf; use slice_group_by::GroupByMut; use crate::automaton::{build_dfa, build_prefix_dfa}; -use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::criterion::Criteria; +use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; +use crate::query_enhancer::{QueryEnhancerBuilder, QueryEnhancer}; use crate::raw_documents_from_matches; use crate::reordered_attrs::ReorderedAttrs; use crate::{TmpMatch, Highlight, DocumentId, Store, RawDocument, Document}; @@ -91,18 +92,36 @@ fn split_best_frequency<'a, S: Store>( Ok(best.map(|(_, l, r)| (l, r))) } -fn generate_automatons(query: &str, store: &S) -> Result, S::Error> { +fn generate_automatons(query: &str, store: &S) -> Result<(Vec, QueryEnhancer), S::Error> { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); - let mut automatons = Vec::new(); - let synonyms = store.synonyms()?; - for n in 1..=NGRAMS { - let mut query_index = 0; - let mut ngrams = query_words.windows(n).peekable(); + let mut automatons = Vec::new(); + let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); - while let Some(ngram_slice) = ngrams.next() { + // We must not declare the original words to the query enhancer + // *but* we need to push them in the automatons list first + let mut original_words = query_words.iter().enumerate().peekable(); + while let Some((query_index, word)) = original_words.next() { + + let has_following_word = original_words.peek().is_some(); + let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); + + let automaton = if not_prefix_dfa { + Automaton::exact(query_index, word) + } else { + Automaton::prefix_exact(query_index, word) + }; + automatons.push(automaton); + } + + for n in 1..=NGRAMS { + + let mut ngrams = query_words.windows(n).enumerate().peekable(); + while let Some((query_index, ngram_slice)) = ngrams.next() { + + let query_range = query_index..query_index + n; let ngram_nb_words = ngram_slice.len(); let ngram = ngram_slice.join(" "); @@ -127,15 +146,19 @@ fn generate_automatons(query: &str, store: &S) -> Result = split_query_string(synonyms).collect(); + let nb_synonym_words = synonyms_words.len(); - for synonym in split_query_string(synonyms) { + let real_query_index = automatons.len(); + enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); + + for (i, synonym) in synonyms_words.into_iter().enumerate() { let automaton = if nb_synonym_words == 1 { - Automaton::exact(query_index, synonym) + Automaton::exact(real_query_index + i, synonym) } else { - Automaton::non_exact(query_index, synonym) + Automaton::non_exact(real_query_index + i, synonym) }; - automatons.push((automaton, synonym.to_owned())); + automatons.push(automaton); } } } @@ -145,37 +168,34 @@ fn generate_automatons(query: &str, store: &S) -> Result QueryBuilder<'c, S, FI> where S: Store, { fn query_all(&self, query: &str) -> Result, S::Error> { - let automatons = generate_automatons(query, &self.store)?; + let (automatons, query_enhancer) = generate_automatons(query, &self.store)?; let words = self.store.words()?.as_fst(); let searchables = self.searchable_attrs.as_ref();