mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-27 04:25:06 +08:00
Merge pull request #232 from meilisearch/support-splitted-words
Support splitted words
This commit is contained in:
commit
93306c2326
@ -2,7 +2,7 @@ mod dfa;
|
|||||||
mod query_enhancer;
|
mod query_enhancer;
|
||||||
|
|
||||||
use std::cmp::Reverse;
|
use std::cmp::Reverse;
|
||||||
use std::vec;
|
use std::{cmp, vec};
|
||||||
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use levenshtein_automata::DFA;
|
use levenshtein_automata::DFA;
|
||||||
@ -18,7 +18,7 @@ use self::query_enhancer::QueryEnhancerBuilder;
|
|||||||
const NGRAMS: usize = 3;
|
const NGRAMS: usize = 3;
|
||||||
|
|
||||||
pub struct AutomatonProducer {
|
pub struct AutomatonProducer {
|
||||||
automatons: Vec<Vec<Automaton>>,
|
automatons: Vec<AutomatonGroup>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AutomatonProducer {
|
impl AutomatonProducer {
|
||||||
@ -26,19 +26,47 @@ impl AutomatonProducer {
|
|||||||
reader: &heed::RoTxn,
|
reader: &heed::RoTxn,
|
||||||
query: &str,
|
query: &str,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
|
postings_list_store: store::PostingsLists,
|
||||||
synonyms_store: store::Synonyms,
|
synonyms_store: store::Synonyms,
|
||||||
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
|
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
|
||||||
let (automatons, query_enhancer) =
|
let (automatons, query_enhancer) = generate_automatons(
|
||||||
generate_automatons(reader, query, main_store, synonyms_store)?;
|
reader,
|
||||||
|
query,
|
||||||
|
main_store,
|
||||||
|
postings_list_store,
|
||||||
|
synonyms_store,
|
||||||
|
)?;
|
||||||
|
|
||||||
Ok((AutomatonProducer { automatons }, query_enhancer))
|
Ok((AutomatonProducer { automatons }, query_enhancer))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn into_iter(self) -> vec::IntoIter<Vec<Automaton>> {
|
pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
|
||||||
self.automatons.into_iter()
|
self.automatons.into_iter()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct AutomatonGroup {
|
||||||
|
pub is_phrase_query: bool,
|
||||||
|
pub automatons: Vec<Automaton>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AutomatonGroup {
|
||||||
|
fn normal(automatons: Vec<Automaton>) -> AutomatonGroup {
|
||||||
|
AutomatonGroup {
|
||||||
|
is_phrase_query: false,
|
||||||
|
automatons,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup {
|
||||||
|
AutomatonGroup {
|
||||||
|
is_phrase_query: true,
|
||||||
|
automatons,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Automaton {
|
pub struct Automaton {
|
||||||
pub index: usize,
|
pub index: usize,
|
||||||
@ -102,12 +130,41 @@ pub fn normalize_str(string: &str) -> String {
|
|||||||
string
|
string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn split_best_frequency<'a>(
|
||||||
|
reader: &heed::RoTxn,
|
||||||
|
word: &'a str,
|
||||||
|
postings_lists_store: store::PostingsLists,
|
||||||
|
) -> MResult<Option<(&'a str, &'a str)>> {
|
||||||
|
let chars = word.char_indices().skip(1);
|
||||||
|
let mut best = None;
|
||||||
|
|
||||||
|
for (i, _) in chars {
|
||||||
|
let (left, right) = word.split_at(i);
|
||||||
|
|
||||||
|
let left_freq = postings_lists_store
|
||||||
|
.postings_list(reader, left.as_ref())?
|
||||||
|
.map_or(0, |i| i.len());
|
||||||
|
|
||||||
|
let right_freq = postings_lists_store
|
||||||
|
.postings_list(reader, right.as_ref())?
|
||||||
|
.map_or(0, |i| i.len());
|
||||||
|
|
||||||
|
let min_freq = cmp::min(left_freq, right_freq);
|
||||||
|
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||||
|
best = Some((min_freq, left, right));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(best.map(|(_, l, r)| (l, r)))
|
||||||
|
}
|
||||||
|
|
||||||
fn generate_automatons(
|
fn generate_automatons(
|
||||||
reader: &heed::RoTxn,
|
reader: &heed::RoTxn,
|
||||||
query: &str,
|
query: &str,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
|
postings_lists_store: store::PostingsLists,
|
||||||
synonym_store: store::Synonyms,
|
synonym_store: store::Synonyms,
|
||||||
) -> MResult<(Vec<Vec<Automaton>>, QueryEnhancer)> {
|
) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> {
|
||||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
let synonyms = match main_store.synonyms_fst(reader)? {
|
||||||
@ -136,7 +193,7 @@ fn generate_automatons(
|
|||||||
original_automatons.push(automaton);
|
original_automatons.push(automaton);
|
||||||
}
|
}
|
||||||
|
|
||||||
automatons.push(original_automatons);
|
automatons.push(AutomatonGroup::normal(original_automatons));
|
||||||
|
|
||||||
for n in 1..=NGRAMS {
|
for n in 1..=NGRAMS {
|
||||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||||
@ -188,13 +245,27 @@ fn generate_automatons(
|
|||||||
Automaton::non_exact(automaton_index, n, synonym)
|
Automaton::non_exact(automaton_index, n, synonym)
|
||||||
};
|
};
|
||||||
automaton_index += 1;
|
automaton_index += 1;
|
||||||
automatons.push(vec![automaton]);
|
automatons.push(AutomatonGroup::normal(vec![automaton]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if n != 1 {
|
if n == 1 {
|
||||||
|
if let Some((left, right)) =
|
||||||
|
split_best_frequency(reader, &normalized, postings_lists_store)?
|
||||||
|
{
|
||||||
|
let a = Automaton::exact(automaton_index, 1, left);
|
||||||
|
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||||
|
automaton_index += 1;
|
||||||
|
|
||||||
|
let b = Automaton::exact(automaton_index, 1, right);
|
||||||
|
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||||
|
automaton_index += 1;
|
||||||
|
|
||||||
|
automatons.push(AutomatonGroup::phrase_query(vec![a, b]));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
// automaton of concatenation of query words
|
// automaton of concatenation of query words
|
||||||
let concat = ngram_slice.concat();
|
let concat = ngram_slice.concat();
|
||||||
let normalized = normalize_str(&concat);
|
let normalized = normalize_str(&concat);
|
||||||
@ -204,15 +275,15 @@ fn generate_automatons(
|
|||||||
|
|
||||||
let automaton = Automaton::exact(automaton_index, n, &normalized);
|
let automaton = Automaton::exact(automaton_index, n, &normalized);
|
||||||
automaton_index += 1;
|
automaton_index += 1;
|
||||||
automatons.push(vec![automaton]);
|
automatons.push(AutomatonGroup::normal(vec![automaton]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// order automatons, the most important first,
|
// order automatons, the most important first,
|
||||||
// we keep the original automatons at the front.
|
// we keep the original automatons at the front.
|
||||||
automatons[1..].sort_by_key(|a| {
|
automatons[1..].sort_by_key(|group| {
|
||||||
let a = a.first().unwrap();
|
let a = group.automatons.first().unwrap();
|
||||||
(Reverse(a.is_exact), a.ngram)
|
(Reverse(a.is_exact), a.ngram)
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ use fst::{IntoStreamer, Streamer};
|
|||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use slice_group_by::{GroupBy, GroupByMut};
|
use slice_group_by::{GroupBy, GroupByMut};
|
||||||
|
|
||||||
use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer};
|
use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
|
||||||
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||||
use crate::raw_document::{raw_documents_from, RawDocument};
|
use crate::raw_document::{raw_documents_from, RawDocument};
|
||||||
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
|
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
|
||||||
@ -138,7 +138,7 @@ fn multiword_rewrite_matches(
|
|||||||
|
|
||||||
fn fetch_raw_documents(
|
fn fetch_raw_documents(
|
||||||
reader: &heed::RoTxn,
|
reader: &heed::RoTxn,
|
||||||
automatons: &[Automaton],
|
automatons_groups: &[AutomatonGroup],
|
||||||
query_enhancer: &QueryEnhancer,
|
query_enhancer: &QueryEnhancer,
|
||||||
searchables: Option<&ReorderedAttrs>,
|
searchables: Option<&ReorderedAttrs>,
|
||||||
main_store: store::Main,
|
main_store: store::Main,
|
||||||
@ -148,55 +148,94 @@ fn fetch_raw_documents(
|
|||||||
let mut matches = Vec::new();
|
let mut matches = Vec::new();
|
||||||
let mut highlights = Vec::new();
|
let mut highlights = Vec::new();
|
||||||
|
|
||||||
for automaton in automatons {
|
for group in automatons_groups {
|
||||||
let Automaton {
|
let AutomatonGroup {
|
||||||
index,
|
is_phrase_query,
|
||||||
is_exact,
|
automatons,
|
||||||
query_len,
|
} = group;
|
||||||
..
|
let phrase_query_len = automatons.len();
|
||||||
} = automaton;
|
|
||||||
let dfa = automaton.dfa();
|
|
||||||
|
|
||||||
let words = match main_store.words_fst(reader)? {
|
let mut tmp_matches = Vec::new();
|
||||||
Some(words) => words,
|
for (id, automaton) in automatons.into_iter().enumerate() {
|
||||||
None => return Ok(Vec::new()),
|
let Automaton {
|
||||||
};
|
index,
|
||||||
|
is_exact,
|
||||||
|
query_len,
|
||||||
|
..
|
||||||
|
} = automaton;
|
||||||
|
let dfa = automaton.dfa();
|
||||||
|
|
||||||
let mut stream = words.search(&dfa).into_stream();
|
let words = match main_store.words_fst(reader)? {
|
||||||
while let Some(input) = stream.next() {
|
Some(words) => words,
|
||||||
let distance = dfa.eval(input).to_u8();
|
None => return Ok(Vec::new()),
|
||||||
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
|
|
||||||
|
|
||||||
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
|
|
||||||
Some(doc_indexes) => doc_indexes,
|
|
||||||
None => continue,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
matches.reserve(doc_indexes.len());
|
let mut stream = words.search(&dfa).into_stream();
|
||||||
highlights.reserve(doc_indexes.len());
|
while let Some(input) = stream.next() {
|
||||||
|
let distance = dfa.eval(input).to_u8();
|
||||||
|
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
|
||||||
|
|
||||||
for di in doc_indexes.as_ref() {
|
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
|
||||||
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
|
Some(doc_indexes) => doc_indexes,
|
||||||
if let Some(attribute) = attribute {
|
None => continue,
|
||||||
let match_ = TmpMatch {
|
};
|
||||||
query_index: *index as u32,
|
|
||||||
distance,
|
|
||||||
attribute,
|
|
||||||
word_index: di.word_index,
|
|
||||||
is_exact,
|
|
||||||
};
|
|
||||||
|
|
||||||
let highlight = Highlight {
|
tmp_matches.reserve(doc_indexes.len());
|
||||||
attribute: di.attribute,
|
|
||||||
char_index: di.char_index,
|
|
||||||
char_length: di.char_length,
|
|
||||||
};
|
|
||||||
|
|
||||||
matches.push((di.document_id, match_));
|
for di in doc_indexes.as_ref() {
|
||||||
highlights.push((di.document_id, highlight));
|
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
|
||||||
|
if let Some(attribute) = attribute {
|
||||||
|
let match_ = TmpMatch {
|
||||||
|
query_index: *index as u32,
|
||||||
|
distance,
|
||||||
|
attribute,
|
||||||
|
word_index: di.word_index,
|
||||||
|
is_exact,
|
||||||
|
};
|
||||||
|
|
||||||
|
let highlight = Highlight {
|
||||||
|
attribute: di.attribute,
|
||||||
|
char_index: di.char_index,
|
||||||
|
char_length: di.char_length,
|
||||||
|
};
|
||||||
|
|
||||||
|
tmp_matches.push((di.document_id, id, match_, highlight));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if *is_phrase_query {
|
||||||
|
tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
|
||||||
|
for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
|
||||||
|
for window in group.windows(2) {
|
||||||
|
let (ida, ia, ma, ha) = window[0];
|
||||||
|
let (idb, ib, mb, hb) = window[1];
|
||||||
|
|
||||||
|
debug_assert_eq!(ida, idb);
|
||||||
|
|
||||||
|
// if matches must follow and actually follows themselves
|
||||||
|
if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
|
||||||
|
// TODO we must make it work for phrase query longer than 2
|
||||||
|
// if the second match is the last phrase query word
|
||||||
|
if ib + 1 == phrase_query_len {
|
||||||
|
// insert first match
|
||||||
|
matches.push((ida, ma));
|
||||||
|
highlights.push((ida, ha));
|
||||||
|
|
||||||
|
// insert second match
|
||||||
|
matches.push((idb, mb));
|
||||||
|
highlights.push((idb, hb));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (id, _, match_, highlight) in tmp_matches {
|
||||||
|
matches.push((id, match_));
|
||||||
|
highlights.push((id, highlight));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let matches = multiword_rewrite_matches(matches, &query_enhancer);
|
let matches = multiword_rewrite_matches(matches, &query_enhancer);
|
||||||
@ -367,15 +406,20 @@ where
|
|||||||
let start_processing = Instant::now();
|
let start_processing = Instant::now();
|
||||||
let mut raw_documents_processed = Vec::with_capacity(range.len());
|
let mut raw_documents_processed = Vec::with_capacity(range.len());
|
||||||
|
|
||||||
let (automaton_producer, query_enhancer) =
|
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
||||||
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
|
reader,
|
||||||
|
query,
|
||||||
|
main_store,
|
||||||
|
postings_lists_store,
|
||||||
|
synonyms_store,
|
||||||
|
)?;
|
||||||
|
|
||||||
let automaton_producer = automaton_producer.into_iter();
|
let automaton_producer = automaton_producer.into_iter();
|
||||||
let mut automatons = Vec::new();
|
let mut automatons = Vec::new();
|
||||||
|
|
||||||
// aggregate automatons groups by groups after time
|
// aggregate automatons groups by groups after time
|
||||||
for auts in automaton_producer {
|
for auts in automaton_producer {
|
||||||
automatons.extend(auts);
|
automatons.push(auts);
|
||||||
|
|
||||||
// we must retrieve the documents associated
|
// we must retrieve the documents associated
|
||||||
// with the current automatons
|
// with the current automatons
|
||||||
@ -480,15 +524,20 @@ where
|
|||||||
let start_processing = Instant::now();
|
let start_processing = Instant::now();
|
||||||
let mut raw_documents_processed = Vec::new();
|
let mut raw_documents_processed = Vec::new();
|
||||||
|
|
||||||
let (automaton_producer, query_enhancer) =
|
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
||||||
AutomatonProducer::new(reader, query, main_store, synonyms_store)?;
|
reader,
|
||||||
|
query,
|
||||||
|
main_store,
|
||||||
|
postings_lists_store,
|
||||||
|
synonyms_store,
|
||||||
|
)?;
|
||||||
|
|
||||||
let automaton_producer = automaton_producer.into_iter();
|
let automaton_producer = automaton_producer.into_iter();
|
||||||
let mut automatons = Vec::new();
|
let mut automatons = Vec::new();
|
||||||
|
|
||||||
// aggregate automatons groups by groups after time
|
// aggregate automatons groups by groups after time
|
||||||
for auts in automaton_producer {
|
for auts in automaton_producer {
|
||||||
automatons.extend(auts);
|
automatons.push(auts);
|
||||||
|
|
||||||
// we must retrieve the documents associated
|
// we must retrieve the documents associated
|
||||||
// with the current automatons
|
// with the current automatons
|
||||||
@ -1697,4 +1746,68 @@ mod tests {
|
|||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn simple_phrase_query_splitting() {
|
||||||
|
let store = TempDatabase::from_iter(vec![
|
||||||
|
("search", &[doc_index(0, 0)][..]),
|
||||||
|
("engine", &[doc_index(0, 1)][..]),
|
||||||
|
("search", &[doc_index(1, 0)][..]),
|
||||||
|
("slow", &[doc_index(1, 1)][..]),
|
||||||
|
("engine", &[doc_index(1, 2)][..]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let env = &store.database.env;
|
||||||
|
let reader = env.read_txn().unwrap();
|
||||||
|
|
||||||
|
let builder = store.query_builder();
|
||||||
|
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
|
||||||
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
||||||
|
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // engine
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn harder_phrase_query_splitting() {
|
||||||
|
let store = TempDatabase::from_iter(vec![
|
||||||
|
("search", &[doc_index(0, 0)][..]),
|
||||||
|
("search", &[doc_index(0, 1)][..]),
|
||||||
|
("engine", &[doc_index(0, 2)][..]),
|
||||||
|
("search", &[doc_index(1, 0)][..]),
|
||||||
|
("slow", &[doc_index(1, 1)][..]),
|
||||||
|
("search", &[doc_index(1, 2)][..]),
|
||||||
|
("engine", &[doc_index(1, 3)][..]),
|
||||||
|
("search", &[doc_index(1, 0)][..]),
|
||||||
|
("search", &[doc_index(1, 1)][..]),
|
||||||
|
("slow", &[doc_index(1, 2)][..]),
|
||||||
|
("engine", &[doc_index(1, 3)][..]),
|
||||||
|
]);
|
||||||
|
|
||||||
|
let env = &store.database.env;
|
||||||
|
let reader = env.read_txn().unwrap();
|
||||||
|
|
||||||
|
let builder = store.query_builder();
|
||||||
|
let results = builder.query(&reader, "searchengine", 0..20).unwrap();
|
||||||
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 1, distance: 0, .. })); // search
|
||||||
|
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // engine
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 2, distance: 0, .. })); // search
|
||||||
|
assert_matches!(iter.next(), Some(TmpMatch { query_index: 0, word_index: 3, distance: 0, .. })); // engine
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user