mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-20 01:48:25 +08:00
Introduce the new Query Tree creation supporting more operations
This commit is contained in:
parent
789e05304c
commit
0b9fe2c072
@ -13,7 +13,7 @@ use log::debug;
|
|||||||
|
|
||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
use crate::{store, DocumentId, DocIndex, MResult};
|
use crate::{store, DocumentId, DocIndex, MResult};
|
||||||
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||||
use crate::QueryWordsMapper;
|
use crate::QueryWordsMapper;
|
||||||
|
|
||||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
@ -144,7 +144,7 @@ fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &'
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
|
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
|
||||||
let words = words.join(" ");
|
let words = normalize_str(&words.join(" "));
|
||||||
let set = ctx.synonyms.synonyms(reader, words.as_bytes())?.unwrap_or_default();
|
let set = ctx.synonyms.synonyms(reader, words.as_bytes())?.unwrap_or_default();
|
||||||
|
|
||||||
let mut strings = Vec::new();
|
let mut strings = Vec::new();
|
||||||
@ -159,13 +159,6 @@ fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) ->
|
|||||||
Ok(strings)
|
Ok(strings)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn is_last<I: IntoIterator>(iter: I) -> impl Iterator<Item=(bool, I::Item)> {
|
|
||||||
let mut iter = iter.into_iter().peekable();
|
|
||||||
core::iter::from_fn(move || {
|
|
||||||
iter.next().map(|item| (iter.peek().is_none(), item))
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn create_operation<I, F>(iter: I, f: F) -> Operation
|
fn create_operation<I, F>(iter: I, f: F) -> Operation
|
||||||
where I: IntoIterator<Item=Operation>,
|
where I: IntoIterator<Item=Operation>,
|
||||||
F: Fn(Vec<Operation>) -> Operation,
|
F: Fn(Vec<Operation>) -> Operation,
|
||||||
@ -186,41 +179,40 @@ pub fn create_query_tree(
|
|||||||
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
||||||
{
|
{
|
||||||
let words = split_query_string(query).map(str::to_lowercase);
|
let words = split_query_string(query).map(str::to_lowercase);
|
||||||
let words: Vec<_> = words.filter(|s| !s.contains(char::is_whitespace)).enumerate().collect();
|
let words: Vec<_> = words.into_iter().enumerate().collect();
|
||||||
|
|
||||||
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
||||||
let mut ngrams = Vec::new();
|
|
||||||
for ngram in 1..=MAX_NGRAM {
|
|
||||||
|
|
||||||
let ngiter = words.windows(ngram).enumerate().map(|(i, group)| {
|
|
||||||
let before = words[0..i].windows(1).enumerate().map(|(i, g)| (i..i+1, g));
|
|
||||||
let after = words[i + ngram..].windows(1)
|
|
||||||
.enumerate()
|
|
||||||
.map(move |(j, g)| (i + j + ngram..i + j + ngram + 1, g));
|
|
||||||
before.chain(Some((i..i + ngram, group))).chain(after)
|
|
||||||
});
|
|
||||||
|
|
||||||
for group in ngiter {
|
|
||||||
|
|
||||||
let mut ops = Vec::new();
|
|
||||||
for (is_last, (range, words)) in is_last(group) {
|
|
||||||
|
|
||||||
|
fn create_inner(
|
||||||
|
reader: &heed::RoTxn<MainT>,
|
||||||
|
ctx: &Context,
|
||||||
|
mapper: &mut QueryWordsMapper,
|
||||||
|
words: &[(usize, String)],
|
||||||
|
) -> MResult<Vec<Operation>>
|
||||||
|
{
|
||||||
let mut alts = Vec::new();
|
let mut alts = Vec::new();
|
||||||
match words {
|
|
||||||
|
for ngram in 1..=MAX_NGRAM {
|
||||||
|
if let Some(group) = words.get(..ngram) {
|
||||||
|
let mut group_ops = Vec::new();
|
||||||
|
|
||||||
|
let tail = &words[ngram..];
|
||||||
|
let is_last = tail.is_empty();
|
||||||
|
|
||||||
|
let mut group_alts = Vec::new();
|
||||||
|
match group {
|
||||||
[(id, word)] => {
|
[(id, word)] => {
|
||||||
let mut idgen = ((id + 1) * 100)..;
|
let mut idgen = ((id + 1) * 100)..;
|
||||||
|
let range = (*id)..id+1;
|
||||||
|
|
||||||
let phrase = split_best_frequency(reader, ctx, word)?
|
let phrase = split_best_frequency(reader, ctx, word)?.map(|ws| {
|
||||||
.map(|ws| {
|
|
||||||
let id = idgen.next().unwrap();
|
let id = idgen.next().unwrap();
|
||||||
idgen.next().unwrap();
|
idgen.next().unwrap();
|
||||||
mapper.declare(range.clone(), id, &[ws.0, ws.1]);
|
mapper.declare(range.clone(), id, &[ws.0, ws.1]);
|
||||||
Operation::phrase2(id, is_last, ws)
|
Operation::phrase2(id, is_last, ws)
|
||||||
});
|
});
|
||||||
|
|
||||||
let synonyms = fetch_synonyms(reader, ctx, &[word])?
|
let synonyms = fetch_synonyms(reader, ctx, &[word])?.into_iter().map(|alts| {
|
||||||
.into_iter()
|
|
||||||
.map(|alts| {
|
|
||||||
let id = idgen.next().unwrap();
|
let id = idgen.next().unwrap();
|
||||||
mapper.declare(range.clone(), id, &alts);
|
mapper.declare(range.clone(), id, &alts);
|
||||||
|
|
||||||
@ -233,14 +225,15 @@ pub fn create_query_tree(
|
|||||||
create_operation(iter, Operation::And)
|
create_operation(iter, Operation::And)
|
||||||
});
|
});
|
||||||
|
|
||||||
let query = Operation::tolerant(*id, is_last, word);
|
let original = Operation::tolerant(*id, is_last, word);
|
||||||
|
|
||||||
alts.push(query);
|
group_alts.push(original);
|
||||||
alts.extend(synonyms.chain(phrase));
|
group_alts.extend(synonyms.chain(phrase));
|
||||||
},
|
},
|
||||||
words => {
|
words => {
|
||||||
let id = words[0].0;
|
let id = words[0].0;
|
||||||
let mut idgen = ((id + 1) * 100_usize.pow(ngram as u32))..;
|
let mut idgen = ((id + 1) * 100_usize.pow(ngram as u32))..;
|
||||||
|
let range = id..id+ngram;
|
||||||
|
|
||||||
let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect();
|
let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect();
|
||||||
|
|
||||||
@ -253,25 +246,32 @@ pub fn create_query_tree(
|
|||||||
let id = idgen.next().unwrap();
|
let id = idgen.next().unwrap();
|
||||||
Operation::exact(id, false, &s)
|
Operation::exact(id, false, &s)
|
||||||
});
|
});
|
||||||
alts.push(create_operation(synonym, Operation::And));
|
group_alts.push(create_operation(synonym, Operation::And));
|
||||||
}
|
}
|
||||||
|
|
||||||
let id = idgen.next().unwrap();
|
let id = idgen.next().unwrap();
|
||||||
let concat = words.concat();
|
let concat = words.concat();
|
||||||
alts.push(Operation::exact(id, is_last, &concat));
|
mapper.declare(range.clone(), id, &[&concat]);
|
||||||
mapper.declare(range.clone(), id, &[concat]);
|
group_alts.push(Operation::exact(id, is_last, &concat));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ops.push(create_operation(alts, Operation::Or));
|
group_ops.push(create_operation(group_alts, Operation::Or));
|
||||||
|
|
||||||
|
if !tail.is_empty() {
|
||||||
|
let tail_ops = create_inner(reader, ctx, mapper, tail)?;
|
||||||
|
group_ops.push(create_operation(tail_ops, Operation::Or));
|
||||||
}
|
}
|
||||||
|
|
||||||
ngrams.push(create_operation(ops, Operation::And));
|
alts.push(create_operation(group_ops, Operation::And));
|
||||||
if ngram == 1 { break }
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let operation = create_operation(ngrams, Operation::Or);
|
Ok(alts)
|
||||||
|
}
|
||||||
|
|
||||||
|
let alternatives = create_inner(reader, ctx, &mut mapper, &words)?;
|
||||||
|
let operation = Operation::Or(alternatives);
|
||||||
let mapping = mapper.mapping();
|
let mapping = mapper.mapping();
|
||||||
|
|
||||||
Ok((operation, mapping))
|
Ok((operation, mapping))
|
||||||
|
Loading…
Reference in New Issue
Block a user