Make the engine support non-exact multi-words synonyms

This commit is contained in:
Clément Renault 2020-01-22 18:11:58 +01:00
parent 0b9fe2c072
commit a9adbda2cd
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 39 additions and 31 deletions

View File

@ -65,7 +65,7 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>(
for di in postings_list.iter() { for di in postings_list.iter() {
let covered_area = match kind { let covered_area = match kind {
Some(QueryKind::Exact(query)) | Some(QueryKind::Tolerant(query)) => { Some(QueryKind::NonTolerant(query)) | Some(QueryKind::Tolerant(query)) => {
let len = if query.len() > input.len() { let len = if query.len() > input.len() {
input.len() input.len()
} else { } else {

View File

@ -45,16 +45,16 @@ impl fmt::Debug for Operation {
impl Operation { impl Operation {
fn tolerant(id: QueryId, prefix: bool, s: &str) -> Operation { fn tolerant(id: QueryId, prefix: bool, s: &str) -> Operation {
Operation::Query(Query { id, prefix, kind: QueryKind::Tolerant(s.to_string()) }) Operation::Query(Query { id, prefix, exact: true, kind: QueryKind::Tolerant(s.to_string()) })
} }
fn exact(id: QueryId, prefix: bool, s: &str) -> Operation { fn non_tolerant(id: QueryId, prefix: bool, s: &str) -> Operation {
Operation::Query(Query { id, prefix, kind: QueryKind::Exact(s.to_string()) }) Operation::Query(Query { id, prefix, exact: true, kind: QueryKind::NonTolerant(s.to_string()) })
} }
fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Operation { fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Operation {
let kind = QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]); let kind = QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]);
Operation::Query(Query { id, prefix, kind }) Operation::Query(Query { id, prefix, exact: true, kind })
} }
} }
@ -64,6 +64,7 @@ pub type QueryId = usize;
pub struct Query { pub struct Query {
pub id: QueryId, pub id: QueryId,
pub prefix: bool, pub prefix: bool,
pub exact: bool,
pub kind: QueryKind, pub kind: QueryKind,
} }
@ -83,17 +84,17 @@ impl Hash for Query {
#[derive(Clone, PartialEq, Eq, Hash)] #[derive(Clone, PartialEq, Eq, Hash)]
pub enum QueryKind { pub enum QueryKind {
Tolerant(String), Tolerant(String),
Exact(String), NonTolerant(String),
Phrase(Vec<String>), Phrase(Vec<String>),
} }
impl fmt::Debug for Query { impl fmt::Debug for Query {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let Query { id, prefix, kind } = self; let Query { id, prefix, kind, .. } = self;
let prefix = if *prefix { String::from("Prefix") } else { String::default() }; let prefix = if *prefix { String::from("Prefix") } else { String::default() };
match kind { match kind {
QueryKind::Exact(word) => { QueryKind::NonTolerant(word) => {
f.debug_struct(&(prefix + "Exact")).field("id", &id).field("word", &word).finish() f.debug_struct(&(prefix + "NonTolerant")).field("id", &id).field("word", &word).finish()
}, },
QueryKind::Tolerant(word) => { QueryKind::Tolerant(word) => {
f.debug_struct(&(prefix + "Tolerant")).field("id", &id).field("word", &word).finish() f.debug_struct(&(prefix + "Tolerant")).field("id", &id).field("word", &word).finish()
@ -205,21 +206,26 @@ pub fn create_query_tree(
let mut idgen = ((id + 1) * 100)..; let mut idgen = ((id + 1) * 100)..;
let range = (*id)..id+1; let range = (*id)..id+1;
let phrase = split_best_frequency(reader, ctx, word)?.map(|ws| { let phrase = split_best_frequency(reader, ctx, word)?
.map(|ws| {
let id = idgen.next().unwrap(); let id = idgen.next().unwrap();
idgen.next().unwrap(); idgen.next().unwrap();
mapper.declare(range.clone(), id, &[ws.0, ws.1]); mapper.declare(range.clone(), id, &[ws.0, ws.1]);
Operation::phrase2(id, is_last, ws) Operation::phrase2(id, is_last, ws)
}); });
let synonyms = fetch_synonyms(reader, ctx, &[word])?.into_iter().map(|alts| { let synonyms = fetch_synonyms(reader, ctx, &[word])?
.into_iter()
.map(|alts| {
let exact = alts.len() == 1;
let id = idgen.next().unwrap(); let id = idgen.next().unwrap();
mapper.declare(range.clone(), id, &alts); mapper.declare(range.clone(), id, &alts);
let mut idgen = once(id).chain(&mut idgen); let mut idgen = once(id).chain(&mut idgen);
let iter = alts.into_iter().map(|w| { let iter = alts.into_iter().map(|w| {
let id = idgen.next().unwrap(); let id = idgen.next().unwrap();
Operation::exact(id, false, &w) let kind = QueryKind::NonTolerant(w);
Operation::Query(Query { id, prefix: false, exact, kind })
}); });
create_operation(iter, Operation::And) create_operation(iter, Operation::And)
@ -238,13 +244,15 @@ pub fn create_query_tree(
let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect(); let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect();
for synonym in fetch_synonyms(reader, ctx, &words)? { for synonym in fetch_synonyms(reader, ctx, &words)? {
let exact = synonym.len() == 1;
let id = idgen.next().unwrap(); let id = idgen.next().unwrap();
mapper.declare(range.clone(), id, &synonym); mapper.declare(range.clone(), id, &synonym);
let mut idgen = once(id).chain(&mut idgen); let mut idgen = once(id).chain(&mut idgen);
let synonym = synonym.into_iter().map(|s| { let synonym = synonym.into_iter().map(|s| {
let id = idgen.next().unwrap(); let id = idgen.next().unwrap();
Operation::exact(id, false, &s) let kind = QueryKind::NonTolerant(s);
Operation::Query(Query { id, prefix: false, exact, kind })
}); });
group_alts.push(create_operation(synonym, Operation::And)); group_alts.push(create_operation(synonym, Operation::And));
} }
@ -252,7 +260,7 @@ pub fn create_query_tree(
let id = idgen.next().unwrap(); let id = idgen.next().unwrap();
let concat = words.concat(); let concat = words.concat();
mapper.declare(range.clone(), id, &[&concat]); mapper.declare(range.clone(), id, &[&concat]);
group_alts.push(Operation::exact(id, is_last, &concat)); group_alts.push(Operation::non_tolerant(id, is_last, &concat));
} }
} }
@ -387,7 +395,7 @@ pub fn traverse_query_tree<'o, 'txn>(
{ {
let before = Instant::now(); let before = Instant::now();
let Query { prefix, kind, .. } = query; let Query { prefix, kind, exact, .. } = query;
let docids: Cow<Set<_>> = match kind { let docids: Cow<Set<_>> = match kind {
QueryKind::Tolerant(word) => { QueryKind::Tolerant(word) => {
if *prefix && word.len() <= 2 { if *prefix && word.len() <= 2 {
@ -434,7 +442,7 @@ pub fn traverse_query_tree<'o, 'txn>(
while let Some(input) = stream.next() { while let Some(input) = stream.next() {
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? { if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
let distance = dfa.eval(input).to_u8(); let distance = dfa.eval(input).to_u8();
let is_exact = distance == 0 && input.len() == word.len(); let is_exact = *exact && distance == 0 && input.len() == word.len();
results.push(result.docids); results.push(result.docids);
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact }; let key = PostingsKey { query, input: input.to_owned(), distance, is_exact };
postings.insert(key, result.matches); postings.insert(key, result.matches);
@ -459,7 +467,7 @@ pub fn traverse_query_tree<'o, 'txn>(
Cow::Owned(docids) Cow::Owned(docids)
} }
}, },
QueryKind::Exact(word) => { QueryKind::NonTolerant(word) => {
// TODO support prefix and non-prefix exact DFA // TODO support prefix and non-prefix exact DFA
let dfa = build_exact_dfa(word); let dfa = build_exact_dfa(word);
@ -476,7 +484,7 @@ pub fn traverse_query_tree<'o, 'txn>(
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? { if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
let distance = dfa.eval(input).to_u8(); let distance = dfa.eval(input).to_u8();
results.push(result.docids); results.push(result.docids);
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact: true }; let key = PostingsKey { query, input: input.to_owned(), distance, is_exact: *exact };
postings.insert(key, result.matches); postings.insert(key, result.matches);
} }
} }