From 59fafb8b30bf4d8d911cd41d0de7ab52795fd917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 17 Jun 2019 16:01:31 +0200 Subject: [PATCH] feat: Support one word has multi-word alternatives --- meilidb-core/src/lib.rs | 8 +- meilidb-core/src/query_builder.rs | 230 ++++++++++++++++++++++++++---- 2 files changed, 206 insertions(+), 32 deletions(-) diff --git a/meilidb-core/src/lib.rs b/meilidb-core/src/lib.rs index bb2de2dec..3235cd6af 100644 --- a/meilidb-core/src/lib.rs +++ b/meilidb-core/src/lib.rs @@ -10,7 +10,7 @@ pub mod criterion; use std::fmt; use std::sync::Arc; -use rayon::slice::ParallelSliceMut; +use sdset::SetBuf; use serde::{Serialize, Deserialize}; use slice_group_by::GroupBy; use zerocopy::{AsBytes, FromBytes}; @@ -229,12 +229,10 @@ impl fmt::Debug for RawDocument { } } -pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec { - let mut docs_ranges = Vec::<(DocumentId, Range)>::new(); +pub fn raw_documents_from_matches(matches: SetBuf<(DocumentId, Match)>) -> Vec { + let mut docs_ranges = Vec::<(_, Range)>::new(); let mut matches2 = Matches::with_capacity(matches.len()); - matches.par_sort_unstable(); - for group in matches.linear_group_by(|(a, _), (b, _)| a == b) { let id = group[0].0; let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0); diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index d8065bba2..c93a7be9e 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -4,12 +4,13 @@ use std::rc::Rc; use std::time::Instant; use std::{cmp, mem}; -use rayon::slice::ParallelSliceMut; -use slice_group_by::GroupByMut; -use meilidb_tokenizer::{is_cjk, split_query_string}; -use hashbrown::{HashMap, HashSet}; use fst::{Streamer, IntoStreamer}; +use hashbrown::{HashMap, HashSet}; use log::info; +use meilidb_tokenizer::{is_cjk, split_query_string}; +use rayon::slice::ParallelSliceMut; +use sdset::SetBuf; +use slice_group_by::GroupByMut; use crate::automaton::{DfaExt, AutomatonExt, build_dfa, build_prefix_dfa}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; @@ -46,12 +47,11 @@ fn generate_automatons(query: &str, store: &S) -> Result QueryBuilder<'c, S, FI> store: self.store, criteria: self.criteria, searchable_attrs: self.searchable_attrs, - filter: Some(function) + filter: Some(function), } } @@ -147,8 +147,22 @@ where S: Store, } } + matches.par_sort_unstable(); + + for document_matches in matches.linear_group_by_mut(|(a, _), (b, _)| a == b) { + let mut offset = 0; + for query_indexes in document_matches.linear_group_by_mut(|(_, a), (_, b)| a.query_index == b.query_index) { + let word_index = query_indexes[0].1.word_index - offset as u16; + for (_, match_) in query_indexes.iter_mut() { + match_.word_index = word_index; + } + offset += query_indexes.len() - 1; + } + } + let total_matches = matches.len(); - let raw_documents = raw_documents_from_matches(matches); + let padded_matches = SetBuf::from_dirty(matches); + let raw_documents = raw_documents_from_matches(padded_matches); info!("{} total documents to classify", raw_documents.len()); info!("{} total matches to classify", total_matches); @@ -455,6 +469,16 @@ mod tests { } } + const fn doc_char_index(document_id: u64, word_index: u16, char_index: u16) -> DocIndex { + DocIndex { + document_id: DocumentId(document_id), + attribute: 0, + word_index, + char_index, + char_length: 0, + } + } + #[test] fn simple_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ @@ -652,35 +676,97 @@ mod tests { assert_matches!(iter.next(), None); } - /// Unique word has multi-word synonyms #[test] - fn multiword_synonyms() { + /// Unique word has multi-word synonyms + fn unique_to_multiword_synonyms() { let mut store = InMemorySetStore::from_iter(vec![ - ("new", &[doc_index(0, 0)][..]), - ("york", &[doc_index(0, 1)][..]), - ("subway", &[doc_index(0, 2)][..]), + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("subway", &[doc_char_index(0, 3, 3)][..]), - ("NY", &[doc_index(1, 0)][..]), - ("subway", &[doc_index(1, 1)][..]), + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("subway", &[doc_char_index(1, 1, 1)][..]), ]); - store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); - store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); + store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); let builder = QueryBuilder::new(&store); let results = builder.query("NY subway", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway assert_matches!(iter.next(), None); }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NYC subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + /// Unique word has multi-word synonyms + fn harder_unique_to_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("yellow", &[doc_char_index(0, 3, 3)][..]), + ("subway", &[doc_char_index(0, 4, 4)][..]), + ("broken", &[doc_char_index(0, 5, 5)][..]), + + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); + store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway assert_matches!(iter.next(), None); }); @@ -690,19 +776,109 @@ mod tests { let results = builder.query("NYC subway", 0..20).unwrap(); let mut iter = results.into_iter(); + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { let mut iter = matches.into_iter(); assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY - assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 1, .. })); // subway - assert_matches!(iter.next(), None); - }); - assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { - let mut iter = matches.into_iter(); - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new - assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 1, .. })); // york assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway assert_matches!(iter.next(), None); }); assert_matches!(iter.next(), None); } + + #[test] + /// Unique word has multi-word synonyms + fn even_harder_unique_to_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("new", &[doc_char_index(0, 0, 0)][..]), + ("york", &[doc_char_index(0, 1, 1)][..]), + ("city", &[doc_char_index(0, 2, 2)][..]), + ("yellow", &[doc_char_index(0, 3, 3)][..]), + ("underground", &[doc_char_index(0, 4, 4)][..]), + ("train", &[doc_char_index(0, 5, 5)][..]), + ("broken", &[doc_char_index(0, 6, 6)][..]), + + ("NY", &[doc_char_index(1, 0, 0)][..]), + ("blue", &[doc_char_index(1, 1, 1)][..]), + ("subway", &[doc_char_index(1, 2, 2)][..]), + ]); + + store.add_synonym("NY", SetBuf::from_dirty(vec!["NYC", "new york", "new york city"])); + store.add_synonym("NYC", SetBuf::from_dirty(vec!["NY", "new york", "new york city"])); + store.add_synonym("subway", SetBuf::from_dirty(vec!["underground train"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NY subway broken", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // underground = subway + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 3, .. })); // broken + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + + let builder = QueryBuilder::new(&store); + let results = builder.query("NYC subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // new = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // york = NY + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // city = NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // underground = subway + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // train = subway + assert_matches!(iter.next(), None); // position rewritten ^ + }); + assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 1, word_index: 2, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } + + #[test] + #[ignore] + /// Multi-word has multi-word synonyms + fn multiword_to_multiword_synonyms() { + let mut store = InMemorySetStore::from_iter(vec![ + ("NY", &[doc_index(0, 0)][..]), + ("subway", &[doc_index(0, 1)][..]), + ]); + + store.add_synonym("new york", SetBuf::from_dirty(vec!["NYC", "NY", "new york city"])); + + let builder = QueryBuilder::new(&store); + let results = builder.query("new york subway", 0..20).unwrap(); + let mut iter = results.into_iter(); + + assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => { + let mut iter = matches.into_iter(); + assert_matches!(iter.next(), Some(Match { query_index: 0, word_index: 0, .. })); // NY + assert_matches!(iter.next(), Some(Match { query_index: 2, word_index: 1, .. })); // subway + assert_matches!(iter.next(), None); + }); + assert_matches!(iter.next(), None); + } }