From 00336c5154c8e1bb2f08ca88f0c09fa130541d00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 16 Jan 2020 14:24:45 +0100 Subject: [PATCH] Reintroduce a basic highlight display --- meilisearch-core/src/bucket_sort.rs | 357 +------------------------- meilisearch-core/src/criterion/mod.rs | 2 +- meilisearch-core/src/lib.rs | 45 ++-- meilisearch-core/src/query_tree.rs | 3 +- meilisearch-core/src/raw_document.rs | 2 +- 5 files changed, 23 insertions(+), 386 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index bd3aac6fd..413e9c732 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -147,7 +147,7 @@ where document_id: *id, query_index: query.id, distance, - is_exact: true, // TODO where can I find this info? + is_exact, postings_list: posting_list_index, }; @@ -384,358 +384,3 @@ impl Deref for PostingsListView<'_> { } } } - -fn fetch_matches<'txn, 'tag>( - reader: &'txn heed::RoTxn, - automatons: &[QueryWordAutomaton], - arena: &mut SmallArena<'tag, PostingsListView<'txn>>, - main_store: store::Main, - postings_lists_store: store::PostingsLists, - pplc_store: store::PrefixPostingsListsCache, -) -> MResult>> -{ - let before_words_fst = Instant::now(); - let words = match unsafe { main_store.static_words_fst(reader)? } { - Some(words) => words, - None => return Ok(Vec::new()), - }; - debug!("words fst took {:.02?}", before_words_fst.elapsed()); - debug!("words fst len {} and size {}", words.len(), words.as_fst().as_bytes().len()); - - let mut total_postings_lists = Vec::new(); - let mut documents_ids = HashSet::::new(); - - let mut dfa_time = Duration::default(); - let mut postings_lists_fetching_time = Duration::default(); - let automatons_loop = Instant::now(); - - for (query_index, automaton) in automatons.iter().enumerate() { - let QueryWordAutomaton { query, is_exact, is_prefix, .. } = automaton; - - let before_word_postings_lists_fetching = Instant::now(); - let mut stream_next_time = Duration::default(); - let mut number_of_words = 0; - let mut postings_lists_original_length = 0; - let mut postings_lists_length = 0; - - if *is_prefix && query.len() == 1 { - let prefix = [query.as_bytes()[0], 0, 0, 0]; - - number_of_words += 1; - - let before_postings_lists_fetching = Instant::now(); - if let Some(postings) = pplc_store.prefix_postings_list(reader, prefix)? { - debug!("Found cached postings list for {:?}", query); - postings_lists_original_length += postings.matches.len(); - - let input = Rc::from(&prefix[..]); - let postings_list = Rc::new(postings.matches); - let postings_list_view = PostingsListView::original(input, postings_list); - - let mut offset = 0; - for group in postings_list_view.linear_group_by_key(|di| di.document_id) { - let document_id = group[0].document_id; - - if query_index != 0 && !documents_ids.contains(&document_id) { - offset += group.len(); - continue - } - documents_ids.insert(document_id); - - postings_lists_length += group.len(); - - let range = postings_list_view.range(offset, group.len()); - let posting_list_index = arena.add(range); - let bare_match = BareMatch { - document_id, - query_index, - distance: 0, - is_exact: *is_exact, - postings_list: posting_list_index, - }; - - - total_postings_lists.push(bare_match); - offset += group.len(); - } - } - postings_lists_fetching_time += before_postings_lists_fetching.elapsed(); - } - else { - let before_dfa = Instant::now(); - let dfa = automaton.dfa(); - dfa_time += before_dfa.elapsed(); - - let byte = query.as_bytes()[0]; - let mut stream = if byte == u8::max_value() { - words.search(&dfa).ge(&[byte]).into_stream() - } else { - words.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream() - }; - - // while let Some(input) = stream.next() { - loop { - let before_stream_next = Instant::now(); - let value = stream.next(); - stream_next_time += before_stream_next.elapsed(); - - let input = match value { - Some(input) => input, - None => break, - }; - - number_of_words += 1; - - let distance = dfa.eval(input).to_u8(); - let is_exact = *is_exact && distance == 0 && input.len() == query.len(); - - let before_postings_lists_fetching = Instant::now(); - if let Some(Postings { docids, matches }) = postings_lists_store.postings_list(reader, input)? { - postings_lists_original_length += matches.len(); - - let input = Rc::from(input); - let matches = Rc::new(matches); - let postings_list_view = PostingsListView::original(input, matches); - - let mut offset = 0; - for group in postings_list_view.linear_group_by_key(|di| di.document_id) { - let document_id = group[0].document_id; - - if query_index != 0 && !documents_ids.contains(&document_id) { - offset += group.len(); - continue - } - documents_ids.insert(document_id); - - postings_lists_length += group.len(); - - let range = postings_list_view.range(offset, group.len()); - let posting_list_index = arena.add(range); - let bare_match = BareMatch { - document_id, - query_index, - distance, - is_exact, - postings_list: posting_list_index, - }; - - total_postings_lists.push(bare_match); - offset += group.len(); - } - } - postings_lists_fetching_time += before_postings_lists_fetching.elapsed(); - } - } - - debug!("{:?} gives {} words", query, number_of_words); - debug!("{:?} gives postings lists of length {} (original was {})", - query, postings_lists_length, postings_lists_original_length); - debug!("{:?} took {:.02?} to fetch postings lists", - query, before_word_postings_lists_fetching.elapsed()); - debug!("stream next took {:.02?}", stream_next_time); - } - - debug!("automatons loop took {:.02?}", automatons_loop.elapsed()); - debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time); - debug!("dfa creation took {:.02?}", dfa_time); - - Ok(total_postings_lists) -} - -#[derive(Debug)] -pub struct QueryWordAutomaton { - pub query: String, - /// Is it a word that must be considered exact - /// or is it some derived word (i.e. a synonym) - pub is_exact: bool, - pub is_prefix: bool, - /// If it's a phrase query and what is - /// its index an the length of the phrase - pub phrase_query: Option<(u16, u16)>, -} - -impl QueryWordAutomaton { - pub fn exact(query: &str) -> QueryWordAutomaton { - QueryWordAutomaton { - query: query.to_string(), - is_exact: true, - is_prefix: false, - phrase_query: None, - } - } - - pub fn exact_prefix(query: &str) -> QueryWordAutomaton { - QueryWordAutomaton { - query: query.to_string(), - is_exact: true, - is_prefix: true, - phrase_query: None, - } - } - - pub fn non_exact(query: &str) -> QueryWordAutomaton { - QueryWordAutomaton { - query: query.to_string(), - is_exact: false, - is_prefix: false, - phrase_query: None, - } - } - - pub fn dfa(&self) -> DFA { - if self.phrase_query.is_some() { - build_exact_dfa(&self.query) - } else if self.is_prefix { - build_prefix_dfa(&self.query) - } else { - build_dfa(&self.query) - } - } -} - -fn split_best_frequency<'a>( - reader: &heed::RoTxn, - word: &'a str, - postings_lists_store: store::PostingsLists, -) -> MResult> { - let chars = word.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = word.split_at(i); - - let left_freq = postings_lists_store - .postings_list(reader, left.as_ref())? - .map_or(0, |p| p.docids.len()); - - let right_freq = postings_lists_store - .postings_list(reader, right.as_ref())? - .map_or(0, |p| p.docids.len()); - - let min_freq = cmp::min(left_freq, right_freq); - if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { - best = Some((min_freq, left, right)); - } - } - - Ok(best.map(|(_, l, r)| (l, r))) -} - -fn construct_automatons( - reader: &heed::RoTxn, - query: &str, - main_store: store::Main, - postings_lists_store: store::PostingsLists, - synonym_store: store::Synonyms, -) -> MResult<(Vec, QueryEnhancer)> { - let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect(); - let synonyms = match main_store.synonyms_fst(reader)? { - Some(synonym) => synonym, - None => fst::Set::default(), - }; - - let mut automaton_index = 0; - let mut automatons = Vec::new(); - let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words); - - // We must not declare the original words to the query enhancer - // *but* we need to push them in the automatons list first - let mut original_words = query_words.iter().peekable(); - while let Some(word) = original_words.next() { - let has_following_word = original_words.peek().is_some(); - let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk); - - let automaton = if not_prefix_dfa { - QueryWordAutomaton::exact(word) - } else { - QueryWordAutomaton::exact_prefix(word) - }; - automaton_index += 1; - automatons.push(automaton); - } - - for n in 1..=NGRAMS { - let mut ngrams = query_words.windows(n).enumerate().peekable(); - while let Some((query_index, ngram_slice)) = ngrams.next() { - let query_range = query_index..query_index + n; - let ngram_nb_words = ngram_slice.len(); - let ngram = ngram_slice.join(" "); - - let has_following_word = ngrams.peek().is_some(); - let not_prefix_dfa = - has_following_word || has_end_whitespace || ngram.chars().all(is_cjk); - - // automaton of synonyms of the ngrams - let normalized = normalize_str(&ngram); - let lev = if not_prefix_dfa { - build_dfa(&normalized) - } else { - build_prefix_dfa(&normalized) - }; - - let mut stream = synonyms.search(&lev).into_stream(); - while let Some(base) = stream.next() { - // only trigger alternatives when the last word has been typed - // i.e. "new " do not but "new yo" triggers alternatives to "new york" - let base = std::str::from_utf8(base).unwrap(); - let base_nb_words = split_query_string(base).count(); - if ngram_nb_words != base_nb_words { - continue; - } - - if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? { - let mut stream = synonyms.into_stream(); - while let Some(synonyms) = stream.next() { - let synonyms = std::str::from_utf8(synonyms).unwrap(); - let synonyms_words: Vec<_> = split_query_string(synonyms).collect(); - let nb_synonym_words = synonyms_words.len(); - - let real_query_index = automaton_index; - enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words); - - for synonym in synonyms_words { - let automaton = if nb_synonym_words == 1 { - QueryWordAutomaton::exact(synonym) - } else { - QueryWordAutomaton::non_exact(synonym) - }; - automaton_index += 1; - automatons.push(automaton); - } - } - } - } - - if n == 1 { - // automatons for splitted words - if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? { - let mut left_automaton = QueryWordAutomaton::exact(left); - left_automaton.phrase_query = Some((0, 2)); - enhancer_builder.declare(query_range.clone(), automaton_index, &[left]); - automaton_index += 1; - automatons.push(left_automaton); - - let mut right_automaton = QueryWordAutomaton::exact(right); - right_automaton.phrase_query = Some((1, 2)); - enhancer_builder.declare(query_range.clone(), automaton_index, &[right]); - automaton_index += 1; - automatons.push(right_automaton); - } - } else { - // automaton of concatenation of query words - let concat = ngram_slice.concat(); - let normalized = normalize_str(&concat); - - let real_query_index = automaton_index; - enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]); - - let automaton = QueryWordAutomaton::exact(&normalized); - automaton_index += 1; - automatons.push(automaton); - } - } - } - - Ok((automatons, enhancer_builder.build())) -} diff --git a/meilisearch-core/src/criterion/mod.rs b/meilisearch-core/src/criterion/mod.rs index 989d173e3..044a3943f 100644 --- a/meilisearch-core/src/criterion/mod.rs +++ b/meilisearch-core/src/criterion/mod.rs @@ -7,7 +7,7 @@ use sdset::SetBuf; use slice_group_by::GroupBy; use crate::automaton::QueryEnhancer; -use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton}; +use crate::bucket_sort::{SimpleMatch, PostingsListView}; use crate::database::MainT; use crate::query_tree::QueryId; use crate::{store, RawDocument, MResult}; diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 6c0ac5be8..a2722488a 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -32,7 +32,7 @@ pub use meilisearch_types::{DocIndex, DocumentId, Highlight}; pub use query_words_mapper::QueryWordsMapper; use compact_arena::SmallArena; -use crate::bucket_sort::{QueryWordAutomaton, PostingsListView}; +use crate::bucket_sort::PostingsListView; use crate::levenshtein::prefix_damerau_levenshtein; use crate::reordered_attrs::ReorderedAttrs; @@ -47,7 +47,6 @@ pub struct Document { fn highlights_from_raw_document<'a, 'tag, 'txn>( raw_document: &RawDocument<'a, 'tag>, - automatons: &[QueryWordAutomaton], arena: &SmallArena<'tag, PostingsListView<'txn>>, searchable_attrs: Option<&ReorderedAttrs>, ) -> Vec @@ -57,14 +56,14 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>( for bm in raw_document.bare_matches.iter() { let postings_list = &arena[bm.postings_list]; let input = postings_list.input(); - let query = &automatons[bm.query_index as usize].query; + // let query = &automatons[bm.query_index as usize].query; for di in postings_list.iter() { - let covered_area = if query.len() > input.len() { - input.len() - } else { - prefix_damerau_levenshtein(query.as_bytes(), input).1 - }; + // let covered_area = if query.len() > input.len() { + // input.len() + // } else { + // prefix_damerau_levenshtein(query.as_bytes(), input).1 + // }; let attribute = searchable_attrs .and_then(|sa| sa.reverse(di.attribute)) @@ -73,7 +72,7 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>( let highlight = Highlight { attribute: attribute, char_index: di.char_index, - char_length: covered_area as u16, + char_length: di.char_length, }; highlights.push(highlight); @@ -97,19 +96,15 @@ impl Document { #[cfg(not(test))] pub fn from_raw<'a, 'tag, 'txn>( raw_document: RawDocument<'a, 'tag>, - // automatons: &[QueryWordAutomaton], arena: &SmallArena<'tag, PostingsListView<'txn>>, searchable_attrs: Option<&ReorderedAttrs>, ) -> Document { - // let highlights = highlights_from_raw_document( - // &raw_document, - // automatons, - // arena, - // searchable_attrs, - // ); - - let highlights = Vec::new(); + let highlights = highlights_from_raw_document( + &raw_document, + arena, + searchable_attrs, + ); Document { id: raw_document.id, highlights } } @@ -117,21 +112,17 @@ impl Document { #[cfg(test)] pub fn from_raw<'a, 'tag, 'txn>( raw_document: RawDocument<'a, 'tag>, - // automatons: &[QueryWordAutomaton], arena: &SmallArena<'tag, PostingsListView<'txn>>, searchable_attrs: Option<&ReorderedAttrs>, ) -> Document { use crate::bucket_sort::SimpleMatch; - // let highlights = highlights_from_raw_document( - // &raw_document, - // automatons, - // arena, - // searchable_attrs, - // ); - - let highlights = Vec::new(); + let highlights = highlights_from_raw_document( + &raw_document, + arena, + searchable_attrs, + ); let mut matches = Vec::new(); for sm in raw_document.processed_matches { diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index d3a1ad0ec..089eaa3af 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -53,7 +53,8 @@ impl Operation { } fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Operation { - Operation::Query(Query { id, prefix, kind: QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]) }) + let kind = QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]); + Operation::Query(Query { id, prefix, kind }) } } diff --git a/meilisearch-core/src/raw_document.rs b/meilisearch-core/src/raw_document.rs index 56fde3e7b..17955824e 100644 --- a/meilisearch-core/src/raw_document.rs +++ b/meilisearch-core/src/raw_document.rs @@ -1,7 +1,7 @@ use compact_arena::SmallArena; use sdset::SetBuf; use crate::DocIndex; -use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView}; +use crate::bucket_sort::{SimpleMatch, BareMatch, PostingsListView}; use crate::reordered_attrs::ReorderedAttrs; pub struct RawDocument<'a, 'tag> {