use std::time::{Instant, Duration}; use std::ops::Range; use std::{cmp, mem}; use fst::{IntoStreamer, Streamer}; use sdset::SetBuf; use slice_group_by::{GroupBy, GroupByMut}; use crate::automaton::{Automaton, AutomatonProducer, QueryEnhancer}; use crate::raw_document::{RawDocument, raw_documents_from}; use crate::{Document, DocumentId, Highlight, TmpMatch, criterion::Criteria}; use crate::{store, reordered_attrs::ReorderedAttrs}; pub struct Automatons { // TODO better use Vec of SmallVec automatons: Vec>, } pub struct QueryBuilder<'a> { criteria: Criteria<'a>, searchables_attrs: Option, timeout: Duration, words_store: store::Words, synonyms_store: store::Synonyms, } fn multiword_rewrite_matches( mut matches: Vec<(DocumentId, TmpMatch)>, query_enhancer: &QueryEnhancer, ) -> SetBuf<(DocumentId, TmpMatch)> { let mut padded_matches = Vec::with_capacity(matches.len()); // we sort the matches by word index to make them rewritable matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index)); let start = Instant::now(); // for each attribute of each document for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) { // padding will only be applied // to word indices in the same attribute let mut padding = 0; let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index); // for each match at the same position // in this document attribute while let Some(same_word_index) = iter.next() { // find the biggest padding let mut biggest = 0; for (id, match_) in same_word_index { let mut replacement = query_enhancer.replacement(match_.query_index); let replacement_len = replacement.len(); let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index); if let Some(query_index) = replacement.next() { let word_index = match_.word_index + padding as u16; let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; padded_matches.push((*id, match_)); } let mut found = false; // look ahead and if there already is a match // corresponding to this padding word, abort the padding 'padding: for (x, next_group) in nexts.enumerate() { for (i, query_index) in replacement.clone().enumerate().skip(x) { let word_index = match_.word_index + padding as u16 + (i + 1) as u16; let padmatch = TmpMatch { query_index, word_index, ..match_.clone() }; for (_, nmatch_) in next_group { let mut rep = query_enhancer.replacement(nmatch_.query_index); let query_index = rep.next().unwrap(); if query_index == padmatch.query_index { if !found { // if we find a corresponding padding for the // first time we must push preceding paddings for (i, query_index) in replacement.clone().enumerate().take(i) { let word_index = match_.word_index + padding as u16 + (i + 1) as u16; let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; padded_matches.push((*id, match_)); biggest = biggest.max(i + 1); } } padded_matches.push((*id, padmatch)); found = true; continue 'padding; } } } // if we do not find a corresponding padding in the // next groups so stop here and pad what was found break } if !found { // if no padding was found in the following matches // we must insert the entire padding for (i, query_index) in replacement.enumerate() { let word_index = match_.word_index + padding as u16 + (i + 1) as u16; let match_ = TmpMatch { query_index, word_index, ..match_.clone() }; padded_matches.push((*id, match_)); } biggest = biggest.max(replacement_len - 1); } } padding += biggest; } } for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) { document_matches.sort_unstable(); } SetBuf::new_unchecked(padded_matches) } fn fetch_raw_documents( reader: &rkv::Reader, automatons: &[Automaton], query_enhancer: &QueryEnhancer, searchables: Option<&ReorderedAttrs>, words_store: &store::Words, ) -> Result, rkv::StoreError> { let mut matches = Vec::new(); let mut highlights = Vec::new(); for automaton in automatons { let Automaton { index, is_exact, query_len, .. } = automaton; let dfa = automaton.dfa(); let words = words_store.words_fst(reader)?; let mut stream = words.search(&dfa).into_stream(); while let Some(input) = stream.next() { let distance = dfa.eval(input).to_u8(); let is_exact = *is_exact && distance == 0 && input.len() == *query_len; let doc_indexes = match words_store.word_indexes(reader, input)? { Some(doc_indexes) => doc_indexes, None => continue, }; matches.reserve(doc_indexes.len()); highlights.reserve(doc_indexes.len()); for di in doc_indexes.as_ref() { let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute)); if let Some(attribute) = attribute { let match_ = TmpMatch { query_index: *index as u32, distance, attribute, word_index: di.word_index, is_exact, }; let highlight = Highlight { attribute: di.attribute, char_index: di.char_index, char_length: di.char_length, }; matches.push((di.document_id, match_)); highlights.push((di.document_id, highlight)); } } } } let matches = multiword_rewrite_matches(matches, &query_enhancer); let highlights = { highlights.sort_unstable_by_key(|(id, _)| *id); SetBuf::new_unchecked(highlights) }; Ok(raw_documents_from(matches, highlights)) } impl<'a> QueryBuilder<'a> { pub fn new(words: store::Words, synonyms: store::Synonyms) -> QueryBuilder<'a> { QueryBuilder { criteria: Criteria::default(), searchables_attrs: None, timeout: Duration::from_secs(1), words_store: words, synonyms_store: synonyms, } } pub fn query( self, reader: &rkv::Reader, query: &str, range: Range, ) -> Result, rkv::StoreError> { let start_processing = Instant::now(); let mut raw_documents_processed = Vec::new(); let (automaton_producer, query_enhancer) = AutomatonProducer::new(reader, query, self.synonyms_store); let mut automaton_producer = automaton_producer.into_iter(); let mut automatons = Vec::new(); // aggregate automatons groups by groups after time while let Some(auts) = automaton_producer.next() { automatons.extend(auts); // we must retrieve the documents associated // with the current automatons let mut raw_documents = fetch_raw_documents( reader, &automatons, &query_enhancer, self.searchables_attrs.as_ref(), &self.words_store, )?; let mut groups = vec![raw_documents.as_mut_slice()]; 'criteria: for criterion in self.criteria.as_ref() { let tmp_groups = mem::replace(&mut groups, Vec::new()); let mut documents_seen = 0; for group in tmp_groups { // if this group does not overlap with the requested range, // push it without sorting and splitting it if documents_seen + group.len() < range.start { documents_seen += group.len(); groups.push(group); continue; } group.sort_unstable_by(|a, b| criterion.evaluate(a, b)); for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { documents_seen += group.len(); groups.push(group); // we have sort enough documents if the last document sorted is after // the end of the requested range, we can continue to the next criterion if documents_seen >= range.end { continue 'criteria } } } } // once we classified the documents related to the current // automatons we save that as the next valid result let iter = raw_documents.into_iter().skip(range.start).take(range.len()); raw_documents_processed.clear(); raw_documents_processed.extend(iter); // stop processing after there is no time if start_processing.elapsed() > self.timeout { break } } // make real documents now that we know // those must be returned let documents = raw_documents_processed .into_iter() .map(|d| Document::from_raw(d)) .collect(); Ok(documents) } }