diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 1abbb168b..bfb8910fa 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -38,6 +38,7 @@ pub fn bucket_sort<'c, FI>( postings_lists_store: store::PostingsLists, documents_fields_counts_store: store::DocumentsFieldsCounts, synonyms_store: store::Synonyms, + prefix_cache_store: store::PrefixCache, ) -> MResult> where FI: Fn(DocumentId) -> bool, @@ -60,12 +61,32 @@ where postings_lists_store, documents_fields_counts_store, synonyms_store, + prefix_cache_store, ); } let (mut automatons, mut query_enhancer) = construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?; + if let [automaton] = &automatons[..] { + if automaton.is_prefix && automaton.query.len() <= 4 { + let mut prefix = [0; 4]; + let len = cmp::min(4, automaton.query.len()); + prefix[..len].copy_from_slice(&automaton.query.as_bytes()[..len]); + + let mut documents = Vec::new(); + let iter = prefix_cache_store.prefix_documents(reader, prefix)?; + for result in iter.skip(range.start).take(range.len()) { + let (docid, highlights) = result?; + documents.push(Document::from_highlights(docid, &highlights)); + } + + if !documents.is_empty() { + return Ok(documents); + } + } + } + debug!("{:?}", query_enhancer); let before_postings_lists_fetching = Instant::now(); @@ -160,6 +181,7 @@ pub fn bucket_sort_with_distinct<'c, FI, FD>( postings_lists_store: store::PostingsLists, documents_fields_counts_store: store::DocumentsFieldsCounts, synonyms_store: store::Synonyms, + prefix_cache_store: store::PrefixCache, ) -> MResult> where FI: Fn(DocumentId) -> bool, diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index ea36abd42..3d2dd4b67 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -81,6 +81,16 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>( } impl Document { + #[cfg(not(test))] + pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document { + Document { id, highlights: highlights.to_owned() } + } + + #[cfg(test)] + pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document { + Document { id, highlights: highlights.to_owned(), matches: Vec::new() } + } + #[cfg(not(test))] pub fn from_raw<'a, 'tag, 'txn>( raw_document: RawDocument<'a, 'tag>, diff --git a/meilisearch-core/src/query_builder.rs b/meilisearch-core/src/query_builder.rs index e46858241..56aa038b7 100644 --- a/meilisearch-core/src/query_builder.rs +++ b/meilisearch-core/src/query_builder.rs @@ -16,6 +16,7 @@ pub struct QueryBuilder<'c, 'f, 'd> { postings_lists_store: store::PostingsLists, documents_fields_counts_store: store::DocumentsFieldsCounts, synonyms_store: store::Synonyms, + prefix_cache_store: store::PrefixCache, } impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { @@ -24,12 +25,14 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { postings_lists: store::PostingsLists, documents_fields_counts: store::DocumentsFieldsCounts, synonyms: store::Synonyms, + prefix_cache: store::PrefixCache, ) -> QueryBuilder<'c, 'f, 'd> { QueryBuilder::with_criteria( main, postings_lists, documents_fields_counts, synonyms, + prefix_cache, Criteria::default(), ) } @@ -39,6 +42,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { postings_lists: store::PostingsLists, documents_fields_counts: store::DocumentsFieldsCounts, synonyms: store::Synonyms, + prefix_cache: store::PrefixCache, criteria: Criteria<'c>, ) -> QueryBuilder<'c, 'f, 'd> { QueryBuilder { @@ -51,6 +55,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { postings_lists_store: postings_lists, documents_fields_counts_store: documents_fields_counts, synonyms_store: synonyms, + prefix_cache_store: prefix_cache, } } @@ -97,6 +102,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { self.postings_lists_store, self.documents_fields_counts_store, self.synonyms_store, + self.prefix_cache_store, ), None => bucket_sort( reader, @@ -109,6 +115,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> { self.postings_lists_store, self.documents_fields_counts_store, self.synonyms_store, + self.prefix_cache_store, ), } } diff --git a/meilisearch-core/src/store/mod.rs b/meilisearch-core/src/store/mod.rs index 198e250e4..072d92004 100644 --- a/meilisearch-core/src/store/mod.rs +++ b/meilisearch-core/src/store/mod.rs @@ -1,4 +1,5 @@ mod docs_words; +mod prefix_cache; mod documents_fields; mod documents_fields_counts; mod main; @@ -8,6 +9,7 @@ mod updates; mod updates_results; pub use self::docs_words::DocsWords; +pub use self::prefix_cache::PrefixCache; pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields}; pub use self::documents_fields_counts::{ DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter, @@ -74,6 +76,10 @@ fn docs_words_name(name: &str) -> String { format!("store-{}-docs-words", name) } +fn prefix_cache_name(name: &str) -> String { + format!("store-{}-prefix-cache", name) +} + fn updates_name(name: &str) -> String { format!("store-{}-updates", name) } @@ -90,6 +96,7 @@ pub struct Index { pub documents_fields_counts: DocumentsFieldsCounts, pub synonyms: Synonyms, pub docs_words: DocsWords, + pub prefix_cache: PrefixCache, pub updates: Updates, pub updates_results: UpdatesResults, @@ -252,6 +259,7 @@ impl Index { self.postings_lists, self.documents_fields_counts, self.synonyms, + self.prefix_cache, ) } @@ -264,6 +272,7 @@ impl Index { self.postings_lists, self.documents_fields_counts, self.synonyms, + self.prefix_cache, criteria, ) } @@ -282,6 +291,7 @@ pub fn create( let documents_fields_counts_name = documents_fields_counts_name(name); let synonyms_name = synonyms_name(name); let docs_words_name = docs_words_name(name); + let prefix_cache_name = prefix_cache_name(name); let updates_name = updates_name(name); let updates_results_name = updates_results_name(name); @@ -292,6 +302,7 @@ pub fn create( let documents_fields_counts = env.create_database(Some(&documents_fields_counts_name))?; let synonyms = env.create_database(Some(&synonyms_name))?; let docs_words = env.create_database(Some(&docs_words_name))?; + let prefix_cache = env.create_database(Some(&prefix_cache_name))?; let updates = update_env.create_database(Some(&updates_name))?; let updates_results = update_env.create_database(Some(&updates_results_name))?; @@ -304,6 +315,7 @@ pub fn create( }, synonyms: Synonyms { synonyms }, docs_words: DocsWords { docs_words }, + prefix_cache: PrefixCache { prefix_cache }, updates: Updates { updates }, updates_results: UpdatesResults { updates_results }, updates_notifier, @@ -323,6 +335,7 @@ pub fn open( let documents_fields_counts_name = documents_fields_counts_name(name); let synonyms_name = synonyms_name(name); let docs_words_name = docs_words_name(name); + let prefix_cache_name = prefix_cache_name(name); let updates_name = updates_name(name); let updates_results_name = updates_results_name(name); @@ -351,6 +364,10 @@ pub fn open( Some(docs_words) => docs_words, None => return Ok(None), }; + let prefix_cache = match env.open_database(Some(&prefix_cache_name))? { + Some(prefix_cache) => prefix_cache, + None => return Ok(None), + }; let updates = match update_env.open_database(Some(&updates_name))? { Some(updates) => updates, None => return Ok(None), @@ -369,6 +386,7 @@ pub fn open( }, synonyms: Synonyms { synonyms }, docs_words: DocsWords { docs_words }, + prefix_cache: PrefixCache { prefix_cache }, updates: Updates { updates }, updates_results: UpdatesResults { updates_results }, updates_notifier, @@ -387,6 +405,7 @@ pub fn clear( index.documents_fields_counts.clear(writer)?; index.synonyms.clear(writer)?; index.docs_words.clear(writer)?; + index.prefix_cache.clear(writer)?; index.updates.clear(update_writer)?; index.updates_results.clear(update_writer)?; Ok(()) diff --git a/meilisearch-core/src/store/prefix_cache.rs b/meilisearch-core/src/store/prefix_cache.rs new file mode 100644 index 000000000..5b1621ca8 --- /dev/null +++ b/meilisearch-core/src/store/prefix_cache.rs @@ -0,0 +1,80 @@ +use std::borrow::Cow; + +use heed::types::{OwnedType, CowSlice}; +use heed::Result as ZResult; +use zerocopy::{AsBytes, FromBytes}; + +use super::BEU64; +use crate::{DocumentId, Highlight}; +use crate::database::MainT; + +#[derive(Debug, Copy, Clone, AsBytes, FromBytes)] +#[repr(C)] +pub struct PrefixKey { + prefix: [u8; 4], + index: BEU64, + docid: BEU64, +} + +impl PrefixKey { + pub fn new(prefix: [u8; 4], index: u64, docid: u64) -> PrefixKey { + PrefixKey { + prefix: prefix, + index: BEU64::new(index), + docid: BEU64::new(docid), + } + } +} + +#[derive(Copy, Clone)] +pub struct PrefixCache { + pub(crate) prefix_cache: heed::Database, CowSlice>, +} + +impl PrefixCache { + pub fn put_prefix_document( + self, + writer: &mut heed::RwTxn, + prefix: [u8; 4], + index: usize, + docid: DocumentId, + highlights: &[Highlight], + ) -> ZResult<()> { + let key = PrefixKey::new(prefix, index as u64, docid.0); + self.prefix_cache.put(writer, &key, highlights) + } + + pub fn clear(self, writer: &mut heed::RwTxn) -> ZResult<()> { + self.prefix_cache.clear(writer) + } + + pub fn prefix_documents<'txn>( + self, + reader: &'txn heed::RoTxn, + prefix: [u8; 4], + ) -> ZResult> { + let start = PrefixKey::new(prefix, 0, 0); + let end = PrefixKey::new(prefix, u64::max_value(), u64::max_value()); + let iter = self.prefix_cache.range(reader, &(start..=end))?; + Ok(PrefixDocumentsIter { iter }) + } +} + +pub struct PrefixDocumentsIter<'txn> { + iter: heed::RoRange<'txn, OwnedType, CowSlice>, +} + +impl<'txn> Iterator for PrefixDocumentsIter<'txn> { + type Item = ZResult<(DocumentId, Cow<'txn, [Highlight]>)>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok((key, highlights))) => { + let docid = DocumentId(key.docid.get()); + Some(Ok((docid, highlights))) + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index 04f9942f1..eadb56392 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -109,6 +109,7 @@ pub fn apply_documents_addition<'a, 'b>( documents_fields_counts_store: store::DocumentsFieldsCounts, postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, + prefix_cache_store: store::PrefixCache, addition: Vec>, ) -> MResult<()> { let mut documents_additions = HashMap::new(); @@ -175,6 +176,7 @@ pub fn apply_documents_addition<'a, 'b>( main_store, postings_lists_store, docs_words_store, + prefix_cache_store, &ranked_map, number_of_inserted_documents, indexer, @@ -188,6 +190,7 @@ pub fn apply_documents_partial_addition<'a, 'b>( documents_fields_counts_store: store::DocumentsFieldsCounts, postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, + prefix_cache_store: store::PrefixCache, addition: Vec>, ) -> MResult<()> { let mut documents_additions = HashMap::new(); @@ -271,6 +274,7 @@ pub fn apply_documents_partial_addition<'a, 'b>( main_store, postings_lists_store, docs_words_store, + prefix_cache_store, &ranked_map, number_of_inserted_documents, indexer, @@ -284,6 +288,7 @@ pub fn reindex_all_documents( documents_fields_counts_store: store::DocumentsFieldsCounts, postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, + prefix_cache_store: store::PrefixCache, ) -> MResult<()> { let schema = match main_store.schema(writer)? { Some(schema) => schema, @@ -345,6 +350,7 @@ pub fn reindex_all_documents( main_store, postings_lists_store, docs_words_store, + prefix_cache_store, &ranked_map, number_of_inserted_documents, indexer, @@ -359,6 +365,7 @@ pub fn write_documents_addition_index( main_store: store::Main, postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, + prefix_cache_store: store::PrefixCache, ranked_map: &RankedMap, number_of_inserted_documents: usize, indexer: RawIndexer, diff --git a/meilisearch-core/src/update/mod.rs b/meilisearch-core/src/update/mod.rs index 239884a88..6136282cf 100644 --- a/meilisearch-core/src/update/mod.rs +++ b/meilisearch-core/src/update/mod.rs @@ -23,12 +23,15 @@ use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::time::Instant; use chrono::{DateTime, Utc}; +use fst::{IntoStreamer, Streamer}; use heed::Result as ZResult; use log::debug; use serde::{Deserialize, Serialize}; use crate::{store, DocumentId, MResult}; use crate::database::{MainT, UpdateT}; +use crate::bucket_sort::bucket_sort; +use crate::criterion::Criteria; use meilisearch_schema::Schema; #[derive(Debug, Clone, Serialize, Deserialize)] @@ -278,6 +281,7 @@ pub fn update_task<'a, 'b>( index.documents_fields_counts, index.postings_lists, index.docs_words, + index.prefix_cache, ); (update_type, result, start.elapsed()) @@ -304,9 +308,63 @@ pub fn update_task<'a, 'b>( index.documents_fields_counts, index.postings_lists, index.docs_words, + index.prefix_cache, documents, ); + let words_fst = index.main.words_fst(writer)?.unwrap(); + let mut stream = words_fst.into_stream(); + let mut previous_char = None; + while let Some(input) = stream.next() { + let (s, c) = match std::str::from_utf8(input) { + Ok(s) => { + let c = s.chars().next().unwrap(); + (&s[..c.len_utf8()], c) + }, + Err(_) => continue, + }; + + match previous_char { + Some(pc) if pc != c => { + debug!("searching and caching {:?}", s); + + let documents = bucket_sort( + writer, + s, + 0..20, + None as Option bool>, + Criteria::default(), + None, + index.main, + index.postings_lists, + index.documents_fields_counts, + index.synonyms, + index.prefix_cache, + ).unwrap(); + + let mut prefix = [0; 4]; + let len = cmp::min(4, s.len()); + prefix[..len].copy_from_slice(&s.as_bytes()[..len]); + + for (i, document) in documents.into_iter().enumerate() { + index.prefix_cache.put_prefix_document( + writer, + prefix, + i, + document.id, + &document.highlights, + ).unwrap(); + } + + previous_char = Some(c) + }, + Some(_) => (), + None => previous_char = Some(c), + } + } + + // TODO we forget to do it for the last prefix char + (update_type, result, start.elapsed()) } UpdateData::DocumentsPartial(documents) => { @@ -323,6 +381,7 @@ pub fn update_task<'a, 'b>( index.documents_fields_counts, index.postings_lists, index.docs_words, + index.prefix_cache, documents, ); @@ -384,6 +443,7 @@ pub fn update_task<'a, 'b>( index.documents_fields_counts, index.postings_lists, index.docs_words, + index.prefix_cache, stop_words, ); diff --git a/meilisearch-core/src/update/schema_update.rs b/meilisearch-core/src/update/schema_update.rs index f946175ad..9c1633b62 100644 --- a/meilisearch-core/src/update/schema_update.rs +++ b/meilisearch-core/src/update/schema_update.rs @@ -13,6 +13,7 @@ pub fn apply_schema_update( documents_fields_counts_store: store::DocumentsFieldsCounts, postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, + prefix_cache_store: store::PrefixCache, ) -> MResult<()> { use UnsupportedOperation::{ CanOnlyIntroduceNewSchemaAttributesAtEnd, CannotRemoveSchemaAttribute, @@ -55,6 +56,7 @@ pub fn apply_schema_update( documents_fields_counts_store, postings_lists_store, docs_words_store, + prefix_cache_store, )? } diff --git a/meilisearch-core/src/update/stop_words_deletion.rs b/meilisearch-core/src/update/stop_words_deletion.rs index 9c799b402..f0ff58a2f 100644 --- a/meilisearch-core/src/update/stop_words_deletion.rs +++ b/meilisearch-core/src/update/stop_words_deletion.rs @@ -68,6 +68,7 @@ pub fn apply_stop_words_deletion( documents_fields_counts_store: store::DocumentsFieldsCounts, postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, + prefix_cache_store: store::PrefixCache, deletion: BTreeSet, ) -> MResult<()> { let mut stop_words_builder = SetBuilder::memory(); @@ -110,6 +111,7 @@ pub fn apply_stop_words_deletion( documents_fields_counts_store, postings_lists_store, docs_words_store, + prefix_cache_store, )?; } } diff --git a/meilisearch-types/src/lib.rs b/meilisearch-types/src/lib.rs index c02281a5f..ae714ccd8 100644 --- a/meilisearch-types/src/lib.rs +++ b/meilisearch-types/src/lib.rs @@ -46,6 +46,8 @@ pub struct DocIndex { /// The order of the field is important because it defines /// the way these structures are ordered between themselves. #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))] +#[repr(C)] pub struct Highlight { /// The attribute in the document where the word was found /// along with the index in it.