From 9cdda8c46ac35d1745a8fc09c667ab561327c493 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 14 Oct 2019 13:56:52 +0200 Subject: [PATCH 1/6] Make the RawIndexer index_text method return the number of words --- meilidb-core/src/raw_indexer.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index 9c0399be5..06d82ed32 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -33,7 +33,8 @@ impl RawIndexer { } } - pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) { + pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize { + let mut number_of_words = 0; let lowercase_text = text.to_lowercase(); let deunicoded = deunicode_with_tofu(&lowercase_text, ""); @@ -57,8 +58,12 @@ impl RawIndexer { ); if !must_continue { break } + + number_of_words += 1; } } + + number_of_words } pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) From a7e40a78c1f0f4b08306673abea5a425664997ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 14 Oct 2019 14:06:34 +0200 Subject: [PATCH 2/6] Introduce the DocumentsFieldsCounts store --- meilidb-core/src/store/documents_fields.rs | 62 +------- .../src/store/documents_fields_counts.rs | 139 ++++++++++++++++++ meilidb-core/src/store/main.rs | 1 - meilidb-core/src/store/mod.rs | 40 ++++- 4 files changed, 179 insertions(+), 63 deletions(-) create mode 100644 meilidb-core/src/store/documents_fields_counts.rs diff --git a/meilidb-core/src/store/documents_fields.rs b/meilidb-core/src/store/documents_fields.rs index e6acb87b6..d3c28f990 100644 --- a/meilidb-core/src/store/documents_fields.rs +++ b/meilidb-core/src/store/documents_fields.rs @@ -1,37 +1,13 @@ use std::convert::TryFrom; use meilidb_schema::SchemaAttr; use crate::DocumentId; +use super::{document_attribute_into_key, document_attribute_from_key}; #[derive(Copy, Clone)] pub struct DocumentsFields { pub(crate) documents_fields: rkv::SingleStore, } -fn document_attribute_into_key(document_id: DocumentId, attribute: SchemaAttr) -> [u8; 10] { - let document_id_bytes = document_id.0.to_be_bytes(); - let attr_bytes = attribute.0.to_be_bytes(); - - let mut key = [0u8; 10]; - key[0..8].copy_from_slice(&document_id_bytes); - key[8..10].copy_from_slice(&attr_bytes); - - key -} - -fn document_attribute_from_key(key: [u8; 10]) -> (DocumentId, SchemaAttr) { - let document_id = { - let array = TryFrom::try_from(&key[0..8]).unwrap(); - DocumentId(u64::from_be_bytes(array)) - }; - - let schema_attr = { - let array = TryFrom::try_from(&key[8..8+2]).unwrap(); - SchemaAttr(u16::from_be_bytes(array)) - }; - - (document_id, schema_attr) -} - impl DocumentsFields { pub fn put_document_field( &self, @@ -100,15 +76,6 @@ impl DocumentsFields { let iter = self.documents_fields.iter_from(reader, document_id_bytes)?; Ok(DocumentFieldsIter { document_id, iter }) } - - pub fn documents_ids<'r, T: rkv::Readable>( - &self, - reader: &'r T, - ) -> Result, rkv::StoreError> - { - let iter = self.documents_fields.iter_start(reader)?; - Ok(DocumentsIdsIter { last_seen_id: None, iter }) - } } pub struct DocumentFieldsIter<'r> { @@ -134,30 +101,3 @@ impl<'r> Iterator for DocumentFieldsIter<'r> { } } } - -pub struct DocumentsIdsIter<'r> { - last_seen_id: Option, - iter: rkv::store::single::Iter<'r>, -} - -impl<'r> Iterator for DocumentsIdsIter<'r> { - type Item = Result; - - fn next(&mut self) -> Option { - for result in &mut self.iter { - match result { - Ok((key, _)) => { - let array = TryFrom::try_from(key).unwrap(); - let (document_id, _) = document_attribute_from_key(array); - if Some(document_id) != self.last_seen_id { - self.last_seen_id = Some(document_id); - return Some(Ok(document_id)) - } - }, - Err(e) => return Some(Err(e)), - } - } - - None - } -} diff --git a/meilidb-core/src/store/documents_fields_counts.rs b/meilidb-core/src/store/documents_fields_counts.rs new file mode 100644 index 000000000..95aff986a --- /dev/null +++ b/meilidb-core/src/store/documents_fields_counts.rs @@ -0,0 +1,139 @@ +use std::convert::TryFrom; +use meilidb_schema::SchemaAttr; +use crate::DocumentId; +use super::{document_attribute_into_key, document_attribute_from_key}; + +#[derive(Copy, Clone)] +pub struct DocumentsFieldsCounts { + pub(crate) documents_fields_counts: rkv::SingleStore, +} + +impl DocumentsFieldsCounts { + pub fn put_document_field_count( + &self, + writer: &mut rkv::Writer, + document_id: DocumentId, + attribute: SchemaAttr, + value: u64, + ) -> Result<(), rkv::StoreError> + { + let key = document_attribute_into_key(document_id, attribute); + self.documents_fields_counts.put(writer, key, &rkv::Value::U64(value)) + } + + pub fn del_all_document_fields_counts( + &self, + writer: &mut rkv::Writer, + document_id: DocumentId, + ) -> Result + { + let document_id_bytes = document_id.0.to_be_bytes(); + let mut keys_to_delete = Vec::new(); + + // WARN we can not delete the keys using the iterator + // so we store them and delete them just after + let iter = self.documents_fields_counts.iter_from(writer, document_id_bytes)?; + for result in iter { + let (key, _) = result?; + let array = TryFrom::try_from(key).unwrap(); + let (current_document_id, _) = document_attribute_from_key(array); + if current_document_id != document_id { break } + + keys_to_delete.push(key.to_owned()); + } + + let count = keys_to_delete.len(); + for key in keys_to_delete { + self.documents_fields_counts.delete(writer, key)?; + } + + Ok(count) + } + + pub fn document_attribute_count<'a>( + &self, + reader: &'a impl rkv::Readable, + document_id: DocumentId, + attribute: SchemaAttr, + ) -> Result, rkv::StoreError> + { + let key = document_attribute_into_key(document_id, attribute); + + match self.documents_fields_counts.get(reader, key)? { + Some(rkv::Value::U64(count)) => Ok(Some(count)), + Some(value) => panic!("invalid type {:?}", value), + None => Ok(None), + } + } + + pub fn document_fields_counts<'r, T: rkv::Readable>( + &self, + reader: &'r T, + document_id: DocumentId, + ) -> Result, rkv::StoreError> + { + let document_id_bytes = document_id.0.to_be_bytes(); + let iter = self.documents_fields_counts.iter_from(reader, document_id_bytes)?; + Ok(DocumentFieldsCountsIter { document_id, iter }) + } + + pub fn documents_ids<'r, T: rkv::Readable>( + &self, + reader: &'r T, + ) -> Result, rkv::StoreError> + { + let iter = self.documents_fields_counts.iter_start(reader)?; + Ok(DocumentsIdsIter { last_seen_id: None, iter }) + } +} + +pub struct DocumentFieldsCountsIter<'r> { + document_id: DocumentId, + iter: rkv::store::single::Iter<'r>, +} + +impl<'r> Iterator for DocumentFieldsCountsIter<'r> { + type Item = Result<(SchemaAttr, u64), rkv::StoreError>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok((key, Some(rkv::Value::U64(count))))) => { + let array = TryFrom::try_from(key).unwrap(); + let (current_document_id, attr) = document_attribute_from_key(array); + if current_document_id != self.document_id { return None; } + + Some(Ok((attr, count))) + }, + Some(Ok((key, data))) => panic!("{:?}, {:?}", key, data), + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + +pub struct DocumentsIdsIter<'r> { + last_seen_id: Option, + iter: rkv::store::single::Iter<'r>, +} + +impl<'r> Iterator for DocumentsIdsIter<'r> { + type Item = Result; + + fn next(&mut self) -> Option { + for result in &mut self.iter { + match result { + Ok((key, _)) => { + let array = TryFrom::try_from(key).unwrap(); + let (document_id, _) = document_attribute_from_key(array); + if Some(document_id) != self.last_seen_id { + self.last_seen_id = Some(document_id); + return Some(Ok(document_id)) + } + }, + Err(e) => return Some(Err(e)), + } + } + + None + } +} diff --git a/meilidb-core/src/store/main.rs b/meilidb-core/src/store/main.rs index cb5266187..5beecdc5f 100644 --- a/meilidb-core/src/store/main.rs +++ b/meilidb-core/src/store/main.rs @@ -3,7 +3,6 @@ use std::convert::TryInto; use meilidb_schema::Schema; use rkv::Value; -use serde::de; use crate::{RankedMap, MResult}; const CUSTOMS_KEY: &str = "customs-key"; diff --git a/meilidb-core/src/store/mod.rs b/meilidb-core/src/store/mod.rs index 183ec7b0b..136b10bab 100644 --- a/meilidb-core/src/store/mod.rs +++ b/meilidb-core/src/store/mod.rs @@ -1,5 +1,6 @@ mod docs_words; mod documents_fields; +mod documents_fields_counts; mod main; mod postings_lists; mod synonyms; @@ -8,6 +9,7 @@ mod updates_results; pub use self::docs_words::DocsWords; pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter}; +pub use self::documents_fields_counts::{DocumentsFieldsCounts, DocumentFieldsCountsIter, DocumentsIdsIter}; pub use self::main::Main; pub use self::postings_lists::PostingsLists; pub use self::synonyms::Synonyms; @@ -15,8 +17,11 @@ pub use self::updates::Updates; pub use self::updates_results::UpdatesResults; use std::collections::HashSet; +use std::convert::TryFrom; + use meilidb_schema::{Schema, SchemaAttr}; -use serde::{ser, de}; +use serde::de; + use crate::criterion::Criteria; use crate::serde::Deserializer; use crate::{update, query_builder::QueryBuilder, DocumentId, MResult, Error}; @@ -25,6 +30,31 @@ fn aligned_to(bytes: &[u8], align: usize) -> bool { (bytes as *const _ as *const () as usize) % align == 0 } +fn document_attribute_into_key(document_id: DocumentId, attribute: SchemaAttr) -> [u8; 10] { + let document_id_bytes = document_id.0.to_be_bytes(); + let attr_bytes = attribute.0.to_be_bytes(); + + let mut key = [0u8; 10]; + key[0..8].copy_from_slice(&document_id_bytes); + key[8..10].copy_from_slice(&attr_bytes); + + key +} + +fn document_attribute_from_key(key: [u8; 10]) -> (DocumentId, SchemaAttr) { + let document_id = { + let array = TryFrom::try_from(&key[0..8]).unwrap(); + DocumentId(u64::from_be_bytes(array)) + }; + + let schema_attr = { + let array = TryFrom::try_from(&key[8..8+2]).unwrap(); + SchemaAttr(u16::from_be_bytes(array)) + }; + + (document_id, schema_attr) +} + fn main_name(name: &str) -> String { format!("store-{}", name) } @@ -37,6 +67,10 @@ fn documents_fields_name(name: &str) -> String { format!("store-{}-documents-fields", name) } +fn documents_fields_counts_name(name: &str) -> String { + format!("store-{}-documents-fields-counts", name) +} + fn synonyms_name(name: &str) -> String { format!("store-{}-synonyms", name) } @@ -58,6 +92,7 @@ pub struct Index { pub main: Main, pub postings_lists: PostingsLists, pub documents_fields: DocumentsFields, + pub documents_fields_counts: DocumentsFieldsCounts, pub synonyms: Synonyms, pub docs_words: DocsWords, @@ -205,6 +240,7 @@ fn open_options( let main_name = main_name(name); let postings_lists_name = postings_lists_name(name); let documents_fields_name = documents_fields_name(name); + let documents_fields_counts_name = documents_fields_counts_name(name); let synonyms_name = synonyms_name(name); let docs_words_name = docs_words_name(name); let updates_name = updates_name(name); @@ -214,6 +250,7 @@ fn open_options( let main = env.open_single(main_name.as_str(), options)?; let postings_lists = env.open_single(postings_lists_name.as_str(), options)?; let documents_fields = env.open_single(documents_fields_name.as_str(), options)?; + let documents_fields_counts = env.open_single(documents_fields_counts_name.as_str(), options)?; let synonyms = env.open_single(synonyms_name.as_str(), options)?; let docs_words = env.open_single(docs_words_name.as_str(), options)?; let updates = env.open_single(updates_name.as_str(), options)?; @@ -223,6 +260,7 @@ fn open_options( main: Main { main }, postings_lists: PostingsLists { postings_lists }, documents_fields: DocumentsFields { documents_fields }, + documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts }, synonyms: Synonyms { synonyms }, docs_words: DocsWords { docs_words }, updates: Updates { updates }, From b377003192fd00925b6ea1b50b78c780c7d4ac70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 14 Oct 2019 14:07:10 +0200 Subject: [PATCH 3/6] Compute and store the number of words in documents fields --- meilidb-core/src/serde/indexer.rs | 26 +++++++++---------- meilidb-core/src/serde/serializer.rs | 15 +++++++++-- meilidb-core/src/update/documents_addition.rs | 11 +++++++- meilidb-core/src/update/documents_deletion.rs | 2 ++ meilidb-core/src/update/mod.rs | 2 ++ 5 files changed, 40 insertions(+), 16 deletions(-) diff --git a/meilidb-core/src/serde/indexer.rs b/meilidb-core/src/serde/indexer.rs index 69a7ddecf..514b97951 100644 --- a/meilidb-core/src/serde/indexer.rs +++ b/meilidb-core/src/serde/indexer.rs @@ -13,7 +13,7 @@ pub struct Indexer<'a> { } impl<'a> ser::Serializer for Indexer<'a> { - type Ok = (); + type Ok = Option; type Error = SerializerError; type SerializeSeq = SeqIndexer<'a>; type SerializeTuple = TupleIndexer<'a>; @@ -83,8 +83,8 @@ impl<'a> ser::Serializer for Indexer<'a> { } fn serialize_str(self, text: &str) -> Result { - self.indexer.index_text(self.document_id, self.attribute, text); - Ok(()) + let number_of_words = self.indexer.index_text(self.document_id, self.attribute, text); + Ok(Some(number_of_words)) } fn serialize_bytes(self, _v: &[u8]) -> Result { @@ -99,8 +99,8 @@ impl<'a> ser::Serializer for Indexer<'a> { where T: ser::Serialize, { let text = value.serialize(ConvertToString)?; - self.indexer.index_text(self.document_id, self.attribute, &text); - Ok(()) + let number_of_words = self.indexer.index_text(self.document_id, self.attribute, &text); + Ok(Some(number_of_words)) } fn serialize_unit(self) -> Result { @@ -225,7 +225,7 @@ pub struct SeqIndexer<'a> { } impl<'a> ser::SerializeSeq for SeqIndexer<'a> { - type Ok = (); + type Ok = Option; type Error = SerializerError; fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> @@ -239,7 +239,7 @@ impl<'a> ser::SerializeSeq for SeqIndexer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); self.indexer.index_text_seq(self.document_id, self.attribute, texts); - Ok(()) + Ok(None) } } @@ -251,7 +251,7 @@ pub struct MapIndexer<'a> { } impl<'a> ser::SerializeMap for MapIndexer<'a> { - type Ok = (); + type Ok = Option; type Error = SerializerError; fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> @@ -273,7 +273,7 @@ impl<'a> ser::SerializeMap for MapIndexer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); self.indexer.index_text_seq(self.document_id, self.attribute, texts); - Ok(()) + Ok(None) } } @@ -285,7 +285,7 @@ pub struct StructSerializer<'a> { } impl<'a> ser::SerializeStruct for StructSerializer<'a> { - type Ok = (); + type Ok = Option; type Error = SerializerError; fn serialize_field( @@ -305,7 +305,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); self.indexer.index_text_seq(self.document_id, self.attribute, texts); - Ok(()) + Ok(None) } } @@ -317,7 +317,7 @@ pub struct TupleIndexer<'a> { } impl<'a> ser::SerializeTuple for TupleIndexer<'a> { - type Ok = (); + type Ok = Option; type Error = SerializerError; fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> @@ -331,6 +331,6 @@ impl<'a> ser::SerializeTuple for TupleIndexer<'a> { fn end(self) -> Result { let texts = self.texts.iter().map(String::as_str); self.indexer.index_text_seq(self.document_id, self.attribute, texts); - Ok(()) + Ok(None) } } diff --git a/meilidb-core/src/serde/serializer.rs b/meilidb-core/src/serde/serializer.rs index 57e436c5a..de09c57bc 100644 --- a/meilidb-core/src/serde/serializer.rs +++ b/meilidb-core/src/serde/serializer.rs @@ -1,4 +1,5 @@ -use meilidb_schema::Schema; +use std::collections::HashMap; +use meilidb_schema::{Schema, SchemaAttr}; use serde::ser; use crate::{DocumentId, RankedMap}; @@ -10,6 +11,7 @@ use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer}; pub struct Serializer<'a> { pub schema: &'a Schema, pub document_store: &'a mut RamDocumentStore, + pub document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>, pub indexer: &'a mut RawIndexer, pub ranked_map: &'a mut RankedMap, pub document_id: DocumentId, @@ -135,6 +137,7 @@ impl<'a> ser::Serializer for Serializer<'a> { schema: self.schema, document_id: self.document_id, document_store: self.document_store, + document_fields_counts: self.document_fields_counts, indexer: self.indexer, ranked_map: self.ranked_map, current_key_name: None, @@ -151,6 +154,7 @@ impl<'a> ser::Serializer for Serializer<'a> { schema: self.schema, document_id: self.document_id, document_store: self.document_store, + document_fields_counts: self.document_fields_counts, indexer: self.indexer, ranked_map: self.ranked_map, }) @@ -172,6 +176,7 @@ pub struct MapSerializer<'a> { schema: &'a Schema, document_id: DocumentId, document_store: &'a mut RamDocumentStore, + document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>, indexer: &'a mut RawIndexer, ranked_map: &'a mut RankedMap, current_key_name: Option, @@ -209,6 +214,7 @@ impl<'a> ser::SerializeMap for MapSerializer<'a> { self.schema, self.document_id, self.document_store, + self.document_fields_counts, self.indexer, self.ranked_map, &key, @@ -225,6 +231,7 @@ pub struct StructSerializer<'a> { schema: &'a Schema, document_id: DocumentId, document_store: &'a mut RamDocumentStore, + document_fields_counts: &'a mut HashMap<(DocumentId, SchemaAttr), u64>, indexer: &'a mut RawIndexer, ranked_map: &'a mut RankedMap, } @@ -244,6 +251,7 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> { self.schema, self.document_id, self.document_store, + self.document_fields_counts, self.indexer, self.ranked_map, key, @@ -260,6 +268,7 @@ fn serialize_value( schema: &Schema, document_id: DocumentId, document_store: &mut RamDocumentStore, + documents_fields_counts: &mut HashMap<(DocumentId, SchemaAttr), u64>, indexer: &mut RawIndexer, ranked_map: &mut RankedMap, key: &str, @@ -275,7 +284,9 @@ where T: ser::Serialize, if props.is_indexed() { let indexer = Indexer { attribute, indexer, document_id }; - value.serialize(indexer)?; + if let Some(number_of_words) = value.serialize(indexer)? { + documents_fields_counts.insert((document_id, attribute), number_of_words as u64); + } } if props.is_ranked() { diff --git a/meilidb-core/src/update/documents_addition.rs b/meilidb-core/src/update/documents_addition.rs index a9420a528..cb662617c 100644 --- a/meilidb-core/src/update/documents_addition.rs +++ b/meilidb-core/src/update/documents_addition.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use fst::{SetBuilder, set::OpBuilder}; use sdset::{SetOperation, duo::Union}; @@ -82,6 +82,7 @@ pub fn apply_documents_addition( writer: &mut rkv::Writer, main_store: store::Main, documents_fields_store: store::DocumentsFields, + documents_fields_counts_store: store::DocumentsFieldsCounts, postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, mut ranked_map: RankedMap, @@ -90,6 +91,7 @@ pub fn apply_documents_addition( { let mut document_ids = HashSet::new(); let mut document_store = RamDocumentStore::new(); + let mut document_fields_counts = HashMap::new(); let mut indexer = RawIndexer::new(); let schema = match main_store.schema(writer)? { @@ -112,6 +114,7 @@ pub fn apply_documents_addition( let serializer = Serializer { schema: &schema, document_store: &mut document_store, + document_fields_counts: &mut document_fields_counts, indexer: &mut indexer, ranked_map: &mut ranked_map, document_id, @@ -126,6 +129,7 @@ pub fn apply_documents_addition( writer, main_store, documents_fields_store, + documents_fields_counts_store, postings_lists_store, docs_words_store, ranked_map.clone(), @@ -137,6 +141,11 @@ pub fn apply_documents_addition( documents_fields_store.put_document_field(writer, id, attr, &value)?; } + // 3. insert new document attributes counts + for ((id, attr), count) in document_fields_counts { + documents_fields_counts_store.put_document_field_count(writer, id, attr, count)?; + } + let indexed = indexer.build(); let mut delta_words_builder = SetBuilder::memory(); diff --git a/meilidb-core/src/update/documents_deletion.rs b/meilidb-core/src/update/documents_deletion.rs index 765eb52f2..b56cebeb6 100644 --- a/meilidb-core/src/update/documents_deletion.rs +++ b/meilidb-core/src/update/documents_deletion.rs @@ -86,6 +86,7 @@ pub fn apply_documents_deletion( writer: &mut rkv::Writer, main_store: store::Main, documents_fields_store: store::DocumentsFields, + documents_fields_counts_store: store::DocumentsFieldsCounts, postings_lists_store: store::PostingsLists, docs_words_store: store::DocsWords, mut ranked_map: RankedMap, @@ -140,6 +141,7 @@ pub fn apply_documents_deletion( } for id in document_ids { + documents_fields_counts_store.del_all_document_fields_counts(writer, id)?; if documents_fields_store.del_all_document_fields(writer, id)? != 0 { deleted_documents.insert(id); } diff --git a/meilidb-core/src/update/mod.rs b/meilidb-core/src/update/mod.rs index 2493a1c94..7d4e349c0 100644 --- a/meilidb-core/src/update/mod.rs +++ b/meilidb-core/src/update/mod.rs @@ -138,6 +138,7 @@ pub fn update_task(writer: &mut rkv::Writer, index: store::Index) -> MResult MResult Date: Mon, 14 Oct 2019 18:48:32 +0200 Subject: [PATCH 4/6] Use the documents_fileds_count store in the QueryBuilder --- meilidb-core/src/query_builder.rs | 58 ++++++++++++++++++++++++++++--- meilidb-core/src/raw_document.rs | 27 +++++++------- meilidb-core/src/raw_indexer.rs | 3 ++ meilidb-core/src/store/mod.rs | 15 ++++++-- 4 files changed, 85 insertions(+), 18 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index c46093ce9..78557c9a4 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -22,6 +22,7 @@ pub struct QueryBuilder<'c, FI = fn(DocumentId) -> bool> { timeout: Option, main_store: store::Main, postings_lists_store: store::PostingsLists, + documents_fields_counts_store: store::DocumentsFieldsCounts, synonyms_store: store::Synonyms, } @@ -130,6 +131,7 @@ fn fetch_raw_documents( searchables: Option<&ReorderedAttrs>, main_store: &store::Main, postings_lists_store: &store::PostingsLists, + documents_fields_counts_store: &store::DocumentsFieldsCounts, ) -> MResult> { let mut matches = Vec::new(); @@ -187,22 +189,42 @@ fn fetch_raw_documents( SetBuf::new_unchecked(highlights) }; - Ok(raw_documents_from(matches, highlights)) + let fields_counts = { + let mut fields_counts = Vec::new(); + for group in matches.linear_group_by_key(|(id, ..)| *id) { + let id = group[0].0; + for result in documents_fields_counts_store.document_fields_counts(reader, id)? { + let (attr, count) = result?; + fields_counts.push((id, attr, count)); + } + } + SetBuf::new(fields_counts).unwrap() + }; + + Ok(raw_documents_from(matches, highlights, fields_counts)) } impl<'c> QueryBuilder<'c> { pub fn new( main: store::Main, postings_lists: store::PostingsLists, + documents_fields_counts: store::DocumentsFieldsCounts, synonyms: store::Synonyms, ) -> QueryBuilder<'c> { - QueryBuilder::with_criteria(main, postings_lists, synonyms, Criteria::default()) + QueryBuilder::with_criteria( + main, + postings_lists, + documents_fields_counts, + synonyms, + Criteria::default(), + ) } pub fn with_criteria( main: store::Main, postings_lists: store::PostingsLists, + documents_fields_counts: store::DocumentsFieldsCounts, synonyms: store::Synonyms, criteria: Criteria<'c>, ) -> QueryBuilder<'c> @@ -214,6 +236,7 @@ impl<'c> QueryBuilder<'c> { timeout: None, main_store: main, postings_lists_store: postings_lists, + documents_fields_counts_store: documents_fields_counts, synonyms_store: synonyms, } } @@ -230,6 +253,7 @@ impl<'c, FI> QueryBuilder<'c, FI> { timeout: self.timeout, main_store: self.main_store, postings_lists_store: self.postings_lists_store, + documents_fields_counts_store: self.documents_fields_counts_store, synonyms_store: self.synonyms_store, } } @@ -292,6 +316,7 @@ impl QueryBuilder<'_, FI> where FI: Fn(DocumentId) -> bool { self.searchable_attrs.as_ref(), &self.main_store, &self.postings_lists_store, + &self.documents_fields_counts_store, )?; // stop processing when time is running out @@ -420,6 +445,7 @@ where FI: Fn(DocumentId) -> bool, self.inner.searchable_attrs.as_ref(), &self.inner.main_store, &self.inner.postings_lists_store, + &self.inner.documents_fields_counts_store, )?; // stop processing when time is running out @@ -549,6 +575,7 @@ mod tests { use fst::{Set, IntoStreamer}; use sdset::SetBuf; use tempfile::TempDir; + use meilidb_schema::SchemaAttr; use crate::automaton::normalize_str; use crate::database::Database; @@ -653,11 +680,15 @@ mod tests { let mut words_fst = BTreeSet::new(); let mut postings_lists = HashMap::new(); + let mut fields_counts = HashMap::<_, u64>::new(); for (word, indexes) in iter { let word = word.to_lowercase().into_bytes(); words_fst.insert(word.clone()); postings_lists.entry(word).or_insert_with(Vec::new).extend_from_slice(indexes); + for idx in indexes { + fields_counts.insert((idx.document_id, idx.attribute, idx.word_index), 1); + } } let words_fst = Set::from_iter(words_fst).unwrap(); @@ -669,6 +700,25 @@ mod tests { index.postings_lists.put_postings_list(&mut writer, &word, &postings_list).unwrap(); } + for ((docid, attr, _), count) in fields_counts { + let prev = index.documents_fields_counts + .document_attribute_count( + &mut writer, + docid, + SchemaAttr(attr), + ).unwrap(); + + let prev = prev.unwrap_or(0); + + index.documents_fields_counts + .put_document_field_count( + &mut writer, + docid, + SchemaAttr(attr), + prev + count, + ).unwrap(); + } + writer.commit().unwrap(); drop(rkv); @@ -1470,8 +1520,8 @@ mod tests { #[test] fn deunicoded_synonyms() { let mut store = TempDatabase::from_iter(vec![ - ("telephone", &[doc_index(0, 0)][..]), // meilidb-data indexes the unidecoded - ("téléphone", &[doc_index(0, 0)][..]), // and the original words with the same DocIndex + ("telephone", &[doc_index(0, 0)][..]), // meilidb indexes the unidecoded + ("téléphone", &[doc_index(0, 0)][..]), // and the original words on the same DocIndex ("iphone", &[doc_index(1, 0)][..]), ]); diff --git a/meilidb-core/src/raw_document.rs b/meilidb-core/src/raw_document.rs index 3567c3fd1..16cc9edda 100644 --- a/meilidb-core/src/raw_document.rs +++ b/meilidb-core/src/raw_document.rs @@ -1,7 +1,10 @@ use std::sync::Arc; use std::fmt; + +use meilidb_schema::SchemaAttr; use sdset::SetBuf; use slice_group_by::GroupBy; + use crate::{TmpMatch, DocumentId, Highlight}; #[derive(Clone)] @@ -9,13 +12,10 @@ pub struct RawDocument { pub id: DocumentId, pub matches: SharedMatches, pub highlights: Vec, + pub fields_counts: SetBuf<(SchemaAttr, u64)>, } impl RawDocument { - fn new(id: DocumentId, matches: SharedMatches, highlights: Vec) -> RawDocument { - RawDocument { id, matches, highlights } - } - pub fn query_index(&self) -> &[u32] { let r = self.matches.range; // it is safe because construction/modifications @@ -60,7 +60,7 @@ impl fmt::Debug for RawDocument { f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "distance", self.distance()))?; f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "attribute", self.attribute()))?; f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "word_index", self.word_index()))?; - f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?; + f.write_fmt(format_args!("{:>15}: {:^5?},\r\n", "is_exact", self.is_exact()))?; f.write_str("}")?; Ok(()) } @@ -69,31 +69,34 @@ impl fmt::Debug for RawDocument { pub fn raw_documents_from( matches: SetBuf<(DocumentId, TmpMatch)>, highlights: SetBuf<(DocumentId, Highlight)>, + fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>, ) -> Vec { - let mut docs_ranges: Vec<(_, Range, _)> = Vec::new(); + let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new(); let mut matches2 = Matches::with_capacity(matches.len()); let matches = matches.linear_group_by_key(|(id, _)| *id); let highlights = highlights.linear_group_by_key(|(id, _)| *id); + let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id); - for (mgroup, hgroup) in matches.zip(highlights) { + for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) { debug_assert_eq!(mgroup[0].0, hgroup[0].0); + debug_assert_eq!(mgroup[0].0, fgroup[0].0); let document_id = mgroup[0].0; - let start = docs_ranges.last().map(|(_, r, _)| r.end).unwrap_or(0); + let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0); let end = start + mgroup.len(); - let highlights = hgroup.iter().map(|(_, h)| *h).collect(); - docs_ranges.push((document_id, Range { start, end }, highlights)); + let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap(); + docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts)); matches2.extend_from_slice(mgroup); } let matches = Arc::new(matches2); - docs_ranges.into_iter().map(|(id, range, highlights)| { + docs_ranges.into_iter().map(|(id, range, highlights, fields_counts)| { let matches = SharedMatches { range, matches: matches.clone() }; - RawDocument::new(id, matches, highlights) + RawDocument { id, matches, highlights, fields_counts } }).collect() } diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index 06d82ed32..980b622f7 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -47,6 +47,9 @@ impl RawIndexer { let iter = Some(lowercase_text).into_iter().chain(next); for text in iter { + // we must not count 2 times the same words + number_of_words = 0; + for token in Tokenizer::new(&text) { let must_continue = index_token( token, diff --git a/meilidb-core/src/store/mod.rs b/meilidb-core/src/store/mod.rs index 136b10bab..3bdd4ca18 100644 --- a/meilidb-core/src/store/mod.rs +++ b/meilidb-core/src/store/mod.rs @@ -201,11 +201,22 @@ impl Index { } pub fn query_builder(&self) -> QueryBuilder { - QueryBuilder::new(self.main, self.postings_lists, self.synonyms) + QueryBuilder::new( + self.main, + self.postings_lists, + self.documents_fields_counts, + self.synonyms, + ) } pub fn query_builder_with_criteria<'c>(&self, criteria: Criteria<'c>) -> QueryBuilder<'c> { - QueryBuilder::with_criteria(self.main, self.postings_lists, self.synonyms, criteria) + QueryBuilder::with_criteria( + self.main, + self.postings_lists, + self.documents_fields_counts, + self.synonyms, + criteria, + ) } } From 9ff92c5d15c921d161159cfa2744d960998a331b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 14 Oct 2019 18:48:54 +0200 Subject: [PATCH 5/6] Update the exact criterion to use the documents fields counts --- meilidb-core/src/criterion/exact.rs | 85 +++++++++++++++++++++++++---- 1 file changed, 75 insertions(+), 10 deletions(-) diff --git a/meilidb-core/src/criterion/exact.rs b/meilidb-core/src/criterion/exact.rs index 820c35aa0..94c4d2746 100644 --- a/meilidb-core/src/criterion/exact.rs +++ b/meilidb-core/src/criterion/exact.rs @@ -1,16 +1,38 @@ use std::cmp::Ordering; + +use sdset::Set; use slice_group_by::GroupBy; +use meilidb_schema::SchemaAttr; + use crate::criterion::Criterion; use crate::RawDocument; #[inline] -fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize { +fn number_exact_matches( + query_index: &[u32], + attribute: &[u16], + is_exact: &[bool], + fields_counts: &Set<(SchemaAttr, u64)>, +) -> usize +{ let mut count = 0; let mut index = 0; for group in query_index.linear_group() { let len = group.len(); - count += is_exact[index..index + len].contains(&true) as usize; + + let mut found_exact = false; + for (pos, _) in is_exact[index..index + len].iter().filter(|x| **x).enumerate() { + found_exact = true; + if let Ok(pos) = fields_counts.binary_search_by_key(&attribute[pos], |(a, _)| a.0) { + let (_, count) = fields_counts[pos]; + if count == 1 { + return usize::max_value() + } + } + } + + count += found_exact as usize; index += len; } @@ -25,13 +47,19 @@ impl Criterion for Exact { let lhs = { let query_index = lhs.query_index(); let is_exact = lhs.is_exact(); - number_exact_matches(query_index, is_exact) + let attribute = lhs.attribute(); + let fields_counts = &lhs.fields_counts; + + number_exact_matches(query_index, attribute, is_exact, fields_counts) }; let rhs = { let query_index = rhs.query_index(); let is_exact = rhs.is_exact(); - number_exact_matches(query_index, is_exact) + let attribute = rhs.attribute(); + let fields_counts = &rhs.fields_counts; + + number_exact_matches(query_index, attribute, is_exact, fields_counts) }; lhs.cmp(&rhs).reverse() @@ -52,14 +80,51 @@ mod tests { // doc1: "souliereres rouge" #[test] fn easy_case() { - let query_index0 = &[0]; - let is_exact0 = &[true]; + let doc0 = { + let query_index = &[0]; + let attribute = &[0]; + let is_exact = &[true]; + let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); - let query_index1 = &[0]; - let is_exact1 = &[false]; + number_exact_matches(query_index, attribute, is_exact, fields_counts) + }; + + let doc1 = { + let query_index = &[0]; + let attribute = &[0]; + let is_exact = &[false]; + let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap(); + + number_exact_matches(query_index, attribute, is_exact, fields_counts) + }; + + assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); + } + + // typing: "soulier" + // + // doc0: { 0. "soulier" } + // doc1: { 0. "soulier bleu et blanc" } + #[test] + fn basic() { + let doc0 = { + let query_index = &[0]; + let attribute = &[0]; + let is_exact = &[true]; + let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap(); + + number_exact_matches(query_index, attribute, is_exact, fields_counts) + }; + + let doc1 = { + let query_index = &[0]; + let attribute = &[0]; + let is_exact = &[true]; + let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap(); + + number_exact_matches(query_index, attribute, is_exact, fields_counts) + }; - let doc0 = number_exact_matches(query_index0, is_exact0); - let doc1 = number_exact_matches(query_index1, is_exact1); assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); } } From 0de37819b4e2a185581884c98cd51e9127a3c009 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 15 Oct 2019 11:15:48 +0200 Subject: [PATCH 6/6] Simplify the document fields counts deletion --- meilidb-core/src/query_builder.rs | 2 +- .../src/store/documents_fields_counts.rs | 21 +++++++------------ meilidb-core/src/update/customs_update.rs | 2 +- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index 78557c9a4..ca9b43467 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -702,7 +702,7 @@ mod tests { for ((docid, attr, _), count) in fields_counts { let prev = index.documents_fields_counts - .document_attribute_count( + .document_field_count( &mut writer, docid, SchemaAttr(attr), diff --git a/meilidb-core/src/store/documents_fields_counts.rs b/meilidb-core/src/store/documents_fields_counts.rs index 95aff986a..75b50966e 100644 --- a/meilidb-core/src/store/documents_fields_counts.rs +++ b/meilidb-core/src/store/documents_fields_counts.rs @@ -27,19 +27,14 @@ impl DocumentsFieldsCounts { document_id: DocumentId, ) -> Result { - let document_id_bytes = document_id.0.to_be_bytes(); let mut keys_to_delete = Vec::new(); // WARN we can not delete the keys using the iterator // so we store them and delete them just after - let iter = self.documents_fields_counts.iter_from(writer, document_id_bytes)?; - for result in iter { - let (key, _) = result?; - let array = TryFrom::try_from(key).unwrap(); - let (current_document_id, _) = document_attribute_from_key(array); - if current_document_id != document_id { break } - - keys_to_delete.push(key.to_owned()); + for result in self.document_fields_counts(writer, document_id)? { + let (attribute, _) = result?; + let key = document_attribute_into_key(document_id, attribute); + keys_to_delete.push(key); } let count = keys_to_delete.len(); @@ -50,9 +45,9 @@ impl DocumentsFieldsCounts { Ok(count) } - pub fn document_attribute_count<'a>( + pub fn document_field_count( &self, - reader: &'a impl rkv::Readable, + reader: &impl rkv::Readable, document_id: DocumentId, attribute: SchemaAttr, ) -> Result, rkv::StoreError> @@ -92,7 +87,7 @@ pub struct DocumentFieldsCountsIter<'r> { iter: rkv::store::single::Iter<'r>, } -impl<'r> Iterator for DocumentFieldsCountsIter<'r> { +impl Iterator for DocumentFieldsCountsIter<'_> { type Item = Result<(SchemaAttr, u64), rkv::StoreError>; fn next(&mut self) -> Option { @@ -116,7 +111,7 @@ pub struct DocumentsIdsIter<'r> { iter: rkv::store::single::Iter<'r>, } -impl<'r> Iterator for DocumentsIdsIter<'r> { +impl Iterator for DocumentsIdsIter<'_> { type Item = Result; fn next(&mut self) -> Option { diff --git a/meilidb-core/src/update/customs_update.rs b/meilidb-core/src/update/customs_update.rs index fa65d7c67..5334b6659 100644 --- a/meilidb-core/src/update/customs_update.rs +++ b/meilidb-core/src/update/customs_update.rs @@ -1,5 +1,5 @@ -use crate::{store, error::UnsupportedOperation, MResult}; use crate::update::{Update, next_update_id}; +use crate::{store, MResult}; pub fn apply_customs_update( writer: &mut rkv::Writer,