From 649fb6e40145b3415a1222c0b7d4dbc212327956 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 3 Nov 2020 13:42:29 +0100 Subject: [PATCH] Make sure that the indexing Store only index searchable fields --- src/update/index_documents/mod.rs | 7 +++++ src/update/index_documents/store.rs | 48 +++++++++++++++++------------ src/update/settings.rs | 11 +++++-- 3 files changed, 45 insertions(+), 21 deletions(-) diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 7dd3b6611..82582dac8 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::HashSet; use std::fs::File; use std::io::{self, Seek, SeekFrom}; use std::sync::mpsc::sync_channel; @@ -327,6 +328,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { WordsPairsProximitiesDocids, } + let searchable_fields: HashSet<_> = match self.index.searchable_fields(self.wtxn)? { + Some(fields) => fields.iter().copied().collect(), + None => fields_ids_map.iter().map(|(id, _name)| id).collect(), + }; + let linked_hash_map_size = self.linked_hash_map_size; let max_nb_chunks = self.max_nb_chunks; let max_memory = self.max_memory; @@ -354,6 +360,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { .enumerate() .map(|(i, documents)| { let store = Store::new( + searchable_fields.clone(), linked_hash_map_size, max_nb_chunks, max_memory_by_job, diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index 7c1896ee5..43e37d9cd 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::convert::{TryFrom, TryInto}; use std::fs::File; use std::iter::FromIterator; @@ -37,6 +37,9 @@ pub struct Readers { } pub struct Store { + // Indexing parameters + searchable_fields: HashSet, + // Caches word_docids: LinkedHashMap, RoaringBitmap>, word_docids_limit: usize, words_pairs_proximities_docids: LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, @@ -56,6 +59,7 @@ pub struct Store { impl Store { pub fn new( + searchable_fields: HashSet, linked_hash_map_size: Option, max_nb_chunks: Option, max_memory: Option, @@ -101,18 +105,22 @@ impl Store { })?; Ok(Store { + // Indexing parameters. + searchable_fields, + // Caches word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), word_docids_limit: linked_hash_map_size, words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), words_pairs_proximities_docids_limit: linked_hash_map_size, + // MTBL parameters chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, - + // MTBL sorters main_sorter, word_docids_sorter, words_pairs_proximities_docids_sorter, - + // MTBL writers docid_word_positions_writer, documents_writer, }) @@ -309,23 +317,25 @@ impl Store { } for (attr, content) in document.iter() { - use serde_json::Value; - let content: Cow = match serde_json::from_slice(content) { - Ok(string) => string, - Err(_) => match serde_json::from_slice(content)? { - Value::Null => continue, - Value::Bool(boolean) => Cow::Owned(boolean.to_string()), - Value::Number(number) => Cow::Owned(number.to_string()), - Value::String(string) => Cow::Owned(string), - Value::Array(_array) => continue, - Value::Object(_object) => continue, - } - }; + if self.searchable_fields.contains(&attr) { + use serde_json::Value; + let content: Cow = match serde_json::from_slice(content) { + Ok(string) => string, + Err(_) => match serde_json::from_slice(content)? { + Value::Null => continue, + Value::Bool(boolean) => Cow::Owned(boolean.to_string()), + Value::Number(number) => Cow::Owned(number.to_string()), + Value::String(string) => Cow::Owned(string), + Value::Array(_array) => continue, + Value::Object(_object) => continue, + } + }; - for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) { - let word = token.to_lowercase(); - let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + for (pos, token) in simple_tokenizer(&content).filter_map(only_token).enumerate().take(MAX_POSITION) { + let word = token.to_lowercase(); + let position = (attr as usize * MAX_POSITION + pos) as u32; + words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + } } } diff --git a/src/update/settings.rs b/src/update/settings.rs index b9abeb477..f9265d976 100644 --- a/src/update/settings.rs +++ b/src/update/settings.rs @@ -42,6 +42,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { } } + pub fn reset_searchable_fields(&mut self) { + self.searchable_fields = Some(None); + } + + pub fn set_searchable_fields(&mut self, names: Vec) { + self.searchable_fields = Some(Some(names)); + } + pub fn reset_displayed_fields(&mut self) { self.displayed_fields = Some(None); } @@ -56,7 +64,6 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { { // Check that the searchable attributes have been specified. if let Some(value) = self.searchable_fields { - let current_searchable_fields = self.index.searchable_fields(self.wtxn)?; let current_displayed_fields = self.index.displayed_fields(self.wtxn)?; let current_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; @@ -93,7 +100,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { }, None => ( current_fields_ids_map.clone(), - current_searchable_fields.map(ToOwned::to_owned), + None, current_displayed_fields.map(ToOwned::to_owned), ), };