diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 27fc138dd..652a88451 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -131,6 +131,11 @@ pub struct IndexerOpt { /// Number of parallel jobs for indexing, defaults to # of CPUs. #[structopt(long)] pub indexing_jobs: Option, + + /// Maximum relative position in an attribute for a word to be indexed. + /// Any value higher than 65535 will be clamped. + #[structopt(long)] + pub max_positions_per_attributes: Option, } struct Highlighter<'a, A> { @@ -346,6 +351,9 @@ async fn main() -> anyhow::Result<()> { if let Some(chunk_compression_level) = indexer_opt_cloned.chunk_compression_level { update_builder.chunk_compression_level(chunk_compression_level); } + if let Some(max_pos_per_attributes) = indexer_opt_cloned.max_positions_per_attributes { + update_builder.max_positions_per_attributes(max_pos_per_attributes); + } update_builder.thread_pool(GLOBAL_THREAD_POOL.get().unwrap()); update_builder.log_every_n(indexer_opt_cloned.log_every_n); update_builder.max_memory(indexer_opt_cloned.max_memory.get_bytes() as usize); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 838817d98..781cedb2c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -53,9 +53,24 @@ pub type Attribute = u32; pub type DocumentId = u32; pub type FieldId = u16; pub type Position = u32; +pub type RelativePosition = u16; pub type FieldDistribution = BTreeMap; pub type GeoPoint = rstar::primitives::GeomWithData<[f64; 2], DocumentId>; +pub const MAX_POSITION_PER_ATTRIBUTE: u32 = u16::MAX as u32 + 1; + +// Convert an absolute word position into a relative position. +// Return the field id of the attribute related to the absolute position +// and the relative position in the attribute. +pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, RelativePosition) { + ((absolute >> 16) as u16, (absolute & 0xFFFF) as u16) +} + +// Compute the absolute word position with the field id of the attribute and relative position in the attribute. +pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position { + (field_id as u32) << 16 | (relative as u32) +} + /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], @@ -187,4 +202,26 @@ mod tests { // the distance of hard separators is clamped to 8 anyway. assert_eq!(string, "name: John Doe. . 43. hello. I. am. fine. . "); } + + #[test] + fn test_relative_position_conversion() { + assert_eq!((0x0000, 0x0000), relative_from_absolute_position(0x00000000)); + assert_eq!((0x0000, 0xFFFF), relative_from_absolute_position(0x0000FFFF)); + assert_eq!((0xFFFF, 0x0000), relative_from_absolute_position(0xFFFF0000)); + assert_eq!((0xFF00, 0xFF00), relative_from_absolute_position(0xFF00FF00)); + assert_eq!((0xFF00, 0x00FF), relative_from_absolute_position(0xFF0000FF)); + assert_eq!((0x1234, 0x5678), relative_from_absolute_position(0x12345678)); + assert_eq!((0xFFFF, 0xFFFF), relative_from_absolute_position(0xFFFFFFFF)); + } + + #[test] + fn test_absolute_position_conversion() { + assert_eq!(0x00000000, absolute_from_relative_position(0x0000, 0x0000)); + assert_eq!(0x0000FFFF, absolute_from_relative_position(0x0000, 0xFFFF)); + assert_eq!(0xFFFF0000, absolute_from_relative_position(0xFFFF, 0x0000)); + assert_eq!(0xFF00FF00, absolute_from_relative_position(0xFF00, 0xFF00)); + assert_eq!(0xFF0000FF, absolute_from_relative_position(0xFF00, 0x00FF)); + assert_eq!(0x12345678, absolute_from_relative_position(0x1234, 0x5678)); + assert_eq!(0xFFFFFFFF, absolute_from_relative_position(0xFFFF, 0xFFFF)); + } } diff --git a/milli/src/proximity.rs b/milli/src/proximity.rs index 083e5a977..62f490119 100644 --- a/milli/src/proximity.rs +++ b/milli/src/proximity.rs @@ -1,8 +1,7 @@ use std::cmp; -use crate::{Attribute, Position}; +use crate::{relative_from_absolute_position, Position}; -pub const ONE_ATTRIBUTE: u32 = 1000; pub const MAX_DISTANCE: u32 = 8; pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { @@ -14,19 +13,15 @@ pub fn index_proximity(lhs: u32, rhs: u32) -> u32 { } pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { - let (lhs_attr, lhs_index) = extract_position(lhs); - let (rhs_attr, rhs_index) = extract_position(rhs); + let (lhs_attr, lhs_index) = relative_from_absolute_position(lhs); + let (rhs_attr, rhs_index) = relative_from_absolute_position(rhs); if lhs_attr != rhs_attr { MAX_DISTANCE } else { - index_proximity(lhs_index, rhs_index) + index_proximity(lhs_index as u32, rhs_index as u32) } } -pub fn extract_position(position: Position) -> (Attribute, Position) { - (position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE) -} - pub fn path_proximity(path: &[Position]) -> u32 { path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::() } diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 8e56b3649..e7775423c 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -10,7 +10,7 @@ use crate::search::criteria::{ resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, }; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; -use crate::Result; +use crate::{absolute_from_relative_position, FieldId, Result}; pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, @@ -181,7 +181,7 @@ fn resolve_state( ctx.field_id_word_count_docids(id, query_len)? { let mut attribute_candidates_array = - attribute_start_with_docids(ctx, id as u32, query)?; + attribute_start_with_docids(ctx, id, query)?; attribute_candidates_array.push(attribute_allowed_docids); candidates |= intersection_of(attribute_candidates_array.iter().collect()); } @@ -199,8 +199,7 @@ fn resolve_state( let mut candidates = RoaringBitmap::new(); let attributes_ids = ctx.searchable_fields_ids()?; for id in attributes_ids { - let attribute_candidates_array = - attribute_start_with_docids(ctx, id as u32, query)?; + let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; candidates |= intersection_of(attribute_candidates_array.iter().collect()); } @@ -290,12 +289,12 @@ fn resolve_state( fn attribute_start_with_docids( ctx: &dyn Context, - attribute_id: u32, + attribute_id: FieldId, query: &[ExactQueryPart], ) -> heed::Result> { let mut attribute_candidates_array = Vec::new(); // start from attribute first position - let mut pos = attribute_id * 1000; + let mut pos = absolute_from_relative_position(attribute_id, 0); for part in query { use ExactQueryPart::*; match part { diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index ca65f0874..fa1381412 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -10,8 +10,7 @@ use serde_json::Value; use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; use crate::error::{InternalError, SerializationError}; -use crate::proximity::ONE_ATTRIBUTE; -use crate::{FieldId, Result}; +use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; /// Extracts the word and positions where this word appear and /// prefixes it by the document id. @@ -24,7 +23,10 @@ pub fn extract_docid_word_positions( indexer: GrenadParameters, searchable_fields: &Option>, stop_words: Option<&fst::Set<&[u8]>>, + max_positions_per_attributes: Option, ) -> Result<(RoaringBitmap, grenad::Reader)> { + let max_positions_per_attributes = max_positions_per_attributes + .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); let max_memory = indexer.max_memory_by_thread(); let mut documents_ids = RoaringBitmap::new(); @@ -63,7 +65,7 @@ pub fn extract_docid_word_positions( if let Some(field) = json_to_string(&value, &mut field_buffer) { let analyzed = analyzer.analyze(field); let tokens = process_tokens(analyzed.tokens()) - .take_while(|(p, _)| (*p as u32) < ONE_ATTRIBUTE); + .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { let token = token.text().trim(); @@ -71,10 +73,10 @@ pub fn extract_docid_word_positions( key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(token.as_bytes()); - let position: u32 = index + let position: u16 = index .try_into() .map_err(|_| SerializationError::InvalidNumberSerialization)?; - let position = field_id as u32 * ONE_ATTRIBUTE + position; + let position = absolute_from_relative_position(field_id, position); docid_word_positions_sorter .insert(&key_buffer, &position.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs index 1fbc55714..4e25cb4f6 100644 --- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs +++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs @@ -10,8 +10,7 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::proximity::extract_position; -use crate::{DocumentId, FieldId, Result}; +use crate::{relative_from_absolute_position, DocumentId, FieldId, Result}; /// Extracts the field id word count and the documents ids where /// this field id with this amount of words appear. @@ -53,8 +52,8 @@ pub fn extract_fid_word_count_docids( } for position in read_u32_ne_bytes(value) { - let (field_id, position) = extract_position(position); - let word_count = position + 1; + let (field_id, position) = relative_from_absolute_position(position); + let word_count = position as u32 + 1; let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); *value = cmp::max(*value, word_count); diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 0406e8ef4..0f04418ed 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -42,6 +42,7 @@ pub(crate) fn data_from_obkv_documents( primary_key_id: FieldId, geo_field_id: Option, stop_words: Option>, + max_positions_per_attributes: Option, ) -> Result<()> { let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = obkv_chunks .par_bridge() @@ -55,6 +56,7 @@ pub(crate) fn data_from_obkv_documents( primary_key_id, geo_field_id, &stop_words, + max_positions_per_attributes, ) }) .collect(); @@ -177,6 +179,7 @@ fn extract_documents_data( primary_key_id: FieldId, geo_field_id: Option, stop_words: &Option>, + max_positions_per_attributes: Option, ) -> Result<( grenad::Reader, (grenad::Reader, grenad::Reader), @@ -206,6 +209,7 @@ fn extract_documents_data( indexer.clone(), searchable_fields, stop_words.as_ref(), + max_positions_per_attributes, )?; // send documents_ids to DB writer diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b0dbd9c3e..92bcab0e9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -68,6 +68,7 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + pub(crate) max_positions_per_attributes: Option, facet_level_group_size: Option, facet_min_level_size: Option, words_prefix_threshold: Option, @@ -104,6 +105,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { update_method: IndexDocumentsMethod::ReplaceDocuments, autogenerate_docids: false, update_id, + max_positions_per_attributes: None, } } @@ -262,6 +264,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { primary_key_id, geo_field_id, stop_words, + self.max_positions_per_attributes, ) }); @@ -284,6 +287,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_type: self.chunk_compression_type, chunk_compression_level: self.chunk_compression_level, thread_pool: self.thread_pool, + max_positions_per_attributes: self.max_positions_per_attributes, update_id: self.update_id, }; let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; @@ -884,6 +888,44 @@ mod tests { wtxn.commit().unwrap(); } + #[test] + fn index_more_than_1000_positions_in_a_field() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(50 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + + let mut big_object = HashMap::new(); + big_object.insert(S("id"), "wow"); + let content: String = + (0..=u16::MAX).into_iter().map(|p| p.to_string()).reduce(|a, b| a + " " + &b).unwrap(); + big_object.insert("content".to_string(), &content); + + let mut cursor = Cursor::new(Vec::new()); + + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + builder.add_documents(big_object).unwrap(); + builder.finish().unwrap(); + cursor.set_position(0); + let content = DocumentBatchReader::from_reader(cursor).unwrap(); + + let builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.execute(content, |_, _| ()).unwrap(); + + wtxn.commit().unwrap(); + + let mut rtxn = index.read_txn().unwrap(); + + assert!(index.word_docids.get(&mut rtxn, "0").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "64").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "256").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "1024").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "32768").unwrap().is_some()); + assert!(index.word_docids.get(&mut rtxn, "65535").unwrap().is_some()); + } + #[test] fn index_documents_with_zeroes() { let path = tempfile::tempdir().unwrap(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 4aa79f6e3..41c156676 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -69,6 +69,7 @@ pub struct Settings<'a, 't, 'u, 'i> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + pub(crate) max_positions_per_attributes: Option, update_id: u64, searchable_fields: Setting>, @@ -108,6 +109,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { synonyms: Setting::NotSet, primary_key: Setting::NotSet, update_id, + max_positions_per_attributes: None, } } @@ -237,6 +239,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { indexing_builder.chunk_compression_type = self.chunk_compression_type; indexing_builder.chunk_compression_level = self.chunk_compression_level; indexing_builder.thread_pool = self.thread_pool; + indexing_builder.max_positions_per_attributes = self.max_positions_per_attributes; indexing_builder.execute_raw(output, &cb)?; Ok(()) diff --git a/milli/src/update/update_builder.rs b/milli/src/update/update_builder.rs index 561c4bc50..20ec28e06 100644 --- a/milli/src/update/update_builder.rs +++ b/milli/src/update/update_builder.rs @@ -12,6 +12,7 @@ pub struct UpdateBuilder<'a> { pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) thread_pool: Option<&'a ThreadPool>, + pub(crate) max_positions_per_attributes: Option, pub(crate) update_id: u64, } @@ -25,6 +26,7 @@ impl<'a> UpdateBuilder<'a> { chunk_compression_type: CompressionType::None, chunk_compression_level: None, thread_pool: None, + max_positions_per_attributes: None, update_id, } } @@ -57,6 +59,10 @@ impl<'a> UpdateBuilder<'a> { self.thread_pool = Some(thread_pool); } + pub fn max_positions_per_attributes(&mut self, max_positions_per_attributes: u32) { + self.max_positions_per_attributes = Some(max_positions_per_attributes); + } + pub fn clear_documents<'t, 'u, 'i>( self, wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -87,6 +93,7 @@ impl<'a> UpdateBuilder<'a> { builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.thread_pool = self.thread_pool; + builder.max_positions_per_attributes = self.max_positions_per_attributes; builder } @@ -105,6 +112,7 @@ impl<'a> UpdateBuilder<'a> { builder.chunk_compression_type = self.chunk_compression_type; builder.chunk_compression_level = self.chunk_compression_level; builder.thread_pool = self.thread_pool; + builder.max_positions_per_attributes = self.max_positions_per_attributes; builder }