diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 9a6060805..3cc842b00 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -7,7 +7,9 @@ use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; use roaring::RoaringBitmap; use serde_json::Value; -use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; +use super::helpers::{ + concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters, MAX_WORD_LENGTH, +}; use crate::error::{InternalError, SerializationError}; use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; @@ -68,7 +70,7 @@ pub fn extract_docid_word_positions( for (index, token) in tokens { let token = token.lemma().trim(); - if !token.is_empty() { + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(token.as_bytes()); diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 79d0d0466..6466a636b 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -18,8 +18,11 @@ pub use merge_functions::{ roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, }; +/// The maximum length a word can be +pub const MAX_WORD_LENGTH: usize = 250; + pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { - key.as_ref().len() <= 511 && !key.as_ref().is_empty() + key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty() } /// Divides one slice into two at an index, returns `None` if mid is out of bounds. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 23618b478..365b0d024 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1741,4 +1741,22 @@ mod tests { index.add_documents(doc3).unwrap_err(); index.add_documents(doc4).unwrap_err(); } + + #[test] + fn long_words_must_be_skipped() { + let index = TempIndex::new(); + + // this is obviousy too long + let long_word = "lol".repeat(1000); + let doc1 = documents! {[{ + "id": "1", + "title": long_word.clone(), + }]}; + + index.add_documents(doc1).unwrap(); + + let rtxn = index.read_txn().unwrap(); + let words_fst = index.words_fst(&rtxn).unwrap(); + assert!(!words_fst.contains(&long_word)); + } }