From fe3973a51c53613d24a263a25fb0417be55d7cbe Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 7 Sep 2022 14:12:08 +0200 Subject: [PATCH] Make sure that long words are correctly skipped --- .../index_documents/extract/extract_docid_word_positions.rs | 6 ++++-- milli/src/update/index_documents/helpers/mod.rs | 5 ++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 9a6060805..3cc842b00 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -7,7 +7,9 @@ use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; use roaring::RoaringBitmap; use serde_json::Value; -use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; +use super::helpers::{ + concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters, MAX_WORD_LENGTH, +}; use crate::error::{InternalError, SerializationError}; use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; @@ -68,7 +70,7 @@ pub fn extract_docid_word_positions( for (index, token) in tokens { let token = token.lemma().trim(); - if !token.is_empty() { + if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(token.as_bytes()); diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 79d0d0466..6466a636b 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -18,8 +18,11 @@ pub use merge_functions::{ roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, }; +/// The maximum length a word can be +pub const MAX_WORD_LENGTH: usize = 250; + pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { - key.as_ref().len() <= 511 && !key.as_ref().is_empty() + key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty() } /// Divides one slice into two at an index, returns `None` if mid is out of bounds.