Make sure that long words are correctly skipped

This commit is contained in:
Kerollmops 2022-09-07 14:12:08 +02:00
parent c83c3cd796
commit fe3973a51c
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 8 additions and 3 deletions

View File

@ -7,7 +7,9 @@ use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; use super::helpers::{
concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters, MAX_WORD_LENGTH,
};
use crate::error::{InternalError, SerializationError}; use crate::error::{InternalError, SerializationError};
use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE}; use crate::{absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
@ -68,7 +70,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
for (index, token) in tokens { for (index, token) in tokens {
let token = token.lemma().trim(); let token = token.lemma().trim();
if !token.is_empty() { if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
key_buffer.truncate(mem::size_of::<u32>()); key_buffer.truncate(mem::size_of::<u32>());
key_buffer.extend_from_slice(token.as_bytes()); key_buffer.extend_from_slice(token.as_bytes());

View File

@ -18,8 +18,11 @@ pub use merge_functions::{
roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn,
}; };
/// The maximum length a word can be
pub const MAX_WORD_LENGTH: usize = 250;
pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool { pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
key.as_ref().len() <= 511 && !key.as_ref().is_empty() key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()
} }
/// Divides one slice into two at an index, returns `None` if mid is out of bounds. /// Divides one slice into two at an index, returns `None` if mid is out of bounds.