fix clippy too many arguments

This commit is contained in:
ManyTheFish 2023-03-09 11:19:13 +01:00
parent b4b859ec8c
commit 5deea631ea

View File

@ -45,8 +45,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
max_memory, max_memory,
); );
let mut key_buffer = Vec::new(); let mut buffers = Buffers::default();
let mut field_buffer = String::new();
let mut tokenizer_builder = TokenizerBuilder::new(); let mut tokenizer_builder = TokenizerBuilder::new();
if let Some(stop_words) = stop_words { if let Some(stop_words) = stop_words {
tokenizer_builder.stop_words(stop_words); tokenizer_builder.stop_words(stop_words);
@ -62,8 +61,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let obkv = KvReader::<FieldId>::new(value); let obkv = KvReader::<FieldId>::new(value);
documents_ids.push(document_id); documents_ids.push(document_id);
key_buffer.clear(); buffers.key_buffer.clear();
key_buffer.extend_from_slice(&document_id.to_be_bytes()); buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes());
let mut script_language_word_count = HashMap::new(); let mut script_language_word_count = HashMap::new();
@ -72,8 +71,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
searchable_fields, searchable_fields,
&tokenizer, &tokenizer,
max_positions_per_attributes, max_positions_per_attributes,
&mut key_buffer, &mut buffers,
&mut field_buffer,
&mut script_language_word_count, &mut script_language_word_count,
&mut docid_word_positions_sorter, &mut docid_word_positions_sorter,
)?; )?;
@ -105,8 +103,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
searchable_fields, searchable_fields,
&tokenizer, &tokenizer,
max_positions_per_attributes, max_positions_per_attributes,
&mut key_buffer, &mut buffers,
&mut field_buffer,
&mut script_language_word_count, &mut script_language_word_count,
&mut docid_word_positions_sorter, &mut docid_word_positions_sorter,
)?; )?;
@ -127,22 +124,20 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
.map(|reader| (documents_ids, reader, script_language_docids)) .map(|reader| (documents_ids, reader, script_language_docids))
} }
#[allow(clippy::too_many_arguments)]
fn extract_tokens_from_document<T: AsRef<[u8]>>( fn extract_tokens_from_document<T: AsRef<[u8]>>(
obkv: &KvReader<FieldId>, obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>, searchable_fields: &Option<HashSet<FieldId>>,
tokenizer: &Tokenizer<T>, tokenizer: &Tokenizer<T>,
max_positions_per_attributes: u32, max_positions_per_attributes: u32,
key_buffer: &mut Vec<u8>, buffers: &mut Buffers,
field_buffer: &mut String,
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>, script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>, docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> { ) -> Result<()> {
for (field_id, field_bytes) in obkv.iter() { for (field_id, field_bytes) in obkv.iter() {
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
field_buffer.clear(); buffers.field_buffer.clear();
if let Some(field) = json_to_string(&value, field_buffer) { if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
let tokens = process_tokens(tokenizer.tokenize(field)) let tokens = process_tokens(tokenizer.tokenize(field))
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
@ -159,14 +154,15 @@ fn extract_tokens_from_document<T: AsRef<[u8]>>(
} }
let token = token.lemma().trim(); let token = token.lemma().trim();
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
key_buffer.truncate(mem::size_of::<u32>()); buffers.key_buffer.truncate(mem::size_of::<u32>());
key_buffer.extend_from_slice(token.as_bytes()); buffers.key_buffer.extend_from_slice(token.as_bytes());
let position: u16 = index let position: u16 = index
.try_into() .try_into()
.map_err(|_| SerializationError::InvalidNumberSerialization)?; .map_err(|_| SerializationError::InvalidNumberSerialization)?;
let position = absolute_from_relative_position(field_id, position); let position = absolute_from_relative_position(field_id, position);
docid_word_positions_sorter.insert(&key_buffer, position.to_ne_bytes())?; docid_word_positions_sorter
.insert(&buffers.key_buffer, position.to_ne_bytes())?;
} }
} }
} }
@ -290,3 +286,9 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)
let total: usize = languages_frequency.iter().map(|(_, c)| c).sum(); let total: usize = languages_frequency.iter().map(|(_, c)| c).sum();
total / 10 // 10% is a completely arbitrary value. total / 10 // 10% is a completely arbitrary value.
} }
#[derive(Default)]
struct Buffers {
key_buffer: Vec<u8>,
field_buffer: String,
}