From f13e076b8ab0e23013b020e173bf5ffd5c7b628f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 16 Sep 2024 14:40:40 +0200 Subject: [PATCH] Use hashmap instead of Btree in wpp extractor --- .../extract_word_pair_proximity_docids.rs | 88 ++++++++++--------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 7b3706424..82007f9ba 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, VecDeque}; +use std::collections::{HashMap, VecDeque}; use heed::RoTxn; use itertools::merge_join_by; @@ -35,10 +35,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { cached_sorter: &mut CboCachedSorter, document_change: DocumentChange, ) -> Result<()> { - /// TODO: mutualize those buffers let mut key_buffer = Vec::new(); - let mut add_word_pair_proximity = BTreeMap::new(); - let mut del_word_pair_proximity = BTreeMap::new(); + let mut word_pair_proximity = HashMap::new(); let mut word_positions: VecDeque<(String, u16)> = VecDeque::with_capacity(MAX_DISTANCE as usize); @@ -51,7 +49,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { document_tokenizer, fields_ids_map, &mut word_positions, - &mut del_word_pair_proximity, + &mut |(w1, w2), prox| { + word_pair_proximity + .entry((w1, w2)) + .and_modify(|(del_p, _add_p)| { + *del_p = std::cmp::min(*del_p, prox); + }) + .or_insert((prox, 0)); + }, )?; } DocumentChange::Update(inner) => { @@ -61,7 +66,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { document_tokenizer, fields_ids_map, &mut word_positions, - &mut del_word_pair_proximity, + &mut |(w1, w2), prox| { + word_pair_proximity + .entry((w1, w2)) + .and_modify(|(del_p, _add_p)| { + *del_p = std::cmp::min(*del_p, prox); + }) + .or_insert((prox, 0)); + }, )?; let document = inner.new(); process_document_tokens( @@ -69,7 +81,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { document_tokenizer, fields_ids_map, &mut word_positions, - &mut add_word_pair_proximity, + &mut |(w1, w2), prox| { + word_pair_proximity + .entry((w1, w2)) + .and_modify(|(_del_p, add_p)| { + *add_p = std::cmp::min(*add_p, prox); + }) + .or_insert((0, prox)); + }, )?; } DocumentChange::Insertion(inner) => { @@ -79,35 +98,23 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { document_tokenizer, fields_ids_map, &mut word_positions, - &mut add_word_pair_proximity, + &mut |(w1, w2), prox| { + word_pair_proximity + .entry((w1, w2)) + .and_modify(|(_del_p, add_p)| { + *add_p = std::cmp::min(*add_p, prox); + }) + .or_insert((0, prox)); + }, )?; } } - use itertools::EitherOrBoth::*; - for eob in - merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| { - d.cmp(a) - }) - { - match eob { - Left(((w1, w2), prox)) => { - let key = build_key(*prox, w1, w2, &mut key_buffer); - cached_sorter.insert_del_u32(key, docid)?; - } - Right(((w1, w2), prox)) => { - let key = build_key(*prox, w1, w2, &mut key_buffer); - cached_sorter.insert_add_u32(key, docid)?; - } - Both(((w1, w2), del_prox), (_, add_prox)) => { - if del_prox != add_prox { - let key = build_key(*del_prox, w1, w2, &mut key_buffer); - cached_sorter.insert_del_u32(key, docid)?; - let key = build_key(*add_prox, w1, w2, &mut key_buffer); - cached_sorter.insert_add_u32(key, docid)?; - } - } - } + for ((w1, w2), (del_p, add_p)) in word_pair_proximity.iter() { + let key = build_key(*del_p, w1, w2, &mut key_buffer); + cached_sorter.insert_del_u32(key, docid)?; + let key = build_key(*add_p, w1, w2, &mut key_buffer); + cached_sorter.insert_add_u32(key, docid)?; } Ok(()) @@ -125,18 +132,19 @@ fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec) -> & fn word_positions_into_word_pair_proximity( word_positions: &mut VecDeque<(String, u16)>, - word_pair_proximity: &mut BTreeMap<(String, String), u8>, + word_pair_proximity: &mut dyn FnMut((String, String), u8), ) -> Result<()> { let (head_word, head_position) = word_positions.pop_front().unwrap(); for (word, position) in word_positions.iter() { let prox = index_proximity(head_position as u32, *position as u32) as u8; if prox > 0 && prox < MAX_DISTANCE as u8 { - word_pair_proximity - .entry((head_word.clone(), word.clone())) - .and_modify(|p| { - *p = std::cmp::min(*p, prox); - }) - .or_insert(prox); + word_pair_proximity((head_word.clone(), word.clone()), prox); + // word_pair_proximity + // .entry((head_word.clone(), word.clone())) + // .and_modify(|p| { + // *p = std::cmp::min(*p, prox); + // }) + // .or_insert(prox); } } Ok(()) @@ -147,7 +155,7 @@ fn process_document_tokens( document_tokenizer: &DocumentTokenizer, fields_ids_map: &mut GlobalFieldsIdsMap, word_positions: &mut VecDeque<(String, u16)>, - word_pair_proximity: &mut BTreeMap<(String, String), u8>, + word_pair_proximity: &mut dyn FnMut((String, String), u8), ) -> Result<()> { let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| { // drain the proximity window until the head word is considered close to the word we are inserting.