Avoid iterating on big databases when useless

This commit is contained in:
Kerollmops 2022-03-01 18:02:12 +01:00
parent a8d28e364d
commit 1ae13c1374
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
3 changed files with 111 additions and 98 deletions

View File

@ -50,35 +50,38 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
self.max_memory, self.max_memory,
); );
let mut new_word_docids_iter = new_word_docids.into_cursor()?; if !common_prefix_fst_words.is_empty() {
let mut current_prefixes: Option<&&[String]> = None; let mut new_word_docids_iter = new_word_docids.into_cursor()?;
let mut prefixes_cache = HashMap::new(); let mut current_prefixes: Option<&&[String]> = None;
while let Some((word, data)) = new_word_docids_iter.move_on_next()? { let mut prefixes_cache = HashMap::new();
current_prefixes = match current_prefixes.take() { while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), current_prefixes = match current_prefixes.take() {
_otherwise => { Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes),
write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; _otherwise => {
common_prefix_fst_words write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?;
.iter() common_prefix_fst_words
.find(|prefixes| word.starts_with(&prefixes[0].as_bytes())) .iter()
} .find(|prefixes| word.starts_with(&prefixes[0].as_bytes()))
}; }
};
if let Some(prefixes) = current_prefixes { if let Some(prefixes) = current_prefixes {
for prefix in prefixes.iter() { for prefix in prefixes.iter() {
if word.starts_with(prefix.as_bytes()) { if word.starts_with(prefix.as_bytes()) {
match prefixes_cache.get_mut(prefix.as_bytes()) { match prefixes_cache.get_mut(prefix.as_bytes()) {
Some(value) => value.push(data.to_owned()), Some(value) => value.push(data.to_owned()),
None => { None => {
prefixes_cache.insert(prefix.clone().into(), vec![data.to_owned()]); prefixes_cache
.insert(prefix.clone().into(), vec![data.to_owned()]);
}
} }
} }
} }
} }
} }
}
write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?; write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_docids_sorter)?;
}
// We fetch the docids associated to the newly added word prefix fst only. // We fetch the docids associated to the newly added word prefix fst only.
let db = self.index.word_docids.remap_data_type::<ByteSlice>(); let db = self.index.word_docids.remap_data_type::<ByteSlice>();

View File

@ -83,70 +83,76 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
self.max_memory, self.max_memory,
); );
// We compute the prefix docids associated with the common prefixes between if !common_prefix_fst_words.is_empty() {
// the old and new word prefix fst. // We compute the prefix docids associated with the common prefixes between
let mut buffer = Vec::new(); // the old and new word prefix fst.
let mut current_prefixes: Option<&&[String]> = None; let mut buffer = Vec::new();
let mut prefixes_cache = HashMap::new(); let mut current_prefixes: Option<&&[String]> = None;
while let Some((key, data)) = new_wppd_iter.move_on_next()? { let mut prefixes_cache = HashMap::new();
let (w1, w2, prox) = StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; while let Some((key, data)) = new_wppd_iter.move_on_next()? {
if prox > self.max_proximity { let (w1, w2, prox) =
continue; StrStrU8Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
if prox > self.max_proximity {
continue;
}
insert_current_prefix_data_in_sorter(
&mut buffer,
&mut current_prefixes,
&mut prefixes_cache,
&mut word_prefix_pair_proximity_docids_sorter,
common_prefix_fst_words,
self.max_prefix_length,
w1,
w2,
prox,
data,
)?;
} }
insert_current_prefix_data_in_sorter( write_prefixes_in_sorter(
&mut buffer,
&mut current_prefixes,
&mut prefixes_cache, &mut prefixes_cache,
&mut word_prefix_pair_proximity_docids_sorter, &mut word_prefix_pair_proximity_docids_sorter,
common_prefix_fst_words,
self.max_prefix_length,
w1,
w2,
prox,
data,
)?; )?;
} }
write_prefixes_in_sorter( if !new_prefix_fst_words.is_empty() {
&mut prefixes_cache, // We compute the prefix docids associated with the newly added prefixes
&mut word_prefix_pair_proximity_docids_sorter, // in the new word prefix fst.
)?; let mut db_iter = self
.index
.word_pair_proximity_docids
.remap_data_type::<ByteSlice>()
.iter(self.wtxn)?;
// We compute the prefix docids associated with the newly added prefixes let mut buffer = Vec::new();
// in the new word prefix fst. let mut current_prefixes: Option<&&[String]> = None;
let mut db_iter = let mut prefixes_cache = HashMap::new();
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(self.wtxn)?; while let Some(((w1, w2, prox), data)) = db_iter.next().transpose()? {
if prox > self.max_proximity {
continue;
}
let mut buffer = Vec::new(); insert_current_prefix_data_in_sorter(
let mut current_prefixes: Option<&&[String]> = None; &mut buffer,
let mut prefixes_cache = HashMap::new(); &mut current_prefixes,
while let Some(((w1, w2, prox), data)) = db_iter.next().transpose()? { &mut prefixes_cache,
if prox > self.max_proximity { &mut word_prefix_pair_proximity_docids_sorter,
continue; &new_prefix_fst_words,
self.max_prefix_length,
w1,
w2,
prox,
data,
)?;
} }
insert_current_prefix_data_in_sorter( write_prefixes_in_sorter(
&mut buffer,
&mut current_prefixes,
&mut prefixes_cache, &mut prefixes_cache,
&mut word_prefix_pair_proximity_docids_sorter, &mut word_prefix_pair_proximity_docids_sorter,
&new_prefix_fst_words,
self.max_prefix_length,
w1,
w2,
prox,
data,
)?; )?;
} }
write_prefixes_in_sorter(
&mut prefixes_cache,
&mut word_prefix_pair_proximity_docids_sorter,
)?;
drop(db_iter);
// All of the word prefix pairs in the database that have a w2 // All of the word prefix pairs in the database that have a w2
// that is contained in the `suppr_pw` set must be removed as well. // that is contained in the `suppr_pw` set must be removed as well.
let mut iter = self let mut iter = self

View File

@ -74,42 +74,46 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
let mut new_word_position_docids_iter = new_word_position_docids.into_cursor()?; let mut new_word_position_docids_iter = new_word_position_docids.into_cursor()?;
// We fetch all the new common prefixes between the previous and new prefix fst. if !common_prefix_fst_words.is_empty() {
let mut buffer = Vec::new(); // We fetch all the new common prefixes between the previous and new prefix fst.
let mut current_prefixes: Option<&&[String]> = None; let mut buffer = Vec::new();
let mut prefixes_cache = HashMap::new(); let mut current_prefixes: Option<&&[String]> = None;
while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? { let mut prefixes_cache = HashMap::new();
let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?; while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? {
let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
current_prefixes = match current_prefixes.take() { current_prefixes = match current_prefixes.take() {
Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes), Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes),
_otherwise => { _otherwise => {
write_prefixes_in_sorter( write_prefixes_in_sorter(
&mut prefixes_cache, &mut prefixes_cache,
&mut prefix_position_docids_sorter, &mut prefix_position_docids_sorter,
)?; )?;
common_prefix_fst_words.iter().find(|prefixes| word.starts_with(&prefixes[0])) common_prefix_fst_words
} .iter()
}; .find(|prefixes| word.starts_with(&prefixes[0]))
}
};
if let Some(prefixes) = current_prefixes { if let Some(prefixes) = current_prefixes {
for prefix in prefixes.iter() { for prefix in prefixes.iter() {
if word.starts_with(prefix) { if word.starts_with(prefix) {
buffer.clear(); buffer.clear();
buffer.extend_from_slice(prefix.as_bytes()); buffer.extend_from_slice(prefix.as_bytes());
buffer.extend_from_slice(&pos.to_be_bytes()); buffer.extend_from_slice(&pos.to_be_bytes());
match prefixes_cache.get_mut(&buffer) { match prefixes_cache.get_mut(&buffer) {
Some(value) => value.push(data.to_owned()), Some(value) => value.push(data.to_owned()),
None => { None => {
prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]); prefixes_cache.insert(buffer.clone(), vec![data.to_owned()]);
}
} }
} }
} }
} }
} }
}
write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?; write_prefixes_in_sorter(&mut prefixes_cache, &mut prefix_position_docids_sorter)?;
}
// We fetch the docids associated to the newly added word prefix fst only. // We fetch the docids associated to the newly added word prefix fst only.
let db = self.index.word_position_docids.remap_data_type::<ByteSlice>(); let db = self.index.word_position_docids.remap_data_type::<ByteSlice>();