Remove the useless threshold when computing the word prefix pair proximity

This commit is contained in:
Clément Renault 2022-01-12 15:23:46 +01:00
parent e3c34684c6
commit 23ea3ad738
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4

View File

@ -18,7 +18,6 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
pub(crate) chunk_compression_level: Option<u32>, pub(crate) chunk_compression_level: Option<u32>,
pub(crate) max_nb_chunks: Option<usize>, pub(crate) max_nb_chunks: Option<usize>,
pub(crate) max_memory: Option<usize>, pub(crate) max_memory: Option<usize>,
threshold: u32,
} }
impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
@ -33,21 +32,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
chunk_compression_level: None, chunk_compression_level: None,
max_nb_chunks: None, max_nb_chunks: None,
max_memory: None, max_memory: None,
threshold: 100,
} }
} }
/// Set the number of words required to make a prefix be part of the words prefixes
/// database. If a word prefix is supposed to match more than this number of words in the
/// dictionnary, therefore this prefix is added to the words prefixes datastructures.
///
/// Default value is 100. This value must be higher than 50 and will be clamped
/// to these bound otherwise.
pub fn threshold(&mut self, value: u32) -> &mut Self {
self.threshold = value.max(50);
self
}
#[logging_timer::time("WordPrefixPairProximityDocids::{}")] #[logging_timer::time("WordPrefixPairProximityDocids::{}")]
pub fn execute(self) -> Result<()> { pub fn execute(self) -> Result<()> {
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
@ -81,7 +68,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
write_prefixes_in_sorter( write_prefixes_in_sorter(
&mut prefixes_cache, &mut prefixes_cache,
&mut word_prefix_pair_proximity_docids_sorter, &mut word_prefix_pair_proximity_docids_sorter,
self.threshold,
)?; )?;
prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0])) prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0]))
} }
@ -109,7 +95,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
write_prefixes_in_sorter( write_prefixes_in_sorter(
&mut prefixes_cache, &mut prefixes_cache,
&mut word_prefix_pair_proximity_docids_sorter, &mut word_prefix_pair_proximity_docids_sorter,
self.threshold,
)?; )?;
drop(prefix_fst); drop(prefix_fst);
@ -131,15 +116,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
fn write_prefixes_in_sorter( fn write_prefixes_in_sorter(
prefixes: &mut HashMap<Vec<u8>, Vec<&[u8]>>, prefixes: &mut HashMap<Vec<u8>, Vec<&[u8]>>,
sorter: &mut grenad::Sorter<MergeFn>, sorter: &mut grenad::Sorter<MergeFn>,
min_word_per_prefix: u32,
) -> Result<()> { ) -> Result<()> {
for (key, data_slices) in prefixes.drain() { for (key, data_slices) in prefixes.drain() {
// if the number of words prefixed by the prefix is higher than the threshold, for data in data_slices {
// we insert it in the sorter. sorter.insert(&key, data)?;
if data_slices.len() > min_word_per_prefix as usize {
for data in data_slices {
sorter.insert(&key, data)?;
}
} }
} }