Remove the useless threshold when computing the word prefix pair proximity

2025-01-19 01:18:31 +08:00 · 2022-01-12 15:23:46 +01:00 · 2022-01-12 15:23:46 +01:00 · 23ea3ad738
commit 23ea3ad738
parent e3c34684c6
1 changed files with 2 additions and 22 deletions
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@ -18,7 +18,6 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
    pub(crate) chunk_compression_level: Option<u32>,
    pub(crate) max_nb_chunks: Option<usize>,
    pub(crate) max_memory: Option<usize>,
    threshold: u32,
 }
 impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
@ -33,21 +32,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
            chunk_compression_level: None,
            max_nb_chunks: None,
            max_memory: None,
            threshold: 100,
        }
    }
    /// Set the number of words required to make a prefix be part of the words prefixes
    /// database. If a word prefix is supposed to match more than this number of words in the
    /// dictionnary, therefore this prefix is added to the words prefixes datastructures.
    ///
    /// Default value is 100. This value must be higher than 50 and will be clamped
    /// to these bound otherwise.
    pub fn threshold(&mut self, value: u32) -> &mut Self {
        self.threshold = value.max(50);
        self
    }
    #[logging_timer::time("WordPrefixPairProximityDocids::{}")]
    pub fn execute(self) -> Result<()> {
        debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
@ -81,7 +68,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
                    write_prefixes_in_sorter(
                        &mut prefixes_cache,
                        &mut word_prefix_pair_proximity_docids_sorter,
                        self.threshold,
                    )?;
                    prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0]))
                }
@ -109,7 +95,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
        write_prefixes_in_sorter(
            &mut prefixes_cache,
            &mut word_prefix_pair_proximity_docids_sorter,
            self.threshold,
        )?;
        drop(prefix_fst);
@ -131,15 +116,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
 fn write_prefixes_in_sorter(
    prefixes: &mut HashMap<Vec<u8>, Vec<&[u8]>>,
    sorter: &mut grenad::Sorter<MergeFn>,
    min_word_per_prefix: u32,
 ) -> Result<()> {
    for (key, data_slices) in prefixes.drain() {
-        // if the number of words prefixed by the prefix is higher than the threshold,
+        for data in data_slices {
-        // we insert it in the sorter.
+            sorter.insert(&key, data)?;
        if data_slices.len() > min_word_per_prefix as usize {
            for data in data_slices {
                sorter.insert(&key, data)?;
            }
        }
    }