Merge #431

431: Fix and improve word prefix pair proximity r=ManyTheFish a=Kerollmops This PR first fixes the algorithm we used to select and compute the word prefix pair proximity database. The previous version was skipping nearly all of the prefixes. The issue is that this fix made this method to take more time and we were trying to reduce the time spent in it. With `@ManyTheFish` we found out that we could skip some of the work we were doing by: - discarding the prefixes that were shorter than a specific threshold (default: 2). - discarding the word prefix pairs with proximity bigger than a specific threshold (default: 4). - remove the unused threshold that was specifying a minimum amount of word docids to merge. We will take more time to do some more optimization, like stop clearing and recomputing from scratch the database, we will compute the subsets of keys to create, keep and merge. This change is a little bit more complex than what this PR does. I keep this PR as a draft as I want to further test the real gain if it is enough or not if it is valid or not. I advise reviewers to review commit by commit to see the changes bit by bit, reviewing the whole PR can be hard. Co-authored-by: Clément Renault <clement@meilisearch.com>
2025-02-20 17:45:54 +08:00 · 2022-01-27 07:04:56 +00:00 · 2022-01-27 07:04:56 +00:00 · 38d23546a5
commit 38d23546a5
parent c63f945093 f9b214f34e
1 changed files with 44 additions and 39 deletions
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@ -18,7 +18,8 @@ pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
    pub(crate) chunk_compression_level: Option<u32>,
    pub(crate) max_nb_chunks: Option<usize>,
    pub(crate) max_memory: Option<usize>,
-    threshold: u32,
+    max_proximity: u8,
+    max_prefix_length: usize,
 }

 impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
@ -33,18 +34,29 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
            chunk_compression_level: None,
            max_nb_chunks: None,
            max_memory: None,
-            threshold: 100,
+            max_proximity: 4,
+            max_prefix_length: 2,
        }
    }

-    /// Set the number of words required to make a prefix be part of the words prefixes
-    /// database. If a word prefix is supposed to match more than this number of words in the
-    /// dictionnary, therefore this prefix is added to the words prefixes datastructures.
+    /// Set the maximum proximity required to make a prefix be part of the words prefixes
+    /// database. If two words are too far from the threshold the associated documents will
+    /// not be part of the prefix database.
    ///
-    /// Default value is 100. This value must be higher than 50 and will be clamped
-    /// to these bound otherwise.
-    pub fn threshold(&mut self, value: u32) -> &mut Self {
-        self.threshold = value.max(50);
+    /// Default value is 4. This value must be lower or equal than 7 and will be clamped
+    /// to this bound otherwise.
+    pub fn max_proximity(&mut self, value: u8) -> &mut Self {
+        self.max_proximity = value.max(7);
+        self
+    }
+
+    /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
+    /// prefixes database. If the prefix length is higher than the threshold, the associated documents
+    /// will not be part of the prefix database.
+    ///
+    /// Default value is 2.
+    pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
+        self.max_prefix_length = value;
        self
    }

@ -64,28 +76,29 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
        );

        let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
-        let prefix_fst_keys = prefix_fst.into_stream().into_bytes();
-        let prefix_fst_keys: Vec<_> = prefix_fst_keys
-            .as_slice()
-            .linear_group_by_key(|x| std::str::from_utf8(&x).unwrap().chars().nth(0).unwrap())
-            .collect();
+        let prefix_fst_keys = prefix_fst.into_stream().into_strs()?;
+        let prefix_fst_keys: Vec<_> =
+            prefix_fst_keys.as_slice().linear_group_by_key(|x| x.chars().nth(0).unwrap()).collect();

        let mut db =
            self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(self.wtxn)?;

        let mut buffer = Vec::new();
-        let mut current_prefixes: Option<&&[Vec<u8>]> = None;
+        let mut current_prefixes: Option<&&[String]> = None;
        let mut prefixes_cache = HashMap::new();
        while let Some(((w1, w2, prox), data)) = db.next().transpose()? {
+            if prox > self.max_proximity {
+                continue;
+            }
+
            current_prefixes = match current_prefixes.take() {
-                Some(prefixes) if w2.as_bytes().starts_with(&prefixes[0]) => Some(prefixes),
+                Some(prefixes) if w2.starts_with(&prefixes[0]) => Some(prefixes),
                _otherwise => {
                    write_prefixes_in_sorter(
                        &mut prefixes_cache,
                        &mut word_prefix_pair_proximity_docids_sorter,
-                        self.threshold,
                    )?;
-                    prefix_fst_keys.iter().find(|prefixes| w2.as_bytes().starts_with(&prefixes[0]))
+                    prefix_fst_keys.iter().find(|prefixes| w2.starts_with(&prefixes[0]))
                }
            };

@ -93,15 +106,17 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
                buffer.clear();
                buffer.extend_from_slice(w1.as_bytes());
                buffer.push(0);
-                for prefix in prefixes.iter().filter(|prefix| w2.as_bytes().starts_with(prefix)) {
-                    buffer.truncate(w1.len() + 1);
-                    buffer.extend_from_slice(prefix);
-                    buffer.push(prox);
+                for prefix in prefixes.iter() {
+                    if prefix.len() <= self.max_prefix_length && w2.starts_with(prefix) {
+                        buffer.truncate(w1.len() + 1);
+                        buffer.extend_from_slice(prefix.as_bytes());
+                        buffer.push(prox);

-                    match prefixes_cache.get_mut(&buffer) {
-                        Some(value) => value.push(data),
-                        None => {
-                            prefixes_cache.insert(buffer.clone(), vec![data]);
+                        match prefixes_cache.get_mut(&buffer) {
+                            Some(value) => value.push(data),
+                            None => {
+                                prefixes_cache.insert(buffer.clone(), vec![data]);
+                            }
                        }
                    }
                }
@ -111,7 +126,6 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
        write_prefixes_in_sorter(
            &mut prefixes_cache,
            &mut word_prefix_pair_proximity_docids_sorter,
-            self.threshold,
        )?;

        drop(prefix_fst);
@ -133,19 +147,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
 fn write_prefixes_in_sorter(
    prefixes: &mut HashMap<Vec<u8>, Vec<&[u8]>>,
    sorter: &mut grenad::Sorter<MergeFn>,
-    min_word_per_prefix: u32,
 ) -> Result<()> {
-    for (i, (key, data_slices)) in prefixes.drain().enumerate() {
-        // if the number of words prefixed by the prefix is higher than the threshold,
-        // we insert it in the sorter.
-        if data_slices.len() > min_word_per_prefix as usize {
-            for data in data_slices {
-                sorter.insert(&key, data)?;
-            }
-        // if the first prefix isn't elligible for insertion,
-        // then the other prefixes can't be elligible.
-        } else if i == 0 {
-            break;
+    for (key, data_slices) in prefixes.drain() {
+        for data in data_slices {
+            sorter.insert(&key, data)?;
        }
    }