From ae47bb359498eee4c0c7cf8b464b315e9713235a Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Wed, 3 Mar 2021 15:41:09 +0100
Subject: [PATCH] Introduce plane_sweep function in proximity criterion

---
 milli/src/search/criteria/mod.rs       |  12 +-
 milli/src/search/criteria/proximity.rs | 178 ++++++++++++++++++++++++-
 2 files changed, 188 insertions(+), 2 deletions(-)
diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs
index aadd0b31a..856e9af9d 100644
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@@ -5,7 +5,7 @@ use anyhow::bail;
 use roaring::RoaringBitmap;
 
 use crate::search::word_derivations;
-use crate::Index;
+use crate::{DocumentId, Index};
 
 use super::query_tree::{Operation, Query, QueryKind};
 use self::typo::Typo;
@@ -66,6 +66,7 @@ pub trait Context {
     fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
     fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
     fn in_prefix_cache(&self, word: &str) -> bool;
+    fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>>;
 }
 pub struct CriteriaBuilder<'t> {
     rtxn: &'t heed::RoTxn<'t>,
@@ -104,6 +105,11 @@ impl<'a> Context for CriteriaBuilder<'a> {
     fn in_prefix_cache(&self, word: &str) -> bool {
         self.words_prefixes_fst.contains(word)
     }
+
+    fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>> {
+        let key = (docid, word);
+        self.index.docid_word_positions.get(self.rtxn, &key)
+    }
 }
 
 impl<'t> CriteriaBuilder<'t> {
@@ -368,6 +374,10 @@ pub mod test {
         fn in_prefix_cache(&self, word: &str) -> bool {
             self.word_prefix_docids.contains_key(&word.to_string())
         }
+
+        fn docid_word_positions(&self, _docid: DocumentId, _word: &str) -> heed::Result<Option<RoaringBitmap>> {
+            todo!()
+        }
     }
 
     impl<'a> Default for TestContext<'a> {
diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs
index b192902c1..cea50c034 100644
--- a/milli/src/search/criteria/proximity.rs
+++ b/milli/src/search/criteria/proximity.rs
@@ -1,9 +1,10 @@
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
 use std::mem::take;
 
 use roaring::RoaringBitmap;
 use log::debug;
 
+use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}};
 use crate::search::query_tree::{maximum_proximity, Operation, Query};
 use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids};
 
@@ -289,3 +290,178 @@ fn resolve_candidates<'t>(
     }
     Ok(candidates)
 }
+
+fn resolve_plane_sweep_candidates<'t>(
+    ctx: &'t dyn Context,
+    query_tree: &Operation,
+    allowed_candidates: &RoaringBitmap,
+) -> anyhow::Result<BTreeMap<u8, RoaringBitmap>>
+{
+    /// FIXME may be buggy with query like "new new york"
+    fn plane_sweep<'t>(
+        ctx: &'t dyn Context,
+        operations: &[Operation],
+        docid: DocumentId,
+        consecutive: bool,
+    ) -> anyhow::Result<Vec<(Position, u8, Position)>> {
+        fn compute_groups_proximity(groups: &Vec<(usize, (Position, u8, Position))>, consecutive: bool) -> Option<(Position, u8, Position)> {
+            // take the inner proximity of the first group as initial
+            let mut proximity = groups.first()?.1.1;
+            let left_most_pos = groups.first()?.1.0;
+            let right_most_pos = groups.last()?.1.2;
+
+            for pair in groups.windows(2) {
+                if let [(i1, (_, _, rpos1)), (i2, (lpos2, prox2, _))] = pair {
+                    // if a pair overlap, meaning that they share at least a word, we return None
+                    if rpos1 >= lpos2 { return None }
+                    // if groups are in the good order (query order) we remove 1 to the proximity
+                    // the proximity is clamped to 7
+                    let pair_proximity = if i1 < i2 {
+                        (*lpos2 - *rpos1 - 1).min(7)
+                    } else {
+                        (*lpos2 - *rpos1).min(7)
+                    };
+
+                    proximity += pair_proximity as u8 + prox2;
+                }
+            }
+
+            // if groups should be consecutives, we will only accept groups with a proximity of 0
+            if !consecutive || proximity == 0 {
+                Some((left_most_pos, proximity, right_most_pos))
+            } else { None }
+        }
+
+        let groups_len = operations.len();
+        let mut groups_positions = Vec::with_capacity(groups_len);
+
+        for operation in operations {
+            let positions = resolve_operation(ctx, operation, docid)?;
+            groups_positions.push(positions.into_iter());
+        }
+
+        // Pop top elements of each list.
+        let mut current = Vec::with_capacity(groups_len);
+        for (i, positions) in groups_positions.iter_mut().enumerate() {
+            match positions.next() {
+                Some(p) => current.push((i, p)),
+                // if a group return None, it means that the document does not contain all the words,
+                // we return an empty result.
+                None => return Ok(Vec::new()),
+            }
+        }
+
+        // Sort k elements by their positions.
+        current.sort_unstable_by_key(|(_, p)| *p);
+
+        // Find leftmost and rightmost group and their positions.
+        let mut leftmost = *current.first().unwrap();
+        let mut rightmost = *current.last().unwrap();
+
+        let mut output = Vec::new();
+        loop {
+            // Find the position p of the next elements of a list of the leftmost group.
+            // If the list is empty, break the loop.
+            let p = groups_positions[leftmost.0].next().map(|p| (leftmost.0, p));
+
+            // let q be the position q of second group of the interval.
+            let q = current[1];
+
+            let mut leftmost_index = 0;
+
+            // If p > r, then the interval [l, r] is minimal and
+            // we insert it into the heap according to its size.
+            if p.map_or(true, |p| p.1 > rightmost.1) {
+                leftmost_index = current[0].0;
+                if let Some(group) = compute_groups_proximity(&current, consecutive) {
+                    output.push(group);
+                }
+            }
+
+            // TODO not sure about breaking here or when the p list is found empty.
+            let p = match p {
+                Some(p) => p,
+                None => break,
+            };
+
+            // Remove the leftmost group P in the interval,
+            // and pop the same group from a list.
+            current[leftmost_index] = p;
+
+            if p.1 > rightmost.1 {
+                // if [l, r] is minimal, let r = p and l = q.
+                rightmost = p;
+                leftmost = q;
+            } else {
+                // Ohterwise, let l = min{p,q}.
+                leftmost = if p.1 < q.1 { p } else { q };
+            }
+
+            // Then update the interval and order of groups_positions in the interval.
+            current.sort_unstable_by_key(|(_, p)| *p);
+        }
+
+        // Sort the list according to the size and the positions.
+        output.sort_unstable();
+
+        Ok(output)
+    }
+
+    fn resolve_operation<'t>(
+        ctx: &'t dyn Context,
+        query_tree: &Operation,
+        docid: DocumentId,
+    ) -> anyhow::Result<Vec<(Position, u8, Position)>> {
+        use Operation::{And, Consecutive, Or};
+
+        match query_tree {
+            And(ops) => plane_sweep(ctx, ops, docid, false),
+            Consecutive(ops) => plane_sweep(ctx, ops, docid, true),
+            Or(_, ops) => {
+                let mut result = Vec::new();
+                for op in ops {
+                    result.extend(resolve_operation(ctx, op, docid)?)
+                }
+
+                result.sort_unstable();
+                Ok(result)
+            },
+            Operation::Query(Query {prefix, kind}) => {
+                let fst = ctx.words_fst();
+                let words = match kind {
+                    QueryKind::Exact { word, .. } => {
+                        if *prefix {
+                            word_derivations(word, true, 0, fst)?
+                        } else {
+                            vec![(word.to_string(), 0)]
+                        }
+                    },
+                    QueryKind::Tolerant { typo, word } => {
+                        word_derivations(word, *prefix, *typo, fst)?
+                    }
+                };
+
+                let mut result = Vec::new();
+                for (word, _) in words {
+                    if let Some(positions) = ctx.docid_word_positions(docid, &word)? {
+                        let iter = positions.iter().map(|p| (p, 0, p));
+                        result.extend(iter);
+                    }
+                }
+
+                result.sort_unstable();
+                Ok(result)
+            }
+        }
+    }
+
+    let mut candidates = BTreeMap::new();
+    for docid in allowed_candidates {
+        let positions =  resolve_operation(ctx, query_tree, docid)?;
+        let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);
+        let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7);
+        candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid);
+    }
+
+    Ok(candidates)
+}