From d6fa9c0414791aaf4676975cd9abc4343126a3c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Tue, 22 Sep 2020 14:04:33 +0200
Subject: [PATCH] Index the intra documents word pair proximities

---
 Cargo.lock         |  1 +
 Cargo.toml         |  3 ++
 src/bin/indexer.rs | 76 +++++++++++++++++++++++++++++++++++++++++++++-
 src/lib.rs         |  7 +++--
 src/search.rs      |  3 +-
 5 files changed, 85 insertions(+), 5 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index a8f5ef606..57a02e8e2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -968,6 +968,7 @@ dependencies = [
  "fst",
  "fxhash",
  "heed",
+ "itertools",
  "jemallocator",
  "levenshtein_automata",
  "log 0.4.11",
diff --git a/Cargo.toml b/Cargo.toml
index ffb407cb9..b24845ada 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -30,6 +30,9 @@ smallvec = "1.4.0"
 structopt = { version = "0.3.14", default-features = false }
 tempfile = "3.1.0"
 
+# documents words self-join
+itertools = "0.9.0"
+
 # logging
 log = "0.4.11"
 stderrlog = "0.4.3"
diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs
index 75ce5347d..f6494839e 100644
--- a/src/bin/indexer.rs
+++ b/src/bin/indexer.rs
@@ -37,6 +37,7 @@ const WORDS_FST_KEY: &[u8] = b"\x06words-fst";
 const HEADERS_BYTE: u8 = 0;
 const WORD_DOCID_POSITIONS_BYTE: u8 = 1;
 const WORD_DOCIDS_BYTE: u8 = 2;
+const WORDS_PROXIMITIES_BYTE: u8 = 5;
 const DOCUMENTS_IDS_BYTE: u8 = 4;
 
 #[cfg(target_os = "linux")]
@@ -128,6 +129,35 @@ fn create_writer(type_: CompressionType, level: Option<u32>, file: File) -> Writ
     builder.build(file)
 }
 
+fn compute_words_pair_proximities(
+    word_positions: &HashMap<String, RoaringBitmap>,
+) -> HashMap<(&str, &str), RoaringBitmap>
+{
+    use itertools::Itertools;
+
+    let mut words_pair_proximities = HashMap::new();
+    for (w1, w2) in word_positions.keys().cartesian_product(word_positions.keys()) {
+        let mut distances = RoaringBitmap::new();
+        let positions1: Vec<_> = word_positions[w1].iter().collect();
+        let positions2: Vec<_> = word_positions[w2].iter().collect();
+        for (ps1, ps2) in positions1.iter().cartesian_product(positions2.iter()) {
+            let prox = milli::proximity::positions_proximity(*ps1, *ps2);
+            // We don't care about a word that appear at the
+            // same position or too far from the other.
+            if prox > 0 && prox < 8 { distances.insert(prox); }
+        }
+        if !distances.is_empty() {
+            // We only store the proximites under one word pair.
+            let (w1, w2) = if w1 > w2 { (w2, w1) } else { (w1, w2) };
+            words_pair_proximities.entry((w1.as_str(), w2.as_str()))
+                .or_insert_with(RoaringBitmap::new)
+                .union_with(&distances);
+        }
+    }
+
+    words_pair_proximities
+}
+
 type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>;
 
 struct Store {
@@ -213,6 +243,43 @@ impl Store {
         Ok(())
     }
 
+    // FIXME We must store those pairs in an ArcCache to reduce the number of I/O operations,
+    //       We must store the documents ids associated with the words pairs and proximities.
+    fn write_words_proximities(
+        sorter: &mut Sorter<MergeFn>,
+        document_id: DocumentId,
+        words_pair_proximities: &HashMap<(&str, &str), RoaringBitmap>,
+    ) -> anyhow::Result<()>
+    {
+        // words proximities keys are all prefixed
+        let mut key = vec![WORDS_PROXIMITIES_BYTE];
+        let mut buffer = Vec::new();
+
+        for ((w1, w2), proximities) in words_pair_proximities {
+            assert!(w1 <= w2);
+            key.truncate(1);
+            key.extend_from_slice(w1.as_bytes());
+            key.push(0);
+            key.extend_from_slice(w2.as_bytes());
+            let pair_len = key.len();
+            for prox in proximities {
+                key.truncate(pair_len);
+                key.push(u8::try_from(prox).unwrap());
+                // We serialize the document ids into a buffer
+                buffer.clear();
+                let ids = RoaringBitmap::from_iter(Some(document_id));
+                buffer.reserve(ids.serialized_size());
+                ids.serialize_into(&mut buffer)?;
+                // that we write under the generated key into MTBL
+                if lmdb_key_valid_size(&key) {
+                    sorter.insert(&key, &buffer)?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     fn write_docid_word_positions(
         sorter: &mut Sorter<MergeFn>,
         id: DocumentId,
@@ -307,6 +374,9 @@ impl Store {
                     }
                 }
 
+                let words_pair_proximities = compute_words_pair_proximities(&word_positions);
+                Self::write_words_proximities(&mut self.sorter, document_id, &words_pair_proximities)?;
+
                 // We write the document in the documents store.
                 self.write_document(document_id, &word_positions, &document)?;
                 word_positions.clear();
@@ -386,7 +456,7 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
                 assert!(values.windows(2).all(|vs| vs[0] == vs[1]));
                 Ok(values[0].to_vec())
             },
-            DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE => {
+            DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE | WORDS_PROXIMITIES_BYTE => {
                 let (head, tail) = values.split_first().unwrap();
 
                 let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
@@ -428,6 +498,10 @@ fn lmdb_writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) ->
         // Write the postings lists
         index.docid_word_positions.as_polymorph()
             .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
+    } else if key.starts_with(&[WORDS_PROXIMITIES_BYTE]) {
+        // Write the word pair proximity document ids
+        index.word_pair_proximity_docids.as_polymorph()
+            .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?;
     }
 
     Ok(())
diff --git a/src/lib.rs b/src/lib.rs
index 91fc9ae42..0e94fcef7 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -17,8 +17,8 @@ use heed::{PolyDatabase, Database};
 pub use self::search::{Search, SearchResult};
 pub use self::criterion::{Criterion, default_criteria};
 pub use self::heed_codec::{
-    RoaringBitmapCodec, BEU32StrCodec, CsvStringRecordCodec,
-    ByteorderXRoaringBitmapCodec,
+    RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec,
+    CsvStringRecordCodec, ByteorderXRoaringBitmapCodec,
 };
 
 pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
@@ -45,6 +45,8 @@ pub struct Index {
     pub docid_word_positions: Database<BEU32StrCodec, ByteorderXRoaringBitmapCodec>,
     /// Maps the document id to the document as a CSV line.
     pub documents: Database<OwnedType<BEU32>, ByteSlice>,
+    /// Maps the proximity between a pair of words with all the docids where this relation appears.
+    pub word_pair_proximity_docids: Database<StrStrU8Codec, RoaringBitmapCodec>,
 }
 
 impl Index {
@@ -54,6 +56,7 @@ impl Index {
             word_docids: env.create_database(Some("word-docids"))?,
             docid_word_positions: env.create_database(Some("docid-word-positions"))?,
             documents: env.create_database(Some("documents"))?,
+            word_pair_proximity_docids: env.create_database(Some("word-pair-proximity-docids"))?,
         })
     }
 
diff --git a/src/search.rs b/src/search.rs
index 46e5f5280..7a52c25b3 100644
--- a/src/search.rs
+++ b/src/search.rs
@@ -1,5 +1,4 @@
 use std::collections::{HashMap, HashSet};
-use std::cmp;
 
 use fst::{IntoStreamer, Streamer};
 use levenshtein_automata::DFA;
@@ -12,7 +11,7 @@ use near_proximity::near_proximity;
 
 use crate::proximity::path_proximity;
 use crate::query_tokens::{QueryTokens, QueryToken};
-use crate::{Index, DocumentId, Position};
+use crate::{Index, DocumentId};
 
 // Building these factories is not free.
 static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));