From d6fa9c0414791aaf4676975cd9abc4343126a3c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 22 Sep 2020 14:04:33 +0200 Subject: [PATCH] Index the intra documents word pair proximities --- Cargo.lock | 1 + Cargo.toml | 3 ++ src/bin/indexer.rs | 76 +++++++++++++++++++++++++++++++++++++++++++++- src/lib.rs | 7 +++-- src/search.rs | 3 +- 5 files changed, 85 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a8f5ef606..57a02e8e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -968,6 +968,7 @@ dependencies = [ "fst", "fxhash", "heed", + "itertools", "jemallocator", "levenshtein_automata", "log 0.4.11", diff --git a/Cargo.toml b/Cargo.toml index ffb407cb9..b24845ada 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,9 @@ smallvec = "1.4.0" structopt = { version = "0.3.14", default-features = false } tempfile = "3.1.0" +# documents words self-join +itertools = "0.9.0" + # logging log = "0.4.11" stderrlog = "0.4.3" diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index 75ce5347d..f6494839e 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -37,6 +37,7 @@ const WORDS_FST_KEY: &[u8] = b"\x06words-fst"; const HEADERS_BYTE: u8 = 0; const WORD_DOCID_POSITIONS_BYTE: u8 = 1; const WORD_DOCIDS_BYTE: u8 = 2; +const WORDS_PROXIMITIES_BYTE: u8 = 5; const DOCUMENTS_IDS_BYTE: u8 = 4; #[cfg(target_os = "linux")] @@ -128,6 +129,35 @@ fn create_writer(type_: CompressionType, level: Option, file: File) -> Writ builder.build(file) } +fn compute_words_pair_proximities( + word_positions: &HashMap, +) -> HashMap<(&str, &str), RoaringBitmap> +{ + use itertools::Itertools; + + let mut words_pair_proximities = HashMap::new(); + for (w1, w2) in word_positions.keys().cartesian_product(word_positions.keys()) { + let mut distances = RoaringBitmap::new(); + let positions1: Vec<_> = word_positions[w1].iter().collect(); + let positions2: Vec<_> = word_positions[w2].iter().collect(); + for (ps1, ps2) in positions1.iter().cartesian_product(positions2.iter()) { + let prox = milli::proximity::positions_proximity(*ps1, *ps2); + // We don't care about a word that appear at the + // same position or too far from the other. + if prox > 0 && prox < 8 { distances.insert(prox); } + } + if !distances.is_empty() { + // We only store the proximites under one word pair. + let (w1, w2) = if w1 > w2 { (w2, w1) } else { (w1, w2) }; + words_pair_proximities.entry((w1.as_str(), w2.as_str())) + .or_insert_with(RoaringBitmap::new) + .union_with(&distances); + } + } + + words_pair_proximities +} + type MergeFn = fn(&[u8], &[Vec]) -> Result, ()>; struct Store { @@ -213,6 +243,43 @@ impl Store { Ok(()) } + // FIXME We must store those pairs in an ArcCache to reduce the number of I/O operations, + // We must store the documents ids associated with the words pairs and proximities. + fn write_words_proximities( + sorter: &mut Sorter, + document_id: DocumentId, + words_pair_proximities: &HashMap<(&str, &str), RoaringBitmap>, + ) -> anyhow::Result<()> + { + // words proximities keys are all prefixed + let mut key = vec![WORDS_PROXIMITIES_BYTE]; + let mut buffer = Vec::new(); + + for ((w1, w2), proximities) in words_pair_proximities { + assert!(w1 <= w2); + key.truncate(1); + key.extend_from_slice(w1.as_bytes()); + key.push(0); + key.extend_from_slice(w2.as_bytes()); + let pair_len = key.len(); + for prox in proximities { + key.truncate(pair_len); + key.push(u8::try_from(prox).unwrap()); + // We serialize the document ids into a buffer + buffer.clear(); + let ids = RoaringBitmap::from_iter(Some(document_id)); + buffer.reserve(ids.serialized_size()); + ids.serialize_into(&mut buffer)?; + // that we write under the generated key into MTBL + if lmdb_key_valid_size(&key) { + sorter.insert(&key, &buffer)?; + } + } + } + + Ok(()) + } + fn write_docid_word_positions( sorter: &mut Sorter, id: DocumentId, @@ -307,6 +374,9 @@ impl Store { } } + let words_pair_proximities = compute_words_pair_proximities(&word_positions); + Self::write_words_proximities(&mut self.sorter, document_id, &words_pair_proximities)?; + // We write the document in the documents store. self.write_document(document_id, &word_positions, &document)?; word_positions.clear(); @@ -386,7 +456,7 @@ fn merge(key: &[u8], values: &[Vec]) -> Result, ()> { assert!(values.windows(2).all(|vs| vs[0] == vs[1])); Ok(values[0].to_vec()) }, - DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE => { + DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE | WORDS_PROXIMITIES_BYTE => { let (head, tail) = values.split_first().unwrap(); let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap(); @@ -428,6 +498,10 @@ fn lmdb_writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> // Write the postings lists index.docid_word_positions.as_polymorph() .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; + } else if key.starts_with(&[WORDS_PROXIMITIES_BYTE]) { + // Write the word pair proximity document ids + index.word_pair_proximity_docids.as_polymorph() + .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; } Ok(()) diff --git a/src/lib.rs b/src/lib.rs index 91fc9ae42..0e94fcef7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -17,8 +17,8 @@ use heed::{PolyDatabase, Database}; pub use self::search::{Search, SearchResult}; pub use self::criterion::{Criterion, default_criteria}; pub use self::heed_codec::{ - RoaringBitmapCodec, BEU32StrCodec, CsvStringRecordCodec, - ByteorderXRoaringBitmapCodec, + RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, + CsvStringRecordCodec, ByteorderXRoaringBitmapCodec, }; pub type FastMap4 = HashMap>; @@ -45,6 +45,8 @@ pub struct Index { pub docid_word_positions: Database, /// Maps the document id to the document as a CSV line. pub documents: Database, ByteSlice>, + /// Maps the proximity between a pair of words with all the docids where this relation appears. + pub word_pair_proximity_docids: Database, } impl Index { @@ -54,6 +56,7 @@ impl Index { word_docids: env.create_database(Some("word-docids"))?, docid_word_positions: env.create_database(Some("docid-word-positions"))?, documents: env.create_database(Some("documents"))?, + word_pair_proximity_docids: env.create_database(Some("word-pair-proximity-docids"))?, }) } diff --git a/src/search.rs b/src/search.rs index 46e5f5280..7a52c25b3 100644 --- a/src/search.rs +++ b/src/search.rs @@ -1,5 +1,4 @@ use std::collections::{HashMap, HashSet}; -use std::cmp; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::DFA; @@ -12,7 +11,7 @@ use near_proximity::near_proximity; use crate::proximity::path_proximity; use crate::query_tokens::{QueryTokens, QueryToken}; -use crate::{Index, DocumentId, Position}; +use crate::{Index, DocumentId}; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true));