From 1c504471d36a804e5487837751abdb74ff6cf9a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 5 Sep 2020 18:25:27 +0200 Subject: [PATCH] Introduce the plane-sweep algorithm --- Cargo.lock | 24 ++++++++++++++++++++++++ Cargo.toml | 1 + src/search.rs | 51 ++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 73 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dc7fb41bd..aabacb512 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -997,6 +997,7 @@ dependencies = [ "levenshtein_automata", "log 0.4.11", "memmap", + "near-proximity", "once_cell", "oxidized-mtbl", "rayon", @@ -1141,6 +1142,14 @@ dependencies = [ "twoway", ] +[[package]] +name = "near-proximity" +version = "0.1.0" +source = "git+https://github.com/Kerollmops/plane-sweep-proximity#f6c9e7e5fc1f7b456d080981e877d0e7943f82bd" +dependencies = [ + "tinyvec", +] + [[package]] name = "net2" version = "0.2.34" @@ -1964,6 +1973,21 @@ dependencies = [ "serde_json", ] +[[package]] +name = "tinyvec" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f331a553cacb14e99d183e5573c86044dd177b5a5277b21e562fd1bd5e1076e1" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" + [[package]] name = "tokio" version = "0.2.21" diff --git a/Cargo.toml b/Cargo.toml index a9f685ea0..86e5e6d50 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,6 +22,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } memmap = "0.7.0" once_cell = "1.4.0" oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "5426182" } +near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity" } rayon = "1.3.1" ringtail = "0.3.0" roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" } diff --git a/src/search.rs b/src/search.rs index 7f0831d1c..7aac68f1d 100644 --- a/src/search.rs +++ b/src/search.rs @@ -5,7 +5,9 @@ use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use log::debug; use once_cell::sync::Lazy; -use roaring::RoaringBitmap; +use roaring::bitmap::{IntoIter, RoaringBitmap}; + +use near_proximity::near_proximity; use crate::query_tokens::{QueryTokens, QueryToken}; use crate::{Index, DocumentId, Position, Attribute}; @@ -136,6 +138,31 @@ impl<'a> Search<'a> { Ok(candidates) } + fn fecth_keywords( + rtxn: &heed::RoTxn, + index: &Index, + derived_words: &[(HashMap, RoaringBitmap)], + candidate: DocumentId, + ) -> anyhow::Result> + { + let mut keywords = Vec::with_capacity(derived_words.len()); + + for (words, _) in derived_words { + + let mut union_positions = RoaringBitmap::new(); + for (word, (_distance, docids)) in words { + + if docids.contains(candidate) { + let positions = index.word_docid_positions.get(rtxn, &(word, candidate))?.unwrap(); + union_positions.union_with(&positions); + } + } + keywords.push(union_positions.into_iter()); + } + + Ok(keywords) + } + pub fn execute(&self) -> anyhow::Result { let rtxn = self.rtxn; let index = self.index; @@ -162,10 +189,28 @@ impl<'a> Search<'a> { debug!("candidates: {:?}", candidates); - let documents = vec![candidates]; + let mut documents = Vec::new(); + + let min_proximity = derived_words.len() as u32; + let mut number_min_proximity = 0; + + let mut paths = Vec::new(); + for candidate in candidates { + let keywords = Self::fecth_keywords(rtxn, index, &derived_words, candidate)?; + near_proximity(keywords, &mut paths); + if let Some((prox, _path)) = paths.first() { + documents.push((*prox, candidate)); + if *prox == min_proximity { + number_min_proximity += 1; + if number_min_proximity >= limit { break } + } + } + } + + documents.sort_unstable_by_key(|(prox, _)| *prox); let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); - let documents_ids = documents.iter().flatten().take(limit).collect(); + let documents_ids = documents.into_iter().map(|(_, id)| id).take(limit).collect(); Ok(SearchResult { found_words, documents_ids }) }