diff --git a/Cargo.lock b/Cargo.lock index b3b23f02d..a8f5ef606 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -136,15 +136,6 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" -[[package]] -name = "bitpacking" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3744aff20a3437a99ebc0bb7733e9e60c7bf590478c9b897e95b38d57e5acb68" -dependencies = [ - "crunchy", -] - [[package]] name = "block-buffer" version = "0.7.3" @@ -267,12 +258,6 @@ dependencies = [ "bitflags", ] -[[package]] -name = "cow-utils" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" - [[package]] name = "crc32c" version = "0.4.0" @@ -370,12 +355,6 @@ dependencies = [ "lazy_static 1.4.0", ] -[[package]] -name = "crunchy" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" - [[package]] name = "csv" version = "1.1.3" @@ -981,18 +960,14 @@ dependencies = [ "arc-cache", "askama", "askama_warp", - "bitpacking", "bstr", "byteorder", - "cow-utils", "criterion", "csv", "flate2", "fst", "fxhash", "heed", - "indexmap", - "itertools", "jemallocator", "levenshtein_automata", "log 0.4.11", @@ -1649,9 +1624,9 @@ checksum = "21215c1b9d8f7832b433255bd9eea3e2779aa55b21b2f8e13aad62c74749b237" [[package]] name = "roaring" -version = "0.6.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d6c40b0f4a172f43c3dab852f6c05df5a643aebe7395dbeb598a2f5bb318c1e" +checksum = "99a260b0fb7df2095948f4a1d37afe5d1a08a2ccc7380f418cec049dc9560077" dependencies = [ "byteorder", ] diff --git a/Cargo.toml b/Cargo.toml index dbf77d00b..ffb407cb9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,10 +8,8 @@ default-run = "indexer" [dependencies] anyhow = "1.0.28" arc-cache = { git = "https://github.com/Kerollmops/rust-arc-cache.git", rev = "56530f2" } -bitpacking = "0.8.2" bstr = "0.2.13" byteorder = "1.3.4" -cow-utils = "0.1.2" csv = "1.1.3" flate2 = "1.0.17" fst = "0.4.3" @@ -25,7 +23,7 @@ once_cell = "1.4.0" oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "9bf47a7" } rayon = "1.3.1" ringtail = "0.3.0" -roaring = "0.6.0" +roaring = "0.6.1" slice-group-by = "0.2.6" smallstr = "0.2.0" smallvec = "1.4.0" @@ -36,12 +34,6 @@ tempfile = "3.1.0" log = "0.4.11" stderrlog = "0.4.3" -# best proximity -indexmap = "1.5.1" - -# to implement internally -itertools = "0.9.0" - # http server askama = "0.10.1" askama_warp = "0.10.0" diff --git a/src/search.rs b/src/search.rs index e1a6ae4ba..b392ad0b6 100644 --- a/src/search.rs +++ b/src/search.rs @@ -4,7 +4,7 @@ use std::cmp; use fst::{IntoStreamer, Streamer}; use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; -use log::{debug, error}; +use log::debug; use once_cell::sync::Lazy; use roaring::bitmap::{IntoIter, RoaringBitmap}; @@ -155,11 +155,9 @@ impl<'a> Search<'a> { let mut union_positions = RoaringBitmap::new(); for (word, (_distance, docids)) in words { - if docids.contains(candidate) { - match index.docid_word_positions.get(rtxn, &(candidate, word))? { - Some(positions) => union_positions.union_with(&positions), - None => error!("position missing for candidate {} and word {:?}", candidate, word), - } + if !docids.contains(candidate) { continue; } + if let Some(positions) = index.docid_word_positions.get(rtxn, &(candidate, word))? { + union_positions.union_with(&positions); } } keywords.push(union_positions.into_iter()); @@ -223,6 +221,13 @@ impl<'a> Search<'a> { path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::() } + // If there only is one word, no need to compute the best proximities. + if derived_words.len() == 1 { + let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); + let documents_ids = candidates.iter().take(limit).collect(); + return Ok(SearchResult { found_words, documents_ids }); + } + let mut paths = Vec::new(); for candidate in candidates { let keywords = Self::fecth_keywords(rtxn, index, &derived_words, candidate)?; @@ -236,7 +241,6 @@ impl<'a> Search<'a> { let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); let documents_ids = documents.into_iter().map(|(_, id)| id).take(limit).collect(); - Ok(SearchResult { found_words, documents_ids }) } }