mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-12-02 01:55:03 +08:00
Introduce the plane-sweep algorithm
This commit is contained in:
parent
dc88a86259
commit
1c504471d3
24
Cargo.lock
generated
24
Cargo.lock
generated
@ -997,6 +997,7 @@ dependencies = [
|
|||||||
"levenshtein_automata",
|
"levenshtein_automata",
|
||||||
"log 0.4.11",
|
"log 0.4.11",
|
||||||
"memmap",
|
"memmap",
|
||||||
|
"near-proximity",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"oxidized-mtbl",
|
"oxidized-mtbl",
|
||||||
"rayon",
|
"rayon",
|
||||||
@ -1141,6 +1142,14 @@ dependencies = [
|
|||||||
"twoway",
|
"twoway",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "near-proximity"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "git+https://github.com/Kerollmops/plane-sweep-proximity#f6c9e7e5fc1f7b456d080981e877d0e7943f82bd"
|
||||||
|
dependencies = [
|
||||||
|
"tinyvec",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "net2"
|
name = "net2"
|
||||||
version = "0.2.34"
|
version = "0.2.34"
|
||||||
@ -1964,6 +1973,21 @@ dependencies = [
|
|||||||
"serde_json",
|
"serde_json",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tinyvec"
|
||||||
|
version = "1.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f331a553cacb14e99d183e5573c86044dd177b5a5277b21e562fd1bd5e1076e1"
|
||||||
|
dependencies = [
|
||||||
|
"tinyvec_macros",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tinyvec_macros"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokio"
|
name = "tokio"
|
||||||
version = "0.2.21"
|
version = "0.2.21"
|
||||||
|
@ -22,6 +22,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
|||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
once_cell = "1.4.0"
|
once_cell = "1.4.0"
|
||||||
oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "5426182" }
|
oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "5426182" }
|
||||||
|
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity" }
|
||||||
rayon = "1.3.1"
|
rayon = "1.3.1"
|
||||||
ringtail = "0.3.0"
|
ringtail = "0.3.0"
|
||||||
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" }
|
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" }
|
||||||
|
@ -5,7 +5,9 @@ use levenshtein_automata::DFA;
|
|||||||
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
|
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::bitmap::{IntoIter, RoaringBitmap};
|
||||||
|
|
||||||
|
use near_proximity::near_proximity;
|
||||||
|
|
||||||
use crate::query_tokens::{QueryTokens, QueryToken};
|
use crate::query_tokens::{QueryTokens, QueryToken};
|
||||||
use crate::{Index, DocumentId, Position, Attribute};
|
use crate::{Index, DocumentId, Position, Attribute};
|
||||||
@ -136,6 +138,31 @@ impl<'a> Search<'a> {
|
|||||||
Ok(candidates)
|
Ok(candidates)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn fecth_keywords(
|
||||||
|
rtxn: &heed::RoTxn,
|
||||||
|
index: &Index,
|
||||||
|
derived_words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
|
||||||
|
candidate: DocumentId,
|
||||||
|
) -> anyhow::Result<Vec<IntoIter>>
|
||||||
|
{
|
||||||
|
let mut keywords = Vec::with_capacity(derived_words.len());
|
||||||
|
|
||||||
|
for (words, _) in derived_words {
|
||||||
|
|
||||||
|
let mut union_positions = RoaringBitmap::new();
|
||||||
|
for (word, (_distance, docids)) in words {
|
||||||
|
|
||||||
|
if docids.contains(candidate) {
|
||||||
|
let positions = index.word_docid_positions.get(rtxn, &(word, candidate))?.unwrap();
|
||||||
|
union_positions.union_with(&positions);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
keywords.push(union_positions.into_iter());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(keywords)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn execute(&self) -> anyhow::Result<SearchResult> {
|
pub fn execute(&self) -> anyhow::Result<SearchResult> {
|
||||||
let rtxn = self.rtxn;
|
let rtxn = self.rtxn;
|
||||||
let index = self.index;
|
let index = self.index;
|
||||||
@ -162,10 +189,28 @@ impl<'a> Search<'a> {
|
|||||||
|
|
||||||
debug!("candidates: {:?}", candidates);
|
debug!("candidates: {:?}", candidates);
|
||||||
|
|
||||||
let documents = vec![candidates];
|
let mut documents = Vec::new();
|
||||||
|
|
||||||
|
let min_proximity = derived_words.len() as u32;
|
||||||
|
let mut number_min_proximity = 0;
|
||||||
|
|
||||||
|
let mut paths = Vec::new();
|
||||||
|
for candidate in candidates {
|
||||||
|
let keywords = Self::fecth_keywords(rtxn, index, &derived_words, candidate)?;
|
||||||
|
near_proximity(keywords, &mut paths);
|
||||||
|
if let Some((prox, _path)) = paths.first() {
|
||||||
|
documents.push((*prox, candidate));
|
||||||
|
if *prox == min_proximity {
|
||||||
|
number_min_proximity += 1;
|
||||||
|
if number_min_proximity >= limit { break }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
documents.sort_unstable_by_key(|(prox, _)| *prox);
|
||||||
|
|
||||||
let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect();
|
let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect();
|
||||||
let documents_ids = documents.iter().flatten().take(limit).collect();
|
let documents_ids = documents.into_iter().map(|(_, id)| id).take(limit).collect();
|
||||||
|
|
||||||
Ok(SearchResult { found_words, documents_ids })
|
Ok(SearchResult { found_words, documents_ids })
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user