mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 03:55:07 +08:00
Use another function to define the proximity
This commit is contained in:
parent
f928b91e9d
commit
bb1ab428db
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -1145,7 +1145,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "near-proximity"
|
name = "near-proximity"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = "git+https://github.com/Kerollmops/plane-sweep-proximity?rev=2f5ad5c#2f5ad5cdafde54731cd75d17ec6228ea3ca1f9b4"
|
source = "git+https://github.com/Kerollmops/plane-sweep-proximity?rev=6608205#66082058537f6fe7709adc4690048d62f3c0e9b7"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"tinyvec",
|
"tinyvec",
|
||||||
]
|
]
|
||||||
|
@ -22,7 +22,7 @@ levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
|||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
once_cell = "1.4.0"
|
once_cell = "1.4.0"
|
||||||
oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "5426182" }
|
oxidized-mtbl = { git = "https://github.com/Kerollmops/oxidized-mtbl.git", rev = "5426182" }
|
||||||
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "2f5ad5c" }
|
near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" }
|
||||||
rayon = "1.3.1"
|
rayon = "1.3.1"
|
||||||
ringtail = "0.3.0"
|
ringtail = "0.3.0"
|
||||||
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" }
|
roaring = { git = "https://github.com/Kerollmops/roaring-rs.git", branch = "mem-usage" }
|
||||||
|
@ -199,13 +199,14 @@ impl Store {
|
|||||||
let mut key = vec![WORD_DOCID_POSITIONS_BYTE];
|
let mut key = vec![WORD_DOCID_POSITIONS_BYTE];
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
|
|
||||||
// We serialize the document ids into a buffer
|
|
||||||
// We prefix the words by the document id.
|
// We prefix the words by the document id.
|
||||||
key.extend_from_slice(&id.to_be_bytes());
|
key.extend_from_slice(&id.to_be_bytes());
|
||||||
|
let base_size = key.len();
|
||||||
|
|
||||||
for (word, positions) in iter {
|
for (word, positions) in iter {
|
||||||
key.truncate(1 + 4);
|
key.truncate(base_size);
|
||||||
key.extend_from_slice(word.as_bytes());
|
key.extend_from_slice(word.as_bytes());
|
||||||
|
// We serialize the positions into a buffer.
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
buffer.reserve(positions.serialized_size());
|
buffer.reserve(positions.serialized_size());
|
||||||
positions.serialize_into(&mut buffer)?;
|
positions.serialize_into(&mut buffer)?;
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
|
use std::cmp;
|
||||||
|
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
use levenshtein_automata::DFA;
|
use levenshtein_automata::DFA;
|
||||||
@ -10,7 +11,7 @@ use roaring::bitmap::{IntoIter, RoaringBitmap};
|
|||||||
use near_proximity::near_proximity;
|
use near_proximity::near_proximity;
|
||||||
|
|
||||||
use crate::query_tokens::{QueryTokens, QueryToken};
|
use crate::query_tokens::{QueryTokens, QueryToken};
|
||||||
use crate::{Index, DocumentId};
|
use crate::{Index, DocumentId, Position};
|
||||||
|
|
||||||
// Building these factories is not free.
|
// Building these factories is not free.
|
||||||
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||||
@ -153,7 +154,7 @@ impl<'a> Search<'a> {
|
|||||||
if docids.contains(candidate) {
|
if docids.contains(candidate) {
|
||||||
match index.docid_word_positions.get(rtxn, &(candidate, word))? {
|
match index.docid_word_positions.get(rtxn, &(candidate, word))? {
|
||||||
Some(positions) => union_positions.union_with(&positions),
|
Some(positions) => union_positions.union_with(&positions),
|
||||||
None => error!("position missing for candidate {} and word {}", candidate, word),
|
None => error!("position missing for candidate {} and word {:?}", candidate, word),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -194,10 +195,37 @@ impl<'a> Search<'a> {
|
|||||||
let min_proximity = derived_words.len() as u32 - 1;
|
let min_proximity = derived_words.len() as u32 - 1;
|
||||||
let mut number_min_proximity = 0;
|
let mut number_min_proximity = 0;
|
||||||
|
|
||||||
|
// TODO move this function elsewhere
|
||||||
|
fn compute_proximity(path: &[Position]) -> u32 {
|
||||||
|
const ONE_ATTRIBUTE: u32 = 1000;
|
||||||
|
const MAX_DISTANCE: u32 = 8;
|
||||||
|
|
||||||
|
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||||
|
if lhs <= rhs {
|
||||||
|
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||||
|
} else {
|
||||||
|
cmp::min((lhs - rhs) + 1, MAX_DISTANCE)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn positions_proximity(lhs: u32, rhs: u32) -> u32 {
|
||||||
|
let (lhs_attr, lhs_index) = extract_position(lhs);
|
||||||
|
let (rhs_attr, rhs_index) = extract_position(rhs);
|
||||||
|
if lhs_attr != rhs_attr { MAX_DISTANCE }
|
||||||
|
else { index_proximity(lhs_index, rhs_index) }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_position(position: u32) -> (u32, u32) {
|
||||||
|
(position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE)
|
||||||
|
}
|
||||||
|
|
||||||
|
path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>()
|
||||||
|
}
|
||||||
|
|
||||||
let mut paths = Vec::new();
|
let mut paths = Vec::new();
|
||||||
for candidate in candidates {
|
for candidate in candidates {
|
||||||
let keywords = Self::fecth_keywords(rtxn, index, &derived_words, candidate)?;
|
let keywords = Self::fecth_keywords(rtxn, index, &derived_words, candidate)?;
|
||||||
near_proximity(keywords, &mut paths);
|
near_proximity(keywords, &mut paths, compute_proximity);
|
||||||
if let Some((prox, _path)) = paths.first() {
|
if let Some((prox, _path)) = paths.first() {
|
||||||
documents.push((*prox, candidate));
|
documents.push((*prox, candidate));
|
||||||
if *prox == min_proximity {
|
if *prox == min_proximity {
|
||||||
|
Loading…
Reference in New Issue
Block a user