mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-27 04:25:06 +08:00
Introduce a first draft of the best_proximity algorithm
This commit is contained in:
parent
dfdaceb410
commit
2a6d6a7f69
195
src/best_proximity.rs
Normal file
195
src/best_proximity.rs
Normal file
@ -0,0 +1,195 @@
|
|||||||
|
use std::cmp;
|
||||||
|
|
||||||
|
const ONE_ATTRIBUTE: u32 = 1000;
|
||||||
|
const MAX_INDEX: u32 = ONE_ATTRIBUTE - 1;
|
||||||
|
const MAX_DISTANCE: u32 = 8;
|
||||||
|
|
||||||
|
// Returns the attribute and index parts.
|
||||||
|
fn extract_position(position: u32) -> (u32, u32) {
|
||||||
|
(position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns a position from the two parts of it.
|
||||||
|
fn construct_position(attr: u32, index: u32) -> u32 {
|
||||||
|
attr * ONE_ATTRIBUTE + index
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO we should use an sdset::Set for `next_positions`.
|
||||||
|
// Returns the positions to focus that will give the best possible proximity.
|
||||||
|
fn best_proximity_for(current_position: u32, proximity: u32, next_positions: &[u32]) -> Option<(u32, Vec<u32>)> {
|
||||||
|
let (current_attr, _) = extract_position(current_position);
|
||||||
|
|
||||||
|
match proximity {
|
||||||
|
// look at i+0
|
||||||
|
0 => {
|
||||||
|
match next_positions.binary_search(¤t_position) {
|
||||||
|
Ok(_) => Some((0, vec![current_position])),
|
||||||
|
Err(_) => best_proximity_for(current_position, proximity + 1, next_positions),
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// look at i+1
|
||||||
|
1 => {
|
||||||
|
let position = current_position + 1;
|
||||||
|
let (attr, _) = extract_position(position);
|
||||||
|
|
||||||
|
// We must check that we do not overflowed the current attribute. If so,
|
||||||
|
// we must check for a bigger proximity that we will be able to find behind.
|
||||||
|
if current_attr == attr {
|
||||||
|
match next_positions.binary_search(&position) {
|
||||||
|
Ok(_) => Some((1, vec![position])),
|
||||||
|
Err(_) => best_proximity_for(current_position, proximity + 1, next_positions),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
best_proximity_for(current_position, proximity + 1, next_positions)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// look at i-(p-1), i+p
|
||||||
|
2..=7 => {
|
||||||
|
let mut output = Vec::new();
|
||||||
|
|
||||||
|
// Behind the current_position
|
||||||
|
if let Some(position) = current_position.checked_sub(proximity - 1) {
|
||||||
|
let (attr, _) = extract_position(position);
|
||||||
|
// We must make sure we are not looking at a word at the end of another attribute.
|
||||||
|
if current_attr == attr && next_positions.binary_search(&position).is_ok() {
|
||||||
|
output.push(position);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// In front of the current_position
|
||||||
|
let position = current_position + proximity;
|
||||||
|
let (attr, _) = extract_position(position);
|
||||||
|
// We must make sure we are not looking at a word at the end of another attribute.
|
||||||
|
if current_attr == attr && next_positions.binary_search(&position).is_ok() {
|
||||||
|
output.push(position);
|
||||||
|
}
|
||||||
|
|
||||||
|
if output.is_empty() {
|
||||||
|
best_proximity_for(current_position, proximity + 1, next_positions)
|
||||||
|
} else {
|
||||||
|
Some((proximity, output))
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// look at i+8 and all above and i-(8-1) and all below
|
||||||
|
8 => {
|
||||||
|
let mut output = Vec::new();
|
||||||
|
|
||||||
|
// Make sure we look at the latest index of the previous attr.
|
||||||
|
if let Some(previous_position) = construct_position(current_attr, 0).checked_sub(1) {
|
||||||
|
let position = current_position.saturating_sub(7).max(previous_position);
|
||||||
|
match dbg!(next_positions.binary_search(&position)) {
|
||||||
|
Ok(i) => output.extend_from_slice(&next_positions[..=i]),
|
||||||
|
Err(i) => if let Some(i) = i.checked_sub(1) {
|
||||||
|
if let Some(positions) = next_positions.get(..=i) {
|
||||||
|
output.extend_from_slice(positions)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure the position doesn't overflow to the next attribute.
|
||||||
|
let position = (current_position + 8).min(construct_position(current_attr + 1, 0));
|
||||||
|
match next_positions.binary_search(&position) {
|
||||||
|
Ok(i) => output.extend_from_slice(&next_positions[i..]),
|
||||||
|
Err(i) => if let Some(positions) = next_positions.get(i..) {
|
||||||
|
output.extend_from_slice(positions);
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if output.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some((8, output))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct BestProximity {
|
||||||
|
positions: Vec<Vec<u32>>,
|
||||||
|
current_proximity: Option<(u32, Vec<(u32, usize)>)>, // where we are
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BestProximity {
|
||||||
|
pub fn new(positions: Vec<Vec<u32>>) -> BestProximity {
|
||||||
|
BestProximity { positions, current_proximity: None }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Iterator for BestProximity {
|
||||||
|
type Item = (u32, Vec<u32>);
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
let output = Vec::new();
|
||||||
|
let best_proximity = 0;
|
||||||
|
|
||||||
|
for (i, positions) in self.positions.iter().enumerate() {
|
||||||
|
if let Some(next_positions) = self.positions.get(i + 1) {
|
||||||
|
for x in positions {
|
||||||
|
let p = next_positions.binary_search(&x);
|
||||||
|
let y = next_positions.get(p.unwrap_or_else(|p| p));
|
||||||
|
eprintln!("{:?} gives {:?} ({:?})", x, p, y);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// match &mut self.current_proximity {
|
||||||
|
// Some((_prox, _pos)) => {
|
||||||
|
// // ...
|
||||||
|
// },
|
||||||
|
// None => {
|
||||||
|
// // ...
|
||||||
|
// },
|
||||||
|
// }
|
||||||
|
|
||||||
|
Some((best_proximity, output))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn same_attribute() {
|
||||||
|
let positions = vec![
|
||||||
|
vec![0, 2, 3, 4 ],
|
||||||
|
vec![ 1, ],
|
||||||
|
vec![ 3, 6],
|
||||||
|
];
|
||||||
|
let mut iter = BestProximity::new(positions);
|
||||||
|
|
||||||
|
assert_eq!(iter.next(), Some((1+2, vec![0, 1, 3]))); // 3
|
||||||
|
assert_eq!(iter.next(), Some((2+2, vec![2, 1, 3]))); // 4
|
||||||
|
assert_eq!(iter.next(), Some((3+2, vec![3, 1, 3]))); // 5
|
||||||
|
assert_eq!(iter.next(), Some((1+5, vec![0, 1, 6]))); // 6
|
||||||
|
assert_eq!(iter.next(), Some((4+2, vec![4, 1, 3]))); // 6
|
||||||
|
assert_eq!(iter.next(), Some((2+5, vec![2, 1, 6]))); // 7
|
||||||
|
assert_eq!(iter.next(), Some((3+5, vec![3, 1, 6]))); // 8
|
||||||
|
assert_eq!(iter.next(), Some((4+5, vec![4, 1, 6]))); // 9
|
||||||
|
assert_eq!(iter.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn easy_best_proximity_for() {
|
||||||
|
// classic
|
||||||
|
assert_eq!(best_proximity_for(0, 0, &[0]), Some((0, vec![0])));
|
||||||
|
assert_eq!(best_proximity_for(0, 1, &[0]), None);
|
||||||
|
assert_eq!(best_proximity_for(1, 1, &[0]), Some((2, vec![0])));
|
||||||
|
assert_eq!(best_proximity_for(0, 1, &[0, 1]), Some((1, vec![1])));
|
||||||
|
assert_eq!(best_proximity_for(1, 1, &[0, 2]), Some((1, vec![2])));
|
||||||
|
assert_eq!(best_proximity_for(1, 2, &[0, 2]), Some((2, vec![0])));
|
||||||
|
assert_eq!(best_proximity_for(1, 2, &[0, 3]), Some((2, vec![0, 3])));
|
||||||
|
|
||||||
|
// limits
|
||||||
|
assert_eq!(best_proximity_for(2, 7, &[0, 9]), Some((7, vec![9])));
|
||||||
|
assert_eq!(best_proximity_for(12, 7, &[6, 19]), Some((7, vec![6, 19])));
|
||||||
|
|
||||||
|
// another attribute
|
||||||
|
assert_eq!(best_proximity_for(1000, 7, &[994, 1007]), Some((7, vec![1007])));
|
||||||
|
assert_eq!(best_proximity_for(1004, 7, &[994, 1011]), Some((7, vec![1011])));
|
||||||
|
assert_eq!(best_proximity_for(1004, 8, &[900, 913, 1000, 1012, 2012]), Some((8, vec![900, 913, 1012, 2012])));
|
||||||
|
assert_eq!(best_proximity_for(1009, 8, &[900, 913, 1002, 1012, 2012]), Some((8, vec![900, 913, 1002, 2012])));
|
||||||
|
}
|
||||||
|
}
|
@ -273,17 +273,17 @@ fn index_csv(mut rdr: csv::Reader<File>) -> anyhow::Result<MtblKvStore> {
|
|||||||
.or_insert_with(FastMap4::default).entry(position) // positions
|
.or_insert_with(FastMap4::default).entry(position) // positions
|
||||||
.or_insert_with(RoaringBitmap::new).insert(document_id); // document ids
|
.or_insert_with(RoaringBitmap::new).insert(document_id); // document ids
|
||||||
|
|
||||||
// We save the documents ids under the position and prefix of the word we have seen it.
|
// // We save the documents ids under the position and prefix of the word we have seen it.
|
||||||
if let Some(prefix) = word.as_bytes().get(0..word.len().min(5)) {
|
// if let Some(prefix) = word.as_bytes().get(0..word.len().min(5)) {
|
||||||
for i in 1..=prefix.len() {
|
// for i in 1..=prefix.len() {
|
||||||
prefix_postings_attrs.entry(SmallVec32::from(&prefix[..i]))
|
// prefix_postings_attrs.entry(SmallVec32::from(&prefix[..i]))
|
||||||
.or_insert_with(RoaringBitmap::new).insert(position);
|
// .or_insert_with(RoaringBitmap::new).insert(position);
|
||||||
|
|
||||||
prefix_postings_ids.entry(SmallVec32::from(&prefix[..i]))
|
// prefix_postings_ids.entry(SmallVec32::from(&prefix[..i]))
|
||||||
.or_insert_with(FastMap4::default).entry(position) // positions
|
// .or_insert_with(FastMap4::default).entry(position) // positions
|
||||||
.or_insert_with(RoaringBitmap::new).insert(document_id); // document ids
|
// .or_insert_with(RoaringBitmap::new).insert(document_id); // document ids
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
25
src/lib.rs
25
src/lib.rs
@ -1,9 +1,9 @@
|
|||||||
|
mod best_proximity;
|
||||||
mod query_tokens;
|
mod query_tokens;
|
||||||
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::hash::BuildHasherDefault;
|
use std::hash::BuildHasherDefault;
|
||||||
use std::time::Instant;
|
|
||||||
|
|
||||||
use cow_utils::CowUtils;
|
use cow_utils::CowUtils;
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
@ -15,6 +15,7 @@ use once_cell::sync::Lazy;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use self::query_tokens::{QueryTokens, QueryToken};
|
use self::query_tokens::{QueryTokens, QueryToken};
|
||||||
|
use self::best_proximity::BestProximity;
|
||||||
|
|
||||||
// Building these factories is not free.
|
// Building these factories is not free.
|
||||||
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||||
@ -88,10 +89,12 @@ impl Index {
|
|||||||
});
|
});
|
||||||
|
|
||||||
let mut words_positions = Vec::new();
|
let mut words_positions = Vec::new();
|
||||||
|
let mut positions = Vec::new();
|
||||||
|
|
||||||
for (word, is_prefix, dfa) in dfas {
|
for (word, is_prefix, dfa) in dfas {
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let mut union_positions = RoaringBitmap::default();
|
let mut union_positions = RoaringBitmap::default();
|
||||||
if word.len() <= 4 && is_prefix {
|
if false && word.len() <= 4 && is_prefix {
|
||||||
if let Some(ids) = self.prefix_postings_attrs.get(rtxn, word.as_bytes())? {
|
if let Some(ids) = self.prefix_postings_attrs.get(rtxn, word.as_bytes())? {
|
||||||
let right = RoaringBitmap::deserialize_from(ids)?;
|
let right = RoaringBitmap::deserialize_from(ids)?;
|
||||||
union_positions.union_with(&right);
|
union_positions.union_with(&right);
|
||||||
@ -110,23 +113,22 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
eprintln!("{} words for {:?} we have found positions {:?}", count, word, union_positions);
|
eprintln!("{} words for {:?} we have found positions {:?}", count, word, union_positions);
|
||||||
words_positions.push((word, is_prefix, dfa, union_positions));
|
words_positions.push((word, is_prefix, dfa));
|
||||||
|
positions.push(union_positions.iter().collect());
|
||||||
}
|
}
|
||||||
|
|
||||||
use itertools::EitherOrBoth;
|
// let positions = BestProximity::new(positions).next().unwrap_or_default();
|
||||||
let (a, b) = (&words_positions[0].3, &words_positions[1].3);
|
let _positions: Vec<Vec<u32>> = positions;
|
||||||
let positions: Vec<_> = itertools::merge_join_by(a, b, |a, b| (a + 1).cmp(b)).flat_map(EitherOrBoth::both).collect();
|
let positions = vec![0u32];
|
||||||
|
eprintln!("best proximity {:?}", positions);
|
||||||
if positions.is_empty() { return Ok(Vec::new()); }
|
|
||||||
|
|
||||||
let mut intersect_docids: Option<RoaringBitmap> = None;
|
let mut intersect_docids: Option<RoaringBitmap> = None;
|
||||||
for (i, (word, is_prefix, dfa, _)) in words_positions.into_iter().take(2).enumerate() {
|
for ((word, is_prefix, dfa), pos) in words_positions.into_iter().zip(positions) {
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let mut union_docids = RoaringBitmap::default();
|
let mut union_docids = RoaringBitmap::default();
|
||||||
|
|
||||||
if word.len() <= 4 && is_prefix {
|
if false && word.len() <= 4 && is_prefix {
|
||||||
let mut key = word.as_bytes()[..word.len().min(5)].to_vec();
|
let mut key = word.as_bytes()[..word.len().min(5)].to_vec();
|
||||||
let pos = if i == 0 { positions[0].0 } else { positions[0].1 };
|
|
||||||
key.extend_from_slice(&pos.to_be_bytes());
|
key.extend_from_slice(&pos.to_be_bytes());
|
||||||
if let Some(ids) = self.prefix_postings_ids.get(rtxn, &key)? {
|
if let Some(ids) = self.prefix_postings_ids.get(rtxn, &key)? {
|
||||||
let right = RoaringBitmap::deserialize_from(ids)?;
|
let right = RoaringBitmap::deserialize_from(ids)?;
|
||||||
@ -138,7 +140,6 @@ impl Index {
|
|||||||
while let Some(word) = stream.next() {
|
while let Some(word) = stream.next() {
|
||||||
let word = std::str::from_utf8(word)?;
|
let word = std::str::from_utf8(word)?;
|
||||||
let mut key = word.as_bytes().to_vec();
|
let mut key = word.as_bytes().to_vec();
|
||||||
let pos = if i == 0 { positions[0].0 } else { positions[0].1 };
|
|
||||||
key.extend_from_slice(&pos.to_be_bytes());
|
key.extend_from_slice(&pos.to_be_bytes());
|
||||||
if let Some(attrs) = self.postings_ids.get(rtxn, &key)? {
|
if let Some(attrs) = self.postings_ids.get(rtxn, &key)? {
|
||||||
let right = RoaringBitmap::deserialize_from(attrs)?;
|
let right = RoaringBitmap::deserialize_from(attrs)?;
|
||||||
|
Loading…
Reference in New Issue
Block a user