mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 08:48:32 +08:00
Merge pull request #453 from meilisearch/introduce-query-tree
Introduce a query tree structure
This commit is contained in:
commit
69adb1d771
16
Cargo.lock
generated
16
Cargo.lock
generated
@ -799,6 +799,14 @@ dependencies = [
|
|||||||
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "intervaltree"
|
||||||
|
version = "0.2.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"smallvec 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "iovec"
|
name = "iovec"
|
||||||
version = "0.1.4"
|
version = "0.1.4"
|
||||||
@ -952,6 +960,7 @@ dependencies = [
|
|||||||
"hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
"hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"intervaltree 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -1797,6 +1806,11 @@ dependencies = [
|
|||||||
"maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "smallvec"
|
||||||
|
version = "1.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "sourcefile"
|
name = "sourcefile"
|
||||||
version = "0.1.4"
|
version = "0.1.4"
|
||||||
@ -2715,6 +2729,7 @@ dependencies = [
|
|||||||
"checksum idna 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "38f09e0f0b1fb55fdee1f17470ad800da77af5186a1a76c026b679358b7e844e"
|
"checksum idna 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "38f09e0f0b1fb55fdee1f17470ad800da77af5186a1a76c026b679358b7e844e"
|
||||||
"checksum idna 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9"
|
"checksum idna 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9"
|
||||||
"checksum indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712d7b3ea5827fcb9d4fda14bf4da5f136f0db2ae9c8f4bd4e2d1c6fde4e6db2"
|
"checksum indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712d7b3ea5827fcb9d4fda14bf4da5f136f0db2ae9c8f4bd4e2d1c6fde4e6db2"
|
||||||
|
"checksum intervaltree 0.2.5 (registry+https://github.com/rust-lang/crates.io-index)" = "8254add2ea664734c9d001f8151cc3d7696b135f7e40e5a2efa814a662cb3a44"
|
||||||
"checksum iovec 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e"
|
"checksum iovec 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e"
|
||||||
"checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
|
"checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
|
||||||
"checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f"
|
"checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f"
|
||||||
@ -2822,6 +2837,7 @@ dependencies = [
|
|||||||
"checksum slice-group-by 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "1f7474f0b646d228360ab62ed974744617bc869d959eac8403bfa3665931a7fb"
|
"checksum slice-group-by 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "1f7474f0b646d228360ab62ed974744617bc869d959eac8403bfa3665931a7fb"
|
||||||
"checksum slog 2.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1cc9c640a4adbfbcc11ffb95efe5aa7af7309e002adab54b185507dbf2377b99"
|
"checksum slog 2.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1cc9c640a4adbfbcc11ffb95efe5aa7af7309e002adab54b185507dbf2377b99"
|
||||||
"checksum smallvec 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "f7b0758c52e15a8b5e3691eae6cc559f08eee9406e548a4477ba4e67770a82b6"
|
"checksum smallvec 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "f7b0758c52e15a8b5e3691eae6cc559f08eee9406e548a4477ba4e67770a82b6"
|
||||||
|
"checksum smallvec 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "44e59e0c9fa00817912ae6e4e6e3c4fe04455e75699d06eedc7d85917ed8e8f4"
|
||||||
"checksum sourcefile 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4bf77cb82ba8453b42b6ae1d692e4cdc92f9a47beaf89a847c8be83f4e328ad3"
|
"checksum sourcefile 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4bf77cb82ba8453b42b6ae1d692e4cdc92f9a47beaf89a847c8be83f4e328ad3"
|
||||||
"checksum spin 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
|
"checksum spin 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
|
||||||
"checksum stdweb 0.4.20 (registry+https://github.com/rust-lang/crates.io-index)" = "d022496b16281348b52d0e30ae99e01a73d737b2f45d38fed4edf79f9325a1d5"
|
"checksum stdweb 0.4.20 (registry+https://github.com/rust-lang/crates.io-index)" = "d022496b16281348b52d0e30ae99e01a73d737b2f45d38fed4edf79f9325a1d5"
|
||||||
|
@ -17,7 +17,8 @@ env_logger = "0.7.0"
|
|||||||
fst = { version = "0.3.5", default-features = false }
|
fst = { version = "0.3.5", default-features = false }
|
||||||
hashbrown = { version = "0.6.0", features = ["serde"] }
|
hashbrown = { version = "0.6.0", features = ["serde"] }
|
||||||
heed = "0.6.1"
|
heed = "0.6.1"
|
||||||
itertools = "0.8.2" # kill me please
|
intervaltree = "0.2.5"
|
||||||
|
itertools = "0.8.2"
|
||||||
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||||
log = "0.4.8"
|
log = "0.4.8"
|
||||||
meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" }
|
meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.4" }
|
||||||
|
@ -1,13 +1,8 @@
|
|||||||
mod dfa;
|
mod dfa;
|
||||||
mod query_enhancer;
|
|
||||||
|
|
||||||
use meilisearch_tokenizer::is_cjk;
|
use meilisearch_tokenizer::is_cjk;
|
||||||
|
|
||||||
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||||
pub use self::query_enhancer::QueryEnhancer;
|
|
||||||
pub use self::query_enhancer::QueryEnhancerBuilder;
|
|
||||||
|
|
||||||
pub const NGRAMS: usize = 3;
|
|
||||||
|
|
||||||
pub fn normalize_str(string: &str) -> String {
|
pub fn normalize_str(string: &str) -> String {
|
||||||
let mut string = string.to_lowercase();
|
let mut string = string.to_lowercase();
|
||||||
|
@ -1,437 +0,0 @@
|
|||||||
use std::cmp::Ordering::{Equal, Greater, Less};
|
|
||||||
use std::ops::Range;
|
|
||||||
|
|
||||||
/// Return `true` if the specified range can accept the given replacements words.
|
|
||||||
/// Returns `false` if the replacements words are already present in the original query
|
|
||||||
/// or if there is fewer replacement words than the range to replace.
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// ## Ignored because already present in original
|
|
||||||
//
|
|
||||||
// new york city subway
|
|
||||||
// -------- ^^^^
|
|
||||||
// / \
|
|
||||||
// [new york city]
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// ## Ignored because smaller than the original
|
|
||||||
//
|
|
||||||
// new york city subway
|
|
||||||
// -------------
|
|
||||||
// \ /
|
|
||||||
// [new york]
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// ## Accepted because bigger than the original
|
|
||||||
//
|
|
||||||
// NYC subway
|
|
||||||
// ---
|
|
||||||
// / \
|
|
||||||
// / \
|
|
||||||
// / \
|
|
||||||
// / \
|
|
||||||
// / \
|
|
||||||
// [new york city]
|
|
||||||
//
|
|
||||||
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
|
||||||
where
|
|
||||||
S: AsRef<str>,
|
|
||||||
T: AsRef<str>,
|
|
||||||
{
|
|
||||||
if words.len() <= range.len() {
|
|
||||||
// there is fewer or equal replacement words
|
|
||||||
// than there is already in the replaced range
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// retrieve the part to rewrite but with the length
|
|
||||||
// of the replacement part
|
|
||||||
let original = query.iter().skip(range.start).take(words.len());
|
|
||||||
|
|
||||||
// check if the original query doesn't already contain
|
|
||||||
// the replacement words
|
|
||||||
!original
|
|
||||||
.map(AsRef::as_ref)
|
|
||||||
.eq(words.iter().map(AsRef::as_ref))
|
|
||||||
}
|
|
||||||
|
|
||||||
type Origin = usize;
|
|
||||||
type RealLength = usize;
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
struct FakeIntervalTree {
|
|
||||||
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FakeIntervalTree {
|
|
||||||
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
|
|
||||||
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
|
|
||||||
FakeIntervalTree { intervals }
|
|
||||||
}
|
|
||||||
|
|
||||||
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
|
||||||
let element = self.intervals.binary_search_by(|(r, _)| {
|
|
||||||
if point >= r.start {
|
|
||||||
if point < r.end {
|
|
||||||
Equal
|
|
||||||
} else {
|
|
||||||
Less
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Greater
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let n = match element {
|
|
||||||
Ok(n) => n,
|
|
||||||
Err(n) => n,
|
|
||||||
};
|
|
||||||
|
|
||||||
match self.intervals.get(n) {
|
|
||||||
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
|
||||||
_otherwise => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct QueryEnhancerBuilder<'a, S> {
|
|
||||||
query: &'a [S],
|
|
||||||
origins: Vec<usize>,
|
|
||||||
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
|
||||||
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
|
||||||
// we initialize origins query indices based on their positions
|
|
||||||
let origins: Vec<_> = (0..=query.len()).collect();
|
|
||||||
let real_to_origin = origins.iter().map(|&o| (o..o + 1, (o, 1))).collect();
|
|
||||||
|
|
||||||
QueryEnhancerBuilder {
|
|
||||||
query,
|
|
||||||
origins,
|
|
||||||
real_to_origin,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Update the final real to origin query indices mapping.
|
|
||||||
///
|
|
||||||
/// `range` is the original words range that this `replacement` words replace
|
|
||||||
/// and `real` is the first real query index of these replacement words.
|
|
||||||
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
|
||||||
where
|
|
||||||
T: AsRef<str>,
|
|
||||||
{
|
|
||||||
// check if the range of original words
|
|
||||||
// can be rewritten with the replacement words
|
|
||||||
if rewrite_range_with(self.query, range.clone(), replacement) {
|
|
||||||
// this range can be replaced so we need to
|
|
||||||
// modify the origins accordingly
|
|
||||||
let offset = replacement.len() - range.len();
|
|
||||||
|
|
||||||
let previous_padding = self.origins[range.end - 1];
|
|
||||||
let current_offset = (self.origins[range.end] - 1) - previous_padding;
|
|
||||||
let diff = offset.saturating_sub(current_offset);
|
|
||||||
self.origins[range.end] += diff;
|
|
||||||
|
|
||||||
for r in &mut self.origins[range.end + 1..] {
|
|
||||||
*r += diff;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// we need to store the real number and origins relations
|
|
||||||
// this way it will be possible to know by how many
|
|
||||||
// we need to pad real query indices
|
|
||||||
let real_range = real..real + replacement.len().max(range.len());
|
|
||||||
let real_length = replacement.len();
|
|
||||||
self.real_to_origin.push((real_range, (range.start, real_length)));
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn build(self) -> QueryEnhancer {
|
|
||||||
let interval_tree = FakeIntervalTree::new(self.real_to_origin);
|
|
||||||
let mut table = Vec::new();
|
|
||||||
|
|
||||||
for real in 0.. {
|
|
||||||
match replacement(&self.origins, &interval_tree, real) {
|
|
||||||
Some(range) => table.push(range),
|
|
||||||
None => break,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
QueryEnhancer { table }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the query indices that represent this real query index.
|
|
||||||
fn replacement(
|
|
||||||
origins: &[usize],
|
|
||||||
real_to_origin: &FakeIntervalTree,
|
|
||||||
real: u32,
|
|
||||||
) -> Option<Range<u32>>
|
|
||||||
{
|
|
||||||
let real = real as usize;
|
|
||||||
|
|
||||||
// query the fake interval tree with the real query index
|
|
||||||
let (range, (origin, real_length)) = real_to_origin.query(real)?;
|
|
||||||
|
|
||||||
// if `real` is the end bound of the range
|
|
||||||
if (range.start + real_length - 1) == real {
|
|
||||||
let mut count = range.len();
|
|
||||||
let mut new_origin = origin;
|
|
||||||
for (i, slice) in origins[new_origin..].windows(2).enumerate() {
|
|
||||||
let len = slice[1] - slice[0];
|
|
||||||
count = count.saturating_sub(len);
|
|
||||||
if count == 0 {
|
|
||||||
new_origin = origin + i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let n = real - range.start;
|
|
||||||
let start = origins[origin];
|
|
||||||
let end = origins.get(new_origin + 1)?;
|
|
||||||
let remaining = (end - start) - n;
|
|
||||||
|
|
||||||
Some(Range {
|
|
||||||
start: (start + n) as u32,
|
|
||||||
end: (start + n + remaining) as u32,
|
|
||||||
})
|
|
||||||
} else {
|
|
||||||
// just return the origin along with
|
|
||||||
// the real position of the word
|
|
||||||
let n = real as usize - range.start;
|
|
||||||
let origin = origins[origin];
|
|
||||||
|
|
||||||
Some(Range {
|
|
||||||
start: (origin + n) as u32,
|
|
||||||
end: (origin + n + 1) as u32,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct QueryEnhancer {
|
|
||||||
table: Vec<Range<u32>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl QueryEnhancer {
|
|
||||||
/// Returns the query indices that represent this real query index.
|
|
||||||
pub fn replacement(&self, real: u32) -> Range<u32> {
|
|
||||||
self.table[real as usize].clone()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn original_unmodified() {
|
|
||||||
let query = ["new", "york", "city", "subway"];
|
|
||||||
// 0 1 2 3
|
|
||||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
|
||||||
|
|
||||||
// new york = new york city
|
|
||||||
builder.declare(0..2, 4, &["new", "york", "city"]);
|
|
||||||
// ^ 4 5 6
|
|
||||||
|
|
||||||
let enhancer = builder.build();
|
|
||||||
|
|
||||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
|
||||||
assert_eq!(enhancer.replacement(1), 1..2); // york
|
|
||||||
assert_eq!(enhancer.replacement(2), 2..3); // city
|
|
||||||
assert_eq!(enhancer.replacement(3), 3..4); // subway
|
|
||||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
|
||||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
|
||||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn simple_growing() {
|
|
||||||
let query = ["new", "york", "subway"];
|
|
||||||
// 0 1 2
|
|
||||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
|
||||||
|
|
||||||
// new york = new york city
|
|
||||||
builder.declare(0..2, 3, &["new", "york", "city"]);
|
|
||||||
// ^ 3 4 5
|
|
||||||
|
|
||||||
let enhancer = builder.build();
|
|
||||||
|
|
||||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
|
||||||
assert_eq!(enhancer.replacement(1), 1..3); // york
|
|
||||||
assert_eq!(enhancer.replacement(2), 3..4); // subway
|
|
||||||
assert_eq!(enhancer.replacement(3), 0..1); // new
|
|
||||||
assert_eq!(enhancer.replacement(4), 1..2); // york
|
|
||||||
assert_eq!(enhancer.replacement(5), 2..3); // city
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn same_place_growings() {
|
|
||||||
let query = ["NY", "subway"];
|
|
||||||
// 0 1
|
|
||||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
|
||||||
|
|
||||||
// NY = new york
|
|
||||||
builder.declare(0..1, 2, &["new", "york"]);
|
|
||||||
// ^ 2 3
|
|
||||||
|
|
||||||
// NY = new york city
|
|
||||||
builder.declare(0..1, 4, &["new", "york", "city"]);
|
|
||||||
// ^ 4 5 6
|
|
||||||
|
|
||||||
// NY = NYC
|
|
||||||
builder.declare(0..1, 7, &["NYC"]);
|
|
||||||
// ^ 7
|
|
||||||
|
|
||||||
// NY = new york city
|
|
||||||
builder.declare(0..1, 8, &["new", "york", "city"]);
|
|
||||||
// ^ 8 9 10
|
|
||||||
|
|
||||||
// subway = underground train
|
|
||||||
builder.declare(1..2, 11, &["underground", "train"]);
|
|
||||||
// ^ 11 12
|
|
||||||
|
|
||||||
let enhancer = builder.build();
|
|
||||||
|
|
||||||
assert_eq!(enhancer.replacement(0), 0..3); // NY
|
|
||||||
assert_eq!(enhancer.replacement(1), 3..5); // subway
|
|
||||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
|
||||||
assert_eq!(enhancer.replacement(3), 1..3); // york
|
|
||||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
|
||||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
|
||||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
|
||||||
assert_eq!(enhancer.replacement(7), 0..3); // NYC
|
|
||||||
assert_eq!(enhancer.replacement(8), 0..1); // new
|
|
||||||
assert_eq!(enhancer.replacement(9), 1..2); // york
|
|
||||||
assert_eq!(enhancer.replacement(10), 2..3); // city
|
|
||||||
assert_eq!(enhancer.replacement(11), 3..4); // underground
|
|
||||||
assert_eq!(enhancer.replacement(12), 4..5); // train
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn bigger_growing() {
|
|
||||||
let query = ["NYC", "subway"];
|
|
||||||
// 0 1
|
|
||||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
|
||||||
|
|
||||||
// NYC = new york city
|
|
||||||
builder.declare(0..1, 2, &["new", "york", "city"]);
|
|
||||||
// ^ 2 3 4
|
|
||||||
|
|
||||||
let enhancer = builder.build();
|
|
||||||
|
|
||||||
assert_eq!(enhancer.replacement(0), 0..3); // NYC
|
|
||||||
assert_eq!(enhancer.replacement(1), 3..4); // subway
|
|
||||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
|
||||||
assert_eq!(enhancer.replacement(3), 1..2); // york
|
|
||||||
assert_eq!(enhancer.replacement(4), 2..3); // city
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn middle_query_growing() {
|
|
||||||
let query = ["great", "awesome", "NYC", "subway"];
|
|
||||||
// 0 1 2 3
|
|
||||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
|
||||||
|
|
||||||
// NYC = new york city
|
|
||||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
|
||||||
// ^ 4 5 6
|
|
||||||
|
|
||||||
let enhancer = builder.build();
|
|
||||||
|
|
||||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
|
||||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
|
||||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
|
||||||
assert_eq!(enhancer.replacement(3), 5..6); // subway
|
|
||||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
|
||||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
|
||||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn end_query_growing() {
|
|
||||||
let query = ["NYC", "subway"];
|
|
||||||
// 0 1
|
|
||||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
|
||||||
|
|
||||||
// NYC = new york city
|
|
||||||
builder.declare(1..2, 2, &["underground", "train"]);
|
|
||||||
// ^ 2 3
|
|
||||||
|
|
||||||
let enhancer = builder.build();
|
|
||||||
|
|
||||||
assert_eq!(enhancer.replacement(0), 0..1); // NYC
|
|
||||||
assert_eq!(enhancer.replacement(1), 1..3); // subway
|
|
||||||
assert_eq!(enhancer.replacement(2), 1..2); // underground
|
|
||||||
assert_eq!(enhancer.replacement(3), 2..3); // train
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn multiple_growings() {
|
|
||||||
let query = ["great", "awesome", "NYC", "subway"];
|
|
||||||
// 0 1 2 3
|
|
||||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
|
||||||
|
|
||||||
// NYC = new york city
|
|
||||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
|
||||||
// ^ 4 5 6
|
|
||||||
|
|
||||||
// subway = underground train
|
|
||||||
builder.declare(3..4, 7, &["underground", "train"]);
|
|
||||||
// ^ 7 8
|
|
||||||
|
|
||||||
let enhancer = builder.build();
|
|
||||||
|
|
||||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
|
||||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
|
||||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
|
||||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
|
||||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
|
||||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
|
||||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
|
||||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
|
||||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn multiple_probable_growings() {
|
|
||||||
let query = ["great", "awesome", "NYC", "subway"];
|
|
||||||
// 0 1 2 3
|
|
||||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
|
||||||
|
|
||||||
// NYC = new york city
|
|
||||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
|
||||||
// ^ 4 5 6
|
|
||||||
|
|
||||||
// subway = underground train
|
|
||||||
builder.declare(3..4, 7, &["underground", "train"]);
|
|
||||||
// ^ 7 8
|
|
||||||
|
|
||||||
// great awesome = good
|
|
||||||
builder.declare(0..2, 9, &["good"]);
|
|
||||||
// ^ 9
|
|
||||||
|
|
||||||
// awesome NYC = NY
|
|
||||||
builder.declare(1..3, 10, &["NY"]);
|
|
||||||
// ^^ 10
|
|
||||||
|
|
||||||
// NYC subway = metro
|
|
||||||
builder.declare(2..4, 11, &["metro"]);
|
|
||||||
// ^^ 11
|
|
||||||
|
|
||||||
let enhancer = builder.build();
|
|
||||||
|
|
||||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
|
||||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
|
||||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
|
||||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
|
||||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
|
||||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
|
||||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
|
||||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
|
||||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
|
||||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
|
||||||
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
|
||||||
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,31 +1,27 @@
|
|||||||
use std::ops::Deref;
|
|
||||||
use std::{cmp, fmt};
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
use std::collections::HashMap;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
|
use std::ops::Deref;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
use std::rc::Rc;
|
use std::rc::Rc;
|
||||||
use std::time::{Duration, Instant};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
use std::time::Instant;
|
||||||
|
use std::fmt;
|
||||||
|
|
||||||
use compact_arena::{SmallArena, Idx32, mk_arena};
|
use compact_arena::{SmallArena, Idx32, mk_arena};
|
||||||
use fst::{IntoStreamer, Streamer};
|
|
||||||
use hashbrown::HashMap;
|
|
||||||
use levenshtein_automata::DFA;
|
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use meilisearch_tokenizer::{is_cjk, split_query_string};
|
|
||||||
use meilisearch_types::DocIndex;
|
use meilisearch_types::DocIndex;
|
||||||
use sdset::{Set, SetBuf};
|
use sdset::{Set, SetBuf, exponential_search};
|
||||||
use slice_group_by::{GroupBy, GroupByMut};
|
use slice_group_by::{GroupBy, GroupByMut};
|
||||||
|
|
||||||
use crate::automaton::NGRAMS;
|
|
||||||
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
|
||||||
use crate::automaton::normalize_str;
|
|
||||||
use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
|
|
||||||
|
|
||||||
use crate::criterion::{Criteria, Context, ContextMut};
|
use crate::criterion::{Criteria, Context, ContextMut};
|
||||||
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||||
use crate::raw_document::RawDocument;
|
use crate::raw_document::RawDocument;
|
||||||
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
||||||
use crate::{store, Document, DocumentId, MResult};
|
use crate::{store, Document, DocumentId, MResult};
|
||||||
|
use crate::query_tree::{create_query_tree, traverse_query_tree};
|
||||||
|
use crate::query_tree::{Operation, QueryResult, QueryKind, QueryId, PostingsKey};
|
||||||
|
use crate::query_tree::Context as QTContext;
|
||||||
|
|
||||||
pub fn bucket_sort<'c, FI>(
|
pub fn bucket_sort<'c, FI>(
|
||||||
reader: &heed::RoTxn<MainT>,
|
reader: &heed::RoTxn<MainT>,
|
||||||
@ -38,6 +34,8 @@ pub fn bucket_sort<'c, FI>(
|
|||||||
postings_lists_store: store::PostingsLists,
|
postings_lists_store: store::PostingsLists,
|
||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
synonyms_store: store::Synonyms,
|
synonyms_store: store::Synonyms,
|
||||||
|
prefix_documents_cache_store: store::PrefixDocumentsCache,
|
||||||
|
prefix_postings_lists_cache_store: store::PrefixPostingsListsCache,
|
||||||
) -> MResult<Vec<Document>>
|
) -> MResult<Vec<Document>>
|
||||||
where
|
where
|
||||||
FI: Fn(DocumentId) -> bool,
|
FI: Fn(DocumentId) -> bool,
|
||||||
@ -60,42 +58,63 @@ where
|
|||||||
postings_lists_store,
|
postings_lists_store,
|
||||||
documents_fields_counts_store,
|
documents_fields_counts_store,
|
||||||
synonyms_store,
|
synonyms_store,
|
||||||
|
prefix_documents_cache_store,
|
||||||
|
prefix_postings_lists_cache_store,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
let (mut automatons, mut query_enhancer) =
|
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
|
||||||
construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
|
Some(words) => words,
|
||||||
|
None => return Ok(Vec::new()),
|
||||||
|
};
|
||||||
|
|
||||||
debug!("{:?}", query_enhancer);
|
let context = QTContext {
|
||||||
|
words_set,
|
||||||
|
synonyms: synonyms_store,
|
||||||
|
postings_lists: postings_lists_store,
|
||||||
|
prefix_postings_lists: prefix_postings_lists_cache_store,
|
||||||
|
};
|
||||||
|
|
||||||
let before_postings_lists_fetching = Instant::now();
|
let (operation, mapping) = create_query_tree(reader, &context, query)?;
|
||||||
mk_arena!(arena);
|
debug!("operation:\n{:?}", operation);
|
||||||
let mut bare_matches =
|
debug!("mapping:\n{:?}", mapping);
|
||||||
fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
|
|
||||||
debug!("bare matches ({}) retrieved in {:.02?}",
|
|
||||||
bare_matches.len(),
|
|
||||||
before_postings_lists_fetching.elapsed(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let before_raw_documents_presort = Instant::now();
|
fn recurs_operation<'o>(map: &mut HashMap<QueryId, &'o QueryKind>, operation: &'o Operation) {
|
||||||
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
match operation {
|
||||||
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
Operation::And(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
||||||
|
Operation::Or(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
||||||
let before_raw_documents_building = Instant::now();
|
Operation::Query(query) => { map.insert(query.id, &query.kind); },
|
||||||
let mut prefiltered_documents = 0;
|
|
||||||
let mut raw_documents = Vec::new();
|
|
||||||
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
|
||||||
prefiltered_documents += 1;
|
|
||||||
if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) {
|
|
||||||
raw_documents.push(raw_document);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
debug!("creating {} (original {}) candidates documents took {:.02?}",
|
|
||||||
|
let mut queries_kinds = HashMap::new();
|
||||||
|
recurs_operation(&mut queries_kinds, &operation);
|
||||||
|
|
||||||
|
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation)?;
|
||||||
|
debug!("found {} documents", docids.len());
|
||||||
|
debug!("number of postings {:?}", queries.len());
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
|
mk_arena!(arena);
|
||||||
|
let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries);
|
||||||
|
debug!("matches cleaned in {:.02?}", before.elapsed());
|
||||||
|
|
||||||
|
let before_bucket_sort = Instant::now();
|
||||||
|
|
||||||
|
let before_raw_documents_building = Instant::now();
|
||||||
|
let mut raw_documents = Vec::new();
|
||||||
|
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||||
|
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
|
||||||
|
raw_documents.push(raw_document);
|
||||||
|
}
|
||||||
|
debug!("creating {} candidates documents took {:.02?}",
|
||||||
raw_documents.len(),
|
raw_documents.len(),
|
||||||
prefiltered_documents,
|
|
||||||
before_raw_documents_building.elapsed(),
|
before_raw_documents_building.elapsed(),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let before_criterion_loop = Instant::now();
|
||||||
|
let proximity_count = AtomicUsize::new(0);
|
||||||
|
|
||||||
let mut groups = vec![raw_documents.as_mut_slice()];
|
let mut groups = vec![raw_documents.as_mut_slice()];
|
||||||
|
|
||||||
'criteria: for criterion in criteria.as_ref() {
|
'criteria: for criterion in criteria.as_ref() {
|
||||||
@ -108,8 +127,7 @@ where
|
|||||||
let ctx = ContextMut {
|
let ctx = ContextMut {
|
||||||
reader,
|
reader,
|
||||||
postings_lists: &mut arena,
|
postings_lists: &mut arena,
|
||||||
query_enhancer: &mut query_enhancer,
|
query_mapping: &mapping,
|
||||||
automatons: &mut automatons,
|
|
||||||
documents_fields_counts_store,
|
documents_fields_counts_store,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -118,8 +136,7 @@ where
|
|||||||
|
|
||||||
let ctx = Context {
|
let ctx = Context {
|
||||||
postings_lists: &arena,
|
postings_lists: &arena,
|
||||||
query_enhancer: &query_enhancer,
|
query_mapping: &mapping,
|
||||||
automatons: &automatons,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let before_criterion_sort = Instant::now();
|
let before_criterion_sort = Instant::now();
|
||||||
@ -141,10 +158,16 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
|
debug!("criterion loop took {:.02?}", before_criterion_loop.elapsed());
|
||||||
let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena, searchable_attrs.as_ref()));
|
debug!("proximity evaluation called {} times", proximity_count.load(Ordering::Relaxed));
|
||||||
|
|
||||||
Ok(iter.collect())
|
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
|
||||||
|
let iter = iter.map(|rd| Document::from_raw(rd, &queries_kinds, &arena, searchable_attrs.as_ref()));
|
||||||
|
let documents = iter.collect();
|
||||||
|
|
||||||
|
debug!("bucket sort took {:.02?}", before_bucket_sort.elapsed());
|
||||||
|
|
||||||
|
Ok(documents)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn bucket_sort_with_distinct<'c, FI, FD>(
|
pub fn bucket_sort_with_distinct<'c, FI, FD>(
|
||||||
@ -160,38 +183,57 @@ pub fn bucket_sort_with_distinct<'c, FI, FD>(
|
|||||||
postings_lists_store: store::PostingsLists,
|
postings_lists_store: store::PostingsLists,
|
||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
synonyms_store: store::Synonyms,
|
synonyms_store: store::Synonyms,
|
||||||
|
_prefix_documents_cache_store: store::PrefixDocumentsCache,
|
||||||
|
prefix_postings_lists_cache_store: store::PrefixPostingsListsCache,
|
||||||
) -> MResult<Vec<Document>>
|
) -> MResult<Vec<Document>>
|
||||||
where
|
where
|
||||||
FI: Fn(DocumentId) -> bool,
|
FI: Fn(DocumentId) -> bool,
|
||||||
FD: Fn(DocumentId) -> Option<u64>,
|
FD: Fn(DocumentId) -> Option<u64>,
|
||||||
{
|
{
|
||||||
let (mut automatons, mut query_enhancer) =
|
let words_set = match unsafe { main_store.static_words_fst(reader)? } {
|
||||||
construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
|
Some(words) => words,
|
||||||
|
None => return Ok(Vec::new()),
|
||||||
|
};
|
||||||
|
|
||||||
let before_postings_lists_fetching = Instant::now();
|
let context = QTContext {
|
||||||
mk_arena!(arena);
|
words_set,
|
||||||
let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
|
synonyms: synonyms_store,
|
||||||
debug!("bare matches ({}) retrieved in {:.02?}",
|
postings_lists: postings_lists_store,
|
||||||
bare_matches.len(),
|
prefix_postings_lists: prefix_postings_lists_cache_store,
|
||||||
before_postings_lists_fetching.elapsed(),
|
};
|
||||||
);
|
|
||||||
|
|
||||||
let before_raw_documents_presort = Instant::now();
|
let (operation, mapping) = create_query_tree(reader, &context, query)?;
|
||||||
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
debug!("operation:\n{:?}", operation);
|
||||||
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
debug!("mapping:\n{:?}", mapping);
|
||||||
|
|
||||||
let before_raw_documents_building = Instant::now();
|
fn recurs_operation<'o>(map: &mut HashMap<QueryId, &'o QueryKind>, operation: &'o Operation) {
|
||||||
let mut prefiltered_documents = 0;
|
match operation {
|
||||||
let mut raw_documents = Vec::new();
|
Operation::And(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
||||||
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
Operation::Or(ops) => ops.iter().for_each(|op| recurs_operation(map, op)),
|
||||||
prefiltered_documents += 1;
|
Operation::Query(query) => { map.insert(query.id, &query.kind); },
|
||||||
if let Some(raw_document) = RawDocument::new(bare_matches, &automatons, &mut arena, searchable_attrs.as_ref()) {
|
|
||||||
raw_documents.push(raw_document);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
debug!("creating {} (original {}) candidates documents took {:.02?}",
|
|
||||||
|
let mut queries_kinds = HashMap::new();
|
||||||
|
recurs_operation(&mut queries_kinds, &operation);
|
||||||
|
|
||||||
|
let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation)?;
|
||||||
|
debug!("found {} documents", docids.len());
|
||||||
|
debug!("number of postings {:?}", queries.len());
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
|
mk_arena!(arena);
|
||||||
|
let mut bare_matches = cleanup_bare_matches(&mut arena, &docids, queries);
|
||||||
|
debug!("matches cleaned in {:.02?}", before.elapsed());
|
||||||
|
|
||||||
|
let before_raw_documents_building = Instant::now();
|
||||||
|
let mut raw_documents = Vec::new();
|
||||||
|
for bare_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||||
|
let raw_document = RawDocument::new(bare_matches, &mut arena, searchable_attrs.as_ref());
|
||||||
|
raw_documents.push(raw_document);
|
||||||
|
}
|
||||||
|
debug!("creating {} candidates documents took {:.02?}",
|
||||||
raw_documents.len(),
|
raw_documents.len(),
|
||||||
prefiltered_documents,
|
|
||||||
before_raw_documents_building.elapsed(),
|
before_raw_documents_building.elapsed(),
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -222,8 +264,7 @@ where
|
|||||||
let ctx = ContextMut {
|
let ctx = ContextMut {
|
||||||
reader,
|
reader,
|
||||||
postings_lists: &mut arena,
|
postings_lists: &mut arena,
|
||||||
query_enhancer: &mut query_enhancer,
|
query_mapping: &mapping,
|
||||||
automatons: &mut automatons,
|
|
||||||
documents_fields_counts_store,
|
documents_fields_counts_store,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -233,8 +274,7 @@ where
|
|||||||
|
|
||||||
let ctx = Context {
|
let ctx = Context {
|
||||||
postings_lists: &arena,
|
postings_lists: &arena,
|
||||||
query_enhancer: &query_enhancer,
|
query_mapping: &mapping,
|
||||||
automatons: &automatons,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let before_criterion_sort = Instant::now();
|
let before_criterion_sort = Instant::now();
|
||||||
@ -306,7 +346,7 @@ where
|
|||||||
};
|
};
|
||||||
|
|
||||||
if distinct_accepted && seen.len() > range.start {
|
if distinct_accepted && seen.len() > range.start {
|
||||||
documents.push(Document::from_raw(raw_document, &automatons, &arena, searchable_attrs.as_ref()));
|
documents.push(Document::from_raw(raw_document, &queries_kinds, &arena, searchable_attrs.as_ref()));
|
||||||
if documents.len() == range.len() {
|
if documents.len() == range.len() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -317,9 +357,82 @@ where
|
|||||||
Ok(documents)
|
Ok(documents)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn cleanup_bare_matches<'tag, 'txn>(
|
||||||
|
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
|
docids: &Set<DocumentId>,
|
||||||
|
queries: HashMap<PostingsKey, Cow<'txn, Set<DocIndex>>>,
|
||||||
|
) -> Vec<BareMatch<'tag>>
|
||||||
|
{
|
||||||
|
let docidslen = docids.len() as f32;
|
||||||
|
let mut bare_matches = Vec::new();
|
||||||
|
|
||||||
|
for (PostingsKey { query, input, distance, is_exact }, matches) in queries {
|
||||||
|
let postings_list_view = PostingsListView::original(Rc::from(input), Rc::new(matches));
|
||||||
|
let pllen = postings_list_view.len() as f32;
|
||||||
|
|
||||||
|
if docidslen / pllen >= 0.8 {
|
||||||
|
let mut offset = 0;
|
||||||
|
for matches in postings_list_view.linear_group_by_key(|m| m.document_id) {
|
||||||
|
let document_id = matches[0].document_id;
|
||||||
|
if docids.contains(&document_id) {
|
||||||
|
let range = postings_list_view.range(offset, matches.len());
|
||||||
|
let posting_list_index = arena.add(range);
|
||||||
|
|
||||||
|
let bare_match = BareMatch {
|
||||||
|
document_id,
|
||||||
|
query_index: query.id,
|
||||||
|
distance,
|
||||||
|
is_exact,
|
||||||
|
postings_list: posting_list_index,
|
||||||
|
};
|
||||||
|
|
||||||
|
bare_matches.push(bare_match);
|
||||||
|
}
|
||||||
|
|
||||||
|
offset += matches.len();
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
let mut offset = 0;
|
||||||
|
for id in docids.as_slice() {
|
||||||
|
let di = DocIndex { document_id: *id, ..DocIndex::default() };
|
||||||
|
let pos = exponential_search(&postings_list_view[offset..], &di).unwrap_or_else(|x| x);
|
||||||
|
|
||||||
|
offset += pos;
|
||||||
|
|
||||||
|
let group = postings_list_view[offset..]
|
||||||
|
.linear_group_by_key(|m| m.document_id)
|
||||||
|
.next()
|
||||||
|
.filter(|matches| matches[0].document_id == *id);
|
||||||
|
|
||||||
|
if let Some(matches) = group {
|
||||||
|
let range = postings_list_view.range(offset, matches.len());
|
||||||
|
let posting_list_index = arena.add(range);
|
||||||
|
|
||||||
|
let bare_match = BareMatch {
|
||||||
|
document_id: *id,
|
||||||
|
query_index: query.id,
|
||||||
|
distance,
|
||||||
|
is_exact,
|
||||||
|
postings_list: posting_list_index,
|
||||||
|
};
|
||||||
|
|
||||||
|
bare_matches.push(bare_match);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let before_raw_documents_presort = Instant::now();
|
||||||
|
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
||||||
|
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
||||||
|
|
||||||
|
bare_matches
|
||||||
|
}
|
||||||
|
|
||||||
pub struct BareMatch<'tag> {
|
pub struct BareMatch<'tag> {
|
||||||
pub document_id: DocumentId,
|
pub document_id: DocumentId,
|
||||||
pub query_index: u16,
|
pub query_index: usize,
|
||||||
pub distance: u8,
|
pub distance: u8,
|
||||||
pub is_exact: bool,
|
pub is_exact: bool,
|
||||||
pub postings_list: Idx32<'tag>,
|
pub postings_list: Idx32<'tag>,
|
||||||
@ -338,7 +451,7 @@ impl fmt::Debug for BareMatch<'_> {
|
|||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
pub struct SimpleMatch {
|
pub struct SimpleMatch {
|
||||||
pub query_index: u16,
|
pub query_index: usize,
|
||||||
pub distance: u8,
|
pub distance: u8,
|
||||||
pub attribute: u16,
|
pub attribute: u16,
|
||||||
pub word_index: u16,
|
pub word_index: u16,
|
||||||
@ -436,285 +549,3 @@ impl Deref for PostingsListView<'_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn fetch_matches<'txn, 'tag>(
|
|
||||||
reader: &'txn heed::RoTxn<MainT>,
|
|
||||||
automatons: &[QueryWordAutomaton],
|
|
||||||
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
|
||||||
main_store: store::Main,
|
|
||||||
postings_lists_store: store::PostingsLists,
|
|
||||||
) -> MResult<Vec<BareMatch<'tag>>>
|
|
||||||
{
|
|
||||||
let before_words_fst = Instant::now();
|
|
||||||
let words = match main_store.words_fst(reader)? {
|
|
||||||
Some(words) => words,
|
|
||||||
None => return Ok(Vec::new()),
|
|
||||||
};
|
|
||||||
debug!("words fst took {:.02?}", before_words_fst.elapsed());
|
|
||||||
debug!("words fst len {} and size {}", words.len(), words.as_fst().as_bytes().len());
|
|
||||||
|
|
||||||
let mut total_postings_lists = Vec::new();
|
|
||||||
|
|
||||||
let mut dfa_time = Duration::default();
|
|
||||||
let mut stream_next_time = Duration::default();
|
|
||||||
let mut postings_lists_fetching_time = Duration::default();
|
|
||||||
let automatons_loop = Instant::now();
|
|
||||||
|
|
||||||
for (query_index, automaton) in automatons.iter().enumerate() {
|
|
||||||
let before_dfa = Instant::now();
|
|
||||||
let dfa = automaton.dfa();
|
|
||||||
let QueryWordAutomaton { query, is_exact, .. } = automaton;
|
|
||||||
dfa_time += before_dfa.elapsed();
|
|
||||||
|
|
||||||
let mut number_of_words = 0;
|
|
||||||
let mut stream = words.search(&dfa).into_stream();
|
|
||||||
|
|
||||||
// while let Some(input) = stream.next() {
|
|
||||||
loop {
|
|
||||||
let before_stream_next = Instant::now();
|
|
||||||
let value = stream.next();
|
|
||||||
stream_next_time += before_stream_next.elapsed();
|
|
||||||
|
|
||||||
let input = match value {
|
|
||||||
Some(input) => input,
|
|
||||||
None => break,
|
|
||||||
};
|
|
||||||
|
|
||||||
number_of_words += 1;
|
|
||||||
|
|
||||||
let distance = dfa.eval(input).to_u8();
|
|
||||||
let is_exact = *is_exact && distance == 0 && input.len() == query.len();
|
|
||||||
|
|
||||||
let before_postings_lists_fetching = Instant::now();
|
|
||||||
if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? {
|
|
||||||
let input = Rc::from(input);
|
|
||||||
let postings_list = Rc::new(postings_list);
|
|
||||||
let postings_list_view = PostingsListView::original(input, postings_list);
|
|
||||||
|
|
||||||
let mut offset = 0;
|
|
||||||
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
|
|
||||||
let posting_list_index = arena.add(postings_list_view.range(offset, group.len()));
|
|
||||||
let document_id = group[0].document_id;
|
|
||||||
let bare_match = BareMatch {
|
|
||||||
document_id,
|
|
||||||
query_index: query_index as u16,
|
|
||||||
distance,
|
|
||||||
is_exact,
|
|
||||||
postings_list: posting_list_index,
|
|
||||||
};
|
|
||||||
|
|
||||||
total_postings_lists.push(bare_match);
|
|
||||||
offset += group.len();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
postings_lists_fetching_time += before_postings_lists_fetching.elapsed();
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("{:?} gives {} words", query, number_of_words);
|
|
||||||
}
|
|
||||||
|
|
||||||
debug!("automatons loop took {:.02?}", automatons_loop.elapsed());
|
|
||||||
debug!("stream next took {:.02?}", stream_next_time);
|
|
||||||
debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time);
|
|
||||||
debug!("dfa creation took {:.02?}", dfa_time);
|
|
||||||
|
|
||||||
Ok(total_postings_lists)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct QueryWordAutomaton {
|
|
||||||
pub query: String,
|
|
||||||
/// Is it a word that must be considered exact
|
|
||||||
/// or is it some derived word (i.e. a synonym)
|
|
||||||
pub is_exact: bool,
|
|
||||||
pub is_prefix: bool,
|
|
||||||
/// If it's a phrase query and what is
|
|
||||||
/// its index an the length of the phrase
|
|
||||||
pub phrase_query: Option<(u16, u16)>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl QueryWordAutomaton {
|
|
||||||
pub fn exact(query: &str) -> QueryWordAutomaton {
|
|
||||||
QueryWordAutomaton {
|
|
||||||
query: query.to_string(),
|
|
||||||
is_exact: true,
|
|
||||||
is_prefix: false,
|
|
||||||
phrase_query: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn exact_prefix(query: &str) -> QueryWordAutomaton {
|
|
||||||
QueryWordAutomaton {
|
|
||||||
query: query.to_string(),
|
|
||||||
is_exact: true,
|
|
||||||
is_prefix: true,
|
|
||||||
phrase_query: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn non_exact(query: &str) -> QueryWordAutomaton {
|
|
||||||
QueryWordAutomaton {
|
|
||||||
query: query.to_string(),
|
|
||||||
is_exact: false,
|
|
||||||
is_prefix: false,
|
|
||||||
phrase_query: None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn dfa(&self) -> DFA {
|
|
||||||
if self.phrase_query.is_some() {
|
|
||||||
build_exact_dfa(&self.query)
|
|
||||||
} else if self.is_prefix {
|
|
||||||
build_prefix_dfa(&self.query)
|
|
||||||
} else {
|
|
||||||
build_dfa(&self.query)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn split_best_frequency<'a>(
|
|
||||||
reader: &heed::RoTxn<MainT>,
|
|
||||||
word: &'a str,
|
|
||||||
postings_lists_store: store::PostingsLists,
|
|
||||||
) -> MResult<Option<(&'a str, &'a str)>> {
|
|
||||||
let chars = word.char_indices().skip(1);
|
|
||||||
let mut best = None;
|
|
||||||
|
|
||||||
for (i, _) in chars {
|
|
||||||
let (left, right) = word.split_at(i);
|
|
||||||
|
|
||||||
let left_freq = postings_lists_store
|
|
||||||
.postings_list(reader, left.as_ref())?
|
|
||||||
.map_or(0, |i| i.len());
|
|
||||||
|
|
||||||
let right_freq = postings_lists_store
|
|
||||||
.postings_list(reader, right.as_ref())?
|
|
||||||
.map_or(0, |i| i.len());
|
|
||||||
|
|
||||||
let min_freq = cmp::min(left_freq, right_freq);
|
|
||||||
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
|
||||||
best = Some((min_freq, left, right));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(best.map(|(_, l, r)| (l, r)))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn construct_automatons(
|
|
||||||
reader: &heed::RoTxn<MainT>,
|
|
||||||
query: &str,
|
|
||||||
main_store: store::Main,
|
|
||||||
postings_lists_store: store::PostingsLists,
|
|
||||||
synonym_store: store::Synonyms,
|
|
||||||
) -> MResult<(Vec<QueryWordAutomaton>, QueryEnhancer)> {
|
|
||||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
|
||||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
|
||||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
|
||||||
Some(synonym) => synonym,
|
|
||||||
None => fst::Set::default(),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut automaton_index = 0;
|
|
||||||
let mut automatons = Vec::new();
|
|
||||||
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
|
|
||||||
|
|
||||||
// We must not declare the original words to the query enhancer
|
|
||||||
// *but* we need to push them in the automatons list first
|
|
||||||
let mut original_words = query_words.iter().peekable();
|
|
||||||
while let Some(word) = original_words.next() {
|
|
||||||
let has_following_word = original_words.peek().is_some();
|
|
||||||
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
|
||||||
|
|
||||||
let automaton = if not_prefix_dfa {
|
|
||||||
QueryWordAutomaton::exact(word)
|
|
||||||
} else {
|
|
||||||
QueryWordAutomaton::exact_prefix(word)
|
|
||||||
};
|
|
||||||
automaton_index += 1;
|
|
||||||
automatons.push(automaton);
|
|
||||||
}
|
|
||||||
|
|
||||||
for n in 1..=NGRAMS {
|
|
||||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
|
||||||
while let Some((query_index, ngram_slice)) = ngrams.next() {
|
|
||||||
let query_range = query_index..query_index + n;
|
|
||||||
let ngram_nb_words = ngram_slice.len();
|
|
||||||
let ngram = ngram_slice.join(" ");
|
|
||||||
|
|
||||||
let has_following_word = ngrams.peek().is_some();
|
|
||||||
let not_prefix_dfa =
|
|
||||||
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
|
||||||
|
|
||||||
// automaton of synonyms of the ngrams
|
|
||||||
let normalized = normalize_str(&ngram);
|
|
||||||
let lev = if not_prefix_dfa {
|
|
||||||
build_dfa(&normalized)
|
|
||||||
} else {
|
|
||||||
build_prefix_dfa(&normalized)
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut stream = synonyms.search(&lev).into_stream();
|
|
||||||
while let Some(base) = stream.next() {
|
|
||||||
// only trigger alternatives when the last word has been typed
|
|
||||||
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
|
||||||
let base = std::str::from_utf8(base).unwrap();
|
|
||||||
let base_nb_words = split_query_string(base).count();
|
|
||||||
if ngram_nb_words != base_nb_words {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
|
|
||||||
let mut stream = synonyms.into_stream();
|
|
||||||
while let Some(synonyms) = stream.next() {
|
|
||||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
|
||||||
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
|
|
||||||
let nb_synonym_words = synonyms_words.len();
|
|
||||||
|
|
||||||
let real_query_index = automaton_index;
|
|
||||||
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
|
|
||||||
|
|
||||||
for synonym in synonyms_words {
|
|
||||||
let automaton = if nb_synonym_words == 1 {
|
|
||||||
QueryWordAutomaton::exact(synonym)
|
|
||||||
} else {
|
|
||||||
QueryWordAutomaton::non_exact(synonym)
|
|
||||||
};
|
|
||||||
automaton_index += 1;
|
|
||||||
automatons.push(automaton);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if n == 1 {
|
|
||||||
// automatons for splitted words
|
|
||||||
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
|
|
||||||
let mut left_automaton = QueryWordAutomaton::exact(left);
|
|
||||||
left_automaton.phrase_query = Some((0, 2));
|
|
||||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
|
||||||
automaton_index += 1;
|
|
||||||
automatons.push(left_automaton);
|
|
||||||
|
|
||||||
let mut right_automaton = QueryWordAutomaton::exact(right);
|
|
||||||
right_automaton.phrase_query = Some((1, 2));
|
|
||||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[right]);
|
|
||||||
automaton_index += 1;
|
|
||||||
automatons.push(right_automaton);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// automaton of concatenation of query words
|
|
||||||
let concat = ngram_slice.concat();
|
|
||||||
let normalized = normalize_str(&concat);
|
|
||||||
|
|
||||||
let real_query_index = automaton_index;
|
|
||||||
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
|
|
||||||
|
|
||||||
let automaton = QueryWordAutomaton::exact(&normalized);
|
|
||||||
automaton_index += 1;
|
|
||||||
automatons.push(automaton);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok((automatons, enhancer_builder.build()))
|
|
||||||
}
|
|
||||||
|
@ -9,13 +9,13 @@ pub struct Attribute;
|
|||||||
impl Criterion for Attribute {
|
impl Criterion for Attribute {
|
||||||
fn name(&self) -> &str { "attribute" }
|
fn name(&self) -> &str { "attribute" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
|
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11,9 +11,9 @@ pub struct Exact;
|
|||||||
impl Criterion for Exact {
|
impl Criterion for Exact {
|
||||||
fn name(&self) -> &str { "exact" }
|
fn name(&self) -> &str { "exact" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
|
@ -1,13 +1,15 @@
|
|||||||
use std::cmp::{self, Ordering};
|
use std::cmp::{self, Ordering};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::ops::Range;
|
||||||
|
|
||||||
use compact_arena::SmallArena;
|
use compact_arena::SmallArena;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
use crate::{store, RawDocument, MResult};
|
use crate::bucket_sort::{SimpleMatch, PostingsListView};
|
||||||
use crate::automaton::QueryEnhancer;
|
|
||||||
use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
|
|
||||||
use crate::database::MainT;
|
use crate::database::MainT;
|
||||||
|
use crate::query_tree::QueryId;
|
||||||
|
use crate::{store, RawDocument, MResult};
|
||||||
|
|
||||||
mod typo;
|
mod typo;
|
||||||
mod words;
|
mod words;
|
||||||
@ -30,26 +32,26 @@ pub use self::sort_by_attr::SortByAttr;
|
|||||||
pub trait Criterion {
|
pub trait Criterion {
|
||||||
fn name(&self) -> &str;
|
fn name(&self) -> &str;
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
_ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
_ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
_documents: &mut [RawDocument<'r, 'tag>],
|
_documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn evaluate<'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
|
ctx: &Context<'p, 'tag, 'txn, 'q>,
|
||||||
lhs: &RawDocument<'r, 'tag>,
|
lhs: &RawDocument<'r, 'tag>,
|
||||||
rhs: &RawDocument<'r, 'tag>,
|
rhs: &RawDocument<'r, 'tag>,
|
||||||
) -> Ordering;
|
) -> Ordering;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn eq<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn eq<'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
|
ctx: &Context<'p, 'tag, 'txn, 'q>,
|
||||||
lhs: &RawDocument<'r, 'tag>,
|
lhs: &RawDocument<'r, 'tag>,
|
||||||
rhs: &RawDocument<'r, 'tag>,
|
rhs: &RawDocument<'r, 'tag>,
|
||||||
) -> bool
|
) -> bool
|
||||||
@ -58,18 +60,16 @@ pub trait Criterion {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a> {
|
pub struct ContextMut<'h, 'p, 'tag, 'txn, 'q> {
|
||||||
pub reader: &'h heed::RoTxn<MainT>,
|
pub reader: &'h heed::RoTxn<MainT>,
|
||||||
pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>,
|
pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
pub query_enhancer: &'q mut QueryEnhancer,
|
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
|
||||||
pub automatons: &'a mut [QueryWordAutomaton],
|
|
||||||
pub documents_fields_counts_store: store::DocumentsFieldsCounts,
|
pub documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Context<'p, 'tag, 'txn, 'q, 'a> {
|
pub struct Context<'p, 'tag, 'txn, 'q> {
|
||||||
pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>,
|
pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
pub query_enhancer: &'q QueryEnhancer,
|
pub query_mapping: &'q HashMap<QueryId, Range<usize>>,
|
||||||
pub automatons: &'a [QueryWordAutomaton],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
@ -138,7 +138,7 @@ impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
|
|||||||
|
|
||||||
fn prepare_query_distances<'a, 'tag, 'txn>(
|
fn prepare_query_distances<'a, 'tag, 'txn>(
|
||||||
documents: &mut [RawDocument<'a, 'tag>],
|
documents: &mut [RawDocument<'a, 'tag>],
|
||||||
query_enhancer: &QueryEnhancer,
|
query_mapping: &HashMap<QueryId, Range<usize>>,
|
||||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
) {
|
) {
|
||||||
for document in documents {
|
for document in documents {
|
||||||
@ -148,7 +148,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>(
|
|||||||
for m in document.bare_matches.iter() {
|
for m in document.bare_matches.iter() {
|
||||||
if postings_lists[m.postings_list].is_empty() { continue }
|
if postings_lists[m.postings_list].is_empty() { continue }
|
||||||
|
|
||||||
let range = query_enhancer.replacement(m.query_index as u32);
|
let range = query_mapping[&(m.query_index as usize)].clone();
|
||||||
let new_len = cmp::max(range.end as usize, processed.len());
|
let new_len = cmp::max(range.end as usize, processed.len());
|
||||||
processed.resize(new_len, None);
|
processed.resize(new_len, None);
|
||||||
|
|
||||||
@ -169,7 +169,7 @@ fn prepare_query_distances<'a, 'tag, 'txn>(
|
|||||||
fn prepare_bare_matches<'a, 'tag, 'txn>(
|
fn prepare_bare_matches<'a, 'tag, 'txn>(
|
||||||
documents: &mut [RawDocument<'a, 'tag>],
|
documents: &mut [RawDocument<'a, 'tag>],
|
||||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
query_enhancer: &QueryEnhancer,
|
query_mapping: &HashMap<QueryId, Range<usize>>,
|
||||||
) {
|
) {
|
||||||
for document in documents {
|
for document in documents {
|
||||||
if !document.processed_matches.is_empty() { continue }
|
if !document.processed_matches.is_empty() { continue }
|
||||||
@ -190,14 +190,14 @@ fn prepare_bare_matches<'a, 'tag, 'txn>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let processed = multiword_rewrite_matches(&mut processed, query_enhancer);
|
let processed = multiword_rewrite_matches(&mut processed, query_mapping);
|
||||||
document.processed_matches = processed.into_vec();
|
document.processed_matches = processed.into_vec();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn multiword_rewrite_matches(
|
fn multiword_rewrite_matches(
|
||||||
matches: &mut [SimpleMatch],
|
matches: &mut [SimpleMatch],
|
||||||
query_enhancer: &QueryEnhancer,
|
query_mapping: &HashMap<QueryId, Range<usize>>,
|
||||||
) -> SetBuf<SimpleMatch>
|
) -> SetBuf<SimpleMatch>
|
||||||
{
|
{
|
||||||
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
|
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
|
||||||
@ -218,13 +218,12 @@ fn multiword_rewrite_matches(
|
|||||||
// find the biggest padding
|
// find the biggest padding
|
||||||
let mut biggest = 0;
|
let mut biggest = 0;
|
||||||
for match_ in same_word_index {
|
for match_ in same_word_index {
|
||||||
let mut replacement = query_enhancer.replacement(match_.query_index as u32);
|
let mut replacement = query_mapping[&(match_.query_index as usize)].clone();
|
||||||
let replacement_len = replacement.len();
|
let replacement_len = replacement.len();
|
||||||
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
|
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
|
||||||
|
|
||||||
if let Some(query_index) = replacement.next() {
|
if let Some(query_index) = replacement.next() {
|
||||||
let word_index = match_.word_index + padding as u16;
|
let word_index = match_.word_index + padding as u16;
|
||||||
let query_index = query_index as u16;
|
|
||||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||||
padded_matches.push(match_);
|
padded_matches.push(match_);
|
||||||
}
|
}
|
||||||
@ -236,20 +235,17 @@ fn multiword_rewrite_matches(
|
|||||||
'padding: for (x, next_group) in nexts.enumerate() {
|
'padding: for (x, next_group) in nexts.enumerate() {
|
||||||
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
||||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||||
let query_index = query_index as u16;
|
|
||||||
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
|
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
|
||||||
|
|
||||||
for nmatch_ in next_group {
|
for nmatch_ in next_group {
|
||||||
let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
|
let mut rep = query_mapping[&(nmatch_.query_index as usize)].clone();
|
||||||
let query_index = rep.next().unwrap() as u16;
|
let query_index = rep.next().unwrap();
|
||||||
if query_index == padmatch.query_index {
|
if query_index == padmatch.query_index {
|
||||||
if !found {
|
if !found {
|
||||||
// if we find a corresponding padding for the
|
// if we find a corresponding padding for the
|
||||||
// first time we must push preceding paddings
|
// first time we must push preceding paddings
|
||||||
for (i, query_index) in replacement.clone().enumerate().take(i)
|
for (i, query_index) in replacement.clone().enumerate().take(i) {
|
||||||
{
|
|
||||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||||
let query_index = query_index as u16;
|
|
||||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||||
padded_matches.push(match_);
|
padded_matches.push(match_);
|
||||||
biggest = biggest.max(i + 1);
|
biggest = biggest.max(i + 1);
|
||||||
@ -273,7 +269,6 @@ fn multiword_rewrite_matches(
|
|||||||
// we must insert the entire padding
|
// we must insert the entire padding
|
||||||
for (i, query_index) in replacement.enumerate() {
|
for (i, query_index) in replacement.enumerate() {
|
||||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||||
let query_index = query_index as u16;
|
|
||||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||||
padded_matches.push(match_);
|
padded_matches.push(match_);
|
||||||
}
|
}
|
||||||
|
@ -11,13 +11,13 @@ pub struct Proximity;
|
|||||||
impl Criterion for Proximity {
|
impl Criterion for Proximity {
|
||||||
fn name(&self) -> &str { "proximity" }
|
fn name(&self) -> &str { "proximity" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
|
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,13 +7,13 @@ pub struct Typo;
|
|||||||
impl Criterion for Typo {
|
impl Criterion for Typo {
|
||||||
fn name(&self) -> &str { "typo" }
|
fn name(&self) -> &str { "typo" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists);
|
prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,13 +7,13 @@ pub struct Words;
|
|||||||
impl Criterion for Words {
|
impl Criterion for Words {
|
||||||
fn name(&self) -> &str { "words" }
|
fn name(&self) -> &str { "words" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
prepare_query_distances(documents, ctx.query_enhancer, ctx.postings_lists);
|
prepare_query_distances(documents, ctx.query_mapping, ctx.postings_lists);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,13 +9,13 @@ pub struct WordsPosition;
|
|||||||
impl Criterion for WordsPosition {
|
impl Criterion for WordsPosition {
|
||||||
fn name(&self) -> &str { "words position" }
|
fn name(&self) -> &str { "words position" }
|
||||||
|
|
||||||
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'a, 'r>(
|
fn prepare<'h, 'p, 'tag, 'txn, 'q, 'r>(
|
||||||
&self,
|
&self,
|
||||||
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q, 'a>,
|
ctx: ContextMut<'h, 'p, 'tag, 'txn, 'q>,
|
||||||
documents: &mut [RawDocument<'r, 'tag>],
|
documents: &mut [RawDocument<'r, 'tag>],
|
||||||
) -> MResult<()>
|
) -> MResult<()>
|
||||||
{
|
{
|
||||||
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_enhancer);
|
prepare_bare_matches(documents, ctx.postings_lists, ctx.query_mapping);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -141,13 +141,13 @@ impl Database {
|
|||||||
|
|
||||||
fs::create_dir_all(&main_path)?;
|
fs::create_dir_all(&main_path)?;
|
||||||
let env = heed::EnvOpenOptions::new()
|
let env = heed::EnvOpenOptions::new()
|
||||||
.map_size(10 * 1024 * 1024 * 1024) // 10GB
|
.map_size(100 * 1024 * 1024 * 1024) // 100GB
|
||||||
.max_dbs(3000)
|
.max_dbs(3000)
|
||||||
.open(main_path)?;
|
.open(main_path)?;
|
||||||
|
|
||||||
fs::create_dir_all(&update_path)?;
|
fs::create_dir_all(&update_path)?;
|
||||||
let update_env = heed::EnvOpenOptions::new()
|
let update_env = heed::EnvOpenOptions::new()
|
||||||
.map_size(10 * 1024 * 1024 * 1024) // 10GB
|
.map_size(100 * 1024 * 1024 * 1024) // 100GB
|
||||||
.max_dbs(3000)
|
.max_dbs(3000)
|
||||||
.open(update_path)?;
|
.open(update_path)?;
|
||||||
|
|
||||||
|
@ -10,6 +10,8 @@ mod error;
|
|||||||
mod levenshtein;
|
mod levenshtein;
|
||||||
mod number;
|
mod number;
|
||||||
mod query_builder;
|
mod query_builder;
|
||||||
|
mod query_tree;
|
||||||
|
mod query_words_mapper;
|
||||||
mod ranked_map;
|
mod ranked_map;
|
||||||
mod raw_document;
|
mod raw_document;
|
||||||
mod reordered_attrs;
|
mod reordered_attrs;
|
||||||
@ -27,10 +29,15 @@ pub use self::raw_document::RawDocument;
|
|||||||
pub use self::store::Index;
|
pub use self::store::Index;
|
||||||
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
|
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
|
||||||
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
|
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
|
||||||
|
pub use query_words_mapper::QueryWordsMapper;
|
||||||
|
|
||||||
|
use std::convert::TryFrom;
|
||||||
|
use std::collections::HashMap;
|
||||||
use compact_arena::SmallArena;
|
use compact_arena::SmallArena;
|
||||||
use crate::bucket_sort::{QueryWordAutomaton, PostingsListView};
|
|
||||||
|
use crate::bucket_sort::PostingsListView;
|
||||||
use crate::levenshtein::prefix_damerau_levenshtein;
|
use crate::levenshtein::prefix_damerau_levenshtein;
|
||||||
|
use crate::query_tree::{QueryId, QueryKind};
|
||||||
use crate::reordered_attrs::ReorderedAttrs;
|
use crate::reordered_attrs::ReorderedAttrs;
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
@ -44,7 +51,7 @@ pub struct Document {
|
|||||||
|
|
||||||
fn highlights_from_raw_document<'a, 'tag, 'txn>(
|
fn highlights_from_raw_document<'a, 'tag, 'txn>(
|
||||||
raw_document: &RawDocument<'a, 'tag>,
|
raw_document: &RawDocument<'a, 'tag>,
|
||||||
automatons: &[QueryWordAutomaton],
|
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
||||||
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
searchable_attrs: Option<&ReorderedAttrs>,
|
searchable_attrs: Option<&ReorderedAttrs>,
|
||||||
) -> Vec<Highlight>
|
) -> Vec<Highlight>
|
||||||
@ -54,13 +61,19 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>(
|
|||||||
for bm in raw_document.bare_matches.iter() {
|
for bm in raw_document.bare_matches.iter() {
|
||||||
let postings_list = &arena[bm.postings_list];
|
let postings_list = &arena[bm.postings_list];
|
||||||
let input = postings_list.input();
|
let input = postings_list.input();
|
||||||
let query = &automatons[bm.query_index as usize].query;
|
let kind = &queries_kinds.get(&bm.query_index);
|
||||||
|
|
||||||
for di in postings_list.iter() {
|
for di in postings_list.iter() {
|
||||||
let covered_area = if query.len() > input.len() {
|
let covered_area = match kind {
|
||||||
input.len()
|
Some(QueryKind::NonTolerant(query)) | Some(QueryKind::Tolerant(query)) => {
|
||||||
} else {
|
let len = if query.len() > input.len() {
|
||||||
prefix_damerau_levenshtein(query.as_bytes(), input).1
|
input.len()
|
||||||
|
} else {
|
||||||
|
prefix_damerau_levenshtein(query.as_bytes(), input).1
|
||||||
|
};
|
||||||
|
u16::try_from(len).unwrap_or(u16::max_value())
|
||||||
|
},
|
||||||
|
_ => di.char_length,
|
||||||
};
|
};
|
||||||
|
|
||||||
let attribute = searchable_attrs
|
let attribute = searchable_attrs
|
||||||
@ -70,7 +83,7 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>(
|
|||||||
let highlight = Highlight {
|
let highlight = Highlight {
|
||||||
attribute: attribute,
|
attribute: attribute,
|
||||||
char_index: di.char_index,
|
char_index: di.char_index,
|
||||||
char_length: covered_area as u16,
|
char_length: covered_area,
|
||||||
};
|
};
|
||||||
|
|
||||||
highlights.push(highlight);
|
highlights.push(highlight);
|
||||||
@ -81,17 +94,27 @@ fn highlights_from_raw_document<'a, 'tag, 'txn>(
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Document {
|
impl Document {
|
||||||
|
#[cfg(not(test))]
|
||||||
|
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
|
||||||
|
Document { id, highlights: highlights.to_owned() }
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub fn from_highlights(id: DocumentId, highlights: &[Highlight]) -> Document {
|
||||||
|
Document { id, highlights: highlights.to_owned(), matches: Vec::new() }
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(not(test))]
|
#[cfg(not(test))]
|
||||||
pub fn from_raw<'a, 'tag, 'txn>(
|
pub fn from_raw<'a, 'tag, 'txn>(
|
||||||
raw_document: RawDocument<'a, 'tag>,
|
raw_document: RawDocument<'a, 'tag>,
|
||||||
automatons: &[QueryWordAutomaton],
|
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
||||||
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
searchable_attrs: Option<&ReorderedAttrs>,
|
searchable_attrs: Option<&ReorderedAttrs>,
|
||||||
) -> Document
|
) -> Document
|
||||||
{
|
{
|
||||||
let highlights = highlights_from_raw_document(
|
let highlights = highlights_from_raw_document(
|
||||||
&raw_document,
|
&raw_document,
|
||||||
automatons,
|
queries_kinds,
|
||||||
arena,
|
arena,
|
||||||
searchable_attrs,
|
searchable_attrs,
|
||||||
);
|
);
|
||||||
@ -102,7 +125,7 @@ impl Document {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub fn from_raw<'a, 'tag, 'txn>(
|
pub fn from_raw<'a, 'tag, 'txn>(
|
||||||
raw_document: RawDocument<'a, 'tag>,
|
raw_document: RawDocument<'a, 'tag>,
|
||||||
automatons: &[QueryWordAutomaton],
|
queries_kinds: &HashMap<QueryId, &QueryKind>,
|
||||||
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
searchable_attrs: Option<&ReorderedAttrs>,
|
searchable_attrs: Option<&ReorderedAttrs>,
|
||||||
) -> Document
|
) -> Document
|
||||||
@ -111,7 +134,7 @@ impl Document {
|
|||||||
|
|
||||||
let highlights = highlights_from_raw_document(
|
let highlights = highlights_from_raw_document(
|
||||||
&raw_document,
|
&raw_document,
|
||||||
automatons,
|
queries_kinds,
|
||||||
arena,
|
arena,
|
||||||
searchable_attrs,
|
searchable_attrs,
|
||||||
);
|
);
|
||||||
|
@ -16,6 +16,8 @@ pub struct QueryBuilder<'c, 'f, 'd> {
|
|||||||
postings_lists_store: store::PostingsLists,
|
postings_lists_store: store::PostingsLists,
|
||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||||
synonyms_store: store::Synonyms,
|
synonyms_store: store::Synonyms,
|
||||||
|
prefix_documents_cache_store: store::PrefixDocumentsCache,
|
||||||
|
prefix_postings_lists_cache_store: store::PrefixPostingsListsCache,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||||
@ -24,12 +26,16 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
|||||||
postings_lists: store::PostingsLists,
|
postings_lists: store::PostingsLists,
|
||||||
documents_fields_counts: store::DocumentsFieldsCounts,
|
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||||
synonyms: store::Synonyms,
|
synonyms: store::Synonyms,
|
||||||
|
prefix_documents_cache: store::PrefixDocumentsCache,
|
||||||
|
prefix_postings_lists_cache: store::PrefixPostingsListsCache,
|
||||||
) -> QueryBuilder<'c, 'f, 'd> {
|
) -> QueryBuilder<'c, 'f, 'd> {
|
||||||
QueryBuilder::with_criteria(
|
QueryBuilder::with_criteria(
|
||||||
main,
|
main,
|
||||||
postings_lists,
|
postings_lists,
|
||||||
documents_fields_counts,
|
documents_fields_counts,
|
||||||
synonyms,
|
synonyms,
|
||||||
|
prefix_documents_cache,
|
||||||
|
prefix_postings_lists_cache,
|
||||||
Criteria::default(),
|
Criteria::default(),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -39,6 +45,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
|||||||
postings_lists: store::PostingsLists,
|
postings_lists: store::PostingsLists,
|
||||||
documents_fields_counts: store::DocumentsFieldsCounts,
|
documents_fields_counts: store::DocumentsFieldsCounts,
|
||||||
synonyms: store::Synonyms,
|
synonyms: store::Synonyms,
|
||||||
|
prefix_documents_cache: store::PrefixDocumentsCache,
|
||||||
|
prefix_postings_lists_cache: store::PrefixPostingsListsCache,
|
||||||
criteria: Criteria<'c>,
|
criteria: Criteria<'c>,
|
||||||
) -> QueryBuilder<'c, 'f, 'd> {
|
) -> QueryBuilder<'c, 'f, 'd> {
|
||||||
QueryBuilder {
|
QueryBuilder {
|
||||||
@ -51,6 +59,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
|||||||
postings_lists_store: postings_lists,
|
postings_lists_store: postings_lists,
|
||||||
documents_fields_counts_store: documents_fields_counts,
|
documents_fields_counts_store: documents_fields_counts,
|
||||||
synonyms_store: synonyms,
|
synonyms_store: synonyms,
|
||||||
|
prefix_documents_cache_store: prefix_documents_cache,
|
||||||
|
prefix_postings_lists_cache_store: prefix_postings_lists_cache,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -97,6 +107,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
|||||||
self.postings_lists_store,
|
self.postings_lists_store,
|
||||||
self.documents_fields_counts_store,
|
self.documents_fields_counts_store,
|
||||||
self.synonyms_store,
|
self.synonyms_store,
|
||||||
|
self.prefix_documents_cache_store,
|
||||||
|
self.prefix_postings_lists_cache_store,
|
||||||
),
|
),
|
||||||
None => bucket_sort(
|
None => bucket_sort(
|
||||||
reader,
|
reader,
|
||||||
@ -109,6 +121,8 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
|||||||
self.postings_lists_store,
|
self.postings_lists_store,
|
||||||
self.documents_fields_counts_store,
|
self.documents_fields_counts_store,
|
||||||
self.synonyms_store,
|
self.synonyms_store,
|
||||||
|
self.prefix_documents_cache_store,
|
||||||
|
self.prefix_postings_lists_cache_store,
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -206,7 +220,7 @@ mod tests {
|
|||||||
let db = &self.database;
|
let db = &self.database;
|
||||||
let mut writer = db.main_write_txn().unwrap();
|
let mut writer = db.main_write_txn().unwrap();
|
||||||
|
|
||||||
let word = word.to_lowercase();
|
let word = normalize_str(word);
|
||||||
|
|
||||||
let alternatives = match self
|
let alternatives = match self
|
||||||
.index
|
.index
|
||||||
@ -355,82 +369,82 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
// #[test]
|
||||||
fn prefix_synonyms() {
|
// fn prefix_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
// let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||||
|
|
||||||
store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
// store.add_synonym("bonjour", SetBuf::from_dirty(vec!["hello"]));
|
||||||
store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"]));
|
// store.add_synonym("salut", SetBuf::from_dirty(vec!["hello"]));
|
||||||
|
|
||||||
let db = &store.database;
|
// let db = &store.database;
|
||||||
let reader = db.main_read_txn().unwrap();
|
// let reader = db.main_read_txn().unwrap();
|
||||||
|
|
||||||
let builder = store.query_builder();
|
// let builder = store.query_builder();
|
||||||
let results = builder.query(&reader, "sal", 0..20).unwrap();
|
// let results = builder.query(&reader, "sal", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
// let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
let mut matches = matches.into_iter();
|
// let mut matches = matches.into_iter();
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||||
assert_matches!(matches.next(), None);
|
// assert_matches!(matches.next(), None);
|
||||||
});
|
// });
|
||||||
assert_matches!(iter.next(), None);
|
// assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
let builder = store.query_builder();
|
// let builder = store.query_builder();
|
||||||
let results = builder.query(&reader, "bonj", 0..20).unwrap();
|
// let results = builder.query(&reader, "bonj", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
// let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
let mut matches = matches.into_iter();
|
// let mut matches = matches.into_iter();
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||||
assert_matches!(matches.next(), None);
|
// assert_matches!(matches.next(), None);
|
||||||
});
|
// });
|
||||||
assert_matches!(iter.next(), None);
|
// assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
let builder = store.query_builder();
|
// let builder = store.query_builder();
|
||||||
let results = builder.query(&reader, "sal blabla", 0..20).unwrap();
|
// let results = builder.query(&reader, "sal blabla", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
// let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), None);
|
// assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
let builder = store.query_builder();
|
// let builder = store.query_builder();
|
||||||
let results = builder.query(&reader, "bonj blabla", 0..20).unwrap();
|
// let results = builder.query(&reader, "bonj blabla", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
// let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), None);
|
// assert_matches!(iter.next(), None);
|
||||||
}
|
// }
|
||||||
|
|
||||||
#[test]
|
// #[test]
|
||||||
fn levenshtein_synonyms() {
|
// fn levenshtein_synonyms() {
|
||||||
let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
// let mut store = TempDatabase::from_iter(vec![("hello", &[doc_index(0, 0)][..])]);
|
||||||
|
|
||||||
store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"]));
|
// store.add_synonym("salutation", SetBuf::from_dirty(vec!["hello"]));
|
||||||
|
|
||||||
let db = &store.database;
|
// let db = &store.database;
|
||||||
let reader = db.main_read_txn().unwrap();
|
// let reader = db.main_read_txn().unwrap();
|
||||||
|
|
||||||
let builder = store.query_builder();
|
// let builder = store.query_builder();
|
||||||
let results = builder.query(&reader, "salutution", 0..20).unwrap();
|
// let results = builder.query(&reader, "salutution", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
// let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
let mut matches = matches.into_iter();
|
// let mut matches = matches.into_iter();
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||||
assert_matches!(matches.next(), None);
|
// assert_matches!(matches.next(), None);
|
||||||
});
|
// });
|
||||||
assert_matches!(iter.next(), None);
|
// assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
let builder = store.query_builder();
|
// let builder = store.query_builder();
|
||||||
let results = builder.query(&reader, "saluttion", 0..20).unwrap();
|
// let results = builder.query(&reader, "saluttion", 0..20).unwrap();
|
||||||
let mut iter = results.into_iter();
|
// let mut iter = results.into_iter();
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
// assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
let mut matches = matches.into_iter();
|
// let mut matches = matches.into_iter();
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, .. }));
|
||||||
assert_matches!(matches.next(), None);
|
// assert_matches!(matches.next(), None);
|
||||||
});
|
// });
|
||||||
assert_matches!(iter.next(), None);
|
// assert_matches!(iter.next(), None);
|
||||||
}
|
// }
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn harder_synonyms() {
|
fn harder_synonyms() {
|
||||||
@ -541,19 +555,19 @@ mod tests {
|
|||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY ± new
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY ± york
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY ± city
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None); // position rewritten ^
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NY ± new
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NY ± york
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NY ± city
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
assert_matches!(iter.next(), None);
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
@ -563,19 +577,19 @@ mod tests {
|
|||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC ± new
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC ± york
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC ± city
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None); // position rewritten ^
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // NYC ± new
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // NYC ± york
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // NYC ± city
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
assert_matches!(iter.next(), None);
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
@ -667,11 +681,11 @@ mod tests {
|
|||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // subway
|
||||||
assert_matches!(matches.next(), None);
|
assert_matches!(matches.next(), None);
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
// assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||||
let mut matches = matches.into_iter();
|
// let mut matches = matches.into_iter();
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway
|
// assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 2, is_exact: true, .. })); // subway
|
||||||
assert_matches!(matches.next(), None);
|
// assert_matches!(matches.next(), None);
|
||||||
});
|
// });
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
let builder = store.query_builder();
|
let builder = store.query_builder();
|
||||||
@ -731,7 +745,7 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
assert_matches!(iter.next(), None); // position rewritten ^
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
@ -739,7 +753,7 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NY
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NY
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NY
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
assert_matches!(iter.next(), None); // position rewritten ^
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
@ -811,15 +825,6 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
assert_matches!(iter.next(), None); // position rewritten ^
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NY
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NY
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NY
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway
|
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
let builder = store.query_builder();
|
let builder = store.query_builder();
|
||||||
@ -831,19 +836,19 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new = NYC
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york = NYC
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city = NYC
|
||||||
// because one-word to one-word ^^^^
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // underground = subway
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // train = subway
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None); // position rewritten ^
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new = NYC
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york = NYC
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city = NYC
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // underground = subway
|
// because one-word to one-word ^^^^
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // train = subway
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: false, .. })); // subway = underground
|
||||||
assert_matches!(iter.next(), None); // position rewritten ^
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: false, .. })); // subway = train
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
@ -906,15 +911,6 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
let builder = store.query_builder();
|
let builder = store.query_builder();
|
||||||
@ -929,29 +925,18 @@ mod tests {
|
|||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 2, is_exact: true, .. })); // underground
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // underground
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // train
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // train
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 4, is_exact: true, .. })); // broken
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken
|
|
||||||
assert_matches!(matches.next(), None);
|
assert_matches!(matches.next(), None);
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NYC = new
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NYC = york
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NYC = city
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 4, is_exact: true, .. })); // subway = underground
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 5, is_exact: true, .. })); // subway = train
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 5, is_exact: true, .. })); // broken
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 5, word_index: 6, is_exact: true, .. })); // broken
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // NY = new
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // NY = york
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // NY = city
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // subway = underground
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 4, word_index: 4, is_exact: true, .. })); // subway = train
|
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
@ -978,15 +963,12 @@ mod tests {
|
|||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
||||||
let mut matches = matches.into_iter();
|
let mut matches = matches.into_iter();
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: false, .. })); // new
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
||||||
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york
|
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
||||||
|
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 4, is_exact: false, .. })); // city
|
||||||
|
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 3, word_index: 3, is_exact: true, .. })); // big
|
||||||
assert_matches!(matches.next(), None);
|
assert_matches!(matches.next(), None);
|
||||||
});
|
});
|
||||||
@ -1017,7 +999,7 @@ mod tests {
|
|||||||
let mut matches = matches.into_iter();
|
let mut matches = matches.into_iter();
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway
|
||||||
assert_matches!(matches.next(), None);
|
assert_matches!(matches.next(), None);
|
||||||
@ -1025,9 +1007,9 @@ mod tests {
|
|||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(2), matches, .. }) => {
|
||||||
let mut matches = matches.into_iter();
|
let mut matches = matches.into_iter();
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 0, word_index: 0, is_exact: true, .. })); // new
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: false, .. })); // york
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 1, word_index: 1, is_exact: true, .. })); // york
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: false, .. })); // city
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 2, word_index: 2, is_exact: true, .. })); // city
|
||||||
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway
|
assert_matches!(matches.next(), Some(SimpleMatch { query_index: 4, word_index: 3, is_exact: true, .. })); // subway
|
||||||
assert_matches!(matches.next(), None);
|
assert_matches!(matches.next(), None);
|
||||||
@ -1161,7 +1143,8 @@ mod tests {
|
|||||||
let mut iter = matches.into_iter();
|
let mut iter = matches.into_iter();
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // iphone
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // iphone
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 0, distance: 1, .. })); // phone
|
// assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 0, distance: 1, .. })); "phone"
|
||||||
|
// but no typo on first letter ^^^^^^^
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case
|
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 2, word_index: 2, distance: 0, .. })); // case
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
});
|
});
|
||||||
@ -1271,73 +1254,4 @@ mod tests {
|
|||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn searchable_attributes() {
|
|
||||||
let store = TempDatabase::from_iter(vec![
|
|
||||||
("search", &[doc_attr_index(0, 0, 0)][..]),
|
|
||||||
("engine", &[doc_attr_index(0, 0, 1)][..]),
|
|
||||||
|
|
||||||
("search", &[doc_attr_index(1, 1, 0)][..]),
|
|
||||||
("engine", &[doc_attr_index(1, 1, 1)][..]),
|
|
||||||
]);
|
|
||||||
|
|
||||||
let db = &store.database;
|
|
||||||
let reader = db.main_read_txn().unwrap();
|
|
||||||
|
|
||||||
let builder = store.query_builder();
|
|
||||||
let results = builder.query(&reader, "search engine", 0..20).unwrap();
|
|
||||||
let mut iter = results.into_iter();
|
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
|
|
||||||
// reorderer the searchable attributes
|
|
||||||
let mut builder = store.query_builder();
|
|
||||||
builder.add_searchable_attribute(1);
|
|
||||||
builder.add_searchable_attribute(0);
|
|
||||||
|
|
||||||
let results = builder.query(&reader, "search engine", 0..20).unwrap();
|
|
||||||
let mut iter = results.into_iter();
|
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches, .. }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
|
|
||||||
// remove a searchable attributes
|
|
||||||
let mut builder = store.query_builder();
|
|
||||||
builder.add_searchable_attribute(1);
|
|
||||||
|
|
||||||
let results = builder.query(&reader, "search engine", 0..20).unwrap();
|
|
||||||
let mut iter = results.into_iter();
|
|
||||||
|
|
||||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches, .. }) => {
|
|
||||||
let mut iter = matches.into_iter();
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 0, word_index: 0, distance: 0, .. })); // search
|
|
||||||
assert_matches!(iter.next(), Some(SimpleMatch { query_index: 1, word_index: 1, distance: 0, .. })); // engine
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
});
|
|
||||||
assert_matches!(iter.next(), None);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
558
meilisearch-core/src/query_tree.rs
Normal file
558
meilisearch-core/src/query_tree.rs
Normal file
@ -0,0 +1,558 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::hash::{Hash, Hasher};
|
||||||
|
use std::ops::Range;
|
||||||
|
use std::time::Instant;
|
||||||
|
use std::{cmp, fmt, iter::once};
|
||||||
|
|
||||||
|
use fst::{IntoStreamer, Streamer};
|
||||||
|
use itertools::{EitherOrBoth, merge_join_by};
|
||||||
|
use meilisearch_tokenizer::split_query_string;
|
||||||
|
use sdset::{Set, SetBuf, SetOperation};
|
||||||
|
use log::debug;
|
||||||
|
|
||||||
|
use crate::database::MainT;
|
||||||
|
use crate::{store, DocumentId, DocIndex, MResult};
|
||||||
|
use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||||
|
use crate::QueryWordsMapper;
|
||||||
|
|
||||||
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub enum Operation {
|
||||||
|
And(Vec<Operation>),
|
||||||
|
Or(Vec<Operation>),
|
||||||
|
Query(Query),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for Operation {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result {
|
||||||
|
match op {
|
||||||
|
Operation::And(children) => {
|
||||||
|
writeln!(f, "{:1$}AND", "", depth * 2)?;
|
||||||
|
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
|
||||||
|
},
|
||||||
|
Operation::Or(children) => {
|
||||||
|
writeln!(f, "{:1$}OR", "", depth * 2)?;
|
||||||
|
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
|
||||||
|
},
|
||||||
|
Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pprint_tree(f, self, 0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Operation {
|
||||||
|
fn tolerant(id: QueryId, prefix: bool, s: &str) -> Operation {
|
||||||
|
Operation::Query(Query { id, prefix, exact: true, kind: QueryKind::Tolerant(s.to_string()) })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn non_tolerant(id: QueryId, prefix: bool, s: &str) -> Operation {
|
||||||
|
Operation::Query(Query { id, prefix, exact: true, kind: QueryKind::NonTolerant(s.to_string()) })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn phrase2(id: QueryId, prefix: bool, (left, right): (&str, &str)) -> Operation {
|
||||||
|
let kind = QueryKind::Phrase(vec![left.to_owned(), right.to_owned()]);
|
||||||
|
Operation::Query(Query { id, prefix, exact: true, kind })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub type QueryId = usize;
|
||||||
|
|
||||||
|
#[derive(Clone, Eq)]
|
||||||
|
pub struct Query {
|
||||||
|
pub id: QueryId,
|
||||||
|
pub prefix: bool,
|
||||||
|
pub exact: bool,
|
||||||
|
pub kind: QueryKind,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for Query {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.prefix == other.prefix && self.kind == other.kind
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Hash for Query {
|
||||||
|
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||||
|
self.prefix.hash(state);
|
||||||
|
self.kind.hash(state);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub enum QueryKind {
|
||||||
|
Tolerant(String),
|
||||||
|
NonTolerant(String),
|
||||||
|
Phrase(Vec<String>),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Debug for Query {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||||
|
let Query { id, prefix, kind, .. } = self;
|
||||||
|
let prefix = if *prefix { String::from("Prefix") } else { String::default() };
|
||||||
|
match kind {
|
||||||
|
QueryKind::NonTolerant(word) => {
|
||||||
|
f.debug_struct(&(prefix + "NonTolerant")).field("id", &id).field("word", &word).finish()
|
||||||
|
},
|
||||||
|
QueryKind::Tolerant(word) => {
|
||||||
|
f.debug_struct(&(prefix + "Tolerant")).field("id", &id).field("word", &word).finish()
|
||||||
|
},
|
||||||
|
QueryKind::Phrase(words) => {
|
||||||
|
f.debug_struct(&(prefix + "Phrase")).field("id", &id).field("words", &words).finish()
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct PostingsList {
|
||||||
|
docids: SetBuf<DocumentId>,
|
||||||
|
matches: SetBuf<DocIndex>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Context {
|
||||||
|
pub words_set: fst::Set,
|
||||||
|
pub synonyms: store::Synonyms,
|
||||||
|
pub postings_lists: store::PostingsLists,
|
||||||
|
pub prefix_postings_lists: store::PrefixPostingsListsCache,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &'a str) -> MResult<Option<(&'a str, &'a str)>> {
|
||||||
|
let chars = word.char_indices().skip(1);
|
||||||
|
let mut best = None;
|
||||||
|
|
||||||
|
for (i, _) in chars {
|
||||||
|
let (left, right) = word.split_at(i);
|
||||||
|
|
||||||
|
let left_freq = ctx.postings_lists
|
||||||
|
.postings_list(reader, left.as_bytes())?
|
||||||
|
.map(|p| p.docids.len())
|
||||||
|
.unwrap_or(0);
|
||||||
|
let right_freq = ctx.postings_lists
|
||||||
|
.postings_list(reader, right.as_bytes())?
|
||||||
|
.map(|p| p.docids.len())
|
||||||
|
.unwrap_or(0);
|
||||||
|
|
||||||
|
let min_freq = cmp::min(left_freq, right_freq);
|
||||||
|
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||||
|
best = Some((min_freq, left, right));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(best.map(|(_, l, r)| (l, r)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
|
||||||
|
let words = normalize_str(&words.join(" "));
|
||||||
|
let set = ctx.synonyms.synonyms(reader, words.as_bytes())?.unwrap_or_default();
|
||||||
|
|
||||||
|
let mut strings = Vec::new();
|
||||||
|
let mut stream = set.stream();
|
||||||
|
while let Some(input) = stream.next() {
|
||||||
|
if let Ok(input) = std::str::from_utf8(input) {
|
||||||
|
let alts = input.split_ascii_whitespace().map(ToOwned::to_owned).collect();
|
||||||
|
strings.push(alts);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(strings)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn create_operation<I, F>(iter: I, f: F) -> Operation
|
||||||
|
where I: IntoIterator<Item=Operation>,
|
||||||
|
F: Fn(Vec<Operation>) -> Operation,
|
||||||
|
{
|
||||||
|
let mut iter = iter.into_iter();
|
||||||
|
match (iter.next(), iter.next()) {
|
||||||
|
(Some(first), None) => first,
|
||||||
|
(first, second) => f(first.into_iter().chain(second).chain(iter).collect()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const MAX_NGRAM: usize = 3;
|
||||||
|
|
||||||
|
pub fn create_query_tree(
|
||||||
|
reader: &heed::RoTxn<MainT>,
|
||||||
|
ctx: &Context,
|
||||||
|
query: &str,
|
||||||
|
) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
|
||||||
|
{
|
||||||
|
let words = split_query_string(query).map(str::to_lowercase);
|
||||||
|
let words: Vec<_> = words.into_iter().enumerate().collect();
|
||||||
|
|
||||||
|
let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));
|
||||||
|
|
||||||
|
fn create_inner(
|
||||||
|
reader: &heed::RoTxn<MainT>,
|
||||||
|
ctx: &Context,
|
||||||
|
mapper: &mut QueryWordsMapper,
|
||||||
|
words: &[(usize, String)],
|
||||||
|
) -> MResult<Vec<Operation>>
|
||||||
|
{
|
||||||
|
let mut alts = Vec::new();
|
||||||
|
|
||||||
|
for ngram in 1..=MAX_NGRAM {
|
||||||
|
if let Some(group) = words.get(..ngram) {
|
||||||
|
let mut group_ops = Vec::new();
|
||||||
|
|
||||||
|
let tail = &words[ngram..];
|
||||||
|
let is_last = tail.is_empty();
|
||||||
|
|
||||||
|
let mut group_alts = Vec::new();
|
||||||
|
match group {
|
||||||
|
[(id, word)] => {
|
||||||
|
let mut idgen = ((id + 1) * 100)..;
|
||||||
|
let range = (*id)..id+1;
|
||||||
|
|
||||||
|
let phrase = split_best_frequency(reader, ctx, word)?
|
||||||
|
.map(|ws| {
|
||||||
|
let id = idgen.next().unwrap();
|
||||||
|
idgen.next().unwrap();
|
||||||
|
mapper.declare(range.clone(), id, &[ws.0, ws.1]);
|
||||||
|
Operation::phrase2(id, is_last, ws)
|
||||||
|
});
|
||||||
|
|
||||||
|
let synonyms = fetch_synonyms(reader, ctx, &[word])?
|
||||||
|
.into_iter()
|
||||||
|
.map(|alts| {
|
||||||
|
let exact = alts.len() == 1;
|
||||||
|
let id = idgen.next().unwrap();
|
||||||
|
mapper.declare(range.clone(), id, &alts);
|
||||||
|
|
||||||
|
let mut idgen = once(id).chain(&mut idgen);
|
||||||
|
let iter = alts.into_iter().map(|w| {
|
||||||
|
let id = idgen.next().unwrap();
|
||||||
|
let kind = QueryKind::NonTolerant(w);
|
||||||
|
Operation::Query(Query { id, prefix: false, exact, kind })
|
||||||
|
});
|
||||||
|
|
||||||
|
create_operation(iter, Operation::And)
|
||||||
|
});
|
||||||
|
|
||||||
|
let original = Operation::tolerant(*id, is_last, word);
|
||||||
|
|
||||||
|
group_alts.push(original);
|
||||||
|
group_alts.extend(synonyms.chain(phrase));
|
||||||
|
},
|
||||||
|
words => {
|
||||||
|
let id = words[0].0;
|
||||||
|
let mut idgen = ((id + 1) * 100_usize.pow(ngram as u32))..;
|
||||||
|
let range = id..id+ngram;
|
||||||
|
|
||||||
|
let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect();
|
||||||
|
|
||||||
|
for synonym in fetch_synonyms(reader, ctx, &words)? {
|
||||||
|
let exact = synonym.len() == 1;
|
||||||
|
let id = idgen.next().unwrap();
|
||||||
|
mapper.declare(range.clone(), id, &synonym);
|
||||||
|
|
||||||
|
let mut idgen = once(id).chain(&mut idgen);
|
||||||
|
let synonym = synonym.into_iter().map(|s| {
|
||||||
|
let id = idgen.next().unwrap();
|
||||||
|
let kind = QueryKind::NonTolerant(s);
|
||||||
|
Operation::Query(Query { id, prefix: false, exact, kind })
|
||||||
|
});
|
||||||
|
group_alts.push(create_operation(synonym, Operation::And));
|
||||||
|
}
|
||||||
|
|
||||||
|
let id = idgen.next().unwrap();
|
||||||
|
let concat = words.concat();
|
||||||
|
mapper.declare(range.clone(), id, &[&concat]);
|
||||||
|
group_alts.push(Operation::non_tolerant(id, is_last, &concat));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
group_ops.push(create_operation(group_alts, Operation::Or));
|
||||||
|
|
||||||
|
if !tail.is_empty() {
|
||||||
|
let tail_ops = create_inner(reader, ctx, mapper, tail)?;
|
||||||
|
group_ops.push(create_operation(tail_ops, Operation::Or));
|
||||||
|
}
|
||||||
|
|
||||||
|
alts.push(create_operation(group_ops, Operation::And));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(alts)
|
||||||
|
}
|
||||||
|
|
||||||
|
let alternatives = create_inner(reader, ctx, &mut mapper, &words)?;
|
||||||
|
let operation = Operation::Or(alternatives);
|
||||||
|
let mapping = mapper.mapping();
|
||||||
|
|
||||||
|
Ok((operation, mapping))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
|
pub struct PostingsKey<'o> {
|
||||||
|
pub query: &'o Query,
|
||||||
|
pub input: Vec<u8>,
|
||||||
|
pub distance: u8,
|
||||||
|
pub is_exact: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub type Postings<'o, 'txn> = HashMap<PostingsKey<'o>, Cow<'txn, Set<DocIndex>>>;
|
||||||
|
pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set<DocumentId>>>;
|
||||||
|
|
||||||
|
pub struct QueryResult<'o, 'txn> {
|
||||||
|
pub docids: Cow<'txn, Set<DocumentId>>,
|
||||||
|
pub queries: Postings<'o, 'txn>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn traverse_query_tree<'o, 'txn>(
|
||||||
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
|
ctx: &Context,
|
||||||
|
tree: &'o Operation,
|
||||||
|
) -> MResult<QueryResult<'o, 'txn>>
|
||||||
|
{
|
||||||
|
fn execute_and<'o, 'txn>(
|
||||||
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
|
ctx: &Context,
|
||||||
|
cache: &mut Cache<'o, 'txn>,
|
||||||
|
postings: &mut Postings<'o, 'txn>,
|
||||||
|
depth: usize,
|
||||||
|
operations: &'o [Operation],
|
||||||
|
) -> MResult<Cow<'txn, Set<DocumentId>>>
|
||||||
|
{
|
||||||
|
debug!("{:1$}AND", "", depth * 2);
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
|
let mut results = Vec::new();
|
||||||
|
|
||||||
|
for op in operations {
|
||||||
|
if cache.get(op).is_none() {
|
||||||
|
let docids = match op {
|
||||||
|
Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||||
|
Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||||
|
Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?,
|
||||||
|
};
|
||||||
|
cache.insert(op, docids);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for op in operations {
|
||||||
|
if let Some(docids) = cache.get(op) {
|
||||||
|
results.push(docids.as_ref());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let op = sdset::multi::Intersection::new(results);
|
||||||
|
let docids = op.into_set_buf();
|
||||||
|
|
||||||
|
debug!("{:3$}--- AND fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
|
||||||
|
|
||||||
|
Ok(Cow::Owned(docids))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn execute_or<'o, 'txn>(
|
||||||
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
|
ctx: &Context,
|
||||||
|
cache: &mut Cache<'o, 'txn>,
|
||||||
|
postings: &mut Postings<'o, 'txn>,
|
||||||
|
depth: usize,
|
||||||
|
operations: &'o [Operation],
|
||||||
|
) -> MResult<Cow<'txn, Set<DocumentId>>>
|
||||||
|
{
|
||||||
|
debug!("{:1$}OR", "", depth * 2);
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
|
let mut results = Vec::new();
|
||||||
|
|
||||||
|
for op in operations {
|
||||||
|
if cache.get(op).is_none() {
|
||||||
|
let docids = match op {
|
||||||
|
Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||||
|
Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?,
|
||||||
|
Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?,
|
||||||
|
};
|
||||||
|
cache.insert(op, docids);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for op in operations {
|
||||||
|
if let Some(docids) = cache.get(op) {
|
||||||
|
results.push(docids.as_ref());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let op = sdset::multi::Union::new(results);
|
||||||
|
let docids = op.into_set_buf();
|
||||||
|
|
||||||
|
debug!("{:3$}--- OR fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
|
||||||
|
|
||||||
|
Ok(Cow::Owned(docids))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn execute_query<'o, 'txn>(
|
||||||
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
|
ctx: &Context,
|
||||||
|
postings: &mut Postings<'o, 'txn>,
|
||||||
|
depth: usize,
|
||||||
|
query: &'o Query,
|
||||||
|
) -> MResult<Cow<'txn, Set<DocumentId>>>
|
||||||
|
{
|
||||||
|
let before = Instant::now();
|
||||||
|
|
||||||
|
let Query { prefix, kind, exact, .. } = query;
|
||||||
|
let docids: Cow<Set<_>> = match kind {
|
||||||
|
QueryKind::Tolerant(word) => {
|
||||||
|
if *prefix && word.len() <= 2 {
|
||||||
|
let prefix = {
|
||||||
|
let mut array = [0; 4];
|
||||||
|
let bytes = word.as_bytes();
|
||||||
|
array[..bytes.len()].copy_from_slice(bytes);
|
||||||
|
array
|
||||||
|
};
|
||||||
|
|
||||||
|
// We retrieve the cached postings lists for all
|
||||||
|
// the words that starts with this short prefix.
|
||||||
|
let result = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default();
|
||||||
|
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: false };
|
||||||
|
postings.insert(key, result.matches);
|
||||||
|
let prefix_docids = &result.docids;
|
||||||
|
|
||||||
|
// We retrieve the exact postings list for the prefix,
|
||||||
|
// because we must consider these matches as exact.
|
||||||
|
let result = ctx.postings_lists.postings_list(reader, word.as_bytes())?.unwrap_or_default();
|
||||||
|
let key = PostingsKey { query, input: word.clone().into_bytes(), distance: 0, is_exact: true };
|
||||||
|
postings.insert(key, result.matches);
|
||||||
|
let exact_docids = &result.docids;
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
|
let docids = sdset::duo::Union::new(prefix_docids, exact_docids).into_set_buf();
|
||||||
|
debug!("{:4$}prefix docids ({} and {}) construction took {:.02?}",
|
||||||
|
"", prefix_docids.len(), exact_docids.len(), before.elapsed(), depth * 2);
|
||||||
|
|
||||||
|
Cow::Owned(docids)
|
||||||
|
|
||||||
|
} else {
|
||||||
|
let dfa = if *prefix { build_prefix_dfa(word) } else { build_dfa(word) };
|
||||||
|
|
||||||
|
let byte = word.as_bytes()[0];
|
||||||
|
let mut stream = if byte == u8::max_value() {
|
||||||
|
ctx.words_set.search(&dfa).ge(&[byte]).into_stream()
|
||||||
|
} else {
|
||||||
|
ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
|
||||||
|
};
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
|
let mut results = Vec::new();
|
||||||
|
while let Some(input) = stream.next() {
|
||||||
|
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
|
||||||
|
let distance = dfa.eval(input).to_u8();
|
||||||
|
let is_exact = *exact && distance == 0 && input.len() == word.len();
|
||||||
|
results.push(result.docids);
|
||||||
|
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact };
|
||||||
|
postings.insert(key, result.matches);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
debug!("{:3$}docids retrieval ({:?}) took {:.02?}", "", results.len(), before.elapsed(), depth * 2);
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
|
let docids = if results.len() > 10 {
|
||||||
|
let cap = results.iter().map(|dis| dis.len()).sum();
|
||||||
|
let mut docids = Vec::with_capacity(cap);
|
||||||
|
for dis in results {
|
||||||
|
docids.extend_from_slice(&dis);
|
||||||
|
}
|
||||||
|
SetBuf::from_dirty(docids)
|
||||||
|
} else {
|
||||||
|
let sets = results.iter().map(AsRef::as_ref).collect();
|
||||||
|
sdset::multi::Union::new(sets).into_set_buf()
|
||||||
|
};
|
||||||
|
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
|
||||||
|
|
||||||
|
Cow::Owned(docids)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
QueryKind::NonTolerant(word) => {
|
||||||
|
// TODO support prefix and non-prefix exact DFA
|
||||||
|
let dfa = build_exact_dfa(word);
|
||||||
|
|
||||||
|
let byte = word.as_bytes()[0];
|
||||||
|
let mut stream = if byte == u8::max_value() {
|
||||||
|
ctx.words_set.search(&dfa).ge(&[byte]).into_stream()
|
||||||
|
} else {
|
||||||
|
ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream()
|
||||||
|
};
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
|
let mut results = Vec::new();
|
||||||
|
while let Some(input) = stream.next() {
|
||||||
|
if let Some(result) = ctx.postings_lists.postings_list(reader, input)? {
|
||||||
|
let distance = dfa.eval(input).to_u8();
|
||||||
|
results.push(result.docids);
|
||||||
|
let key = PostingsKey { query, input: input.to_owned(), distance, is_exact: *exact };
|
||||||
|
postings.insert(key, result.matches);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
debug!("{:3$}docids retrieval ({:?}) took {:.02?}", "", results.len(), before.elapsed(), depth * 2);
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
|
let docids = if results.len() > 10 {
|
||||||
|
let cap = results.iter().map(|dis| dis.len()).sum();
|
||||||
|
let mut docids = Vec::with_capacity(cap);
|
||||||
|
for dis in results {
|
||||||
|
docids.extend_from_slice(&dis);
|
||||||
|
}
|
||||||
|
SetBuf::from_dirty(docids)
|
||||||
|
} else {
|
||||||
|
let sets = results.iter().map(AsRef::as_ref).collect();
|
||||||
|
sdset::multi::Union::new(sets).into_set_buf()
|
||||||
|
};
|
||||||
|
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
|
||||||
|
|
||||||
|
Cow::Owned(docids)
|
||||||
|
},
|
||||||
|
QueryKind::Phrase(words) => {
|
||||||
|
// TODO support prefix and non-prefix exact DFA
|
||||||
|
if let [first, second] = words.as_slice() {
|
||||||
|
let first = ctx.postings_lists.postings_list(reader, first.as_bytes())?.unwrap_or_default();
|
||||||
|
let second = ctx.postings_lists.postings_list(reader, second.as_bytes())?.unwrap_or_default();
|
||||||
|
|
||||||
|
let iter = merge_join_by(first.matches.as_slice(), second.matches.as_slice(), |a, b| {
|
||||||
|
let x = (a.document_id, a.attribute, (a.word_index as u32) + 1);
|
||||||
|
let y = (b.document_id, b.attribute, b.word_index as u32);
|
||||||
|
x.cmp(&y)
|
||||||
|
});
|
||||||
|
|
||||||
|
let matches: Vec<_> = iter
|
||||||
|
.filter_map(EitherOrBoth::both)
|
||||||
|
.flat_map(|(a, b)| once(*a).chain(Some(*b)))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
|
let mut docids: Vec<_> = matches.iter().map(|m| m.document_id).collect();
|
||||||
|
docids.dedup();
|
||||||
|
let docids = SetBuf::new(docids).unwrap();
|
||||||
|
debug!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
|
||||||
|
|
||||||
|
let matches = Cow::Owned(SetBuf::new(matches).unwrap());
|
||||||
|
let key = PostingsKey { query, input: vec![], distance: 0, is_exact: true };
|
||||||
|
postings.insert(key, matches);
|
||||||
|
|
||||||
|
Cow::Owned(docids)
|
||||||
|
} else {
|
||||||
|
debug!("{:2$}{:?} skipped", "", words, depth * 2);
|
||||||
|
Cow::default()
|
||||||
|
}
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
debug!("{:4$}{:?} fetched {:?} documents in {:.02?}", "", query, docids.len(), before.elapsed(), depth * 2);
|
||||||
|
Ok(docids)
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut cache = Cache::new();
|
||||||
|
let mut postings = Postings::new();
|
||||||
|
|
||||||
|
let docids = match tree {
|
||||||
|
Operation::And(ops) => execute_and(reader, ctx, &mut cache, &mut postings, 0, &ops)?,
|
||||||
|
Operation::Or(ops) => execute_or(reader, ctx, &mut cache, &mut postings, 0, &ops)?,
|
||||||
|
Operation::Query(query) => execute_query(reader, ctx, &mut postings, 0, &query)?,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(QueryResult { docids, queries: postings })
|
||||||
|
}
|
415
meilisearch-core/src/query_words_mapper.rs
Normal file
415
meilisearch-core/src/query_words_mapper.rs
Normal file
@ -0,0 +1,415 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
use std::iter::FromIterator;
|
||||||
|
use std::ops::Range;
|
||||||
|
use intervaltree::{Element, IntervalTree};
|
||||||
|
|
||||||
|
pub type QueryId = usize;
|
||||||
|
|
||||||
|
pub struct QueryWordsMapper {
|
||||||
|
originals: Vec<String>,
|
||||||
|
mappings: HashMap<QueryId, (Range<usize>, Vec<String>)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QueryWordsMapper {
|
||||||
|
pub fn new<I, A>(originals: I) -> QueryWordsMapper
|
||||||
|
where I: IntoIterator<Item = A>,
|
||||||
|
A: ToString,
|
||||||
|
{
|
||||||
|
let originals = originals.into_iter().map(|s| s.to_string()).collect();
|
||||||
|
QueryWordsMapper { originals, mappings: HashMap::new() }
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn declare<I, A>(&mut self, range: Range<usize>, id: QueryId, replacement: I)
|
||||||
|
where I: IntoIterator<Item = A>,
|
||||||
|
A: ToString,
|
||||||
|
{
|
||||||
|
assert!(range.len() != 0);
|
||||||
|
assert!(self.originals.get(range.clone()).is_some());
|
||||||
|
assert!(id >= self.originals.len());
|
||||||
|
|
||||||
|
let replacement: Vec<_> = replacement.into_iter().map(|s| s.to_string()).collect();
|
||||||
|
|
||||||
|
assert!(!replacement.is_empty());
|
||||||
|
|
||||||
|
// We detect words at the end and at the front of the
|
||||||
|
// replacement that are common with the originals:
|
||||||
|
//
|
||||||
|
// x a b c d e f g
|
||||||
|
// ^^^/ \^^^
|
||||||
|
// a b x c d k j e f
|
||||||
|
// ^^^ ^^^
|
||||||
|
//
|
||||||
|
|
||||||
|
let left = &self.originals[..range.start];
|
||||||
|
let right = &self.originals[range.end..];
|
||||||
|
|
||||||
|
let common_left = longest_common_prefix(left, &replacement);
|
||||||
|
let common_right = longest_common_prefix(&replacement, right);
|
||||||
|
|
||||||
|
for i in 0..common_left {
|
||||||
|
let range = range.start - common_left + i..range.start - common_left + i + 1;
|
||||||
|
let replacement = vec![replacement[i].clone()];
|
||||||
|
self.mappings.insert(id + i, (range, replacement));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let replacement = replacement[common_left..replacement.len() - common_right].iter().cloned().collect();
|
||||||
|
self.mappings.insert(id + common_left, (range.clone(), replacement));
|
||||||
|
}
|
||||||
|
|
||||||
|
for i in 0..common_right {
|
||||||
|
let id = id + replacement.len() - common_right + i;
|
||||||
|
let range = range.end + i..range.end + i + 1;
|
||||||
|
let replacement = vec![replacement[replacement.len() - common_right + i].clone()];
|
||||||
|
self.mappings.insert(id, (range, replacement));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn mapping(self) -> HashMap<QueryId, Range<usize>> {
|
||||||
|
let mappings = self.mappings.into_iter().map(|(i, (r, v))| (r, (i, v)));
|
||||||
|
let intervals = IntervalTree::from_iter(mappings);
|
||||||
|
|
||||||
|
let mut output = HashMap::new();
|
||||||
|
let mut offset = 0;
|
||||||
|
|
||||||
|
// We map each original word to the biggest number of
|
||||||
|
// associated words.
|
||||||
|
for i in 0..self.originals.len() {
|
||||||
|
let max = intervals.query_point(i)
|
||||||
|
.filter_map(|e| {
|
||||||
|
if e.range.end - 1 == i {
|
||||||
|
let len = e.value.1.iter().skip(i - e.range.start).count();
|
||||||
|
if len != 0 { Some(len) } else { None }
|
||||||
|
} else { None }
|
||||||
|
})
|
||||||
|
.max()
|
||||||
|
.unwrap_or(1);
|
||||||
|
|
||||||
|
let range = i + offset..i + offset + max;
|
||||||
|
output.insert(i, range);
|
||||||
|
offset += max - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We retrieve the range that each original word
|
||||||
|
// is mapped to and apply it to each of the words.
|
||||||
|
for i in 0..self.originals.len() {
|
||||||
|
|
||||||
|
let iter = intervals.query_point(i).filter(|e| e.range.end - 1 == i);
|
||||||
|
for Element { range, value: (id, words) } in iter {
|
||||||
|
|
||||||
|
// We ask for the complete range mapped to the area we map.
|
||||||
|
let start = output.get(&range.start).map(|r| r.start).unwrap_or(range.start);
|
||||||
|
let end = output.get(&(range.end - 1)).map(|r| r.end).unwrap_or(range.end);
|
||||||
|
let range = start..end;
|
||||||
|
|
||||||
|
// We map each query id to one word until the last,
|
||||||
|
// we map it to the remainings words.
|
||||||
|
let add = range.len() - words.len();
|
||||||
|
for (j, x) in range.take(words.len()).enumerate() {
|
||||||
|
let add = if j == words.len() - 1 { add } else { 0 }; // is last?
|
||||||
|
let range = x..x + 1 + add;
|
||||||
|
output.insert(id + j, range);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
output
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn longest_common_prefix<T: Eq + std::fmt::Debug>(a: &[T], b: &[T]) -> usize {
|
||||||
|
let mut best = None;
|
||||||
|
for i in (0..a.len()).rev() {
|
||||||
|
let count = a[i..].iter().zip(b).take_while(|(a, b)| a == b).count();
|
||||||
|
best = match best {
|
||||||
|
Some(old) if count > old => Some(count),
|
||||||
|
Some(_) => break,
|
||||||
|
None => Some(count),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
best.unwrap_or(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn original_unmodified() {
|
||||||
|
let query = ["new", "york", "city", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// new york = new york city
|
||||||
|
builder.declare(0..2, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
// new = new york city
|
||||||
|
builder.declare(0..1, 7, &["new", "york", "city"]);
|
||||||
|
// ^ 7 8 9
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // new
|
||||||
|
assert_eq!(mapping[&1], 1..2); // york
|
||||||
|
assert_eq!(mapping[&2], 2..3); // city
|
||||||
|
assert_eq!(mapping[&3], 3..4); // subway
|
||||||
|
|
||||||
|
assert_eq!(mapping[&4], 0..1); // new
|
||||||
|
assert_eq!(mapping[&5], 1..2); // york
|
||||||
|
assert_eq!(mapping[&6], 2..3); // city
|
||||||
|
|
||||||
|
assert_eq!(mapping[&7], 0..1); // new
|
||||||
|
assert_eq!(mapping[&8], 1..2); // york
|
||||||
|
assert_eq!(mapping[&9], 2..3); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn original_unmodified2() {
|
||||||
|
let query = ["new", "york", "city", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// city subway = new york city underground train
|
||||||
|
builder.declare(2..4, 4, &["new", "york", "city", "underground", "train"]);
|
||||||
|
// ^ 4 5 6 7 8
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // new
|
||||||
|
assert_eq!(mapping[&1], 1..2); // york
|
||||||
|
assert_eq!(mapping[&2], 2..3); // city
|
||||||
|
assert_eq!(mapping[&3], 3..5); // subway
|
||||||
|
|
||||||
|
assert_eq!(mapping[&4], 0..1); // new
|
||||||
|
assert_eq!(mapping[&5], 1..2); // york
|
||||||
|
assert_eq!(mapping[&6], 2..3); // city
|
||||||
|
assert_eq!(mapping[&7], 3..4); // underground
|
||||||
|
assert_eq!(mapping[&8], 4..5); // train
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn original_unmodified3() {
|
||||||
|
let query = ["a", "b", "x", "x", "a", "b", "c", "d", "e", "f", "g"];
|
||||||
|
// 0 1 2 3 4 5 6 7 8 9 10
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// c d = a b x c d k j e f
|
||||||
|
builder.declare(6..8, 11, &["a", "b", "x", "c", "d", "k", "j", "e", "f"]);
|
||||||
|
// ^^ 11 12 13 14 15 16 17 18 19
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // a
|
||||||
|
assert_eq!(mapping[&1], 1..2); // b
|
||||||
|
assert_eq!(mapping[&2], 2..3); // x
|
||||||
|
assert_eq!(mapping[&3], 3..4); // x
|
||||||
|
assert_eq!(mapping[&4], 4..5); // a
|
||||||
|
assert_eq!(mapping[&5], 5..6); // b
|
||||||
|
assert_eq!(mapping[&6], 6..7); // c
|
||||||
|
assert_eq!(mapping[&7], 7..11); // d
|
||||||
|
assert_eq!(mapping[&8], 11..12); // e
|
||||||
|
assert_eq!(mapping[&9], 12..13); // f
|
||||||
|
assert_eq!(mapping[&10], 13..14); // g
|
||||||
|
|
||||||
|
assert_eq!(mapping[&11], 4..5); // a
|
||||||
|
assert_eq!(mapping[&12], 5..6); // b
|
||||||
|
assert_eq!(mapping[&13], 6..7); // x
|
||||||
|
assert_eq!(mapping[&14], 7..8); // c
|
||||||
|
assert_eq!(mapping[&15], 8..9); // d
|
||||||
|
assert_eq!(mapping[&16], 9..10); // k
|
||||||
|
assert_eq!(mapping[&17], 10..11); // j
|
||||||
|
assert_eq!(mapping[&18], 11..12); // e
|
||||||
|
assert_eq!(mapping[&19], 12..13); // f
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn simple_growing() {
|
||||||
|
let query = ["new", "york", "subway"];
|
||||||
|
// 0 1 2
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// new york = new york city
|
||||||
|
builder.declare(0..2, 3, &["new", "york", "city"]);
|
||||||
|
// ^ 3 4 5
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // new
|
||||||
|
assert_eq!(mapping[&1], 1..3); // york
|
||||||
|
assert_eq!(mapping[&2], 3..4); // subway
|
||||||
|
assert_eq!(mapping[&3], 0..1); // new
|
||||||
|
assert_eq!(mapping[&4], 1..2); // york
|
||||||
|
assert_eq!(mapping[&5], 2..3); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn same_place_growings() {
|
||||||
|
let query = ["NY", "subway"];
|
||||||
|
// 0 1
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NY = new york
|
||||||
|
builder.declare(0..1, 2, &["new", "york"]);
|
||||||
|
// ^ 2 3
|
||||||
|
|
||||||
|
// NY = new york city
|
||||||
|
builder.declare(0..1, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
// NY = NYC
|
||||||
|
builder.declare(0..1, 7, &["NYC"]);
|
||||||
|
// ^ 7
|
||||||
|
|
||||||
|
// NY = new york city
|
||||||
|
builder.declare(0..1, 8, &["new", "york", "city"]);
|
||||||
|
// ^ 8 9 10
|
||||||
|
|
||||||
|
// subway = underground train
|
||||||
|
builder.declare(1..2, 11, &["underground", "train"]);
|
||||||
|
// ^ 11 12
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..3); // NY
|
||||||
|
assert_eq!(mapping[&1], 3..5); // subway
|
||||||
|
assert_eq!(mapping[&2], 0..1); // new
|
||||||
|
assert_eq!(mapping[&3], 1..3); // york
|
||||||
|
assert_eq!(mapping[&4], 0..1); // new
|
||||||
|
assert_eq!(mapping[&5], 1..2); // york
|
||||||
|
assert_eq!(mapping[&6], 2..3); // city
|
||||||
|
assert_eq!(mapping[&7], 0..3); // NYC
|
||||||
|
assert_eq!(mapping[&8], 0..1); // new
|
||||||
|
assert_eq!(mapping[&9], 1..2); // york
|
||||||
|
assert_eq!(mapping[&10], 2..3); // city
|
||||||
|
assert_eq!(mapping[&11], 3..4); // underground
|
||||||
|
assert_eq!(mapping[&12], 4..5); // train
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bigger_growing() {
|
||||||
|
let query = ["NYC", "subway"];
|
||||||
|
// 0 1
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(0..1, 2, &["new", "york", "city"]);
|
||||||
|
// ^ 2 3 4
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..3); // NYC
|
||||||
|
assert_eq!(mapping[&1], 3..4); // subway
|
||||||
|
assert_eq!(mapping[&2], 0..1); // new
|
||||||
|
assert_eq!(mapping[&3], 1..2); // york
|
||||||
|
assert_eq!(mapping[&4], 2..3); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn middle_query_growing() {
|
||||||
|
let query = ["great", "awesome", "NYC", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // great
|
||||||
|
assert_eq!(mapping[&1], 1..2); // awesome
|
||||||
|
assert_eq!(mapping[&2], 2..5); // NYC
|
||||||
|
assert_eq!(mapping[&3], 5..6); // subway
|
||||||
|
assert_eq!(mapping[&4], 2..3); // new
|
||||||
|
assert_eq!(mapping[&5], 3..4); // york
|
||||||
|
assert_eq!(mapping[&6], 4..5); // city
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn end_query_growing() {
|
||||||
|
let query = ["NYC", "subway"];
|
||||||
|
// 0 1
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(1..2, 2, &["underground", "train"]);
|
||||||
|
// ^ 2 3
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // NYC
|
||||||
|
assert_eq!(mapping[&1], 1..3); // subway
|
||||||
|
assert_eq!(mapping[&2], 1..2); // underground
|
||||||
|
assert_eq!(mapping[&3], 2..3); // train
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multiple_growings() {
|
||||||
|
let query = ["great", "awesome", "NYC", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
// subway = underground train
|
||||||
|
builder.declare(3..4, 7, &["underground", "train"]);
|
||||||
|
// ^ 7 8
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // great
|
||||||
|
assert_eq!(mapping[&1], 1..2); // awesome
|
||||||
|
assert_eq!(mapping[&2], 2..5); // NYC
|
||||||
|
assert_eq!(mapping[&3], 5..7); // subway
|
||||||
|
assert_eq!(mapping[&4], 2..3); // new
|
||||||
|
assert_eq!(mapping[&5], 3..4); // york
|
||||||
|
assert_eq!(mapping[&6], 4..5); // city
|
||||||
|
assert_eq!(mapping[&7], 5..6); // underground
|
||||||
|
assert_eq!(mapping[&8], 6..7); // train
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multiple_probable_growings() {
|
||||||
|
let query = ["great", "awesome", "NYC", "subway"];
|
||||||
|
// 0 1 2 3
|
||||||
|
let mut builder = QueryWordsMapper::new(&query);
|
||||||
|
|
||||||
|
// NYC = new york city
|
||||||
|
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||||
|
// ^ 4 5 6
|
||||||
|
|
||||||
|
// subway = underground train
|
||||||
|
builder.declare(3..4, 7, &["underground", "train"]);
|
||||||
|
// ^ 7 8
|
||||||
|
|
||||||
|
// great awesome = good
|
||||||
|
builder.declare(0..2, 9, &["good"]);
|
||||||
|
// ^ 9
|
||||||
|
|
||||||
|
// awesome NYC = NY
|
||||||
|
builder.declare(1..3, 10, &["NY"]);
|
||||||
|
// ^^ 10
|
||||||
|
|
||||||
|
// NYC subway = metro
|
||||||
|
builder.declare(2..4, 11, &["metro"]);
|
||||||
|
// ^^ 11
|
||||||
|
|
||||||
|
let mapping = builder.mapping();
|
||||||
|
|
||||||
|
assert_eq!(mapping[&0], 0..1); // great
|
||||||
|
assert_eq!(mapping[&1], 1..2); // awesome
|
||||||
|
assert_eq!(mapping[&2], 2..5); // NYC
|
||||||
|
assert_eq!(mapping[&3], 5..7); // subway
|
||||||
|
assert_eq!(mapping[&4], 2..3); // new
|
||||||
|
assert_eq!(mapping[&5], 3..4); // york
|
||||||
|
assert_eq!(mapping[&6], 4..5); // city
|
||||||
|
assert_eq!(mapping[&7], 5..6); // underground
|
||||||
|
assert_eq!(mapping[&8], 6..7); // train
|
||||||
|
assert_eq!(mapping[&9], 0..2); // good
|
||||||
|
assert_eq!(mapping[&10], 1..5); // NY
|
||||||
|
assert_eq!(mapping[&11], 2..7); // metro
|
||||||
|
}
|
||||||
|
}
|
@ -1,8 +1,7 @@
|
|||||||
use compact_arena::SmallArena;
|
use compact_arena::SmallArena;
|
||||||
use itertools::EitherOrBoth;
|
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
use crate::DocIndex;
|
use crate::DocIndex;
|
||||||
use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView};
|
use crate::bucket_sort::{SimpleMatch, BareMatch, PostingsListView};
|
||||||
use crate::reordered_attrs::ReorderedAttrs;
|
use crate::reordered_attrs::ReorderedAttrs;
|
||||||
|
|
||||||
pub struct RawDocument<'a, 'tag> {
|
pub struct RawDocument<'a, 'tag> {
|
||||||
@ -19,10 +18,9 @@ pub struct RawDocument<'a, 'tag> {
|
|||||||
impl<'a, 'tag> RawDocument<'a, 'tag> {
|
impl<'a, 'tag> RawDocument<'a, 'tag> {
|
||||||
pub fn new<'txn>(
|
pub fn new<'txn>(
|
||||||
bare_matches: &'a mut [BareMatch<'tag>],
|
bare_matches: &'a mut [BareMatch<'tag>],
|
||||||
automatons: &[QueryWordAutomaton],
|
|
||||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||||
searchable_attrs: Option<&ReorderedAttrs>,
|
searchable_attrs: Option<&ReorderedAttrs>,
|
||||||
) -> Option<RawDocument<'a, 'tag>>
|
) -> RawDocument<'a, 'tag>
|
||||||
{
|
{
|
||||||
if let Some(reordered_attrs) = searchable_attrs {
|
if let Some(reordered_attrs) = searchable_attrs {
|
||||||
for bm in bare_matches.iter() {
|
for bm in bare_matches.iter() {
|
||||||
@ -42,70 +40,12 @@ impl<'a, 'tag> RawDocument<'a, 'tag> {
|
|||||||
|
|
||||||
bare_matches.sort_unstable_by_key(|m| m.query_index);
|
bare_matches.sort_unstable_by_key(|m| m.query_index);
|
||||||
|
|
||||||
let mut previous_word = None;
|
RawDocument {
|
||||||
for i in 0..bare_matches.len() {
|
|
||||||
let a = &bare_matches[i];
|
|
||||||
let auta = &automatons[a.query_index as usize];
|
|
||||||
|
|
||||||
match auta.phrase_query {
|
|
||||||
Some((0, _)) => {
|
|
||||||
let b = match bare_matches.get(i + 1) {
|
|
||||||
Some(b) => b,
|
|
||||||
None => {
|
|
||||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if a.query_index + 1 != b.query_index {
|
|
||||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
let pla = &postings_lists[a.postings_list];
|
|
||||||
let plb = &postings_lists[b.postings_list];
|
|
||||||
|
|
||||||
let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
|
|
||||||
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
|
|
||||||
});
|
|
||||||
|
|
||||||
let mut newa = Vec::new();
|
|
||||||
let mut newb = Vec::new();
|
|
||||||
|
|
||||||
for eb in iter {
|
|
||||||
if let EitherOrBoth::Both(a, b) = eb {
|
|
||||||
newa.push(*a);
|
|
||||||
newb.push(*b);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !newa.is_empty() {
|
|
||||||
previous_word = Some(a.query_index);
|
|
||||||
}
|
|
||||||
|
|
||||||
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
|
|
||||||
postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
|
|
||||||
},
|
|
||||||
Some((1, _)) => {
|
|
||||||
if previous_word.take() != Some(a.query_index - 1) {
|
|
||||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
|
||||||
}
|
|
||||||
},
|
|
||||||
Some((_, _)) => unreachable!(),
|
|
||||||
None => (),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if bare_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
|
|
||||||
return None
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(RawDocument {
|
|
||||||
id: bare_matches[0].document_id,
|
id: bare_matches[0].document_id,
|
||||||
bare_matches,
|
bare_matches,
|
||||||
processed_matches: Vec::new(),
|
processed_matches: Vec::new(),
|
||||||
processed_distances: Vec::new(),
|
processed_distances: Vec::new(),
|
||||||
contains_one_word_field: false,
|
contains_one_word_field: false,
|
||||||
})
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -67,6 +67,17 @@ impl Main {
|
|||||||
self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, bytes)
|
self.main.put::<_, Str, ByteSlice>(writer, WORDS_KEY, bytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub unsafe fn static_words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
|
||||||
|
match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? {
|
||||||
|
Some(bytes) => {
|
||||||
|
let bytes: &'static [u8] = std::mem::transmute(bytes);
|
||||||
|
let set = fst::Set::from_static_slice(bytes).unwrap();
|
||||||
|
Ok(Some(set))
|
||||||
|
}
|
||||||
|
None => Ok(None),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
|
pub fn words_fst(self, reader: &heed::RoTxn<MainT>) -> ZResult<Option<fst::Set>> {
|
||||||
match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? {
|
match self.main.get::<_, Str, ByteSlice>(reader, WORDS_KEY)? {
|
||||||
Some(bytes) => {
|
Some(bytes) => {
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
mod docs_words;
|
mod docs_words;
|
||||||
|
mod prefix_documents_cache;
|
||||||
|
mod prefix_postings_lists_cache;
|
||||||
mod documents_fields;
|
mod documents_fields;
|
||||||
mod documents_fields_counts;
|
mod documents_fields_counts;
|
||||||
mod main;
|
mod main;
|
||||||
@ -8,6 +10,8 @@ mod updates;
|
|||||||
mod updates_results;
|
mod updates_results;
|
||||||
|
|
||||||
pub use self::docs_words::DocsWords;
|
pub use self::docs_words::DocsWords;
|
||||||
|
pub use self::prefix_documents_cache::PrefixDocumentsCache;
|
||||||
|
pub use self::prefix_postings_lists_cache::PrefixPostingsListsCache;
|
||||||
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
|
pub use self::documents_fields::{DocumentFieldsIter, DocumentsFields};
|
||||||
pub use self::documents_fields_counts::{
|
pub use self::documents_fields_counts::{
|
||||||
DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter,
|
DocumentFieldsCountsIter, DocumentsFieldsCounts, DocumentsIdsIter,
|
||||||
@ -18,10 +22,15 @@ pub use self::synonyms::Synonyms;
|
|||||||
pub use self::updates::Updates;
|
pub use self::updates::Updates;
|
||||||
pub use self::updates_results::UpdatesResults;
|
pub use self::updates_results::UpdatesResults;
|
||||||
|
|
||||||
|
use std::borrow::Cow;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
use std::{mem, ptr};
|
||||||
|
|
||||||
use heed::Result as ZResult;
|
use heed::Result as ZResult;
|
||||||
|
use heed::{BytesEncode, BytesDecode};
|
||||||
use meilisearch_schema::{Schema, SchemaAttr};
|
use meilisearch_schema::{Schema, SchemaAttr};
|
||||||
|
use sdset::{Set, SetBuf};
|
||||||
use serde::de::{self, Deserialize};
|
use serde::de::{self, Deserialize};
|
||||||
use zerocopy::{AsBytes, FromBytes};
|
use zerocopy::{AsBytes, FromBytes};
|
||||||
|
|
||||||
@ -29,7 +38,7 @@ use crate::criterion::Criteria;
|
|||||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||||
use crate::database::{MainT, UpdateT};
|
use crate::database::{MainT, UpdateT};
|
||||||
use crate::serde::Deserializer;
|
use crate::serde::Deserializer;
|
||||||
use crate::{query_builder::QueryBuilder, update, DocumentId, Error, MResult};
|
use crate::{query_builder::QueryBuilder, update, DocIndex, DocumentId, Error, MResult};
|
||||||
|
|
||||||
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
|
type BEU64 = zerocopy::U64<byteorder::BigEndian>;
|
||||||
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
|
type BEU16 = zerocopy::U16<byteorder::BigEndian>;
|
||||||
@ -50,6 +59,87 @@ impl DocumentAttrKey {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Default, Debug)]
|
||||||
|
pub struct Postings<'a> {
|
||||||
|
pub docids: Cow<'a, Set<DocumentId>>,
|
||||||
|
pub matches: Cow<'a, Set<DocIndex>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct PostingsCodec;
|
||||||
|
|
||||||
|
impl<'a> BytesEncode<'a> for PostingsCodec {
|
||||||
|
type EItem = Postings<'a>;
|
||||||
|
|
||||||
|
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
|
||||||
|
let u64_size = mem::size_of::<u64>();
|
||||||
|
let docids_size = item.docids.len() * mem::size_of::<DocumentId>();
|
||||||
|
let matches_size = item.matches.len() * mem::size_of::<DocIndex>();
|
||||||
|
|
||||||
|
let mut buffer = Vec::with_capacity(u64_size + docids_size + matches_size);
|
||||||
|
|
||||||
|
let docids_len = item.docids.len();
|
||||||
|
buffer.extend_from_slice(&docids_len.to_be_bytes());
|
||||||
|
buffer.extend_from_slice(item.docids.as_bytes());
|
||||||
|
buffer.extend_from_slice(item.matches.as_bytes());
|
||||||
|
|
||||||
|
Some(Cow::Owned(buffer))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aligned_to(bytes: &[u8], align: usize) -> bool {
|
||||||
|
(bytes as *const _ as *const () as usize) % align == 0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_bytes_to_set<'a, T: 'a>(bytes: &'a [u8]) -> Option<Cow<'a, Set<T>>>
|
||||||
|
where T: Clone + FromBytes
|
||||||
|
{
|
||||||
|
match zerocopy::LayoutVerified::<_, [T]>::new_slice(bytes) {
|
||||||
|
Some(layout) => Some(Cow::Borrowed(Set::new_unchecked(layout.into_slice()))),
|
||||||
|
None => {
|
||||||
|
let len = bytes.len();
|
||||||
|
let elem_size = mem::size_of::<T>();
|
||||||
|
|
||||||
|
// ensure that it is the alignment that is wrong
|
||||||
|
// and the length is valid
|
||||||
|
if len % elem_size == 0 && !aligned_to(bytes, mem::align_of::<T>()) {
|
||||||
|
let elems = len / elem_size;
|
||||||
|
let mut vec = Vec::<T>::with_capacity(elems);
|
||||||
|
|
||||||
|
unsafe {
|
||||||
|
let dst = vec.as_mut_ptr() as *mut u8;
|
||||||
|
ptr::copy_nonoverlapping(bytes.as_ptr(), dst, len);
|
||||||
|
vec.set_len(elems);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Some(Cow::Owned(SetBuf::new_unchecked(vec)));
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> BytesDecode<'a> for PostingsCodec {
|
||||||
|
type DItem = Postings<'a>;
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let u64_size = mem::size_of::<u64>();
|
||||||
|
let docid_size = mem::size_of::<DocumentId>();
|
||||||
|
|
||||||
|
let (len_bytes, bytes) = bytes.split_at(u64_size);
|
||||||
|
let docids_len = len_bytes.try_into().ok().map(u64::from_be_bytes)? as usize;
|
||||||
|
let docids_size = docids_len * docid_size;
|
||||||
|
|
||||||
|
let docids_bytes = &bytes[..docids_size];
|
||||||
|
let matches_bytes = &bytes[docids_size..];
|
||||||
|
|
||||||
|
let docids = from_bytes_to_set(docids_bytes)?;
|
||||||
|
let matches = from_bytes_to_set(matches_bytes)?;
|
||||||
|
|
||||||
|
Some(Postings { docids, matches })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn main_name(name: &str) -> String {
|
fn main_name(name: &str) -> String {
|
||||||
format!("store-{}", name)
|
format!("store-{}", name)
|
||||||
}
|
}
|
||||||
@ -74,6 +164,14 @@ fn docs_words_name(name: &str) -> String {
|
|||||||
format!("store-{}-docs-words", name)
|
format!("store-{}-docs-words", name)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn prefix_documents_cache_name(name: &str) -> String {
|
||||||
|
format!("store-{}-prefix-documents-cache", name)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn prefix_postings_lists_cache_name(name: &str) -> String {
|
||||||
|
format!("store-{}-prefix-postings-lists-cache", name)
|
||||||
|
}
|
||||||
|
|
||||||
fn updates_name(name: &str) -> String {
|
fn updates_name(name: &str) -> String {
|
||||||
format!("store-{}-updates", name)
|
format!("store-{}-updates", name)
|
||||||
}
|
}
|
||||||
@ -90,6 +188,8 @@ pub struct Index {
|
|||||||
pub documents_fields_counts: DocumentsFieldsCounts,
|
pub documents_fields_counts: DocumentsFieldsCounts,
|
||||||
pub synonyms: Synonyms,
|
pub synonyms: Synonyms,
|
||||||
pub docs_words: DocsWords,
|
pub docs_words: DocsWords,
|
||||||
|
pub prefix_documents_cache: PrefixDocumentsCache,
|
||||||
|
pub prefix_postings_lists_cache: PrefixPostingsListsCache,
|
||||||
|
|
||||||
pub updates: Updates,
|
pub updates: Updates,
|
||||||
pub updates_results: UpdatesResults,
|
pub updates_results: UpdatesResults,
|
||||||
@ -142,7 +242,7 @@ impl Index {
|
|||||||
|
|
||||||
pub fn schema_update(&self, writer: &mut heed::RwTxn<UpdateT>, schema: Schema) -> MResult<u64> {
|
pub fn schema_update(&self, writer: &mut heed::RwTxn<UpdateT>, schema: Schema) -> MResult<u64> {
|
||||||
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
let _ = self.updates_notifier.send(UpdateEvent::NewUpdate);
|
||||||
update::push_schema_update(writer, self.updates, self.updates_results, schema)
|
update::push_schema_update(writer, self, schema)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn customs_update(&self, writer: &mut heed::RwTxn<UpdateT>, customs: Vec<u8>) -> ZResult<u64> {
|
pub fn customs_update(&self, writer: &mut heed::RwTxn<UpdateT>, customs: Vec<u8>) -> ZResult<u64> {
|
||||||
@ -252,6 +352,8 @@ impl Index {
|
|||||||
self.postings_lists,
|
self.postings_lists,
|
||||||
self.documents_fields_counts,
|
self.documents_fields_counts,
|
||||||
self.synonyms,
|
self.synonyms,
|
||||||
|
self.prefix_documents_cache,
|
||||||
|
self.prefix_postings_lists_cache,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -264,6 +366,8 @@ impl Index {
|
|||||||
self.postings_lists,
|
self.postings_lists,
|
||||||
self.documents_fields_counts,
|
self.documents_fields_counts,
|
||||||
self.synonyms,
|
self.synonyms,
|
||||||
|
self.prefix_documents_cache,
|
||||||
|
self.prefix_postings_lists_cache,
|
||||||
criteria,
|
criteria,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@ -282,6 +386,8 @@ pub fn create(
|
|||||||
let documents_fields_counts_name = documents_fields_counts_name(name);
|
let documents_fields_counts_name = documents_fields_counts_name(name);
|
||||||
let synonyms_name = synonyms_name(name);
|
let synonyms_name = synonyms_name(name);
|
||||||
let docs_words_name = docs_words_name(name);
|
let docs_words_name = docs_words_name(name);
|
||||||
|
let prefix_documents_cache_name = prefix_documents_cache_name(name);
|
||||||
|
let prefix_postings_lists_cache_name = prefix_postings_lists_cache_name(name);
|
||||||
let updates_name = updates_name(name);
|
let updates_name = updates_name(name);
|
||||||
let updates_results_name = updates_results_name(name);
|
let updates_results_name = updates_results_name(name);
|
||||||
|
|
||||||
@ -292,6 +398,8 @@ pub fn create(
|
|||||||
let documents_fields_counts = env.create_database(Some(&documents_fields_counts_name))?;
|
let documents_fields_counts = env.create_database(Some(&documents_fields_counts_name))?;
|
||||||
let synonyms = env.create_database(Some(&synonyms_name))?;
|
let synonyms = env.create_database(Some(&synonyms_name))?;
|
||||||
let docs_words = env.create_database(Some(&docs_words_name))?;
|
let docs_words = env.create_database(Some(&docs_words_name))?;
|
||||||
|
let prefix_documents_cache = env.create_database(Some(&prefix_documents_cache_name))?;
|
||||||
|
let prefix_postings_lists_cache = env.create_database(Some(&prefix_postings_lists_cache_name))?;
|
||||||
let updates = update_env.create_database(Some(&updates_name))?;
|
let updates = update_env.create_database(Some(&updates_name))?;
|
||||||
let updates_results = update_env.create_database(Some(&updates_results_name))?;
|
let updates_results = update_env.create_database(Some(&updates_results_name))?;
|
||||||
|
|
||||||
@ -299,11 +407,11 @@ pub fn create(
|
|||||||
main: Main { main },
|
main: Main { main },
|
||||||
postings_lists: PostingsLists { postings_lists },
|
postings_lists: PostingsLists { postings_lists },
|
||||||
documents_fields: DocumentsFields { documents_fields },
|
documents_fields: DocumentsFields { documents_fields },
|
||||||
documents_fields_counts: DocumentsFieldsCounts {
|
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
|
||||||
documents_fields_counts,
|
|
||||||
},
|
|
||||||
synonyms: Synonyms { synonyms },
|
synonyms: Synonyms { synonyms },
|
||||||
docs_words: DocsWords { docs_words },
|
docs_words: DocsWords { docs_words },
|
||||||
|
prefix_postings_lists_cache: PrefixPostingsListsCache { prefix_postings_lists_cache },
|
||||||
|
prefix_documents_cache: PrefixDocumentsCache { prefix_documents_cache },
|
||||||
updates: Updates { updates },
|
updates: Updates { updates },
|
||||||
updates_results: UpdatesResults { updates_results },
|
updates_results: UpdatesResults { updates_results },
|
||||||
updates_notifier,
|
updates_notifier,
|
||||||
@ -323,6 +431,8 @@ pub fn open(
|
|||||||
let documents_fields_counts_name = documents_fields_counts_name(name);
|
let documents_fields_counts_name = documents_fields_counts_name(name);
|
||||||
let synonyms_name = synonyms_name(name);
|
let synonyms_name = synonyms_name(name);
|
||||||
let docs_words_name = docs_words_name(name);
|
let docs_words_name = docs_words_name(name);
|
||||||
|
let prefix_documents_cache_name = prefix_documents_cache_name(name);
|
||||||
|
let prefix_postings_lists_cache_name = prefix_postings_lists_cache_name(name);
|
||||||
let updates_name = updates_name(name);
|
let updates_name = updates_name(name);
|
||||||
let updates_results_name = updates_results_name(name);
|
let updates_results_name = updates_results_name(name);
|
||||||
|
|
||||||
@ -351,6 +461,14 @@ pub fn open(
|
|||||||
Some(docs_words) => docs_words,
|
Some(docs_words) => docs_words,
|
||||||
None => return Ok(None),
|
None => return Ok(None),
|
||||||
};
|
};
|
||||||
|
let prefix_documents_cache = match env.open_database(Some(&prefix_documents_cache_name))? {
|
||||||
|
Some(prefix_documents_cache) => prefix_documents_cache,
|
||||||
|
None => return Ok(None),
|
||||||
|
};
|
||||||
|
let prefix_postings_lists_cache = match env.open_database(Some(&prefix_postings_lists_cache_name))? {
|
||||||
|
Some(prefix_postings_lists_cache) => prefix_postings_lists_cache,
|
||||||
|
None => return Ok(None),
|
||||||
|
};
|
||||||
let updates = match update_env.open_database(Some(&updates_name))? {
|
let updates = match update_env.open_database(Some(&updates_name))? {
|
||||||
Some(updates) => updates,
|
Some(updates) => updates,
|
||||||
None => return Ok(None),
|
None => return Ok(None),
|
||||||
@ -364,11 +482,11 @@ pub fn open(
|
|||||||
main: Main { main },
|
main: Main { main },
|
||||||
postings_lists: PostingsLists { postings_lists },
|
postings_lists: PostingsLists { postings_lists },
|
||||||
documents_fields: DocumentsFields { documents_fields },
|
documents_fields: DocumentsFields { documents_fields },
|
||||||
documents_fields_counts: DocumentsFieldsCounts {
|
documents_fields_counts: DocumentsFieldsCounts { documents_fields_counts },
|
||||||
documents_fields_counts,
|
|
||||||
},
|
|
||||||
synonyms: Synonyms { synonyms },
|
synonyms: Synonyms { synonyms },
|
||||||
docs_words: DocsWords { docs_words },
|
docs_words: DocsWords { docs_words },
|
||||||
|
prefix_documents_cache: PrefixDocumentsCache { prefix_documents_cache },
|
||||||
|
prefix_postings_lists_cache: PrefixPostingsListsCache { prefix_postings_lists_cache },
|
||||||
updates: Updates { updates },
|
updates: Updates { updates },
|
||||||
updates_results: UpdatesResults { updates_results },
|
updates_results: UpdatesResults { updates_results },
|
||||||
updates_notifier,
|
updates_notifier,
|
||||||
@ -387,6 +505,8 @@ pub fn clear(
|
|||||||
index.documents_fields_counts.clear(writer)?;
|
index.documents_fields_counts.clear(writer)?;
|
||||||
index.synonyms.clear(writer)?;
|
index.synonyms.clear(writer)?;
|
||||||
index.docs_words.clear(writer)?;
|
index.docs_words.clear(writer)?;
|
||||||
|
index.prefix_documents_cache.clear(writer)?;
|
||||||
|
index.prefix_postings_lists_cache.clear(writer)?;
|
||||||
index.updates.clear(update_writer)?;
|
index.updates.clear(update_writer)?;
|
||||||
index.updates_results.clear(update_writer)?;
|
index.updates_results.clear(update_writer)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -1,13 +1,17 @@
|
|||||||
use crate::DocIndex;
|
|
||||||
use crate::database::MainT;
|
|
||||||
use heed::types::{ByteSlice, CowSlice};
|
|
||||||
use heed::Result as ZResult;
|
|
||||||
use sdset::{Set, SetBuf};
|
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use heed::Result as ZResult;
|
||||||
|
use heed::types::ByteSlice;
|
||||||
|
use sdset::{Set, SetBuf};
|
||||||
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
|
use crate::database::MainT;
|
||||||
|
use crate::DocIndex;
|
||||||
|
use crate::store::{Postings, PostingsCodec};
|
||||||
|
|
||||||
#[derive(Copy, Clone)]
|
#[derive(Copy, Clone)]
|
||||||
pub struct PostingsLists {
|
pub struct PostingsLists {
|
||||||
pub(crate) postings_lists: heed::Database<ByteSlice, CowSlice<DocIndex>>,
|
pub(crate) postings_lists: heed::Database<ByteSlice, PostingsCodec>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PostingsLists {
|
impl PostingsLists {
|
||||||
@ -15,9 +19,14 @@ impl PostingsLists {
|
|||||||
self,
|
self,
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
word: &[u8],
|
word: &[u8],
|
||||||
words_indexes: &Set<DocIndex>,
|
matches: &Set<DocIndex>,
|
||||||
) -> ZResult<()> {
|
) -> ZResult<()> {
|
||||||
self.postings_lists.put(writer, word, words_indexes)
|
let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect();
|
||||||
|
let docids = Cow::Owned(SetBuf::new_unchecked(docids));
|
||||||
|
let matches = Cow::Borrowed(matches);
|
||||||
|
let postings = Postings { docids, matches };
|
||||||
|
|
||||||
|
self.postings_lists.put(writer, word, &postings)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn del_postings_list(self, writer: &mut heed::RwTxn<MainT>, word: &[u8]) -> ZResult<bool> {
|
pub fn del_postings_list(self, writer: &mut heed::RwTxn<MainT>, word: &[u8]) -> ZResult<bool> {
|
||||||
@ -32,11 +41,7 @@ impl PostingsLists {
|
|||||||
self,
|
self,
|
||||||
reader: &'txn heed::RoTxn<MainT>,
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
word: &[u8],
|
word: &[u8],
|
||||||
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> {
|
) -> ZResult<Option<Postings<'txn>>> {
|
||||||
match self.postings_lists.get(reader, word)? {
|
self.postings_lists.get(reader, word)
|
||||||
Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))),
|
|
||||||
Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))),
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
80
meilisearch-core/src/store/prefix_documents_cache.rs
Normal file
80
meilisearch-core/src/store/prefix_documents_cache.rs
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use heed::types::{OwnedType, CowSlice};
|
||||||
|
use heed::Result as ZResult;
|
||||||
|
use zerocopy::{AsBytes, FromBytes};
|
||||||
|
|
||||||
|
use super::BEU64;
|
||||||
|
use crate::{DocumentId, Highlight};
|
||||||
|
use crate::database::MainT;
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone, AsBytes, FromBytes)]
|
||||||
|
#[repr(C)]
|
||||||
|
pub struct PrefixKey {
|
||||||
|
prefix: [u8; 4],
|
||||||
|
index: BEU64,
|
||||||
|
docid: BEU64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PrefixKey {
|
||||||
|
pub fn new(prefix: [u8; 4], index: u64, docid: u64) -> PrefixKey {
|
||||||
|
PrefixKey {
|
||||||
|
prefix: prefix,
|
||||||
|
index: BEU64::new(index),
|
||||||
|
docid: BEU64::new(docid),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
|
pub struct PrefixDocumentsCache {
|
||||||
|
pub(crate) prefix_documents_cache: heed::Database<OwnedType<PrefixKey>, CowSlice<Highlight>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PrefixDocumentsCache {
|
||||||
|
pub fn put_prefix_document(
|
||||||
|
self,
|
||||||
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
|
prefix: [u8; 4],
|
||||||
|
index: usize,
|
||||||
|
docid: DocumentId,
|
||||||
|
highlights: &[Highlight],
|
||||||
|
) -> ZResult<()> {
|
||||||
|
let key = PrefixKey::new(prefix, index as u64, docid.0);
|
||||||
|
self.prefix_documents_cache.put(writer, &key, highlights)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
|
||||||
|
self.prefix_documents_cache.clear(writer)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn prefix_documents<'txn>(
|
||||||
|
self,
|
||||||
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
|
prefix: [u8; 4],
|
||||||
|
) -> ZResult<PrefixDocumentsIter<'txn>> {
|
||||||
|
let start = PrefixKey::new(prefix, 0, 0);
|
||||||
|
let end = PrefixKey::new(prefix, u64::max_value(), u64::max_value());
|
||||||
|
let iter = self.prefix_documents_cache.range(reader, &(start..=end))?;
|
||||||
|
Ok(PrefixDocumentsIter { iter })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct PrefixDocumentsIter<'txn> {
|
||||||
|
iter: heed::RoRange<'txn, OwnedType<PrefixKey>, CowSlice<Highlight>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'txn> Iterator for PrefixDocumentsIter<'txn> {
|
||||||
|
type Item = ZResult<(DocumentId, Cow<'txn, [Highlight]>)>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
match self.iter.next() {
|
||||||
|
Some(Ok((key, highlights))) => {
|
||||||
|
let docid = DocumentId(key.docid.get());
|
||||||
|
Some(Ok((docid, highlights)))
|
||||||
|
}
|
||||||
|
Some(Err(e)) => Some(Err(e)),
|
||||||
|
None => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
45
meilisearch-core/src/store/prefix_postings_lists_cache.rs
Normal file
45
meilisearch-core/src/store/prefix_postings_lists_cache.rs
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use heed::Result as ZResult;
|
||||||
|
use heed::types::OwnedType;
|
||||||
|
use sdset::{Set, SetBuf};
|
||||||
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
|
use crate::database::MainT;
|
||||||
|
use crate::DocIndex;
|
||||||
|
use crate::store::{PostingsCodec, Postings};
|
||||||
|
|
||||||
|
#[derive(Copy, Clone)]
|
||||||
|
pub struct PrefixPostingsListsCache {
|
||||||
|
pub(crate) prefix_postings_lists_cache: heed::Database<OwnedType<[u8; 4]>, PostingsCodec>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PrefixPostingsListsCache {
|
||||||
|
pub fn put_prefix_postings_list(
|
||||||
|
self,
|
||||||
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
|
prefix: [u8; 4],
|
||||||
|
matches: &Set<DocIndex>,
|
||||||
|
) -> ZResult<()>
|
||||||
|
{
|
||||||
|
let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect();
|
||||||
|
let docids = Cow::Owned(SetBuf::new_unchecked(docids));
|
||||||
|
let matches = Cow::Borrowed(matches);
|
||||||
|
let postings = Postings { docids, matches };
|
||||||
|
|
||||||
|
self.prefix_postings_lists_cache.put(writer, &prefix, &postings)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
|
||||||
|
self.prefix_postings_lists_cache.clear(writer)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn prefix_postings_list<'txn>(
|
||||||
|
self,
|
||||||
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
|
prefix: [u8; 4],
|
||||||
|
) -> ZResult<Option<Postings<'txn>>>
|
||||||
|
{
|
||||||
|
self.prefix_postings_lists_cache.get(reader, &prefix)
|
||||||
|
}
|
||||||
|
}
|
@ -4,19 +4,17 @@ use crate::{store, MResult, RankedMap};
|
|||||||
|
|
||||||
pub fn apply_clear_all(
|
pub fn apply_clear_all(
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
main_store: store::Main,
|
index: &store::Index,
|
||||||
documents_fields_store: store::DocumentsFields,
|
|
||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
|
||||||
postings_lists_store: store::PostingsLists,
|
|
||||||
docs_words_store: store::DocsWords,
|
|
||||||
) -> MResult<()> {
|
) -> MResult<()> {
|
||||||
main_store.put_words_fst(writer, &fst::Set::default())?;
|
index.main.put_words_fst(writer, &fst::Set::default())?;
|
||||||
main_store.put_ranked_map(writer, &RankedMap::default())?;
|
index.main.put_ranked_map(writer, &RankedMap::default())?;
|
||||||
main_store.put_number_of_documents(writer, |_| 0)?;
|
index.main.put_number_of_documents(writer, |_| 0)?;
|
||||||
documents_fields_store.clear(writer)?;
|
index.documents_fields.clear(writer)?;
|
||||||
documents_fields_counts_store.clear(writer)?;
|
index.documents_fields_counts.clear(writer)?;
|
||||||
postings_lists_store.clear(writer)?;
|
index.postings_lists.clear(writer)?;
|
||||||
docs_words_store.clear(writer)?;
|
index.docs_words.clear(writer)?;
|
||||||
|
index.prefix_documents_cache.clear(writer)?;
|
||||||
|
index.prefix_postings_lists_cache.clear(writer)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -9,7 +9,7 @@ use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
|||||||
use crate::raw_indexer::RawIndexer;
|
use crate::raw_indexer::RawIndexer;
|
||||||
use crate::serde::{extract_document_id, serialize_value, Deserializer, Serializer};
|
use crate::serde::{extract_document_id, serialize_value, Deserializer, Serializer};
|
||||||
use crate::store;
|
use crate::store;
|
||||||
use crate::update::{apply_documents_deletion, next_update_id, Update};
|
use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update};
|
||||||
use crate::{Error, MResult, RankedMap};
|
use crate::{Error, MResult, RankedMap};
|
||||||
|
|
||||||
pub struct DocumentsAddition<D> {
|
pub struct DocumentsAddition<D> {
|
||||||
@ -104,16 +104,12 @@ pub fn push_documents_addition<D: serde::Serialize>(
|
|||||||
|
|
||||||
pub fn apply_documents_addition<'a, 'b>(
|
pub fn apply_documents_addition<'a, 'b>(
|
||||||
writer: &'a mut heed::RwTxn<'b, MainT>,
|
writer: &'a mut heed::RwTxn<'b, MainT>,
|
||||||
main_store: store::Main,
|
index: &store::Index,
|
||||||
documents_fields_store: store::DocumentsFields,
|
|
||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
|
||||||
postings_lists_store: store::PostingsLists,
|
|
||||||
docs_words_store: store::DocsWords,
|
|
||||||
addition: Vec<HashMap<String, serde_json::Value>>,
|
addition: Vec<HashMap<String, serde_json::Value>>,
|
||||||
) -> MResult<()> {
|
) -> MResult<()> {
|
||||||
let mut documents_additions = HashMap::new();
|
let mut documents_additions = HashMap::new();
|
||||||
|
|
||||||
let schema = match main_store.schema(writer)? {
|
let schema = match index.main.schema(writer)? {
|
||||||
Some(schema) => schema,
|
Some(schema) => schema,
|
||||||
None => return Err(Error::SchemaMissing),
|
None => return Err(Error::SchemaMissing),
|
||||||
};
|
};
|
||||||
@ -133,22 +129,14 @@ pub fn apply_documents_addition<'a, 'b>(
|
|||||||
// 2. remove the documents posting lists
|
// 2. remove the documents posting lists
|
||||||
let number_of_inserted_documents = documents_additions.len();
|
let number_of_inserted_documents = documents_additions.len();
|
||||||
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
|
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
|
||||||
apply_documents_deletion(
|
apply_documents_deletion(writer, index, documents_ids)?;
|
||||||
writer,
|
|
||||||
main_store,
|
|
||||||
documents_fields_store,
|
|
||||||
documents_fields_counts_store,
|
|
||||||
postings_lists_store,
|
|
||||||
docs_words_store,
|
|
||||||
documents_ids,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
let mut ranked_map = match index.main.ranked_map(writer)? {
|
||||||
Some(ranked_map) => ranked_map,
|
Some(ranked_map) => ranked_map,
|
||||||
None => RankedMap::default(),
|
None => RankedMap::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let stop_words = match main_store.stop_words_fst(writer)? {
|
let stop_words = match index.main.stop_words_fst(writer)? {
|
||||||
Some(stop_words) => stop_words,
|
Some(stop_words) => stop_words,
|
||||||
None => fst::Set::default(),
|
None => fst::Set::default(),
|
||||||
};
|
};
|
||||||
@ -160,8 +148,8 @@ pub fn apply_documents_addition<'a, 'b>(
|
|||||||
let serializer = Serializer {
|
let serializer = Serializer {
|
||||||
txn: writer,
|
txn: writer,
|
||||||
schema: &schema,
|
schema: &schema,
|
||||||
document_store: documents_fields_store,
|
document_store: index.documents_fields,
|
||||||
document_fields_counts: documents_fields_counts_store,
|
document_fields_counts: index.documents_fields_counts,
|
||||||
indexer: &mut indexer,
|
indexer: &mut indexer,
|
||||||
ranked_map: &mut ranked_map,
|
ranked_map: &mut ranked_map,
|
||||||
document_id,
|
document_id,
|
||||||
@ -172,27 +160,25 @@ pub fn apply_documents_addition<'a, 'b>(
|
|||||||
|
|
||||||
write_documents_addition_index(
|
write_documents_addition_index(
|
||||||
writer,
|
writer,
|
||||||
main_store,
|
index,
|
||||||
postings_lists_store,
|
|
||||||
docs_words_store,
|
|
||||||
&ranked_map,
|
&ranked_map,
|
||||||
number_of_inserted_documents,
|
number_of_inserted_documents,
|
||||||
indexer,
|
indexer,
|
||||||
)
|
)?;
|
||||||
|
|
||||||
|
compute_short_prefixes(writer, index)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn apply_documents_partial_addition<'a, 'b>(
|
pub fn apply_documents_partial_addition<'a, 'b>(
|
||||||
writer: &'a mut heed::RwTxn<'b, MainT>,
|
writer: &'a mut heed::RwTxn<'b, MainT>,
|
||||||
main_store: store::Main,
|
index: &store::Index,
|
||||||
documents_fields_store: store::DocumentsFields,
|
|
||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
|
||||||
postings_lists_store: store::PostingsLists,
|
|
||||||
docs_words_store: store::DocsWords,
|
|
||||||
addition: Vec<HashMap<String, serde_json::Value>>,
|
addition: Vec<HashMap<String, serde_json::Value>>,
|
||||||
) -> MResult<()> {
|
) -> MResult<()> {
|
||||||
let mut documents_additions = HashMap::new();
|
let mut documents_additions = HashMap::new();
|
||||||
|
|
||||||
let schema = match main_store.schema(writer)? {
|
let schema = match index.main.schema(writer)? {
|
||||||
Some(schema) => schema,
|
Some(schema) => schema,
|
||||||
None => return Err(Error::SchemaMissing),
|
None => return Err(Error::SchemaMissing),
|
||||||
};
|
};
|
||||||
@ -209,7 +195,7 @@ pub fn apply_documents_partial_addition<'a, 'b>(
|
|||||||
let mut deserializer = Deserializer {
|
let mut deserializer = Deserializer {
|
||||||
document_id,
|
document_id,
|
||||||
reader: writer,
|
reader: writer,
|
||||||
documents_fields: documents_fields_store,
|
documents_fields: index.documents_fields,
|
||||||
schema: &schema,
|
schema: &schema,
|
||||||
attributes: None,
|
attributes: None,
|
||||||
};
|
};
|
||||||
@ -229,22 +215,14 @@ pub fn apply_documents_partial_addition<'a, 'b>(
|
|||||||
// 2. remove the documents posting lists
|
// 2. remove the documents posting lists
|
||||||
let number_of_inserted_documents = documents_additions.len();
|
let number_of_inserted_documents = documents_additions.len();
|
||||||
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
|
let documents_ids = documents_additions.iter().map(|(id, _)| *id).collect();
|
||||||
apply_documents_deletion(
|
apply_documents_deletion(writer, index, documents_ids)?;
|
||||||
writer,
|
|
||||||
main_store,
|
|
||||||
documents_fields_store,
|
|
||||||
documents_fields_counts_store,
|
|
||||||
postings_lists_store,
|
|
||||||
docs_words_store,
|
|
||||||
documents_ids,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
let mut ranked_map = match index.main.ranked_map(writer)? {
|
||||||
Some(ranked_map) => ranked_map,
|
Some(ranked_map) => ranked_map,
|
||||||
None => RankedMap::default(),
|
None => RankedMap::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let stop_words = match main_store.stop_words_fst(writer)? {
|
let stop_words = match index.main.stop_words_fst(writer)? {
|
||||||
Some(stop_words) => stop_words,
|
Some(stop_words) => stop_words,
|
||||||
None => fst::Set::default(),
|
None => fst::Set::default(),
|
||||||
};
|
};
|
||||||
@ -256,8 +234,8 @@ pub fn apply_documents_partial_addition<'a, 'b>(
|
|||||||
let serializer = Serializer {
|
let serializer = Serializer {
|
||||||
txn: writer,
|
txn: writer,
|
||||||
schema: &schema,
|
schema: &schema,
|
||||||
document_store: documents_fields_store,
|
document_store: index.documents_fields,
|
||||||
document_fields_counts: documents_fields_counts_store,
|
document_fields_counts: index.documents_fields_counts,
|
||||||
indexer: &mut indexer,
|
indexer: &mut indexer,
|
||||||
ranked_map: &mut ranked_map,
|
ranked_map: &mut ranked_map,
|
||||||
document_id,
|
document_id,
|
||||||
@ -268,24 +246,19 @@ pub fn apply_documents_partial_addition<'a, 'b>(
|
|||||||
|
|
||||||
write_documents_addition_index(
|
write_documents_addition_index(
|
||||||
writer,
|
writer,
|
||||||
main_store,
|
index,
|
||||||
postings_lists_store,
|
|
||||||
docs_words_store,
|
|
||||||
&ranked_map,
|
&ranked_map,
|
||||||
number_of_inserted_documents,
|
number_of_inserted_documents,
|
||||||
indexer,
|
indexer,
|
||||||
)
|
)?;
|
||||||
|
|
||||||
|
compute_short_prefixes(writer, index)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn reindex_all_documents(
|
pub fn reindex_all_documents(writer: &mut heed::RwTxn<MainT>, index: &store::Index) -> MResult<()> {
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
let schema = match index.main.schema(writer)? {
|
||||||
main_store: store::Main,
|
|
||||||
documents_fields_store: store::DocumentsFields,
|
|
||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
|
||||||
postings_lists_store: store::PostingsLists,
|
|
||||||
docs_words_store: store::DocsWords,
|
|
||||||
) -> MResult<()> {
|
|
||||||
let schema = match main_store.schema(writer)? {
|
|
||||||
Some(schema) => schema,
|
Some(schema) => schema,
|
||||||
None => return Err(Error::SchemaMissing),
|
None => return Err(Error::SchemaMissing),
|
||||||
};
|
};
|
||||||
@ -294,21 +267,21 @@ pub fn reindex_all_documents(
|
|||||||
|
|
||||||
// 1. retrieve all documents ids
|
// 1. retrieve all documents ids
|
||||||
let mut documents_ids_to_reindex = Vec::new();
|
let mut documents_ids_to_reindex = Vec::new();
|
||||||
for result in documents_fields_counts_store.documents_ids(writer)? {
|
for result in index.documents_fields_counts.documents_ids(writer)? {
|
||||||
let document_id = result?;
|
let document_id = result?;
|
||||||
documents_ids_to_reindex.push(document_id);
|
documents_ids_to_reindex.push(document_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. remove the documents posting lists
|
// 2. remove the documents posting lists
|
||||||
main_store.put_words_fst(writer, &fst::Set::default())?;
|
index.main.put_words_fst(writer, &fst::Set::default())?;
|
||||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
index.main.put_ranked_map(writer, &ranked_map)?;
|
||||||
main_store.put_number_of_documents(writer, |_| 0)?;
|
index.main.put_number_of_documents(writer, |_| 0)?;
|
||||||
postings_lists_store.clear(writer)?;
|
index.postings_lists.clear(writer)?;
|
||||||
docs_words_store.clear(writer)?;
|
index.docs_words.clear(writer)?;
|
||||||
|
|
||||||
// 3. re-index chunks of documents (otherwise we make the borrow checker unhappy)
|
// 3. re-index chunks of documents (otherwise we make the borrow checker unhappy)
|
||||||
for documents_ids in documents_ids_to_reindex.chunks(100) {
|
for documents_ids in documents_ids_to_reindex.chunks(100) {
|
||||||
let stop_words = match main_store.stop_words_fst(writer)? {
|
let stop_words = match index.main.stop_words_fst(writer)? {
|
||||||
Some(stop_words) => stop_words,
|
Some(stop_words) => stop_words,
|
||||||
None => fst::Set::default(),
|
None => fst::Set::default(),
|
||||||
};
|
};
|
||||||
@ -318,7 +291,7 @@ pub fn reindex_all_documents(
|
|||||||
let mut ram_store = HashMap::new();
|
let mut ram_store = HashMap::new();
|
||||||
|
|
||||||
for document_id in documents_ids {
|
for document_id in documents_ids {
|
||||||
for result in documents_fields_store.document_fields(writer, *document_id)? {
|
for result in index.documents_fields.document_fields(writer, *document_id)? {
|
||||||
let (attr, bytes) = result?;
|
let (attr, bytes) = result?;
|
||||||
let value: serde_json::Value = serde_json::from_slice(bytes)?;
|
let value: serde_json::Value = serde_json::from_slice(bytes)?;
|
||||||
ram_store.insert((document_id, attr), value);
|
ram_store.insert((document_id, attr), value);
|
||||||
@ -330,8 +303,8 @@ pub fn reindex_all_documents(
|
|||||||
attr,
|
attr,
|
||||||
schema.props(attr),
|
schema.props(attr),
|
||||||
*docid,
|
*docid,
|
||||||
documents_fields_store,
|
index.documents_fields,
|
||||||
documents_fields_counts_store,
|
index.documents_fields_counts,
|
||||||
&mut indexer,
|
&mut indexer,
|
||||||
&mut ranked_map,
|
&mut ranked_map,
|
||||||
&value,
|
&value,
|
||||||
@ -342,23 +315,21 @@ pub fn reindex_all_documents(
|
|||||||
// 4. write the new index in the main store
|
// 4. write the new index in the main store
|
||||||
write_documents_addition_index(
|
write_documents_addition_index(
|
||||||
writer,
|
writer,
|
||||||
main_store,
|
index,
|
||||||
postings_lists_store,
|
|
||||||
docs_words_store,
|
|
||||||
&ranked_map,
|
&ranked_map,
|
||||||
number_of_inserted_documents,
|
number_of_inserted_documents,
|
||||||
indexer,
|
indexer,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
compute_short_prefixes(writer, index)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write_documents_addition_index(
|
pub fn write_documents_addition_index(
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
main_store: store::Main,
|
index: &store::Index,
|
||||||
postings_lists_store: store::PostingsLists,
|
|
||||||
docs_words_store: store::DocsWords,
|
|
||||||
ranked_map: &RankedMap,
|
ranked_map: &RankedMap,
|
||||||
number_of_inserted_documents: usize,
|
number_of_inserted_documents: usize,
|
||||||
indexer: RawIndexer,
|
indexer: RawIndexer,
|
||||||
@ -369,16 +340,16 @@ pub fn write_documents_addition_index(
|
|||||||
for (word, delta_set) in indexed.words_doc_indexes {
|
for (word, delta_set) in indexed.words_doc_indexes {
|
||||||
delta_words_builder.insert(&word).unwrap();
|
delta_words_builder.insert(&word).unwrap();
|
||||||
|
|
||||||
let set = match postings_lists_store.postings_list(writer, &word)? {
|
let set = match index.postings_lists.postings_list(writer, &word)? {
|
||||||
Some(set) => Union::new(&set, &delta_set).into_set_buf(),
|
Some(postings) => Union::new(&postings.matches, &delta_set).into_set_buf(),
|
||||||
None => delta_set,
|
None => delta_set,
|
||||||
};
|
};
|
||||||
|
|
||||||
postings_lists_store.put_postings_list(writer, &word, &set)?;
|
index.postings_lists.put_postings_list(writer, &word, &set)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (id, words) in indexed.docs_words {
|
for (id, words) in indexed.docs_words {
|
||||||
docs_words_store.put_doc_words(writer, id, &words)?;
|
index.docs_words.put_doc_words(writer, id, &words)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let delta_words = delta_words_builder
|
let delta_words = delta_words_builder
|
||||||
@ -386,7 +357,7 @@ pub fn write_documents_addition_index(
|
|||||||
.and_then(fst::Set::from_bytes)
|
.and_then(fst::Set::from_bytes)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let words = match main_store.words_fst(writer)? {
|
let words = match index.main.words_fst(writer)? {
|
||||||
Some(words) => {
|
Some(words) => {
|
||||||
let op = OpBuilder::new()
|
let op = OpBuilder::new()
|
||||||
.add(words.stream())
|
.add(words.stream())
|
||||||
@ -403,9 +374,11 @@ pub fn write_documents_addition_index(
|
|||||||
None => delta_words,
|
None => delta_words,
|
||||||
};
|
};
|
||||||
|
|
||||||
main_store.put_words_fst(writer, &words)?;
|
index.main.put_words_fst(writer, &words)?;
|
||||||
main_store.put_ranked_map(writer, ranked_map)?;
|
index.main.put_ranked_map(writer, ranked_map)?;
|
||||||
main_store.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
|
index.main.put_number_of_documents(writer, |old| old + number_of_inserted_documents as u64)?;
|
||||||
|
|
||||||
|
compute_short_prefixes(writer, index)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,7 @@ use crate::database::{MainT, UpdateT};
|
|||||||
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
use crate::database::{UpdateEvent, UpdateEventsEmitter};
|
||||||
use crate::serde::extract_document_id;
|
use crate::serde::extract_document_id;
|
||||||
use crate::store;
|
use crate::store;
|
||||||
use crate::update::{next_update_id, Update};
|
use crate::update::{next_update_id, compute_short_prefixes, Update};
|
||||||
use crate::{DocumentId, Error, MResult, RankedMap};
|
use crate::{DocumentId, Error, MResult, RankedMap};
|
||||||
|
|
||||||
pub struct DocumentsDeletion {
|
pub struct DocumentsDeletion {
|
||||||
@ -85,21 +85,17 @@ pub fn push_documents_deletion(
|
|||||||
|
|
||||||
pub fn apply_documents_deletion(
|
pub fn apply_documents_deletion(
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
main_store: store::Main,
|
index: &store::Index,
|
||||||
documents_fields_store: store::DocumentsFields,
|
|
||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
|
||||||
postings_lists_store: store::PostingsLists,
|
|
||||||
docs_words_store: store::DocsWords,
|
|
||||||
deletion: Vec<DocumentId>,
|
deletion: Vec<DocumentId>,
|
||||||
) -> MResult<()> {
|
) -> MResult<()> {
|
||||||
let idset = SetBuf::from_dirty(deletion);
|
let idset = SetBuf::from_dirty(deletion);
|
||||||
|
|
||||||
let schema = match main_store.schema(writer)? {
|
let schema = match index.main.schema(writer)? {
|
||||||
Some(schema) => schema,
|
Some(schema) => schema,
|
||||||
None => return Err(Error::SchemaMissing),
|
None => return Err(Error::SchemaMissing),
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut ranked_map = match main_store.ranked_map(writer)? {
|
let mut ranked_map = match index.main.ranked_map(writer)? {
|
||||||
Some(ranked_map) => ranked_map,
|
Some(ranked_map) => ranked_map,
|
||||||
None => RankedMap::default(),
|
None => RankedMap::default(),
|
||||||
};
|
};
|
||||||
@ -125,7 +121,7 @@ pub fn apply_documents_deletion(
|
|||||||
ranked_map.remove(id, *ranked_attr);
|
ranked_map.remove(id, *ranked_attr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(words) = docs_words_store.doc_words(writer, id)? {
|
if let Some(words) = index.docs_words.doc_words(writer, id)? {
|
||||||
let mut stream = words.stream();
|
let mut stream = words.stream();
|
||||||
while let Some(word) = stream.next() {
|
while let Some(word) = stream.next() {
|
||||||
let word = word.to_vec();
|
let word = word.to_vec();
|
||||||
@ -142,21 +138,21 @@ pub fn apply_documents_deletion(
|
|||||||
for (word, document_ids) in words_document_ids {
|
for (word, document_ids) in words_document_ids {
|
||||||
let document_ids = SetBuf::from_dirty(document_ids);
|
let document_ids = SetBuf::from_dirty(document_ids);
|
||||||
|
|
||||||
if let Some(doc_indexes) = postings_lists_store.postings_list(writer, &word)? {
|
if let Some(postings) = index.postings_lists.postings_list(writer, &word)? {
|
||||||
let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id);
|
let op = DifferenceByKey::new(&postings.matches, &document_ids, |d| d.document_id, |id| *id);
|
||||||
let doc_indexes = op.into_set_buf();
|
let doc_indexes = op.into_set_buf();
|
||||||
|
|
||||||
if !doc_indexes.is_empty() {
|
if !doc_indexes.is_empty() {
|
||||||
postings_lists_store.put_postings_list(writer, &word, &doc_indexes)?;
|
index.postings_lists.put_postings_list(writer, &word, &doc_indexes)?;
|
||||||
} else {
|
} else {
|
||||||
postings_lists_store.del_postings_list(writer, &word)?;
|
index.postings_lists.del_postings_list(writer, &word)?;
|
||||||
removed_words.insert(word);
|
removed_words.insert(word);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for id in document_ids {
|
for id in document_ids {
|
||||||
documents_fields_counts_store.del_all_document_fields_counts(writer, id)?;
|
index.documents_fields_counts.del_all_document_fields_counts(writer, id)?;
|
||||||
if documents_fields_store.del_all_document_fields(writer, id)? != 0 {
|
if index.documents_fields.del_all_document_fields(writer, id)? != 0 {
|
||||||
deleted_documents.insert(id);
|
deleted_documents.insert(id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -164,11 +160,11 @@ pub fn apply_documents_deletion(
|
|||||||
|
|
||||||
let deleted_documents_len = deleted_documents.len() as u64;
|
let deleted_documents_len = deleted_documents.len() as u64;
|
||||||
for id in deleted_documents {
|
for id in deleted_documents {
|
||||||
docs_words_store.del_doc_words(writer, id)?;
|
index.docs_words.del_doc_words(writer, id)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let removed_words = fst::Set::from_iter(removed_words).unwrap();
|
let removed_words = fst::Set::from_iter(removed_words).unwrap();
|
||||||
let words = match main_store.words_fst(writer)? {
|
let words = match index.main.words_fst(writer)? {
|
||||||
Some(words_set) => {
|
Some(words_set) => {
|
||||||
let op = fst::set::OpBuilder::new()
|
let op = fst::set::OpBuilder::new()
|
||||||
.add(words_set.stream())
|
.add(words_set.stream())
|
||||||
@ -185,9 +181,11 @@ pub fn apply_documents_deletion(
|
|||||||
None => fst::Set::default(),
|
None => fst::Set::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
main_store.put_words_fst(writer, &words)?;
|
index.main.put_words_fst(writer, &words)?;
|
||||||
main_store.put_ranked_map(writer, &ranked_map)?;
|
index.main.put_ranked_map(writer, &ranked_map)?;
|
||||||
main_store.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
|
index.main.put_number_of_documents(writer, |old| old - deleted_documents_len)?;
|
||||||
|
|
||||||
|
compute_short_prefixes(writer, index)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -26,6 +26,8 @@ use chrono::{DateTime, Utc};
|
|||||||
use heed::Result as ZResult;
|
use heed::Result as ZResult;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
use fst::{IntoStreamer, Streamer};
|
||||||
|
use sdset::Set;
|
||||||
|
|
||||||
use crate::{store, DocumentId, MResult};
|
use crate::{store, DocumentId, MResult};
|
||||||
use crate::database::{MainT, UpdateT};
|
use crate::database::{MainT, UpdateT};
|
||||||
@ -255,14 +257,7 @@ pub fn update_task<'a, 'b>(
|
|||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
let update_type = UpdateType::ClearAll;
|
let update_type = UpdateType::ClearAll;
|
||||||
let result = apply_clear_all(
|
let result = apply_clear_all(writer, index);
|
||||||
writer,
|
|
||||||
index.main,
|
|
||||||
index.documents_fields,
|
|
||||||
index.documents_fields_counts,
|
|
||||||
index.postings_lists,
|
|
||||||
index.docs_words,
|
|
||||||
);
|
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
}
|
}
|
||||||
@ -270,15 +265,7 @@ pub fn update_task<'a, 'b>(
|
|||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
let update_type = UpdateType::Schema;
|
let update_type = UpdateType::Schema;
|
||||||
let result = apply_schema_update(
|
let result = apply_schema_update(writer, &schema, index);
|
||||||
writer,
|
|
||||||
&schema,
|
|
||||||
index.main,
|
|
||||||
index.documents_fields,
|
|
||||||
index.documents_fields_counts,
|
|
||||||
index.postings_lists,
|
|
||||||
index.docs_words,
|
|
||||||
);
|
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
}
|
}
|
||||||
@ -297,15 +284,7 @@ pub fn update_task<'a, 'b>(
|
|||||||
number: documents.len(),
|
number: documents.len(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let result = apply_documents_addition(
|
let result = apply_documents_addition(writer, index, documents);
|
||||||
writer,
|
|
||||||
index.main,
|
|
||||||
index.documents_fields,
|
|
||||||
index.documents_fields_counts,
|
|
||||||
index.postings_lists,
|
|
||||||
index.docs_words,
|
|
||||||
documents,
|
|
||||||
);
|
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
}
|
}
|
||||||
@ -316,15 +295,7 @@ pub fn update_task<'a, 'b>(
|
|||||||
number: documents.len(),
|
number: documents.len(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let result = apply_documents_partial_addition(
|
let result = apply_documents_partial_addition(writer, index, documents);
|
||||||
writer,
|
|
||||||
index.main,
|
|
||||||
index.documents_fields,
|
|
||||||
index.documents_fields_counts,
|
|
||||||
index.postings_lists,
|
|
||||||
index.docs_words,
|
|
||||||
documents,
|
|
||||||
);
|
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
}
|
}
|
||||||
@ -335,15 +306,7 @@ pub fn update_task<'a, 'b>(
|
|||||||
number: documents.len(),
|
number: documents.len(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let result = apply_documents_deletion(
|
let result = apply_documents_deletion(writer, index, documents);
|
||||||
writer,
|
|
||||||
index.main,
|
|
||||||
index.documents_fields,
|
|
||||||
index.documents_fields_counts,
|
|
||||||
index.postings_lists,
|
|
||||||
index.docs_words,
|
|
||||||
documents,
|
|
||||||
);
|
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
}
|
}
|
||||||
@ -377,15 +340,7 @@ pub fn update_task<'a, 'b>(
|
|||||||
number: stop_words.len(),
|
number: stop_words.len(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let result = apply_stop_words_deletion(
|
let result = apply_stop_words_deletion(writer, index, stop_words);
|
||||||
writer,
|
|
||||||
index.main,
|
|
||||||
index.documents_fields,
|
|
||||||
index.documents_fields_counts,
|
|
||||||
index.postings_lists,
|
|
||||||
index.docs_words,
|
|
||||||
stop_words,
|
|
||||||
);
|
|
||||||
|
|
||||||
(update_type, result, start.elapsed())
|
(update_type, result, start.elapsed())
|
||||||
}
|
}
|
||||||
@ -407,3 +362,67 @@ pub fn update_task<'a, 'b>(
|
|||||||
|
|
||||||
Ok(status)
|
Ok(status)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn compute_short_prefixes(writer: &mut heed::RwTxn<MainT>, index: &store::Index) -> MResult<()> {
|
||||||
|
// retrieve the words fst to compute all those prefixes
|
||||||
|
let words_fst = match index.main.words_fst(writer)? {
|
||||||
|
Some(fst) => fst,
|
||||||
|
None => return Ok(()),
|
||||||
|
};
|
||||||
|
|
||||||
|
// clear the prefixes
|
||||||
|
let pplc_store = index.prefix_postings_lists_cache;
|
||||||
|
pplc_store.clear(writer)?;
|
||||||
|
|
||||||
|
for prefix_len in 1..=2 {
|
||||||
|
// compute prefixes and store those in the PrefixPostingsListsCache store.
|
||||||
|
let mut previous_prefix: Option<([u8; 4], Vec<_>)> = None;
|
||||||
|
let mut stream = words_fst.into_stream();
|
||||||
|
while let Some(input) = stream.next() {
|
||||||
|
|
||||||
|
// We skip the prefixes that are shorter than the current length
|
||||||
|
// we want to cache (<). We must ignore the input when it is exactly the
|
||||||
|
// same word as the prefix because if we match exactly on it we need
|
||||||
|
// to consider it as an exact match and not as a prefix (=).
|
||||||
|
if input.len() <= prefix_len { continue }
|
||||||
|
|
||||||
|
if let Some(postings_list) = index.postings_lists.postings_list(writer, input)?.map(|p| p.matches.into_owned()) {
|
||||||
|
let prefix = &input[..prefix_len];
|
||||||
|
|
||||||
|
let mut arr_prefix = [0; 4];
|
||||||
|
arr_prefix[..prefix_len].copy_from_slice(prefix);
|
||||||
|
|
||||||
|
match previous_prefix {
|
||||||
|
Some((ref mut prev_prefix, ref mut prev_pl)) if *prev_prefix != arr_prefix => {
|
||||||
|
prev_pl.sort_unstable();
|
||||||
|
prev_pl.dedup();
|
||||||
|
|
||||||
|
if let Ok(prefix) = std::str::from_utf8(&prev_prefix[..prefix_len]) {
|
||||||
|
debug!("writing the prefix of {:?} of length {}", prefix, prev_pl.len());
|
||||||
|
}
|
||||||
|
|
||||||
|
let pls = Set::new_unchecked(&prev_pl);
|
||||||
|
pplc_store.put_prefix_postings_list(writer, *prev_prefix, &pls)?;
|
||||||
|
|
||||||
|
*prev_prefix = arr_prefix;
|
||||||
|
prev_pl.clear();
|
||||||
|
prev_pl.extend_from_slice(&postings_list);
|
||||||
|
},
|
||||||
|
Some((_, ref mut prev_pl)) => prev_pl.extend_from_slice(&postings_list),
|
||||||
|
None => previous_prefix = Some((arr_prefix, postings_list.to_vec())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// write the last prefix postings lists
|
||||||
|
if let Some((prev_prefix, mut prev_pl)) = previous_prefix.take() {
|
||||||
|
prev_pl.sort_unstable();
|
||||||
|
prev_pl.dedup();
|
||||||
|
|
||||||
|
let pls = Set::new_unchecked(&prev_pl);
|
||||||
|
pplc_store.put_prefix_postings_list(writer, prev_prefix, &pls)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
@ -8,11 +8,7 @@ use crate::{error::UnsupportedOperation, store, MResult};
|
|||||||
pub fn apply_schema_update(
|
pub fn apply_schema_update(
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
new_schema: &Schema,
|
new_schema: &Schema,
|
||||||
main_store: store::Main,
|
index: &store::Index,
|
||||||
documents_fields_store: store::DocumentsFields,
|
|
||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
|
||||||
postings_lists_store: store::PostingsLists,
|
|
||||||
docs_words_store: store::DocsWords,
|
|
||||||
) -> MResult<()> {
|
) -> MResult<()> {
|
||||||
use UnsupportedOperation::{
|
use UnsupportedOperation::{
|
||||||
CanOnlyIntroduceNewSchemaAttributesAtEnd, CannotRemoveSchemaAttribute,
|
CanOnlyIntroduceNewSchemaAttributesAtEnd, CannotRemoveSchemaAttribute,
|
||||||
@ -21,7 +17,7 @@ pub fn apply_schema_update(
|
|||||||
|
|
||||||
let mut need_full_reindexing = false;
|
let mut need_full_reindexing = false;
|
||||||
|
|
||||||
if let Some(old_schema) = main_store.schema(writer)? {
|
if let Some(old_schema) = index.main.schema(writer)? {
|
||||||
for diff in meilisearch_schema::diff(&old_schema, new_schema) {
|
for diff in meilisearch_schema::diff(&old_schema, new_schema) {
|
||||||
match diff {
|
match diff {
|
||||||
Diff::IdentChange { .. } => return Err(CannotUpdateSchemaIdentifier.into()),
|
Diff::IdentChange { .. } => return Err(CannotUpdateSchemaIdentifier.into()),
|
||||||
@ -45,17 +41,10 @@ pub fn apply_schema_update(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
main_store.put_schema(writer, new_schema)?;
|
index.main.put_schema(writer, new_schema)?;
|
||||||
|
|
||||||
if need_full_reindexing {
|
if need_full_reindexing {
|
||||||
reindex_all_documents(
|
reindex_all_documents(writer, index)?
|
||||||
writer,
|
|
||||||
main_store,
|
|
||||||
documents_fields_store,
|
|
||||||
documents_fields_counts_store,
|
|
||||||
postings_lists_store,
|
|
||||||
docs_words_store,
|
|
||||||
)?
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -63,14 +52,13 @@ pub fn apply_schema_update(
|
|||||||
|
|
||||||
pub fn push_schema_update(
|
pub fn push_schema_update(
|
||||||
writer: &mut heed::RwTxn<UpdateT>,
|
writer: &mut heed::RwTxn<UpdateT>,
|
||||||
updates_store: store::Updates,
|
index: &store::Index,
|
||||||
updates_results_store: store::UpdatesResults,
|
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
) -> MResult<u64> {
|
) -> MResult<u64> {
|
||||||
let last_update_id = next_update_id(writer, updates_store, updates_results_store)?;
|
let last_update_id = next_update_id(writer, index.updates, index.updates_results)?;
|
||||||
|
|
||||||
let update = Update::schema(schema);
|
let update = Update::schema(schema);
|
||||||
updates_store.put_update(writer, last_update_id, &update)?;
|
index.updates.put_update(writer, last_update_id, &update)?;
|
||||||
|
|
||||||
Ok(last_update_id)
|
Ok(last_update_id)
|
||||||
}
|
}
|
||||||
|
@ -63,11 +63,7 @@ pub fn push_stop_words_deletion(
|
|||||||
|
|
||||||
pub fn apply_stop_words_deletion(
|
pub fn apply_stop_words_deletion(
|
||||||
writer: &mut heed::RwTxn<MainT>,
|
writer: &mut heed::RwTxn<MainT>,
|
||||||
main_store: store::Main,
|
index: &store::Index,
|
||||||
documents_fields_store: store::DocumentsFields,
|
|
||||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
|
||||||
postings_lists_store: store::PostingsLists,
|
|
||||||
docs_words_store: store::DocsWords,
|
|
||||||
deletion: BTreeSet<String>,
|
deletion: BTreeSet<String>,
|
||||||
) -> MResult<()> {
|
) -> MResult<()> {
|
||||||
let mut stop_words_builder = SetBuilder::memory();
|
let mut stop_words_builder = SetBuilder::memory();
|
||||||
@ -83,7 +79,7 @@ pub fn apply_stop_words_deletion(
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
// now we delete all of these stop words from the main store
|
// now we delete all of these stop words from the main store
|
||||||
let stop_words_fst = main_store.stop_words_fst(writer)?.unwrap_or_default();
|
let stop_words_fst = index.main.stop_words_fst(writer)?.unwrap_or_default();
|
||||||
|
|
||||||
let op = OpBuilder::new()
|
let op = OpBuilder::new()
|
||||||
.add(&stop_words_fst)
|
.add(&stop_words_fst)
|
||||||
@ -97,20 +93,13 @@ pub fn apply_stop_words_deletion(
|
|||||||
.and_then(fst::Set::from_bytes)
|
.and_then(fst::Set::from_bytes)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
main_store.put_stop_words_fst(writer, &stop_words_fst)?;
|
index.main.put_stop_words_fst(writer, &stop_words_fst)?;
|
||||||
|
|
||||||
// now that we have setup the stop words
|
// now that we have setup the stop words
|
||||||
// lets reindex everything...
|
// lets reindex everything...
|
||||||
if let Ok(number) = main_store.number_of_documents(writer) {
|
if let Ok(number) = index.main.number_of_documents(writer) {
|
||||||
if number > 0 {
|
if number > 0 {
|
||||||
reindex_all_documents(
|
reindex_all_documents(writer, index)?;
|
||||||
writer,
|
|
||||||
main_store,
|
|
||||||
documents_fields_store,
|
|
||||||
documents_fields_counts_store,
|
|
||||||
postings_lists_store,
|
|
||||||
docs_words_store,
|
|
||||||
)?;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -170,8 +170,6 @@ impl<'a> SearchBuilder<'a> {
|
|||||||
let ranked_map = ranked_map.map_err(|e| Error::Internal(e.to_string()))?;
|
let ranked_map = ranked_map.map_err(|e| Error::Internal(e.to_string()))?;
|
||||||
let ranked_map = ranked_map.unwrap_or_default();
|
let ranked_map = ranked_map.unwrap_or_default();
|
||||||
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
// Change criteria
|
// Change criteria
|
||||||
let mut query_builder = match self.get_criteria(reader, &ranked_map, &schema)? {
|
let mut query_builder = match self.get_criteria(reader, &ranked_map, &schema)? {
|
||||||
Some(criteria) => self.index.query_builder_with_criteria(criteria),
|
Some(criteria) => self.index.query_builder_with_criteria(criteria),
|
||||||
@ -222,8 +220,9 @@ impl<'a> SearchBuilder<'a> {
|
|||||||
|
|
||||||
query_builder.with_fetch_timeout(self.timeout);
|
query_builder.with_fetch_timeout(self.timeout);
|
||||||
|
|
||||||
let docs =
|
let start = Instant::now();
|
||||||
query_builder.query(reader, &self.query, self.offset..(self.offset + self.limit));
|
let docs = query_builder.query(reader, &self.query, self.offset..(self.offset + self.limit));
|
||||||
|
let time_ms = start.elapsed().as_millis() as usize;
|
||||||
|
|
||||||
let mut hits = Vec::with_capacity(self.limit);
|
let mut hits = Vec::with_capacity(self.limit);
|
||||||
for doc in docs.map_err(|e| Error::SearchDocuments(e.to_string()))? {
|
for doc in docs.map_err(|e| Error::SearchDocuments(e.to_string()))? {
|
||||||
@ -278,8 +277,6 @@ impl<'a> SearchBuilder<'a> {
|
|||||||
hits.push(hit);
|
hits.push(hit);
|
||||||
}
|
}
|
||||||
|
|
||||||
let time_ms = start.elapsed().as_millis() as usize;
|
|
||||||
|
|
||||||
let results = SearchResult {
|
let results = SearchResult {
|
||||||
hits,
|
hits,
|
||||||
offset: self.offset,
|
offset: self.offset,
|
||||||
|
@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
///
|
///
|
||||||
/// It is used to inform the database the document you want to deserialize.
|
/// It is used to inform the database the document you want to deserialize.
|
||||||
/// Helpful for custom ranking.
|
/// Helpful for custom ranking.
|
||||||
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Default, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
|
||||||
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
|
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
|
||||||
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
@ -19,7 +19,7 @@ pub struct DocumentId(pub u64);
|
|||||||
///
|
///
|
||||||
/// This is stored in the map, generated at index time,
|
/// This is stored in the map, generated at index time,
|
||||||
/// extracted and interpreted at search time.
|
/// extracted and interpreted at search time.
|
||||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
|
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
|
||||||
#[repr(C)]
|
#[repr(C)]
|
||||||
pub struct DocIndex {
|
pub struct DocIndex {
|
||||||
@ -46,6 +46,8 @@ pub struct DocIndex {
|
|||||||
/// The order of the field is important because it defines
|
/// The order of the field is important because it defines
|
||||||
/// the way these structures are ordered between themselves.
|
/// the way these structures are ordered between themselves.
|
||||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
|
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
|
||||||
|
#[repr(C)]
|
||||||
pub struct Highlight {
|
pub struct Highlight {
|
||||||
/// The attribute in the document where the word was found
|
/// The attribute in the document where the word was found
|
||||||
/// along with the index in it.
|
/// along with the index in it.
|
||||||
|
Loading…
Reference in New Issue
Block a user