mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 08:48:32 +08:00
wip: Impl a basic tree traversing
This commit is contained in:
parent
6e1f4af833
commit
fbcec2975d
6
Cargo.lock
generated
6
Cargo.lock
generated
@ -962,7 +962,7 @@ dependencies = [
|
|||||||
"once_cell 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"once_cell 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"ordered-float 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
"ordered-float 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"rustyline 5.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"rustyline 5.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"sdset 0.3.6 (git+https://github.com/Kerollmops/sdset?branch=intersection-by-key)",
|
||||||
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"siphasher 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"siphasher 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -1693,7 +1693,7 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "sdset"
|
name = "sdset"
|
||||||
version = "0.3.6"
|
version = "0.3.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/Kerollmops/sdset?branch=intersection-by-key#03c5008a4b23e11ba89c5579b023473b555d3864"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "semver"
|
name = "semver"
|
||||||
@ -2807,7 +2807,7 @@ dependencies = [
|
|||||||
"checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421"
|
"checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421"
|
||||||
"checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d"
|
"checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d"
|
||||||
"checksum sct 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c"
|
"checksum sct 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c"
|
||||||
"checksum sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "5bfd7aab2bcae693c563b40fbbaf87d60c9b6f2a60d55ed69a9c761e3d4c63c9"
|
"checksum sdset 0.3.6 (git+https://github.com/Kerollmops/sdset?branch=intersection-by-key)" = "<none>"
|
||||||
"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
|
"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
|
||||||
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
||||||
"checksum serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)" = "0c4b39bd9b0b087684013a792c59e3e07a46a01d2322518d8a1104641a0b1be0"
|
"checksum serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)" = "0c4b39bd9b0b087684013a792c59e3e07a46a01d2322518d8a1104641a0b1be0"
|
||||||
|
@ -25,13 +25,17 @@ meilisearch-tokenizer = { path = "../meilisearch-tokenizer", version = "0.8.4" }
|
|||||||
meilisearch-types = { path = "../meilisearch-types", version = "0.8.4" }
|
meilisearch-types = { path = "../meilisearch-types", version = "0.8.4" }
|
||||||
once_cell = "1.2.0"
|
once_cell = "1.2.0"
|
||||||
ordered-float = { version = "1.0.2", features = ["serde"] }
|
ordered-float = { version = "1.0.2", features = ["serde"] }
|
||||||
sdset = "0.3.6"
|
|
||||||
serde = { version = "1.0.101", features = ["derive"] }
|
serde = { version = "1.0.101", features = ["derive"] }
|
||||||
serde_json = "1.0.41"
|
serde_json = "1.0.41"
|
||||||
siphasher = "0.3.1"
|
siphasher = "0.3.1"
|
||||||
slice-group-by = "0.2.6"
|
slice-group-by = "0.2.6"
|
||||||
zerocopy = "0.2.8"
|
zerocopy = "0.2.8"
|
||||||
|
|
||||||
|
[dependencies.sdset]
|
||||||
|
# version = "0.3.6"
|
||||||
|
git = "https://github.com/Kerollmops/sdset"
|
||||||
|
branch = "intersection-by-key"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
assert_matches = "1.3"
|
assert_matches = "1.3"
|
||||||
criterion = "0.3"
|
criterion = "0.3"
|
||||||
|
@ -15,7 +15,7 @@ use levenshtein_automata::DFA;
|
|||||||
use log::debug;
|
use log::debug;
|
||||||
use meilisearch_tokenizer::{is_cjk, split_query_string};
|
use meilisearch_tokenizer::{is_cjk, split_query_string};
|
||||||
use meilisearch_types::DocIndex;
|
use meilisearch_types::DocIndex;
|
||||||
use sdset::{Set, SetBuf};
|
use sdset::{Set, SetBuf, SetOperation};
|
||||||
use slice_group_by::{GroupBy, GroupByMut};
|
use slice_group_by::{GroupBy, GroupByMut};
|
||||||
|
|
||||||
use crate::automaton::NGRAMS;
|
use crate::automaton::NGRAMS;
|
||||||
@ -28,7 +28,7 @@ use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
|||||||
use crate::raw_document::RawDocument;
|
use crate::raw_document::RawDocument;
|
||||||
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
||||||
use crate::{store, Document, DocumentId, MResult};
|
use crate::{store, Document, DocumentId, MResult};
|
||||||
use crate::query_tree::create_query_tree;
|
use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult};
|
||||||
|
|
||||||
pub fn bucket_sort<'c, FI>(
|
pub fn bucket_sort<'c, FI>(
|
||||||
reader: &heed::RoTxn<MainT>,
|
reader: &heed::RoTxn<MainT>,
|
||||||
@ -50,6 +50,21 @@ where
|
|||||||
let operation = create_query_tree(reader, postings_lists_store, synonyms_store, query).unwrap();
|
let operation = create_query_tree(reader, postings_lists_store, synonyms_store, query).unwrap();
|
||||||
println!("{:?}", operation);
|
println!("{:?}", operation);
|
||||||
|
|
||||||
|
let QueryResult { docids, queries } = traverse_query_tree(reader, postings_lists_store, &operation).unwrap();
|
||||||
|
println!("found {} documents", docids.len());
|
||||||
|
println!("number of postings {:?}", queries.len());
|
||||||
|
|
||||||
|
let before = Instant::now();
|
||||||
|
for (query, matches) in queries {
|
||||||
|
let op = sdset::duo::IntersectionByKey::new(&matches, &docids, |d| d.document_id, Clone::clone);
|
||||||
|
let buf: SetBuf<DocIndex> = op.into_set_buf();
|
||||||
|
if !buf.is_empty() {
|
||||||
|
println!("{:?} gives {} matches", query, buf.len());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("matches cleaned in {:.02?}", before.elapsed());
|
||||||
|
|
||||||
// We delegate the filter work to the distinct query builder,
|
// We delegate the filter work to the distinct query builder,
|
||||||
// specifying a distinct rule that has no effect.
|
// specifying a distinct rule that has no effect.
|
||||||
if filter.is_some() {
|
if filter.is_some() {
|
||||||
|
@ -204,22 +204,28 @@ pub fn create_query_tree(
|
|||||||
Ok(create_operation(ngrams, Operation::Or))
|
Ok(create_operation(ngrams, Operation::Or))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct QueryResult<'q, 'c> {
|
pub struct QueryResult<'o, 'txn> {
|
||||||
pub docids: Cow<'c, Set<DocumentId>>,
|
pub docids: SetBuf<DocumentId>,
|
||||||
pub queries: HashMap<&'q Query, Cow<'c, Set<DocIndex>>>,
|
pub queries: HashMap<&'o Query, Cow<'txn, Set<DocIndex>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub type Postings<'q, 'c> = HashMap<&'q Query, Cow<'c, Set<DocIndex>>>;
|
pub type Postings<'o, 'txn> = HashMap<&'o Query, Cow<'txn, Set<DocIndex>>>;
|
||||||
pub type Cache<'o, 'c> = HashMap<&'o Operation, Cow<'c, Set<DocumentId>>>;
|
pub type Cache<'o, 'c> = HashMap<&'o Operation, SetBuf<DocumentId>>;
|
||||||
|
|
||||||
pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> QueryResult<'a, 'c> {
|
pub fn traverse_query_tree<'o, 'txn>(
|
||||||
fn execute_and<'o, 'c>(
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
ctx: &'c Context,
|
postings_lists: store::PostingsLists,
|
||||||
cache: &mut Cache<'o, 'c>,
|
tree: &'o Operation,
|
||||||
postings: &mut Postings<'o, 'c>,
|
) -> MResult<QueryResult<'o, 'txn>>
|
||||||
|
{
|
||||||
|
fn execute_and<'o, 'txn>(
|
||||||
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
|
pls: store::PostingsLists,
|
||||||
|
cache: &mut Cache<'o, 'txn>,
|
||||||
|
postings: &mut Postings<'o, 'txn>,
|
||||||
depth: usize,
|
depth: usize,
|
||||||
operations: &'o [Operation],
|
operations: &'o [Operation],
|
||||||
) -> Cow<'c, Set<DocumentId>>
|
) -> MResult<SetBuf<DocumentId>>
|
||||||
{
|
{
|
||||||
println!("{:1$}AND", "", depth * 2);
|
println!("{:1$}AND", "", depth * 2);
|
||||||
|
|
||||||
@ -229,9 +235,9 @@ pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> Que
|
|||||||
for op in operations {
|
for op in operations {
|
||||||
if cache.get(op).is_none() {
|
if cache.get(op).is_none() {
|
||||||
let docids = match op {
|
let docids = match op {
|
||||||
Operation::And(ops) => execute_and(ctx, cache, postings, depth + 1, &ops),
|
Operation::And(ops) => execute_and(reader, pls, cache, postings, depth + 1, &ops)?,
|
||||||
Operation::Or(ops) => execute_or(ctx, cache, postings, depth + 1, &ops),
|
Operation::Or(ops) => execute_or(reader, pls, cache, postings, depth + 1, &ops)?,
|
||||||
Operation::Query(query) => execute_query(ctx, postings, depth + 1, &query),
|
Operation::Query(query) => execute_query(reader, pls, postings, depth + 1, &query)?,
|
||||||
};
|
};
|
||||||
cache.insert(op, docids);
|
cache.insert(op, docids);
|
||||||
}
|
}
|
||||||
@ -245,20 +251,20 @@ pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> Que
|
|||||||
|
|
||||||
let op = sdset::multi::Intersection::new(results);
|
let op = sdset::multi::Intersection::new(results);
|
||||||
let docids = op.into_set_buf();
|
let docids = op.into_set_buf();
|
||||||
let docids: Cow<Set<_>> = Cow::Owned(docids);
|
|
||||||
|
|
||||||
println!("{:3$}--- AND fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
|
println!("{:3$}--- AND fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
|
||||||
|
|
||||||
docids
|
Ok(docids)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn execute_or<'o, 'c>(
|
fn execute_or<'o, 'txn>(
|
||||||
ctx: &'c Context,
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
cache: &mut Cache<'o, 'c>,
|
pls: store::PostingsLists,
|
||||||
postings: &mut Postings<'o, 'c>,
|
cache: &mut Cache<'o, 'txn>,
|
||||||
|
postings: &mut Postings<'o, 'txn>,
|
||||||
depth: usize,
|
depth: usize,
|
||||||
operations: &'o [Operation],
|
operations: &'o [Operation],
|
||||||
) -> Cow<'c, Set<DocumentId>>
|
) -> MResult<SetBuf<DocumentId>>
|
||||||
{
|
{
|
||||||
println!("{:1$}OR", "", depth * 2);
|
println!("{:1$}OR", "", depth * 2);
|
||||||
|
|
||||||
@ -270,46 +276,47 @@ pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> Que
|
|||||||
Some(docids) => docids,
|
Some(docids) => docids,
|
||||||
None => {
|
None => {
|
||||||
let docids = match op {
|
let docids = match op {
|
||||||
Operation::And(ops) => execute_and(ctx, cache, postings, depth + 1, &ops),
|
Operation::And(ops) => execute_and(reader, pls, cache, postings, depth + 1, &ops)?,
|
||||||
Operation::Or(ops) => execute_or(ctx, cache, postings, depth + 1, &ops),
|
Operation::Or(ops) => execute_or(reader, pls, cache, postings, depth + 1, &ops)?,
|
||||||
Operation::Query(query) => execute_query(ctx, postings, depth + 1, &query),
|
Operation::Query(query) => execute_query(reader, pls, postings, depth + 1, &query)?,
|
||||||
};
|
};
|
||||||
cache.entry(op).or_insert(docids)
|
cache.entry(op).or_insert(docids)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
ids.extend(docids.as_ref());
|
ids.extend_from_slice(docids.as_ref());
|
||||||
}
|
}
|
||||||
|
|
||||||
let docids = SetBuf::from_dirty(ids);
|
let docids = SetBuf::from_dirty(ids);
|
||||||
let docids: Cow<Set<_>> = Cow::Owned(docids);
|
|
||||||
|
|
||||||
println!("{:3$}--- OR fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
|
println!("{:3$}--- OR fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
|
||||||
|
|
||||||
docids
|
Ok(docids)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn execute_query<'o, 'c>(
|
fn execute_query<'o, 'txn>(
|
||||||
ctx: &'c Context,
|
reader: &'txn heed::RoTxn<MainT>,
|
||||||
postings: &mut Postings<'o, 'c>,
|
pls: store::PostingsLists,
|
||||||
|
postings: &mut Postings<'o, 'txn>,
|
||||||
depth: usize,
|
depth: usize,
|
||||||
query: &'o Query,
|
query: &'o Query,
|
||||||
) -> Cow<'c, Set<DocumentId>>
|
) -> MResult<SetBuf<DocumentId>>
|
||||||
{
|
{
|
||||||
let before = Instant::now();
|
let before = Instant::now();
|
||||||
let (docids, matches) = match query {
|
let (docids, matches) = match query {
|
||||||
Query::Tolerant(_, word) | Query::Exact(_, word) | Query::Prefix(_, word) => {
|
Query::Tolerant(_, word) | Query::Exact(_, word) | Query::Prefix(_, word) => {
|
||||||
if let Some(PostingsList { docids, matches }) = ctx.postings.get(word) {
|
if let Some(docindexes) = pls.postings_list(reader, word.as_bytes())? {
|
||||||
(Cow::Borrowed(docids.as_set()), Cow::Borrowed(matches.as_set()))
|
let mut docids: Vec<_> = docindexes.iter().map(|d| d.document_id).collect();
|
||||||
|
docids.dedup();
|
||||||
|
(SetBuf::new(docids).unwrap(), docindexes)
|
||||||
} else {
|
} else {
|
||||||
(Cow::default(), Cow::default())
|
(SetBuf::default(), Cow::default())
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
Query::Phrase(_, words) => {
|
Query::Phrase(_, words) => {
|
||||||
if let [first, second] = words.as_slice() {
|
if let [first, second] = words.as_slice() {
|
||||||
let default = SetBuf::default();
|
let first = pls.postings_list(reader, first.as_bytes())?.unwrap_or_default();
|
||||||
let first = ctx.postings.get(first).map(|pl| &pl.matches).unwrap_or(&default);
|
let second = pls.postings_list(reader, second.as_bytes())?.unwrap_or_default();
|
||||||
let second = ctx.postings.get(second).map(|pl| &pl.matches).unwrap_or(&default);
|
|
||||||
|
|
||||||
let iter = merge_join_by(first.as_slice(), second.as_slice(), |a, b| {
|
let iter = merge_join_by(first.as_slice(), second.as_slice(), |a, b| {
|
||||||
let x = (a.document_id, a.attribute, (a.word_index as u32) + 1);
|
let x = (a.document_id, a.attribute, (a.word_index as u32) + 1);
|
||||||
@ -327,10 +334,10 @@ pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> Que
|
|||||||
|
|
||||||
println!("{:2$}matches {:?}", "", matches, depth * 2);
|
println!("{:2$}matches {:?}", "", matches, depth * 2);
|
||||||
|
|
||||||
(Cow::Owned(SetBuf::new(docids).unwrap()), Cow::Owned(SetBuf::new(matches).unwrap()))
|
(SetBuf::new(docids).unwrap(), Cow::Owned(SetBuf::new(matches).unwrap()))
|
||||||
} else {
|
} else {
|
||||||
println!("{:2$}{:?} skipped", "", words, depth * 2);
|
println!("{:2$}{:?} skipped", "", words, depth * 2);
|
||||||
(Cow::default(), Cow::default())
|
(SetBuf::default(), Cow::default())
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@ -338,17 +345,17 @@ pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> Que
|
|||||||
println!("{:4$}{:?} fetched {:?} documents in {:.02?}", "", query, docids.len(), before.elapsed(), depth * 2);
|
println!("{:4$}{:?} fetched {:?} documents in {:.02?}", "", query, docids.len(), before.elapsed(), depth * 2);
|
||||||
|
|
||||||
postings.insert(query, matches);
|
postings.insert(query, matches);
|
||||||
docids
|
Ok(docids)
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut cache = Cache::new();
|
let mut cache = Cache::new();
|
||||||
let mut postings = Postings::new();
|
let mut postings = Postings::new();
|
||||||
|
|
||||||
let docids = match tree {
|
let docids = match tree {
|
||||||
Operation::And(operations) => execute_and(ctx, &mut cache, &mut postings, 0, &operations),
|
Operation::And(ops) => execute_and(reader, postings_lists, &mut cache, &mut postings, 0, &ops)?,
|
||||||
Operation::Or(operations) => execute_or(ctx, &mut cache, &mut postings, 0, &operations),
|
Operation::Or(ops) => execute_or(reader, postings_lists, &mut cache, &mut postings, 0, &ops)?,
|
||||||
Operation::Query(query) => execute_query(ctx, &mut postings, 0, &query),
|
Operation::Query(query) => execute_query(reader, postings_lists, &mut postings, 0, &query)?,
|
||||||
};
|
};
|
||||||
|
|
||||||
QueryResult { docids, queries: postings }
|
Ok(QueryResult { docids, queries: postings })
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user