Add documentation

This commit is contained in:
Loïc Lecrenier 2023-03-08 13:26:29 +01:00
parent 4e266211bf
commit c232cdabf5
10 changed files with 358 additions and 548 deletions

View File

@ -1,15 +1,21 @@
use std::collections::hash_map::Entry;
use std::hash::Hash;
use fxhash::FxHashMap;
use heed::types::ByteSlice;
use heed::{BytesEncode, Database, RoTxn};
use super::interner::Interned;
use super::SearchContext;
use crate::Result;
/// A cache storing pointers to values in the LMDB databases.
///
/// Used for performance reasons only. By using this cache, we avoid performing a
/// database lookup and instead get a direct reference to the value using a fast
/// local HashMap lookup.
#[derive(Default)]
pub struct DatabaseCache<'search> {
// TODO: interner for all database cache keys?
pub word_pair_proximity_docids:
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
pub word_prefix_pair_proximity_docids:
@ -21,36 +27,50 @@ pub struct DatabaseCache<'search> {
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
}
impl<'search> SearchContext<'search> {
fn get_value<'v, K1, KC>(
txn: &'search RoTxn,
cache_key: K1,
db_key: &'v KC::EItem,
cache: &mut FxHashMap<K1, Option<&'search [u8]>>,
db: Database<KC, ByteSlice>,
) -> Result<Option<&'search [u8]>>
where
K1: Copy + Eq + Hash,
KC: BytesEncode<'v>,
{
let bitmap_ptr = match cache.entry(cache_key) {
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
Entry::Vacant(entry) => {
let bitmap_ptr = db.get(txn, db_key)?;
entry.insert(bitmap_ptr);
bitmap_ptr
}
};
Ok(bitmap_ptr)
}
/// Retrieve or insert the given value in the `word_docids` database.
pub fn get_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'search [u8]>> {
let bitmap_ptr = match self.db_cache.word_docids.entry(word) {
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
Entry::Vacant(entry) => {
let bitmap_ptr = self
.index
.word_docids
.remap_data_type::<ByteSlice>()
.get(self.txn, self.word_interner.get(word))?;
entry.insert(bitmap_ptr);
bitmap_ptr
Self::get_value(
self.txn,
word,
self.word_interner.get(word).as_str(),
&mut self.db_cache.word_docids,
self.index.word_docids.remap_data_type::<ByteSlice>(),
)
}
};
Ok(bitmap_ptr)
}
pub fn get_prefix_docids(&mut self, prefix: Interned<String>) -> Result<Option<&'search [u8]>> {
// In the future, this will be a frozen roaring bitmap
let bitmap_ptr = match self.db_cache.word_prefix_docids.entry(prefix) {
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
Entry::Vacant(entry) => {
let bitmap_ptr = self
.index
.word_prefix_docids
.remap_data_type::<ByteSlice>()
.get(self.txn, self.word_interner.get(prefix))?;
entry.insert(bitmap_ptr);
bitmap_ptr
}
};
Ok(bitmap_ptr)
/// Retrieve or insert the given value in the `word_prefix_docids` database.
pub fn get_word_prefix_docids(
&mut self,
prefix: Interned<String>,
) -> Result<Option<&'search [u8]>> {
Self::get_value(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
&mut self.db_cache.word_prefix_docids,
self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
)
}
pub fn get_word_pair_proximity_docids(
@ -59,40 +79,17 @@ impl<'search> SearchContext<'search> {
word2: Interned<String>,
proximity: u8,
) -> Result<Option<&'search [u8]>> {
let key = (proximity, word1, word2);
match self.db_cache.word_pair_proximity_docids.entry(key) {
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
Entry::Vacant(entry) => {
// We shouldn't greedily access this DB at all
// a DB (w1, w2) -> [proximities] would be much better
// We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity
// And if we worked with words encoded as integers, the set of words could be a roaring bitmap
// Then, to find all the proximities between two list of words, we'd do:
// inputs:
// - words1 (roaring bitmap)
// - words2 (roaring bitmap)
// output:
// - [(word1, word2, [proximities])]
// algo:
// let mut ouput = vec![];
// for word1 in words1 {
// let all_words_in_proximity_of_w1 = pair_words_db.get(word1);
// let words_in_proximity_of_w1 = all_words_in_proximity_of_w1 & words2;
// for word2 in words_in_proximity_of_w1 {
// let proximties = prox_db.get(word1, word2);
// output.push(word1, word2, proximities);
// }
// }
let bitmap_ptr =
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().get(
Self::get_value(
self.txn,
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
)?;
entry.insert(bitmap_ptr);
Ok(bitmap_ptr)
}
}
(proximity, word1, word2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(word2).as_str(),
),
&mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
)
}
pub fn get_word_prefix_pair_proximity_docids(
@ -101,22 +98,17 @@ impl<'search> SearchContext<'search> {
prefix2: Interned<String>,
proximity: u8,
) -> Result<Option<&'search [u8]>> {
let key = (proximity, word1, prefix2);
match self.db_cache.word_prefix_pair_proximity_docids.entry(key) {
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
Entry::Vacant(entry) => {
let bitmap_ptr = self
.index
.word_prefix_pair_proximity_docids
.remap_data_type::<ByteSlice>()
.get(
Self::get_value(
self.txn,
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
)?;
entry.insert(bitmap_ptr);
Ok(bitmap_ptr)
}
}
(proximity, word1, prefix2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(prefix2).as_str(),
),
&mut self.db_cache.word_prefix_pair_proximity_docids,
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
)
}
pub fn get_prefix_word_pair_proximity_docids(
&mut self,
@ -124,25 +116,16 @@ impl<'search> SearchContext<'search> {
right: Interned<String>,
proximity: u8,
) -> Result<Option<&'search [u8]>> {
let key = (proximity, left_prefix, right);
match self.db_cache.prefix_word_pair_proximity_docids.entry(key) {
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
Entry::Vacant(entry) => {
let bitmap_ptr = self
.index
.prefix_word_pair_proximity_docids
.remap_data_type::<ByteSlice>()
.get(
Self::get_value(
self.txn,
(proximity, left_prefix, right),
&(
proximity,
self.word_interner.get(left_prefix),
self.word_interner.get(right),
self.word_interner.get(left_prefix).as_str(),
self.word_interner.get(right).as_str(),
),
)?;
entry.insert(bitmap_ptr);
Ok(bitmap_ptr)
}
}
&mut self.db_cache.prefix_word_pair_proximity_docids,
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
)
}
}

View File

@ -1,3 +1,41 @@
/*! Implementation of a generic graph-based ranking rule.
A graph-based ranking rule is a ranking rule that works by representing
its possible operations and their relevancy cost as a directed acyclic multi-graph
built on top of the query graph. It then computes its buckets by finding the
cheapest paths from the start node to the end node and computing the document ids
that satisfy those paths.
For example, the proximity ranking rule builds a graph where the edges between two
nodes represent a condition that the term of the source node is in a certain proximity
to the term of the destination node. With the query "pretty house by" where the term
"pretty" has three possible proximities to the term "house" and "house" has two
proximities to "by", the graph will look like this:
```txt
11
START 0pretty 2 house by 0 END
32-
```
The proximity ranking rule's first bucket will be determined by the union of all
the shortest paths from START to END, which in this case is:
```txt
START --0-> pretty --1--> house --1--> by --0--> end
```
The path's corresponding document ids are found by taking the intersection of the
document ids of each edge. That is, we find the documents where both `pretty` is
1-close to `house` AND `house` is 1-close to `by`.
For the second bucket, we get the union of the second-cheapest paths, which are:
```txt
START --0-> pretty --1--> house --2--> by --0--> end
START --0-> pretty --2--> house --1--> by --0--> end
```
That is we find the documents where either:
- `pretty` is 1-close to `house` AND `house` is 2-close to `by`
- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
*/
use roaring::RoaringBitmap;
use super::logger::SearchLogger;
@ -8,24 +46,38 @@ use super::small_bitmap::SmallBitmap;
use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
use crate::Result;
/// A generic graph-based ranking rule
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
id: String,
// When the ranking rule is not iterating over its buckets,
// its state is `None`.
state: Option<GraphBasedRankingRuleState<G>>,
}
impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
/// Creates the ranking rule with the given identifier
pub fn new(id: String) -> Self {
Self { id, state: None }
}
}
/// The internal state of a graph-based ranking rule during iteration
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
/// The current graph
graph: RankingRuleGraph<G>,
/// Cache to retrieve the docids associated with each edge
edge_docids_cache: EdgeDocidsCache<G>,
/// Cache used to optimistically discard paths that resolve to no documents.
empty_paths_cache: EmptyPathsCache,
/// A structure giving the list of possible costs from each node to the end node,
/// along with a set of unavoidable edges that must be traversed to achieve that distance.
all_distances: Vec<Vec<(u16, SmallBitmap)>>,
/// An index in the first element of `all_distances`, giving the cost of the next bucket
cur_distance_idx: usize,
}
/// Traverse each edge of the graph, computes its associated document ids,
/// and remove this edge from the graph if its docids are disjoint with the
/// given universe.
fn remove_empty_edges<'search, G: RankingRuleGraphTrait>(
ctx: &mut SearchContext<'search>,
graph: &mut RankingRuleGraph<G>,
@ -70,6 +122,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
let mut edge_docids_cache = EdgeDocidsCache::default();
let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16);
// First simplify the graph as much as possible, by computing the docids of the edges
// within the rule's universe and removing the edges that have no associated docids.
remove_empty_edges(
ctx,
&mut graph,
@ -77,6 +131,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
universe,
&mut empty_paths_cache,
)?;
// Then pre-compute the cost of all paths from each node to the end node
let all_distances = graph.initialize_distances_with_necessary_edges();
let state = GraphBasedRankingRuleState {
@ -98,9 +154,14 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
logger: &mut dyn SearchLogger<QueryGraph>,
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
// If universe.len() <= 1, the bucket sort algorithm
// should not have called this function.
assert!(universe.len() > 1);
// Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
// should never happen
let mut state = self.state.take().unwrap();
// TODO: does this have a real positive performance cost?
remove_empty_edges(
ctx,
&mut state.graph,
@ -109,12 +170,16 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
&mut state.empty_paths_cache,
)?;
// If the cur_distance_idx does not point to a valid cost in the `all_distances`
// structure, then we have computed all the buckets and can return.
if state.cur_distance_idx
>= state.all_distances[state.graph.query_graph.root_node as usize].len()
{
self.state = None;
return Ok(None);
}
// Retrieve the cost of the paths to compute
let (cost, _) =
state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx];
state.cur_distance_idx += 1;
@ -129,22 +194,38 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
cur_distance_idx: _,
} = &mut state;
let mut paths = vec![];
let original_universe = universe;
let mut universe = universe.clone();
// TODO: remove this unnecessary clone
let original_graph = graph.clone();
// and this vector as well
let mut paths = vec![];
// For each path of the given cost, we will compute its associated
// document ids.
// In case the path does not resolve to any document id, we try to figure out why
// and update the `empty_paths_cache` accordingly.
// For example, it may be that the path is empty because one of its edges is disjoint
// with the universe, or because a prefix of the path is disjoint with the universe, or because
// the path contains two edges that are disjoint from each other within the universe.
// Updating the empty_paths_cache helps speed up the execution of `visit_paths_of_cost` and reduces
// the number of future candidate paths given by that same function.
graph.visit_paths_of_cost(
graph.query_graph.root_node as usize,
cost,
all_distances,
empty_paths_cache,
|path, graph, empty_paths_cache| {
// Accumulate the path for logging purposes only
paths.push(path.to_vec());
let mut path_docids = universe.clone();
// We store the edges and their docids in vectors in case the path turns out to be
// empty and we need to figure out why it was empty.
let mut visited_edges = vec![];
let mut cached_edge_docids = vec![];
for &edge_index in path {
visited_edges.push(edge_index);
let edge_docids =
@ -154,21 +235,29 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
BitmapOrAllRef::All => continue,
};
cached_edge_docids.push((edge_index, edge_docids.clone()));
// If the edge is empty, then the path will be empty as well, we update the graph
// and caches accordingly and skip to the next candidate path.
if edge_docids.is_disjoint(&universe) {
// 1. Store in the cache that this edge is empty for this universe
empty_paths_cache.forbid_edge(edge_index);
// 2. remove this edge from the ranking rule graph
graph.remove_edge(edge_index);
// 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore
edge_docids_cache.cache.remove(&edge_index);
return Ok(());
}
path_docids &= edge_docids;
// If the (sub)path is empty, we try to figure out why and update the caches accordingly.
if path_docids.is_disjoint(&universe) {
// empty_paths_cache.forbid_prefix(&visited_edges);
// if the intersection between this edge and any
// First, we know that this path is empty, and thus any path
// that is a superset of it will also be empty.
empty_paths_cache.forbid_prefix(&visited_edges);
// Second, if the intersection between this edge and any
// previous one is disjoint with the universe,
// then we add these two edges to the empty_path_cache
// then we also know that any path containing the same couple of
// edges will also be empty.
for (edge_index2, edge_docids2) in
cached_edge_docids[..cached_edge_docids.len() - 1].iter()
{
@ -181,6 +270,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
}
}
bucket |= &path_docids;
// Reduce the size of the universe so that we can more optimistically discard candidate paths
universe -= path_docids;
Ok(())
},
@ -196,6 +286,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
logger,
);
// TODO: Graph-based ranking rules do not (yet) modify the query graph. We could, however,
// remove nodes and/or terms within nodes that weren't present in any of the paths.
let next_query_graph = state.graph.query_graph.clone();
self.state = Some(state);

View File

@ -3,6 +3,7 @@ use std::marker::PhantomData;
use fxhash::FxHashMap;
/// An index within a [`Interner<T>`] structure.
pub struct Interned<T> {
idx: u32,
_phantom: PhantomData<T>,
@ -13,7 +14,10 @@ impl<T> Interned<T> {
Self { idx, _phantom: PhantomData }
}
}
/// An [`Interner`] is used to store a unique copy of a value of type `T`. This value
/// is then identified by a lightweight index of type [`Interned<T>`], which can
/// be copied, compared, and hashed efficiently. An immutable reference to the original value
/// can be retrieved using `self.get(interned)`.
pub struct Interner<T> {
stable_store: Vec<T>,
lookup: FxHashMap<T, Interned<T>>,

View File

@ -7,7 +7,82 @@ use super::ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGrap
use super::small_bitmap::SmallBitmap;
use super::{RankingRule, RankingRuleQueryTrait};
/// Trait for structure logging the execution of a search query.
pub trait SearchLogger<Q: RankingRuleQueryTrait> {
/// Logs the initial query
fn initial_query(&mut self, query: &Q);
/// Logs the query that was used to compute the set of all candidates
fn query_for_universe(&mut self, query: &Q);
/// Logs the value of the initial set of all candidates
fn initial_universe(&mut self, universe: &RoaringBitmap);
/// Logs the ranking rules used to perform the search query
fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<Q>]);
/// Logs the start of a ranking rule's iteration.
fn start_iteration_ranking_rule<'transaction>(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'transaction, Q>,
query: &Q,
universe: &RoaringBitmap,
);
/// Logs the end of the computation of a ranking rule bucket
fn next_bucket_ranking_rule<'transaction>(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'transaction, Q>,
universe: &RoaringBitmap,
candidates: &RoaringBitmap,
);
/// Logs the skipping of a ranking rule bucket
fn skip_bucket_ranking_rule<'transaction>(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'transaction, Q>,
candidates: &RoaringBitmap,
);
/// Logs the end of a ranking rule's iteration.
fn end_iteration_ranking_rule<'transaction>(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'transaction, Q>,
universe: &RoaringBitmap,
);
/// Logs the addition of document ids to the final results
fn add_to_results(&mut self, docids: &[u32]);
/// Logs the internal state of the words ranking rule
fn log_words_state(&mut self, query_graph: &Q);
/// Logs the internal state of the proximity ranking rule
fn log_proximity_state(
&mut self,
query_graph: &RankingRuleGraph<ProximityGraph>,
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: Vec<Vec<(u16, SmallBitmap)>>,
cost: u16,
);
/// Logs the internal state of the typo ranking rule
fn log_typo_state(
&mut self,
query_graph: &RankingRuleGraph<TypoGraph>,
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: Vec<Vec<(u16, SmallBitmap)>>,
cost: u16,
);
}
/// A dummy [`SearchLogger`] which does nothing.
pub struct DefaultSearchLogger;
impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
fn initial_query(&mut self, _query: &Q) {}
@ -76,63 +151,3 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
) {
}
}
pub trait SearchLogger<Q: RankingRuleQueryTrait> {
fn initial_query(&mut self, query: &Q);
fn query_for_universe(&mut self, query: &Q);
fn initial_universe(&mut self, universe: &RoaringBitmap);
fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<Q>]);
fn start_iteration_ranking_rule<'transaction>(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'transaction, Q>,
query: &Q,
universe: &RoaringBitmap,
);
fn next_bucket_ranking_rule<'transaction>(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'transaction, Q>,
universe: &RoaringBitmap,
candidates: &RoaringBitmap,
);
fn skip_bucket_ranking_rule<'transaction>(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'transaction, Q>,
candidates: &RoaringBitmap,
);
fn end_iteration_ranking_rule<'transaction>(
&mut self,
ranking_rule_idx: usize,
ranking_rule: &dyn RankingRule<'transaction, Q>,
universe: &RoaringBitmap,
);
fn add_to_results(&mut self, docids: &[u32]);
fn log_words_state(&mut self, query_graph: &Q);
fn log_proximity_state(
&mut self,
query_graph: &RankingRuleGraph<ProximityGraph>,
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: Vec<Vec<(u16, SmallBitmap)>>,
cost: u16,
);
fn log_typo_state(
&mut self,
query_graph: &RankingRuleGraph<TypoGraph>,
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: Vec<Vec<(u16, SmallBitmap)>>,
cost: u16,
);
}

View File

@ -88,7 +88,7 @@ fn resolve_maximally_reduced_query_graph<'search>(
break;
} else {
let position_to_remove = positions_to_remove.pop().unwrap();
let _ = graph.remove_words_at_position(position_to_remove);
let _ = graph.remove_words_starting_at_position(position_to_remove);
}
}
logger.query_for_universe(&graph);

View File

@ -3,6 +3,17 @@ use super::small_bitmap::SmallBitmap;
use super::SearchContext;
use crate::Result;
const QUERY_GRAPH_NODE_LENGTH_LIMIT: u16 = 64;
/// A node of the [`QueryGraph`].
///
/// There are four types of nodes:
/// 1. `Start` : unique, represents the start of the query
/// 2. `End` : unique, represents the end of a query
/// 3. `Deleted` : represents a node that was deleted.
/// All deleted nodes are unreachable from the start node.
/// 4. `Term` is a regular node representing a word or combination of words
/// from the user query.
#[derive(Clone)]
pub enum QueryNode {
Term(LocatedQueryTerm),
@ -11,34 +22,84 @@ pub enum QueryNode {
End,
}
/// The edges associated with a node in the query graph.
#[derive(Clone)]
pub struct Edges {
// TODO: use a tiny bitset instead, something like a simple Vec<u8> where most queries will see a vector of one element
/// Set of nodes which have an edge going to the current node
pub predecessors: SmallBitmap,
/// Set of nodes which are reached by an edge from the current node
pub successors: SmallBitmap,
}
/**
A graph representing all the ways to interpret the user's search query.
## Important
At the moment, a query graph has a hardcoded limit of [`QUERY_GRAPH_NODE_LENGTH_LIMIT`] nodes.
## Example 1
For the search query `sunflower`, we need to register the following things:
- we need to look for the exact word `sunflower`
- but also any word which is 1 or 2 typos apart from `sunflower`
- and every word that contains the prefix `sunflower`
- and also the couple of adjacent words `sun flower`
- as well as all the user-defined synonyms of `sunflower`
All these derivations of a word will be stored in [`WordDerivations`].
## Example 2:
For the search query `summer house by`.
We also look for all word derivations of each term. And we also need to consider
the potential n-grams `summerhouse`, `summerhouseby`, and `houseby`.
Furthermore, we need to know which words these ngrams replace. This is done by creating the
following graph, where each node also contains a list of derivations:
```txt
houseby
START summer house by END
summerhouse
summerhouseby
```
Note also that each node has a range of positions associated with it,
such that `summer` is known to be a word at the positions `0..=0` and `houseby`
is registered with the positions `1..=2`. When two nodes are connected by an edge,
it means that they are potentially next to each other in the user's search query
(depending on the [`TermsMatchingStrategy`](crate::search::TermsMatchingStrategy)
and the transformations that were done on the query graph).
*/
#[derive(Clone)]
pub struct QueryGraph {
/// The index of the start node within `self.nodes`
pub root_node: u16,
/// The index of the end node within `self.nodes`
pub end_node: u16,
/// The list of all query nodes
pub nodes: Vec<QueryNode>,
/// The list of all node edges
pub edges: Vec<Edges>,
}
fn _assert_sizes() {
// TODO: QueryNodes are too big now, 88B is a bit too big
let _: [u8; 88] = [0; std::mem::size_of::<QueryNode>()];
let _: [u8; 32] = [0; std::mem::size_of::<Edges>()];
}
impl Default for QueryGraph {
/// Create a new QueryGraph with two disconnected nodes: the root and end nodes.
fn default() -> Self {
let nodes = vec![QueryNode::Start, QueryNode::End];
let edges = vec![
Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) },
Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) },
Edges {
predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
},
Edges {
predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
},
];
Self { root_node: 0, end_node: 1, nodes, edges }
@ -46,33 +107,31 @@ impl Default for QueryGraph {
}
impl QueryGraph {
/// Connect all the given predecessor nodes to the given successor node
fn connect_to_node(&mut self, from_nodes: &[u16], to_node: u16) {
for &from_node in from_nodes {
self.edges[from_node as usize].successors.insert(to_node);
self.edges[to_node as usize].predecessors.insert(from_node);
}
}
/// Add the given node to the graph and connect it to all the given predecessor nodes
fn add_node(&mut self, from_nodes: &[u16], node: QueryNode) -> u16 {
let new_node_idx = self.nodes.len() as u16;
assert!(new_node_idx <= QUERY_GRAPH_NODE_LENGTH_LIMIT);
self.nodes.push(node);
self.edges.push(Edges {
predecessors: SmallBitmap::from_array(from_nodes, 64),
successors: SmallBitmap::new(64),
predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
});
for from_node in from_nodes {
self.edges[*from_node as usize].successors.insert(new_node_idx);
}
self.connect_to_node(from_nodes, new_node_idx);
new_node_idx
}
}
impl QueryGraph {
// TODO: return the list of all matching words here as well
/// Build the query graph from the parsed user search query.
pub fn from_query(ctx: &mut SearchContext, terms: Vec<LocatedQueryTerm>) -> Result<QueryGraph> {
// TODO: maybe empty nodes should not be removed here, to compute
// the score of the `words` ranking rule correctly
// it is very easy to traverse the graph and remove afterwards anyway
// Still, I'm keeping this here as a demo
let mut empty_nodes = vec![];
let word_set = ctx.index.words_fst(ctx.txn)?;
@ -81,7 +140,6 @@ impl QueryGraph {
let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
(vec![], vec![], vec![graph.root_node]);
// TODO: split words / synonyms
for length in 1..=terms.len() {
let query = &terms[..length];
@ -156,6 +214,8 @@ impl QueryGraph {
Ok(graph)
}
/// Remove the given nodes and all their edges from the query graph.
pub fn remove_nodes(&mut self, nodes: &[u16]) {
for &node in nodes {
self.nodes[node as usize] = QueryNode::Deleted;
@ -166,10 +226,13 @@ impl QueryGraph {
for succ in edges.successors.iter() {
self.edges[succ as usize].predecessors.remove(node);
}
self.edges[node as usize] =
Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) };
self.edges[node as usize] = Edges {
predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
};
}
}
/// Remove the given nodes, connecting all their predecessors to all their successors.
pub fn remove_nodes_keep_edges(&mut self, nodes: &[u16]) {
for &node in nodes {
self.nodes[node as usize] = QueryNode::Deleted;
@ -182,11 +245,17 @@ impl QueryGraph {
self.edges[succ as usize].predecessors.remove(node);
self.edges[succ as usize].predecessors.union(&edges.predecessors);
}
self.edges[node as usize] =
Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) };
self.edges[node as usize] = Edges {
predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
};
}
}
pub fn remove_words_at_position(&mut self, position: i8) -> bool {
/// Remove all the nodes that correspond to a word starting at the given position, and connect
/// the predecessors of these nodes to their successors.
/// Return `true` if any node was removed.
pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool {
let mut nodes_to_remove_keeping_edges = vec![];
for (node_idx, node) in self.nodes.iter().enumerate() {
let node_idx = node_idx as u16;
@ -202,14 +271,15 @@ impl QueryGraph {
!nodes_to_remove_keeping_edges.is_empty()
}
/// Simplify the query graph by removing all nodes that are disconnected from
/// the start or end nodes.
fn simplify(&mut self) {
loop {
let mut nodes_to_remove = vec![];
for (node_idx, node) in self.nodes.iter().enumerate() {
if (!matches!(node, QueryNode::End | QueryNode::Deleted)
&& self.edges[node_idx].successors.is_empty())
|| (!matches!(node, QueryNode::Start | QueryNode::Deleted)
&& self.edges[node_idx].predecessors.is_empty())
if !matches!(node, QueryNode::End | QueryNode::Deleted)
&& (self.edges[node_idx].successors.is_empty()
|| self.edges[node_idx].predecessors.is_empty())
{
nodes_to_remove.push(node_idx as u16);
}

View File

@ -53,7 +53,7 @@ impl RankingRuleGraphTrait for TypoGraph {
docids |= bitmap;
}
if *nbr_typos == 0 {
if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? {
if let Some(bytes) = ctx.get_word_prefix_docids(derivations.original)? {
// TODO: deserialize bitmap within a universe
let bitmap = universe
& RoaringBitmapCodec::bytes_decode(bytes)

View File

@ -114,7 +114,7 @@ pub fn apply_ranking_rules<'search>(
logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe);
ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?;
let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
let mut candidates: Vec<RoaringBitmap> = vec![RoaringBitmap::default(); ranking_rules_len];
candidates[0] = universe.clone();
let mut cur_ranking_rule_index = 0;
@ -174,7 +174,7 @@ pub fn apply_ranking_rules<'search>(
}
} else {
let candidates =
candidates.iter().take(length - results.len()).collect::<Vec<_>>();
candidates.iter().take(length - results.len()).collect::<Vec<u32>>();
logger.add_to_results(&candidates);
results.extend(&candidates);
}
@ -234,358 +234,3 @@ pub fn apply_ranking_rules<'search>(
Ok(results)
}
#[cfg(test)]
mod tests {
// use crate::allocator::ALLOC;
use std::fs::File;
use std::io::{BufRead, BufReader, Cursor, Seek};
use std::time::Instant;
use big_s::S;
use heed::EnvOpenOptions;
use maplit::hashset;
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
// use crate::search::new::logger::detailed::DetailedSearchLogger;
use crate::search::new::logger::DefaultSearchLogger;
use crate::search::new::{execute_search, SearchContext};
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy};
#[test]
fn search_wiki_new() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_wiki").unwrap();
let txn = index.read_txn().unwrap();
println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len());
// loop {
let start = Instant::now();
// let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log");
let mut ctx = SearchContext::new(&index, &txn);
let results = execute_search(
&mut ctx,
"which a the releases from poison by the government",
None,
0,
20,
&mut DefaultSearchLogger,
// &mut logger,
)
.unwrap();
// logger.write_d2_description(&mut ctx);
let elapsed = start.elapsed();
println!("{}us", elapsed.as_micros());
let _documents = index
.documents(&txn, results.iter().copied())
.unwrap()
.into_iter()
.map(|(id, obkv)| {
let mut object = serde_json::Map::default();
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
let value = obkv.get(fid).unwrap();
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
object.insert(fid_name.to_owned(), value);
}
(id, serde_json::to_string_pretty(&object).unwrap())
})
.collect::<Vec<_>>();
println!("{}us: {:?}", elapsed.as_micros(), results);
// }
// for (id, _document) in documents {
// println!("{id}:");
// // println!("{document}");
// }
}
#[test]
fn search_wiki_old() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_wiki").unwrap();
let txn = index.read_txn().unwrap();
let rr = index.criteria(&txn).unwrap();
println!("{rr:?}");
let start = Instant::now();
let mut s = Search::new(&txn, &index);
s.query("which a the releases from poison by the government");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
let docs = s.execute().unwrap();
let elapsed = start.elapsed();
let documents = index
.documents(&txn, docs.documents_ids.iter().copied())
.unwrap()
.into_iter()
.map(|(id, obkv)| {
let mut object = serde_json::Map::default();
for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
let value = obkv.get(fid).unwrap();
let value: serde_json::Value = serde_json::from_slice(value).unwrap();
object.insert(fid_name.to_owned(), value);
}
(id, serde_json::to_string_pretty(&object).unwrap())
})
.collect::<Vec<_>>();
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
for (id, _document) in documents {
println!("{id}:");
// println!("{document}");
}
}
#[test]
fn search_movies_new() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_movies").unwrap();
let txn = index.read_txn().unwrap();
// let primary_key = index.primary_key(&txn).unwrap().unwrap();
// let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
// loop {
let start = Instant::now();
let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log");
let mut ctx = SearchContext::new(&index, &txn);
let results = execute_search(
&mut ctx,
"releases from poison by the government",
None,
0,
20,
// &mut DefaultSearchLogger,
&mut logger,
)
.unwrap();
logger.write_d2_description(&mut ctx);
let elapsed = start.elapsed();
// let ids = index
// .documents(&txn, results.iter().copied())
// .unwrap()
// .into_iter()
// .map(|x| {
// let obkv = &x.1;
// let id = obkv.get(primary_key).unwrap();
// let id: serde_json::Value = serde_json::from_slice(id).unwrap();
// id.as_str().unwrap().to_owned()
// })
// .collect::<Vec<_>>();
println!("{}us: {results:?}", elapsed.as_micros());
// println!("external ids: {ids:?}");
// }
}
#[test]
fn search_movies_old() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_movies").unwrap();
let txn = index.read_txn().unwrap();
let rr = index.criteria(&txn).unwrap();
println!("{rr:?}");
let primary_key = index.primary_key(&txn).unwrap().unwrap();
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
let start = Instant::now();
let mut s = Search::new(&txn, &index);
s.query("which a the releases from poison by the government");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
let docs = s.execute().unwrap();
let elapsed = start.elapsed();
let ids = index
.documents(&txn, docs.documents_ids.iter().copied())
.unwrap()
.into_iter()
.map(|x| {
let obkv = &x.1;
let id = obkv.get(primary_key).unwrap();
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
id.as_str().unwrap().to_owned()
})
.collect::<Vec<_>>();
println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
println!("external ids: {ids:?}");
}
#[test]
fn _settings_movies() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_movies").unwrap();
let mut wtxn = index.write_txn().unwrap();
let config = IndexerConfig::default();
let mut builder = Settings::new(&mut wtxn, &index, &config);
builder.set_min_word_len_one_typo(5);
builder.set_min_word_len_two_typos(100);
builder.set_sortable_fields(hashset! { S("release_date") });
builder.set_criteria(vec![
Criterion::Words,
Criterion::Typo,
Criterion::Proximity,
Criterion::Asc("release_date".to_owned()),
]);
builder.execute(|_| (), || false).unwrap();
wtxn.commit().unwrap();
}
#[test]
fn _index_movies() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_movies").unwrap();
let mut wtxn = index.write_txn().unwrap();
let primary_key = "id";
let searchable_fields = vec!["title", "overview"];
let filterable_fields = vec!["release_date", "genres"];
let config = IndexerConfig::default();
let mut builder = Settings::new(&mut wtxn, &index, &config);
builder.set_primary_key(primary_key.to_owned());
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
builder.set_filterable_fields(filterable_fields);
builder.set_min_word_len_one_typo(5);
builder.set_min_word_len_two_typos(100);
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
builder.execute(|_| (), || false).unwrap();
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
.unwrap();
let documents = documents_from(
"/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json",
"json",
);
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
}
#[test]
fn _index_wiki() {
let mut options = EnvOpenOptions::new();
options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
let index = Index::new(options, "data_wiki").unwrap();
let mut wtxn = index.write_txn().unwrap();
// let primary_key = "id";
let searchable_fields = vec!["body", "title", "url"];
// let filterable_fields = vec![];
let config = IndexerConfig::default();
let mut builder = Settings::new(&mut wtxn, &index, &config);
// builder.set_primary_key(primary_key.to_owned());
let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
builder.set_searchable_fields(searchable_fields);
// let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
// builder.set_filterable_fields(filterable_fields);
// builder.set_min_word_len_one_typo(5);
// builder.set_min_word_len_two_typos(100);
builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
builder.execute(|_| (), || false).unwrap();
let config = IndexerConfig::default();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
.unwrap();
let documents = documents_from(
"/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv",
"csv",
);
let (builder, user_error) = builder.add_documents(documents).unwrap();
user_error.unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();
index.prepare_for_closing().wait();
}
fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
let reader = File::open(filename)
.unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
let reader = BufReader::new(reader);
let documents = match filetype {
"csv" => documents_from_csv(reader).unwrap(),
"json" => documents_from_json(reader).unwrap(),
"jsonl" => documents_from_jsonl(reader).unwrap(),
otherwise => panic!("invalid update format {:?}", otherwise),
};
DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
}
fn documents_from_jsonl(reader: impl BufRead) -> crate::Result<Vec<u8>> {
let mut documents = DocumentsBatchBuilder::new(Vec::new());
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
let object = result.unwrap();
documents.append_json_object(&object)?;
}
documents.into_inner().map_err(Into::into)
}
fn documents_from_json(reader: impl BufRead) -> crate::Result<Vec<u8>> {
let mut documents = DocumentsBatchBuilder::new(Vec::new());
documents.append_json_array(reader)?;
documents.into_inner().map_err(Into::into)
}
fn documents_from_csv(reader: impl BufRead) -> crate::Result<Vec<u8>> {
let csv = csv::Reader::from_reader(reader);
let mut documents = DocumentsBatchBuilder::new(Vec::new());
documents.append_csv(csv)?;
documents.into_inner().map_err(Into::into)
}
}

View File

@ -46,7 +46,7 @@ impl<'search> SearchContext<'search> {
}
}
if *use_prefix_db {
if let Some(prefix_docids) = self.get_prefix_docids(*original)? {
if let Some(prefix_docids) = self.get_word_prefix_docids(*original)? {
or_docids.push(prefix_docids);
}
}

View File

@ -88,7 +88,8 @@ impl<'search> RankingRule<'search, QueryGraph> for Words {
break;
} else {
let position_to_remove = self.positions_to_remove.pop().unwrap();
let did_delete_any_node = query_graph.remove_words_at_position(position_to_remove);
let did_delete_any_node =
query_graph.remove_words_starting_at_position(position_to_remove);
if did_delete_any_node {
break;
}