diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 100dae90a..7e68ec5e5 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -1,15 +1,21 @@ use std::collections::hash_map::Entry; +use std::hash::Hash; use fxhash::FxHashMap; use heed::types::ByteSlice; +use heed::{BytesEncode, Database, RoTxn}; use super::interner::Interned; use super::SearchContext; use crate::Result; +/// A cache storing pointers to values in the LMDB databases. +/// +/// Used for performance reasons only. By using this cache, we avoid performing a +/// database lookup and instead get a direct reference to the value using a fast +/// local HashMap lookup. #[derive(Default)] pub struct DatabaseCache<'search> { - // TODO: interner for all database cache keys? pub word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option<&'search [u8]>>, pub word_prefix_pair_proximity_docids: @@ -21,36 +27,50 @@ pub struct DatabaseCache<'search> { pub word_prefix_docids: FxHashMap, Option<&'search [u8]>>, } impl<'search> SearchContext<'search> { - pub fn get_word_docids(&mut self, word: Interned) -> Result> { - let bitmap_ptr = match self.db_cache.word_docids.entry(word) { + fn get_value<'v, K1, KC>( + txn: &'search RoTxn, + cache_key: K1, + db_key: &'v KC::EItem, + cache: &mut FxHashMap>, + db: Database, + ) -> Result> + where + K1: Copy + Eq + Hash, + KC: BytesEncode<'v>, + { + let bitmap_ptr = match cache.entry(cache_key) { Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(), Entry::Vacant(entry) => { - let bitmap_ptr = self - .index - .word_docids - .remap_data_type::() - .get(self.txn, self.word_interner.get(word))?; + let bitmap_ptr = db.get(txn, db_key)?; entry.insert(bitmap_ptr); bitmap_ptr } }; Ok(bitmap_ptr) } - pub fn get_prefix_docids(&mut self, prefix: Interned) -> Result> { - // In the future, this will be a frozen roaring bitmap - let bitmap_ptr = match self.db_cache.word_prefix_docids.entry(prefix) { - Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(), - Entry::Vacant(entry) => { - let bitmap_ptr = self - .index - .word_prefix_docids - .remap_data_type::() - .get(self.txn, self.word_interner.get(prefix))?; - entry.insert(bitmap_ptr); - bitmap_ptr - } - }; - Ok(bitmap_ptr) + + /// Retrieve or insert the given value in the `word_docids` database. + pub fn get_word_docids(&mut self, word: Interned) -> Result> { + Self::get_value( + self.txn, + word, + self.word_interner.get(word).as_str(), + &mut self.db_cache.word_docids, + self.index.word_docids.remap_data_type::(), + ) + } + /// Retrieve or insert the given value in the `word_prefix_docids` database. + pub fn get_word_prefix_docids( + &mut self, + prefix: Interned, + ) -> Result> { + Self::get_value( + self.txn, + prefix, + self.word_interner.get(prefix).as_str(), + &mut self.db_cache.word_prefix_docids, + self.index.word_prefix_docids.remap_data_type::(), + ) } pub fn get_word_pair_proximity_docids( @@ -59,40 +79,17 @@ impl<'search> SearchContext<'search> { word2: Interned, proximity: u8, ) -> Result> { - let key = (proximity, word1, word2); - match self.db_cache.word_pair_proximity_docids.entry(key) { - Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), - Entry::Vacant(entry) => { - // We shouldn't greedily access this DB at all - // a DB (w1, w2) -> [proximities] would be much better - // We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity - // And if we worked with words encoded as integers, the set of words could be a roaring bitmap - // Then, to find all the proximities between two list of words, we'd do: - - // inputs: - // - words1 (roaring bitmap) - // - words2 (roaring bitmap) - // output: - // - [(word1, word2, [proximities])] - // algo: - // let mut ouput = vec![]; - // for word1 in words1 { - // let all_words_in_proximity_of_w1 = pair_words_db.get(word1); - // let words_in_proximity_of_w1 = all_words_in_proximity_of_w1 & words2; - // for word2 in words_in_proximity_of_w1 { - // let proximties = prox_db.get(word1, word2); - // output.push(word1, word2, proximities); - // } - // } - let bitmap_ptr = - self.index.word_pair_proximity_docids.remap_data_type::().get( - self.txn, - &(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)), - )?; - entry.insert(bitmap_ptr); - Ok(bitmap_ptr) - } - } + Self::get_value( + self.txn, + (proximity, word1, word2), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(word2).as_str(), + ), + &mut self.db_cache.word_pair_proximity_docids, + self.index.word_pair_proximity_docids.remap_data_type::(), + ) } pub fn get_word_prefix_pair_proximity_docids( @@ -101,22 +98,17 @@ impl<'search> SearchContext<'search> { prefix2: Interned, proximity: u8, ) -> Result> { - let key = (proximity, word1, prefix2); - match self.db_cache.word_prefix_pair_proximity_docids.entry(key) { - Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), - Entry::Vacant(entry) => { - let bitmap_ptr = self - .index - .word_prefix_pair_proximity_docids - .remap_data_type::() - .get( - self.txn, - &(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)), - )?; - entry.insert(bitmap_ptr); - Ok(bitmap_ptr) - } - } + Self::get_value( + self.txn, + (proximity, word1, prefix2), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(prefix2).as_str(), + ), + &mut self.db_cache.word_prefix_pair_proximity_docids, + self.index.word_prefix_pair_proximity_docids.remap_data_type::(), + ) } pub fn get_prefix_word_pair_proximity_docids( &mut self, @@ -124,25 +116,16 @@ impl<'search> SearchContext<'search> { right: Interned, proximity: u8, ) -> Result> { - let key = (proximity, left_prefix, right); - match self.db_cache.prefix_word_pair_proximity_docids.entry(key) { - Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), - Entry::Vacant(entry) => { - let bitmap_ptr = self - .index - .prefix_word_pair_proximity_docids - .remap_data_type::() - .get( - self.txn, - &( - proximity, - self.word_interner.get(left_prefix), - self.word_interner.get(right), - ), - )?; - entry.insert(bitmap_ptr); - Ok(bitmap_ptr) - } - } + Self::get_value( + self.txn, + (proximity, left_prefix, right), + &( + proximity, + self.word_interner.get(left_prefix).as_str(), + self.word_interner.get(right).as_str(), + ), + &mut self.db_cache.prefix_word_pair_proximity_docids, + self.index.prefix_word_pair_proximity_docids.remap_data_type::(), + ) } } diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 1d17c32a8..2cedbffa5 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -1,3 +1,41 @@ +/*! Implementation of a generic graph-based ranking rule. + +A graph-based ranking rule is a ranking rule that works by representing +its possible operations and their relevancy cost as a directed acyclic multi-graph +built on top of the query graph. It then computes its buckets by finding the +cheapest paths from the start node to the end node and computing the document ids +that satisfy those paths. + +For example, the proximity ranking rule builds a graph where the edges between two +nodes represent a condition that the term of the source node is in a certain proximity +to the term of the destination node. With the query "pretty house by" where the term +"pretty" has three possible proximities to the term "house" and "house" has two +proximities to "by", the graph will look like this: + +```txt +┌───────┐ ┌───────┐─────1────▶┌───────┐──1──▶┌─────┐ ┌───────┐ +│ START │──0─▶│pretty │─────2────▶│ house │ │ by │─0─▶│ END │ +└───────┘ └───────┘─────3────▶└───────┘──2-─▶└─────┘ └───────┘ +``` +The proximity ranking rule's first bucket will be determined by the union of all +the shortest paths from START to END, which in this case is: +```txt +START --0-> pretty --1--> house --1--> by --0--> end +``` +The path's corresponding document ids are found by taking the intersection of the +document ids of each edge. That is, we find the documents where both `pretty` is +1-close to `house` AND `house` is 1-close to `by`. + +For the second bucket, we get the union of the second-cheapest paths, which are: +```txt +START --0-> pretty --1--> house --2--> by --0--> end +START --0-> pretty --2--> house --1--> by --0--> end +``` +That is we find the documents where either: +- `pretty` is 1-close to `house` AND `house` is 2-close to `by` +- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by` +*/ + use roaring::RoaringBitmap; use super::logger::SearchLogger; @@ -8,24 +46,38 @@ use super::small_bitmap::SmallBitmap; use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::Result; +/// A generic graph-based ranking rule pub struct GraphBasedRankingRule { id: String, + // When the ranking rule is not iterating over its buckets, + // its state is `None`. state: Option>, } impl GraphBasedRankingRule { + /// Creates the ranking rule with the given identifier pub fn new(id: String) -> Self { Self { id, state: None } } } +/// The internal state of a graph-based ranking rule during iteration pub struct GraphBasedRankingRuleState { + /// The current graph graph: RankingRuleGraph, + /// Cache to retrieve the docids associated with each edge edge_docids_cache: EdgeDocidsCache, + /// Cache used to optimistically discard paths that resolve to no documents. empty_paths_cache: EmptyPathsCache, + /// A structure giving the list of possible costs from each node to the end node, + /// along with a set of unavoidable edges that must be traversed to achieve that distance. all_distances: Vec>, + /// An index in the first element of `all_distances`, giving the cost of the next bucket cur_distance_idx: usize, } +/// Traverse each edge of the graph, computes its associated document ids, +/// and remove this edge from the graph if its docids are disjoint with the +/// given universe. fn remove_empty_edges<'search, G: RankingRuleGraphTrait>( ctx: &mut SearchContext<'search>, graph: &mut RankingRuleGraph, @@ -70,6 +122,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> let mut edge_docids_cache = EdgeDocidsCache::default(); let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16); + // First simplify the graph as much as possible, by computing the docids of the edges + // within the rule's universe and removing the edges that have no associated docids. remove_empty_edges( ctx, &mut graph, @@ -77,6 +131,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> universe, &mut empty_paths_cache, )?; + + // Then pre-compute the cost of all paths from each node to the end node let all_distances = graph.initialize_distances_with_necessary_edges(); let state = GraphBasedRankingRuleState { @@ -98,9 +154,14 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> logger: &mut dyn SearchLogger, universe: &RoaringBitmap, ) -> Result>> { + // If universe.len() <= 1, the bucket sort algorithm + // should not have called this function. assert!(universe.len() > 1); + // Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`, + // should never happen let mut state = self.state.take().unwrap(); + // TODO: does this have a real positive performance cost? remove_empty_edges( ctx, &mut state.graph, @@ -109,12 +170,16 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> &mut state.empty_paths_cache, )?; + // If the cur_distance_idx does not point to a valid cost in the `all_distances` + // structure, then we have computed all the buckets and can return. if state.cur_distance_idx >= state.all_distances[state.graph.query_graph.root_node as usize].len() { self.state = None; return Ok(None); } + + // Retrieve the cost of the paths to compute let (cost, _) = state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx]; state.cur_distance_idx += 1; @@ -129,22 +194,38 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> cur_distance_idx: _, } = &mut state; - let mut paths = vec![]; let original_universe = universe; let mut universe = universe.clone(); // TODO: remove this unnecessary clone let original_graph = graph.clone(); + // and this vector as well + let mut paths = vec![]; + + // For each path of the given cost, we will compute its associated + // document ids. + // In case the path does not resolve to any document id, we try to figure out why + // and update the `empty_paths_cache` accordingly. + // For example, it may be that the path is empty because one of its edges is disjoint + // with the universe, or because a prefix of the path is disjoint with the universe, or because + // the path contains two edges that are disjoint from each other within the universe. + // Updating the empty_paths_cache helps speed up the execution of `visit_paths_of_cost` and reduces + // the number of future candidate paths given by that same function. graph.visit_paths_of_cost( graph.query_graph.root_node as usize, cost, all_distances, empty_paths_cache, |path, graph, empty_paths_cache| { + // Accumulate the path for logging purposes only paths.push(path.to_vec()); let mut path_docids = universe.clone(); + + // We store the edges and their docids in vectors in case the path turns out to be + // empty and we need to figure out why it was empty. let mut visited_edges = vec![]; let mut cached_edge_docids = vec![]; + for &edge_index in path { visited_edges.push(edge_index); let edge_docids = @@ -154,21 +235,29 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> BitmapOrAllRef::All => continue, }; cached_edge_docids.push((edge_index, edge_docids.clone())); + + // If the edge is empty, then the path will be empty as well, we update the graph + // and caches accordingly and skip to the next candidate path. if edge_docids.is_disjoint(&universe) { // 1. Store in the cache that this edge is empty for this universe empty_paths_cache.forbid_edge(edge_index); // 2. remove this edge from the ranking rule graph graph.remove_edge(edge_index); + // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore edge_docids_cache.cache.remove(&edge_index); return Ok(()); } path_docids &= edge_docids; + // If the (sub)path is empty, we try to figure out why and update the caches accordingly. if path_docids.is_disjoint(&universe) { - // empty_paths_cache.forbid_prefix(&visited_edges); - // if the intersection between this edge and any + // First, we know that this path is empty, and thus any path + // that is a superset of it will also be empty. + empty_paths_cache.forbid_prefix(&visited_edges); + // Second, if the intersection between this edge and any // previous one is disjoint with the universe, - // then we add these two edges to the empty_path_cache + // then we also know that any path containing the same couple of + // edges will also be empty. for (edge_index2, edge_docids2) in cached_edge_docids[..cached_edge_docids.len() - 1].iter() { @@ -181,6 +270,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> } } bucket |= &path_docids; + // Reduce the size of the universe so that we can more optimistically discard candidate paths universe -= path_docids; Ok(()) }, @@ -196,6 +286,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph> logger, ); + // TODO: Graph-based ranking rules do not (yet) modify the query graph. We could, however, + // remove nodes and/or terms within nodes that weren't present in any of the paths. let next_query_graph = state.graph.query_graph.clone(); self.state = Some(state); diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index 8a8fad1e1..e68f3b949 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -3,6 +3,7 @@ use std::marker::PhantomData; use fxhash::FxHashMap; +/// An index within a [`Interner`] structure. pub struct Interned { idx: u32, _phantom: PhantomData, @@ -13,7 +14,10 @@ impl Interned { Self { idx, _phantom: PhantomData } } } - +/// An [`Interner`] is used to store a unique copy of a value of type `T`. This value +/// is then identified by a lightweight index of type [`Interned`], which can +/// be copied, compared, and hashed efficiently. An immutable reference to the original value +/// can be retrieved using `self.get(interned)`. pub struct Interner { stable_store: Vec, lookup: FxHashMap>, diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 11e1389d0..d4d64f844 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -7,7 +7,82 @@ use super::ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGrap use super::small_bitmap::SmallBitmap; use super::{RankingRule, RankingRuleQueryTrait}; +/// Trait for structure logging the execution of a search query. +pub trait SearchLogger { + /// Logs the initial query + fn initial_query(&mut self, query: &Q); + + /// Logs the query that was used to compute the set of all candidates + fn query_for_universe(&mut self, query: &Q); + + /// Logs the value of the initial set of all candidates + fn initial_universe(&mut self, universe: &RoaringBitmap); + + /// Logs the ranking rules used to perform the search query + fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]); + + /// Logs the start of a ranking rule's iteration. + fn start_iteration_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + query: &Q, + universe: &RoaringBitmap, + ); + /// Logs the end of the computation of a ranking rule bucket + fn next_bucket_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + universe: &RoaringBitmap, + candidates: &RoaringBitmap, + ); + /// Logs the skipping of a ranking rule bucket + fn skip_bucket_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + candidates: &RoaringBitmap, + ); + /// Logs the end of a ranking rule's iteration. + fn end_iteration_ranking_rule<'transaction>( + &mut self, + ranking_rule_idx: usize, + ranking_rule: &dyn RankingRule<'transaction, Q>, + universe: &RoaringBitmap, + ); + /// Logs the addition of document ids to the final results + fn add_to_results(&mut self, docids: &[u32]); + + /// Logs the internal state of the words ranking rule + fn log_words_state(&mut self, query_graph: &Q); + + /// Logs the internal state of the proximity ranking rule + fn log_proximity_state( + &mut self, + query_graph: &RankingRuleGraph, + paths: &[Vec], + empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + distances: Vec>, + cost: u16, + ); + + /// Logs the internal state of the typo ranking rule + fn log_typo_state( + &mut self, + query_graph: &RankingRuleGraph, + paths: &[Vec], + empty_paths_cache: &EmptyPathsCache, + universe: &RoaringBitmap, + distances: Vec>, + cost: u16, + ); +} + +/// A dummy [`SearchLogger`] which does nothing. pub struct DefaultSearchLogger; + impl SearchLogger for DefaultSearchLogger { fn initial_query(&mut self, _query: &Q) {} @@ -76,63 +151,3 @@ impl SearchLogger for DefaultSearchLogger { ) { } } - -pub trait SearchLogger { - fn initial_query(&mut self, query: &Q); - - fn query_for_universe(&mut self, query: &Q); - - fn initial_universe(&mut self, universe: &RoaringBitmap); - - fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule]); - - fn start_iteration_ranking_rule<'transaction>( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - query: &Q, - universe: &RoaringBitmap, - ); - fn next_bucket_ranking_rule<'transaction>( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - universe: &RoaringBitmap, - candidates: &RoaringBitmap, - ); - fn skip_bucket_ranking_rule<'transaction>( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - candidates: &RoaringBitmap, - ); - fn end_iteration_ranking_rule<'transaction>( - &mut self, - ranking_rule_idx: usize, - ranking_rule: &dyn RankingRule<'transaction, Q>, - universe: &RoaringBitmap, - ); - fn add_to_results(&mut self, docids: &[u32]); - - fn log_words_state(&mut self, query_graph: &Q); - - fn log_proximity_state( - &mut self, - query_graph: &RankingRuleGraph, - paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, - universe: &RoaringBitmap, - distances: Vec>, - cost: u16, - ); - - fn log_typo_state( - &mut self, - query_graph: &RankingRuleGraph, - paths: &[Vec], - empty_paths_cache: &EmptyPathsCache, - universe: &RoaringBitmap, - distances: Vec>, - cost: u16, - ); -} diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 235075580..f2cc7d5f4 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -88,7 +88,7 @@ fn resolve_maximally_reduced_query_graph<'search>( break; } else { let position_to_remove = positions_to_remove.pop().unwrap(); - let _ = graph.remove_words_at_position(position_to_remove); + let _ = graph.remove_words_starting_at_position(position_to_remove); } } logger.query_for_universe(&graph); diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index b879b2c15..88d1849e3 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -3,6 +3,17 @@ use super::small_bitmap::SmallBitmap; use super::SearchContext; use crate::Result; +const QUERY_GRAPH_NODE_LENGTH_LIMIT: u16 = 64; + +/// A node of the [`QueryGraph`]. +/// +/// There are four types of nodes: +/// 1. `Start` : unique, represents the start of the query +/// 2. `End` : unique, represents the end of a query +/// 3. `Deleted` : represents a node that was deleted. +/// All deleted nodes are unreachable from the start node. +/// 4. `Term` is a regular node representing a word or combination of words +/// from the user query. #[derive(Clone)] pub enum QueryNode { Term(LocatedQueryTerm), @@ -11,34 +22,84 @@ pub enum QueryNode { End, } +/// The edges associated with a node in the query graph. #[derive(Clone)] pub struct Edges { - // TODO: use a tiny bitset instead, something like a simple Vec where most queries will see a vector of one element + /// Set of nodes which have an edge going to the current node pub predecessors: SmallBitmap, + /// Set of nodes which are reached by an edge from the current node pub successors: SmallBitmap, } +/** +A graph representing all the ways to interpret the user's search query. + +## Important +At the moment, a query graph has a hardcoded limit of [`QUERY_GRAPH_NODE_LENGTH_LIMIT`] nodes. + +## Example 1 +For the search query `sunflower`, we need to register the following things: +- we need to look for the exact word `sunflower` +- but also any word which is 1 or 2 typos apart from `sunflower` +- and every word that contains the prefix `sunflower` +- and also the couple of adjacent words `sun flower` +- as well as all the user-defined synonyms of `sunflower` + +All these derivations of a word will be stored in [`WordDerivations`]. + +## Example 2: +For the search query `summer house by`. + +We also look for all word derivations of each term. And we also need to consider +the potential n-grams `summerhouse`, `summerhouseby`, and `houseby`. +Furthermore, we need to know which words these ngrams replace. This is done by creating the +following graph, where each node also contains a list of derivations: +```txt + ┌───────┐ + ┌─│houseby│─────────┐ + │ └───────┘ │ +┌───────┐ ┌───────┐ │ ┌───────┐ ┌────┐ │ ┌───────┐ +│ START │─┬─│summer │─┴─│ house │┌─│ by │─┼─│ END │ +└───────┘ │ └───────┘ └───────┘│ └────┘ │ └───────┘ + │ ┌────────────┐ │ │ + ├─│summerhouse │───────┘ │ + │ └────────────┘ │ + │ ┌─────────────┐ │ + └─────────│summerhouseby│───────┘ + └─────────────┘ +``` +Note also that each node has a range of positions associated with it, +such that `summer` is known to be a word at the positions `0..=0` and `houseby` +is registered with the positions `1..=2`. When two nodes are connected by an edge, +it means that they are potentially next to each other in the user's search query +(depending on the [`TermsMatchingStrategy`](crate::search::TermsMatchingStrategy) +and the transformations that were done on the query graph). +*/ #[derive(Clone)] pub struct QueryGraph { + /// The index of the start node within `self.nodes` pub root_node: u16, + /// The index of the end node within `self.nodes` pub end_node: u16, + /// The list of all query nodes pub nodes: Vec, + /// The list of all node edges pub edges: Vec, } -fn _assert_sizes() { - // TODO: QueryNodes are too big now, 88B is a bit too big - let _: [u8; 88] = [0; std::mem::size_of::()]; - let _: [u8; 32] = [0; std::mem::size_of::()]; -} - impl Default for QueryGraph { /// Create a new QueryGraph with two disconnected nodes: the root and end nodes. fn default() -> Self { let nodes = vec![QueryNode::Start, QueryNode::End]; let edges = vec![ - Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }, - Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }, + Edges { + predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + }, + Edges { + predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + }, ]; Self { root_node: 0, end_node: 1, nodes, edges } @@ -46,33 +107,31 @@ impl Default for QueryGraph { } impl QueryGraph { + /// Connect all the given predecessor nodes to the given successor node fn connect_to_node(&mut self, from_nodes: &[u16], to_node: u16) { for &from_node in from_nodes { self.edges[from_node as usize].successors.insert(to_node); self.edges[to_node as usize].predecessors.insert(from_node); } } + /// Add the given node to the graph and connect it to all the given predecessor nodes fn add_node(&mut self, from_nodes: &[u16], node: QueryNode) -> u16 { let new_node_idx = self.nodes.len() as u16; + assert!(new_node_idx <= QUERY_GRAPH_NODE_LENGTH_LIMIT); self.nodes.push(node); self.edges.push(Edges { - predecessors: SmallBitmap::from_array(from_nodes, 64), - successors: SmallBitmap::new(64), + predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), }); - for from_node in from_nodes { - self.edges[*from_node as usize].successors.insert(new_node_idx); - } + self.connect_to_node(from_nodes, new_node_idx); + new_node_idx } } impl QueryGraph { - // TODO: return the list of all matching words here as well + /// Build the query graph from the parsed user search query. pub fn from_query(ctx: &mut SearchContext, terms: Vec) -> Result { - // TODO: maybe empty nodes should not be removed here, to compute - // the score of the `words` ranking rule correctly - // it is very easy to traverse the graph and remove afterwards anyway - // Still, I'm keeping this here as a demo let mut empty_nodes = vec![]; let word_set = ctx.index.words_fst(ctx.txn)?; @@ -81,7 +140,6 @@ impl QueryGraph { let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![graph.root_node]); - // TODO: split words / synonyms for length in 1..=terms.len() { let query = &terms[..length]; @@ -156,6 +214,8 @@ impl QueryGraph { Ok(graph) } + + /// Remove the given nodes and all their edges from the query graph. pub fn remove_nodes(&mut self, nodes: &[u16]) { for &node in nodes { self.nodes[node as usize] = QueryNode::Deleted; @@ -166,10 +226,13 @@ impl QueryGraph { for succ in edges.successors.iter() { self.edges[succ as usize].predecessors.remove(node); } - self.edges[node as usize] = - Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }; + self.edges[node as usize] = Edges { + predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + }; } } + /// Remove the given nodes, connecting all their predecessors to all their successors. pub fn remove_nodes_keep_edges(&mut self, nodes: &[u16]) { for &node in nodes { self.nodes[node as usize] = QueryNode::Deleted; @@ -182,11 +245,17 @@ impl QueryGraph { self.edges[succ as usize].predecessors.remove(node); self.edges[succ as usize].predecessors.union(&edges.predecessors); } - self.edges[node as usize] = - Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) }; + self.edges[node as usize] = Edges { + predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT), + }; } } - pub fn remove_words_at_position(&mut self, position: i8) -> bool { + + /// Remove all the nodes that correspond to a word starting at the given position, and connect + /// the predecessors of these nodes to their successors. + /// Return `true` if any node was removed. + pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool { let mut nodes_to_remove_keeping_edges = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { let node_idx = node_idx as u16; @@ -202,14 +271,15 @@ impl QueryGraph { !nodes_to_remove_keeping_edges.is_empty() } + /// Simplify the query graph by removing all nodes that are disconnected from + /// the start or end nodes. fn simplify(&mut self) { loop { let mut nodes_to_remove = vec![]; for (node_idx, node) in self.nodes.iter().enumerate() { - if (!matches!(node, QueryNode::End | QueryNode::Deleted) - && self.edges[node_idx].successors.is_empty()) - || (!matches!(node, QueryNode::Start | QueryNode::Deleted) - && self.edges[node_idx].predecessors.is_empty()) + if !matches!(node, QueryNode::End | QueryNode::Deleted) + && (self.edges[node_idx].successors.is_empty() + || self.edges[node_idx].predecessors.is_empty()) { nodes_to_remove.push(node_idx as u16); } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index ce569fbb0..bf2c6572e 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -53,7 +53,7 @@ impl RankingRuleGraphTrait for TypoGraph { docids |= bitmap; } if *nbr_typos == 0 { - if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? { + if let Some(bytes) = ctx.get_word_prefix_docids(derivations.original)? { // TODO: deserialize bitmap within a universe let bitmap = universe & RoaringBitmapCodec::bytes_decode(bytes) diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 788f8a496..3ccb54032 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -114,7 +114,7 @@ pub fn apply_ranking_rules<'search>( logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe); ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?; - let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; + let mut candidates: Vec = vec![RoaringBitmap::default(); ranking_rules_len]; candidates[0] = universe.clone(); let mut cur_ranking_rule_index = 0; @@ -174,7 +174,7 @@ pub fn apply_ranking_rules<'search>( } } else { let candidates = - candidates.iter().take(length - results.len()).collect::>(); + candidates.iter().take(length - results.len()).collect::>(); logger.add_to_results(&candidates); results.extend(&candidates); } @@ -234,358 +234,3 @@ pub fn apply_ranking_rules<'search>( Ok(results) } - -#[cfg(test)] -mod tests { - // use crate::allocator::ALLOC; - use std::fs::File; - use std::io::{BufRead, BufReader, Cursor, Seek}; - use std::time::Instant; - - use big_s::S; - use heed::EnvOpenOptions; - use maplit::hashset; - - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - // use crate::search::new::logger::detailed::DetailedSearchLogger; - use crate::search::new::logger::DefaultSearchLogger; - use crate::search::new::{execute_search, SearchContext}; - use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; - use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; - - #[test] - fn search_wiki_new() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - let txn = index.read_txn().unwrap(); - - println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - - // loop { - let start = Instant::now(); - - // let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "which a the releases from poison by the government", - None, - 0, - 20, - &mut DefaultSearchLogger, - // &mut logger, - ) - .unwrap(); - - // logger.write_d2_description(&mut ctx); - - let elapsed = start.elapsed(); - println!("{}us", elapsed.as_micros()); - - let _documents = index - .documents(&txn, results.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), results); - // } - // for (id, _document) in documents { - // println!("{id}:"); - // // println!("{document}"); - // } - } - - #[test] - fn search_wiki_old() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - - let txn = index.read_txn().unwrap(); - - let rr = index.criteria(&txn).unwrap(); - println!("{rr:?}"); - - let start = Instant::now(); - - let mut s = Search::new(&txn, &index); - s.query("which a the releases from poison by the government"); - s.terms_matching_strategy(TermsMatchingStrategy::Last); - s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); - let docs = s.execute().unwrap(); - - let elapsed = start.elapsed(); - - let documents = index - .documents(&txn, docs.documents_ids.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - for (id, _document) in documents { - println!("{id}:"); - // println!("{document}"); - } - } - #[test] - fn search_movies_new() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let txn = index.read_txn().unwrap(); - - // let primary_key = index.primary_key(&txn).unwrap().unwrap(); - // let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - // loop { - let start = Instant::now(); - - let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "releases from poison by the government", - None, - 0, - 20, - // &mut DefaultSearchLogger, - &mut logger, - ) - .unwrap(); - - logger.write_d2_description(&mut ctx); - - let elapsed = start.elapsed(); - - // let ids = index - // .documents(&txn, results.iter().copied()) - // .unwrap() - // .into_iter() - // .map(|x| { - // let obkv = &x.1; - // let id = obkv.get(primary_key).unwrap(); - // let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - // id.as_str().unwrap().to_owned() - // }) - // .collect::>(); - - println!("{}us: {results:?}", elapsed.as_micros()); - // println!("external ids: {ids:?}"); - // } - } - - #[test] - fn search_movies_old() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - - let txn = index.read_txn().unwrap(); - - let rr = index.criteria(&txn).unwrap(); - println!("{rr:?}"); - - let primary_key = index.primary_key(&txn).unwrap().unwrap(); - let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - - let start = Instant::now(); - - let mut s = Search::new(&txn, &index); - s.query("which a the releases from poison by the government"); - s.terms_matching_strategy(TermsMatchingStrategy::Last); - s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); - let docs = s.execute().unwrap(); - - let elapsed = start.elapsed(); - - let ids = index - .documents(&txn, docs.documents_ids.iter().copied()) - .unwrap() - .into_iter() - .map(|x| { - let obkv = &x.1; - let id = obkv.get(primary_key).unwrap(); - let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - id.as_str().unwrap().to_owned() - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - println!("external ids: {ids:?}"); - } - - #[test] - fn _settings_movies() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_min_word_len_one_typo(5); - builder.set_min_word_len_two_typos(100); - builder.set_sortable_fields(hashset! { S("release_date") }); - builder.set_criteria(vec![ - Criterion::Words, - Criterion::Typo, - Criterion::Proximity, - Criterion::Asc("release_date".to_owned()), - ]); - - builder.execute(|_| (), || false).unwrap(); - wtxn.commit().unwrap(); - } - - #[test] - fn _index_movies() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - let primary_key = "id"; - let searchable_fields = vec!["title", "overview"]; - let filterable_fields = vec!["release_date", "genres"]; - - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(primary_key.to_owned()); - let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_filterable_fields(filterable_fields); - - builder.set_min_word_len_one_typo(5); - builder.set_min_word_len_two_typos(100); - builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); - builder.execute(|_| (), || false).unwrap(); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) - .unwrap(); - - let documents = documents_from( - "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json", - "json", - ); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); - } - #[test] - fn _index_wiki() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - // let primary_key = "id"; - let searchable_fields = vec!["body", "title", "url"]; - // let filterable_fields = vec![]; - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - // builder.set_primary_key(primary_key.to_owned()); - let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_filterable_fields(filterable_fields); - - // builder.set_min_word_len_one_typo(5); - // builder.set_min_word_len_two_typos(100); - builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); - builder.execute(|_| (), || false).unwrap(); - - let config = IndexerConfig::default(); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) - .unwrap(); - - let documents = documents_from( - "/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv", - "csv", - ); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); - } - - fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { - let reader = File::open(filename) - .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename)); - let reader = BufReader::new(reader); - let documents = match filetype { - "csv" => documents_from_csv(reader).unwrap(), - "json" => documents_from_json(reader).unwrap(), - "jsonl" => documents_from_jsonl(reader).unwrap(), - otherwise => panic!("invalid update format {:?}", otherwise), - }; - DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() - } - - fn documents_from_jsonl(reader: impl BufRead) -> crate::Result> { - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - - for result in serde_json::Deserializer::from_reader(reader).into_iter::() { - let object = result.unwrap(); - documents.append_json_object(&object)?; - } - - documents.into_inner().map_err(Into::into) - } - - fn documents_from_json(reader: impl BufRead) -> crate::Result> { - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - - documents.append_json_array(reader)?; - - documents.into_inner().map_err(Into::into) - } - - fn documents_from_csv(reader: impl BufRead) -> crate::Result> { - let csv = csv::Reader::from_reader(reader); - - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - documents.append_csv(csv)?; - - documents.into_inner().map_err(Into::into) - } -} diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 0581341d1..ca64e4342 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -46,7 +46,7 @@ impl<'search> SearchContext<'search> { } } if *use_prefix_db { - if let Some(prefix_docids) = self.get_prefix_docids(*original)? { + if let Some(prefix_docids) = self.get_word_prefix_docids(*original)? { or_docids.push(prefix_docids); } } diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 2858e1569..2015367da 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -88,7 +88,8 @@ impl<'search> RankingRule<'search, QueryGraph> for Words { break; } else { let position_to_remove = self.positions_to_remove.pop().unwrap(); - let did_delete_any_node = query_graph.remove_words_at_position(position_to_remove); + let did_delete_any_node = + query_graph.remove_words_starting_at_position(position_to_remove); if did_delete_any_node { break; }