From c8e251bf24e66a67d9e0483ce79045f393263575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 13:57:34 +0100 Subject: [PATCH] Remove noise in codebase --- milli/src/search/new/db_cache.rs | 8 +- .../search/new/graph_based_ranking_rule.rs | 71 +------ milli/src/search/new/mod.rs | 6 +- milli/src/search/new/query_graph.rs | 111 +---------- milli/src/search/new/query_term.rs | 4 - .../search/new/ranking_rule_graph/build.rs | 2 - .../new/ranking_rule_graph/cheapest_paths.rs | 7 +- .../ranking_rule_graph/edge_docids_cache.rs | 6 - .../src/search/new/ranking_rule_graph/mod.rs | 59 +----- .../new/ranking_rule_graph/paths_map.rs | 188 ++---------------- .../new/ranking_rule_graph/proximity/build.rs | 5 +- .../proximity/compute_docids.rs | 11 +- .../new/ranking_rule_graph/proximity/mod.rs | 3 +- .../new/ranking_rule_graph/resolve_paths.rs | 10 +- milli/src/search/new/ranking_rules.rs | 101 +--------- milli/src/search/new/resolve_query_graph.rs | 101 +--------- milli/src/search/new/sort.rs | 17 +- milli/src/search/new/words.rs | 45 +---- 18 files changed, 63 insertions(+), 692 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 0a058d339..ae7cb9b91 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -1,7 +1,8 @@ use std::collections::hash_map::Entry; use fxhash::FxHashMap; -use heed::{types::ByteSlice, RoTxn}; +use heed::types::ByteSlice; +use heed::RoTxn; use crate::{Index, Result}; @@ -62,10 +63,7 @@ impl<'transaction> DatabaseCache<'transaction> { match self.word_pair_proximity_docids.entry(key.clone()) { Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), Entry::Vacant(entry) => { - // Note that now, we really want to do a prefix iter over (w1, w2) to get all the possible proximities - // but oh well - // - // Actually, we shouldn'transaction greedily access this DB at all + // We shouldn't greedily access this DB at all // a DB (w1, w2) -> [proximities] would be much better // We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity // And if we worked with words encoded as integers, the set of words could be a roaring bitmap diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index 0f72b9d5d..43de6a531 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -1,20 +1,15 @@ use heed::RoTxn; use roaring::RoaringBitmap; -use crate::{ - new::ranking_rule_graph::cheapest_paths::{self, Path}, - Index, Result, -}; - -use super::{ - db_cache::DatabaseCache, - ranking_rule_graph::{ - cheapest_paths::KCheapestPathsState, edge_docids_cache::EdgeDocidsCache, - empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, RankingRuleGraph, - RankingRuleGraphTrait, - }, - QueryGraph, RankingRule, RankingRuleOutput, -}; +use super::db_cache::DatabaseCache; +use super::ranking_rule_graph::cheapest_paths::KCheapestPathsState; +use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache; +use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; +use super::ranking_rule_graph::paths_map::PathsMap; +use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait}; +use super::{QueryGraph, RankingRule, RankingRuleOutput}; +use crate::new::ranking_rule_graph::cheapest_paths::{self, Path}; +use crate::{Index, Result}; pub struct GraphBasedRankingRule { state: Option>, @@ -43,16 +38,8 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result<()> { - // if let Some(state) = &mut self.state { - // // TODO: update the previous state - // // TODO: update the existing graph incrementally, based on a diff - - // } else { + // TODO: update old state instead of starting from scratch let graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?; - // println!("Initialized Proximity Ranking Rule."); - // println!("GRAPH:"); - // let graphviz = graph.graphviz(); - // println!("{graphviz}"); let cheapest_paths_state = KCheapestPathsState::new(&graph); let state = GraphBasedRankingRuleState { @@ -62,13 +49,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap empty_paths_cache: <_>::default(), }; - // let desc = state.graph.graphviz_with_path( - // &state.cheapest_paths_state.as_ref().unwrap().kth_cheapest_path.clone(), - // ); - // println!("Cheapest path: {desc}"); - self.state = Some(state); - // } Ok(()) } @@ -86,17 +67,9 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else { return Ok(None); }; - // println!("Proximity: Next Bucket"); let mut paths = PathsMap::default(); - // let desc = state.graph.dot_description_with_path(&cheapest_paths_state.kth_cheapest_path); - // println!("CHeapest Path: {desc}"); - // TODO: when does it return None? -> when there is no cheapest path - // How to handle it? -> ... return all document ids from the universe? - // - // TODO: Give an empty_edge and empty_prefix argument to the - // compute_paths_of_next_lowest_cost function if let Some(next_cheapest_paths_state) = cheapest_paths_state .compute_paths_of_next_lowest_cost( &mut state.graph, @@ -107,31 +80,12 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap state.cheapest_paths_state = Some(next_cheapest_paths_state); } else { state.cheapest_paths_state = None; - // If returns None if there are no longer any paths to compute - // BUT! paths_map may not be empty, and we need to compute the current bucket still } - // println!("PATHS: {}", paths.graphviz(&state.graph)); - - // paths.iterate(|path, cost| { - // let desc = state.graph.graphviz_with_path(&Path { edges: path.clone(), cost: *cost }); - // println!("Path to resolve of cost {cost}: {desc}"); - // }); - - // let desc = state.graph.dot_description_with_path( - // &state.cheapest_paths_state.as_ref().unwrap().kth_cheapest_path.clone(), - // ); - // println!("Cheapest path: {desc}"); - - // TODO: verify that this is correct - // If the paths are empty, we should probably return the universe? - // BUT! Is there a case where the paths are empty AND the universe is - // not empty? if paths.is_empty() { self.state = None; return Ok(None); } - // Here, log all the paths? let bucket = state.graph.resolve_paths( index, @@ -142,10 +96,6 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap universe, paths, )?; - // The call above also updated the graph such that it doesn't contain the empty edges anymore. - // println!("Resolved all the paths: {bucket:?} from universe {:?}", state.universe); - // let graphviz = state.graph.graphviz(); - // println!("{graphviz}"); let next_query_graph = state.graph.query_graph.clone(); @@ -160,7 +110,6 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap _txn: &'transaction RoTxn, _db_cache: &mut DatabaseCache<'transaction>, ) { - // println!("PROXIMITY: end iteration"); self.state = None; } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 17e74e70e..01b466d20 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -14,10 +14,8 @@ pub use query_graph::*; pub use ranking_rules::*; use roaring::RoaringBitmap; -use self::{ - db_cache::DatabaseCache, - query_term::{word_derivations, LocatedQueryTerm}, -}; +use self::db_cache::DatabaseCache; +use self::query_term::{word_derivations, LocatedQueryTerm}; use crate::{Index, Result}; pub enum BitmapOrAllRef<'s> { diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index cbe319af7..5ed746c27 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,13 +1,12 @@ +use std::collections::HashSet; +use std::fmt; use std::fmt::Debug; -use std::{collections::HashSet, fmt}; use heed::RoTxn; use roaring::RoaringBitmap; -use super::{ - db_cache::DatabaseCache, - query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}, -}; +use super::db_cache::DatabaseCache; +use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; use crate::{Index, Result}; #[derive(Clone)] @@ -20,8 +19,7 @@ pub enum QueryNode { #[derive(Debug, Clone)] pub struct Edges { - // TODO: use a tiny bitset instead - // something like a simple Vec where most queries will see a vector of one element + // TODO: use a tiny bitset instead, something like a simple Vec where most queries will see a vector of one element pub predecessors: RoaringBitmap, pub successors: RoaringBitmap, } @@ -75,7 +73,6 @@ impl QueryGraph { impl QueryGraph { // TODO: return the list of all matching words here as well - pub fn from_query<'transaction>( index: &Index, txn: &RoTxn, @@ -94,9 +91,7 @@ impl QueryGraph { let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![graph.root_node]); - // TODO: add all the word derivations found in the fst - // and add split words / support phrases - + // TODO: split words / synonyms for length in 1..=query.len() { let query = &query[..length]; @@ -279,18 +274,6 @@ impl Debug for QueryNode { } } -/* -TODO: - -1. Find the minimum number of words to check to resolve the 10 query trees at once. - (e.g. just 0 | 01 | 012 ) -2. Simplify the query tree after removal of a node ✅ -3. Create the proximity graph ✅ -4. Assign different proximities for the ngrams ✅ -5. Walk the proximity graph, finding all the potential paths of weight N from START to END ✅ -(without checking the bitmaps) - -*/ impl QueryGraph { pub fn graphviz(&self) -> String { let mut desc = String::new(); @@ -317,91 +300,9 @@ node [shape = "record"] for edge in self.edges[node].successors.iter() { desc.push_str(&format!("{node} -> {edge};\n")); } - // for edge in self.edges[node].incoming.iter() { - // desc.push_str(&format!("{node} -> {edge} [color = grey];\n")); - // } } desc.push('}'); desc } } - -#[cfg(test)] -mod tests { - use charabia::Tokenize; - - use super::{LocatedQueryTerm, QueryGraph, QueryNode}; - use crate::index::tests::TempIndex; - use crate::new::db_cache::DatabaseCache; - use crate::search::new::query_term::word_derivations; - - #[test] - fn build_graph() { - let mut index = TempIndex::new(); - index.index_documents_config.autogenerate_docids = true; - index - .update_settings(|s| { - s.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - index - .add_documents(documents!({ - "text": "0 1 2 3 4 5 6 7 01 23 234 56 79 709 7356", - })) - .unwrap(); - - // let fst = fst::Set::from_iter(["01", "23", "234", "56"]).unwrap(); - let txn = index.read_txn().unwrap(); - let mut db_cache = DatabaseCache::default(); - - let fst = index.words_fst(&txn).unwrap(); - let query = LocatedQueryTerm::from_query( - "0 no 1 2 3 4 5 6 7".tokenize(), - None, - |word, is_prefix| { - word_derivations( - &index, - &txn, - word, - if word.len() < 3 { - 0 - } else if word.len() < 6 { - 1 - } else { - 2 - }, - is_prefix, - &fst, - ) - }, - ) - .unwrap(); - - let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap(); - println!("{}", graph.graphviz()); - - // let positions_to_remove = vec![3, 6, 0, 4]; - // for p in positions_to_remove { - // graph.remove_words_at_position(p); - // println!("{}", graph.graphviz()); - // } - - // let proximities = |w1: &str, w2: &str| -> Vec { - // if matches!((w1, w2), ("56", "7")) { - // vec![] - // } else { - // vec![1, 2] - // } - // }; - - // let prox_graph = ProximityGraph::from_query_graph(graph, proximities); - - // println!("{}", prox_graph.graphviz()); - } -} - -// fn remove_element_from_vector(v: &mut Vec, el: usize) { -// let position = v.iter().position(|&x| x == el).unwrap(); -// v.swap_remove(position); -// } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 4d2b22264..52943755a 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -17,10 +17,6 @@ use crate::{Index, Result}; #[derive(Debug, Clone)] pub struct WordDerivations { - // TODO: should have a list for the words corresponding to the prefix as well! - // This is to implement the `exactness` ranking rule. - // However, we could also consider every term in `zero_typo` (except first one) to - // be words of that the original word is a prefix of pub original: String, pub zero_typo: Vec, pub one_typo: Vec, diff --git a/milli/src/search/new/ranking_rule_graph/build.rs b/milli/src/search/new/ranking_rule_graph/build.rs index 6978491cd..8e7dd7a04 100644 --- a/milli/src/search/new/ranking_rule_graph/build.rs +++ b/milli/src/search/new/ranking_rule_graph/build.rs @@ -46,8 +46,6 @@ impl RankingRuleGraph { } } } - // ranking_rule_graph.simplify(); - Ok(ranking_rule_graph) } } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 00babb560..fdce85159 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -3,10 +3,9 @@ use std::collections::{BTreeMap, HashSet}; use itertools::Itertools; use roaring::RoaringBitmap; -use super::{ - empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, Edge, RankingRuleGraph, - RankingRuleGraphTrait, -}; +use super::empty_paths_cache::EmptyPathsCache; +use super::paths_map::PathsMap; +use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait}; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct Path { diff --git a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs index 263b78d6a..dddbda6af 100644 --- a/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/edge_docids_cache.rs @@ -18,18 +18,12 @@ use crate::{Index, Result}; pub struct EdgeDocidsCache { pub cache: FxHashMap, - - // TODO: There is a big difference between `cache`, which is always valid, and - // `empty_path_prefixes`, which is only accurate for a particular universe - // ALSO, we should have a universe-specific `empty_edge` to use - // pub empty_path_prefixes: HashSet>, _phantom: PhantomData, } impl Default for EdgeDocidsCache { fn default() -> Self { Self { cache: Default::default(), - // empty_path_prefixes: Default::default(), _phantom: Default::default(), } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index b5c928ffa..52b685d08 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -5,7 +5,6 @@ pub mod empty_paths_cache; pub mod paths_map; pub mod proximity; pub mod resolve_paths; - use std::collections::{BTreeSet, HashSet}; use std::ops::ControlFlow; @@ -86,22 +85,10 @@ pub struct RankingRuleGraph { pub node_edges: Vec, pub successors: Vec, - // to get the edges between two nodes: + // TODO: to get the edges between two nodes: // 1. get node_outgoing_edges[from] // 2. get node_incoming_edges[to] // 3. take intersection betweem the two - - // TODO: node edges could be different I guess - // something like: - // pub node_edges: Vec - // where each index is the result of: - // the successor index in the top 16 bits, the edge index in the bottom 16 bits - - // TODO: - // node_successors? - - // pub removed_edges: HashSet, - // pub tmp_removed_edges: HashSet, } impl RankingRuleGraph { // Visit all edges between the two given nodes in order of increasing cost. @@ -142,50 +129,6 @@ impl RankingRuleGraph { } self.successors[from_node as usize] = new_successors_from_node; } - // pub fn remove_nodes(&mut self, nodes: &[usize]) { - // for &node in nodes { - // let edge_indices = &mut self.node_edges[node]; - // for edge_index in edge_indices.iter() { - // self.all_edges[*edge_index] = None; - // } - // edge_indices.clear(); - - // let preds = &self.query_graph.edges[node].incoming; - // for pred in preds { - // let edge_indices = &mut self.node_edges[*pred]; - // for edge_index in edge_indices.iter() { - // let edge_opt = &mut self.all_edges[*edge_index]; - // let Some(edge) = edge_opt else { continue; }; - // if edge.to_node == node { - // *edge_opt = None; - // } - // } - // panic!("remove nodes is incorrect at the moment"); - // edge_indices.clear(); - // } - // } - // self.query_graph.remove_nodes(nodes); - // } - // pub fn simplify(&mut self) { - // loop { - // let mut nodes_to_remove = vec![]; - // for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { - // if !matches!(node, QueryNode::End | QueryNode::Deleted) - // && self.node_edges[node_idx].is_empty() - // { - // nodes_to_remove.push(node_idx); - // } - // } - // if nodes_to_remove.is_empty() { - // break; - // } else { - // self.remove_nodes(&nodes_to_remove); - // } - // } - // } - // fn is_removed_edge(&self, edge: u32) -> bool { - // self.removed_edges.contains(&edge) || self.tmp_removed_edges.contains(&edge) - // } pub fn graphviz(&self) -> String { let mut desc = String::new(); diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 572cb975f..6f6512ae4 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -6,14 +6,13 @@ use std::hash::{Hash, Hasher}; use roaring::RoaringBitmap; use super::cheapest_paths::Path; -use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait, Edge}; +use super::{Edge, EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait}; use crate::new::QueryNode; - #[derive(Debug)] pub struct PathsMap { nodes: Vec<(u32, PathsMap)>, - value: Option + value: Option, } impl Default for PathsMap { fn default() -> Self { @@ -73,11 +72,11 @@ impl PathsMap { } pub fn remove_first(&mut self) -> Option<(Vec, V)> { if self.is_empty() { - return None + return None; } let mut result = vec![]; - let (_, value) = self.remove_first_rec(&mut result); + let (_, value) = self.remove_first_rec(&mut result); Some((result, value)) } pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { @@ -85,7 +84,7 @@ impl PathsMap { visit(cur, value); } for (first_edge, rest) in self.nodes.iter() { - cur.push(*first_edge); + cur.push(*first_edge); rest.iterate_rec(cur, visit); cur.pop(); } @@ -163,7 +162,7 @@ impl PathsMap { let [first_edge, remaining_prefix @ ..] = prefix else { return self.nodes.iter().map(|n| n.0).collect(); }; - for (edge, rest) in self.nodes.iter(){ + for (edge, rest) in self.nodes.iter() { if edge == first_edge { return rest.edge_indices_after_prefix(remaining_prefix); } @@ -173,14 +172,12 @@ impl PathsMap { pub fn contains_prefix_of_path(&self, path: &[u32]) -> bool { if self.value.is_some() { - return true + return true; } match path { - [] => { - false - } + [] => false, [first_edge, remaining_path @ ..] => { - for (edge, rest) in self.nodes.iter(){ + for (edge, rest) in self.nodes.iter() { if edge == first_edge { return rest.contains_prefix_of_path(remaining_path); } @@ -197,7 +194,12 @@ impl PathsMap { desc.push_str("\n}\n"); desc } - fn graphviz_rec(&self, desc: &mut String, path_from: Vec, graph: &RankingRuleGraph) { + fn graphviz_rec( + &self, + desc: &mut String, + path_from: Vec, + graph: &RankingRuleGraph, + ) { let id_from = { let mut h = DefaultHasher::new(); path_from.hash(&mut h); @@ -227,7 +229,6 @@ impl PathsMap { } impl RankingRuleGraph { - pub fn graphviz_with_path(&self, path: &Path) -> String { let mut desc = String::new(); desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n"); @@ -248,11 +249,7 @@ impl RankingRuleGraph { for (edge_idx, edge) in self.all_edges.iter().enumerate() { let Some(edge) = edge else { continue }; let Edge { from_node, to_node, cost, details } = edge; - let color = if path.edges.contains(&(edge_idx as u32)) { - "red" - } else { - "green" - }; + let color = if path.edges.contains(&(edge_idx as u32)) { "red" } else { "green" }; match &edge.details { EdgeDetails::Unconditional => { desc.push_str(&format!( @@ -273,157 +270,4 @@ impl RankingRuleGraph { desc.push('}'); desc } - -} - -#[cfg(test)] -mod tests { - use super::PathsMap; - use crate::db_snap; - use crate::index::tests::TempIndex; - use crate::new::db_cache::DatabaseCache; - use crate::new::ranking_rule_graph::cheapest_paths::KCheapestPathsState; - use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; - use crate::new::ranking_rule_graph::proximity::ProximityGraph; - use crate::new::ranking_rule_graph::RankingRuleGraph; - use crate::search::new::query_term::{word_derivations, LocatedQueryTerm}; - use crate::search::new::QueryGraph; - use charabia::Tokenize; - - #[test] - fn paths_tree() { - let mut index = TempIndex::new(); - index.index_documents_config.autogenerate_docids = true; - index - .update_settings(|s| { - s.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - index - .add_documents(documents!([ - { - "text": "0 1 2 3 4 5" - }, - { - "text": "0 a 1 b 2 3 4 5" - }, - { - "text": "0 a 1 b 3 a 4 b 5" - }, - { - "text": "0 a a 1 b 2 3 4 5" - }, - { - "text": "0 a a a a 1 b 3 45" - }, - ])) - .unwrap(); - - db_snap!(index, word_pair_proximity_docids, @"679d1126b569b3e8b10dd937c3faedf9"); - - let txn = index.read_txn().unwrap(); - let mut db_cache = DatabaseCache::default(); - let fst = index.words_fst(&txn).unwrap(); - let query = - LocatedQueryTerm::from_query("0 1 2 3 4 5".tokenize(), None, |word, is_prefix| { - word_derivations(&index, &txn, word, if word.len() < 3 { - 0 - } else if word.len() < 6 { - 1 - } else { - 2 - },is_prefix, &fst) - }) - .unwrap(); - let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap(); - let empty_paths_cache = EmptyPathsCache::default(); - let mut db_cache = DatabaseCache::default(); - - let mut prox_graph = - RankingRuleGraph::::build(&index, &txn, &mut db_cache, graph).unwrap(); - - println!("{}", prox_graph.graphviz()); - - let mut state = KCheapestPathsState::new(&prox_graph).unwrap(); - - let mut path_tree = PathsMap::default(); - while state.next_cost() <= 6 { - let next_state = state.compute_paths_of_next_lowest_cost(&mut prox_graph, &empty_paths_cache, &mut path_tree); - if let Some(next_state) = next_state { - state = next_state; - } else { - break; - } - } - - let desc = path_tree.graphviz(&prox_graph); - println!("{desc}"); - - // let path = vec![u32 { from: 0, to: 2, edge_idx: 0 }, u32 { from: 2, to: 3, edge_idx: 0 }, u32 { from: 3, to: 4, edge_idx: 0 }, u32 { from: 4, to: 5, edge_idx: 0 }, u32 { from: 5, to: 8, edge_idx: 0 }, u32 { from: 8, to: 1, edge_idx: 0 }, u32 { from: 1, to: 10, edge_idx: 0 }]; - // println!("{}", psath_tree.contains_prefix_of_path(&path)); - - - // let path = vec![u32 { from: 0, to: 2, edge_idx: 0 }, u32 { from: 2, to: 3, edge_idx: 0 }, u32 { from: 3, to: 4, edge_idx: 0 }, u32 { from: 4, to: 5, edge_idx: 0 }, u32 { from: 5, to: 6, edge_idx: 0 }, u32 { from: 6, to: 7, edge_idx: 0 }, u32 { from: 7, to: 1, edge_idx: 0 }]; - - - // path_tree.iterate(|path, cost| { - // println!("cost {cost} for path: {path:?}"); - // }); - - // path_tree.remove_forbidden_prefix(&[ - // u32 { from: 0, to: 2, edge_idx: 0 }, - // u32 { from: 2, to: 3, edge_idx: 2 }, - // ]); - // let desc = path_tree.graphviz(); - // println!("{desc}"); - - // path_tree.remove_forbidden_edge(&u32 { from: 5, to: 6, cost: 1 }); - - // let desc = path_tree.graphviz(); - // println!("AFTER REMOVING 5-6 [1]:\n{desc}"); - - // path_tree.remove_forbidden_edge(&u32 { from: 3, to: 4, cost: 1 }); - - // let desc = path_tree.graphviz(); - // println!("AFTER REMOVING 3-4 [1]:\n{desc}"); - - // let p = path_tree.remove_first(); - // println!("PATH: {p:?}"); - // let desc = path_tree.graphviz(); - // println!("AFTER REMOVING: {desc}"); - - // let p = path_tree.remove_first(); - // println!("PATH: {p:?}"); - // let desc = path_tree.graphviz(); - // println!("AFTER REMOVING: {desc}"); - - // path_tree.remove_all_containing_edge(&u32 { from: 5, to: 6, cost: 2 }); - - // let desc = path_tree.graphviz(); - // println!("{desc}"); - - // let first_edges = path_tree.remove_first().unwrap(); - // println!("{first_edges:?}"); - // let desc = path_tree.graphviz(); - // println!("{desc}"); - - // let first_edges = path_tree.remove_first().unwrap(); - // println!("{first_edges:?}"); - // let desc = path_tree.graphviz(); - // println!("{desc}"); - - // let first_edges = path_tree.remove_first().unwrap(); - // println!("{first_edges:?}"); - // let desc = path_tree.graphviz(); - // println!("{desc}"); - - // println!("{path_tree:?}"); - } - - - #[test] - fn test_contains_prefix_of_path() { - - } } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 7149f8bf6..bfcac57ee 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -1,5 +1,8 @@ use std::collections::BTreeMap; +use heed::RoTxn; +use itertools::Itertools; + use super::ProximityEdge; use crate::new::db_cache::DatabaseCache; use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; @@ -7,8 +10,6 @@ use crate::new::ranking_rule_graph::proximity::WordPair; use crate::new::ranking_rule_graph::{Edge, EdgeDetails}; use crate::new::QueryNode; use crate::{Index, Result}; -use heed::RoTxn; -use itertools::Itertools; pub fn visit_from_node(from_node: &QueryNode) -> Result> { Ok(Some(match from_node { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 325042761..51c6d6ad5 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,17 +1,17 @@ -use roaring::MultiOps; +use heed::RoTxn; +use roaring::{MultiOps, RoaringBitmap}; use super::{ProximityEdge, WordPair}; use crate::new::db_cache::DatabaseCache; -use crate::CboRoaringBitmapCodec; +use crate::{CboRoaringBitmapCodec, Result}; pub fn compute_docids<'transaction>( index: &crate::Index, - txn: &'transaction heed::RoTxn, + txn: &'transaction RoTxn, db_cache: &mut DatabaseCache<'transaction>, edge: &ProximityEdge, -) -> crate::Result { +) -> Result { let ProximityEdge { pairs, proximity } = edge; - // TODO: we should know already which pair of words to look for let mut pair_docids = vec![]; for pair in pairs.iter() { let bytes = match pair { @@ -25,7 +25,6 @@ pub fn compute_docids<'transaction>( bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); pair_docids.push(bitmap); } - pair_docids.sort_by_key(|rb| rb.len()); let docids = MultiOps::union(pair_docids); Ok(docids) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index e4905ead9..16c2acf1f 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -1,12 +1,13 @@ pub mod build; pub mod compute_docids; +use heed::RoTxn; + use super::{Edge, EdgeDetails, RankingRuleGraphTrait}; use crate::new::db_cache::DatabaseCache; use crate::new::query_term::WordDerivations; use crate::new::QueryNode; use crate::{Index, Result}; -use heed::RoTxn; #[derive(Debug, Clone)] pub enum WordPair { diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index 95cf4629b..d21ddcd86 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -68,14 +68,6 @@ impl RankingRuleGraph { } path_bitmaps.push(path_bitmap); } - let docids = MultiOps::union(path_bitmaps); - Ok(docids) - // for each path, translate it to an intersection of cached roaring bitmaps - // then do a union for all paths - - // get the docids of the given paths in the proximity graph - // in the fastest possible way - // 1. roaring MultiOps (before we can do the Frozen+AST thing) - // 2. minimize number of operations + Ok(MultiOps::union(path_bitmaps)) } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index b980c1dc4..4ceaddb8a 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -97,49 +97,10 @@ pub fn get_start_universe<'transaction>( query_graph: &QueryGraph, term_matching_strategy: TermsMatchingStrategy, // filters: Filters, - // mut distinct: Option, ) -> Result { - // NOTE: - // - // There is a performance problem when using `distinct` + exhaustive number of hits, - // especially for search that yield many results (many ~= almost all of the - // dataset). - // - // We'll solve it later. Maybe there are smart ways to go about it. - // - // For example, if there are millions of possible values for the distinct attribute, - // then we could just look at the documents which share any distinct attribute with - // another one, and remove the later docids them from the universe. - // => NO! because we don't know which one to remove, only after the sorting is done can we know it - // => this kind of computation can be done, but only in the evaluation of the number - // of hits for the documents that aren't returned by the search. - // - // `Distinct` otherwise should always be computed during - + // TODO: actually compute the universe from the query graph let universe = index.documents_ids(txn).unwrap(); - - // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. - // NOTE: this is wrong - // Instead, we should only compute the documents corresponding to the last remaining - // word, 2-gram, and 3-gran. - // let candidates = resolve_query_graph(index, txn, db_cache, query_graph, &universe)?; - - // Distinct should be lazy if placeholder? - // - // // because the initial_candidates should be an exhaustive count of the matching documents, - // // we precompute the distinct attributes. - // let initial_candidates = match &mut distinct { - // Some(distinct) => { - // let mut initial_candidates = RoaringBitmap::new(); - // for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { - // initial_candidates.insert(c?); - // } - // initial_candidates - // } - // None => candidates.clone(), - // }; - - Ok(/*candidates*/ universe) + Ok(universe) } pub fn execute_search<'transaction>( @@ -306,43 +267,6 @@ mod tests { let primary_key = index.primary_key(&txn).unwrap().unwrap(); let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - loop { - let start = Instant::now(); - - let mut db_cache = DatabaseCache::default(); - - let query_graph = make_query_graph( - &index, - &txn, - &mut db_cache, - "released from prison by the government", - ) - .unwrap(); - // println!("{}", query_graph.graphviz()); - - // TODO: filters + maybe distinct attributes? - let universe = get_start_universe( - &index, - &txn, - &mut db_cache, - &query_graph, - TermsMatchingStrategy::Last, - ) - .unwrap(); - // println!("universe: {universe:?}"); - - let results = execute_search( - &index, - &txn, - &mut db_cache, - &universe, - &query_graph, /* 0, 20 */ - ) - .unwrap(); - - let elapsed = start.elapsed(); - println!("{}us: {results:?}", elapsed.as_micros()); - } let start = Instant::now(); let mut db_cache = DatabaseCache::default(); @@ -350,7 +274,6 @@ mod tests { let query_graph = make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government") .unwrap(); - // println!("{}", query_graph.graphviz()); // TODO: filters + maybe distinct attributes? let universe = get_start_universe( @@ -361,7 +284,6 @@ mod tests { TermsMatchingStrategy::Last, ) .unwrap(); - // println!("universe: {universe:?}"); let results = execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /* 0, 20 */) @@ -396,7 +318,7 @@ mod tests { let start = Instant::now(); let mut s = Search::new(&txn, &index); - s.query("released from prison by the government"); + s.query("b b b b b b b b b b"); s.terms_matching_strategy(TermsMatchingStrategy::Last); s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); let docs = s.execute().unwrap(); @@ -414,30 +336,14 @@ mod tests { let index = Index::new(options, "data_movies").unwrap(); let mut wtxn = index.write_txn().unwrap(); - // let primary_key = "id"; - // let searchable_fields = vec!["title", "overview"]; - // let filterable_fields = vec!["release_date", "genres"]; - // let sortable_fields = vec[]; - let config = IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); builder.set_min_word_len_one_typo(5); builder.set_min_word_len_two_typos(100); - // builder.set_primary_key(primary_key.to_owned()); - - // let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_searchable_fields(searchable_fields); - - // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_filterable_fields(filterable_fields); - builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); - // let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_sortable_fields(sortable_fields); - builder.execute(|_| (), || false).unwrap(); } @@ -452,7 +358,6 @@ mod tests { let primary_key = "id"; let searchable_fields = vec!["title", "overview"]; let filterable_fields = vec!["release_date", "genres"]; - // let sortable_fields = vec[]; let config = IndexerConfig::default(); let mut builder = Settings::new(&mut wtxn, &index, &config); diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 6110bae49..e752358a7 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -1,7 +1,8 @@ +use std::collections::{HashMap, HashSet, VecDeque}; + use fxhash::FxHashMap; use heed::{BytesDecode, RoTxn}; use roaring::{MultiOps, RoaringBitmap}; -use std::collections::{HashMap, HashSet, VecDeque}; use super::db_cache::DatabaseCache; use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations}; @@ -9,8 +10,6 @@ use super::QueryGraph; use crate::{Index, Result, RoaringBitmapCodec}; // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc. - -// TODO: reuse NodeDocidsCache in between calls to resolve_query_graph #[derive(Default)] pub struct NodeDocIdsCache { pub cache: FxHashMap, @@ -55,11 +54,6 @@ impl NodeDocIdsCache { .into_iter() .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap()); MultiOps::union(derivations_iter) - // TODO: if `or` is empty, register that somewhere, and immediately return an empty bitmap - // On the other hand, `or` *cannot* be empty, only its intersection with the universe can - // - // TODO: Or we don't do anything and accumulate all these operations in a tree of operations - // between frozen roaring bitmap that is resolved only at the very end } }; let _ = self.cache.insert(node_idx, docids); @@ -79,10 +73,7 @@ pub fn resolve_query_graph<'transaction>( // TODO: there is definitely a faster way to compute this big // roaring bitmap expression - // resolve_query_graph_rec(index, txn, q, q.root_node, &mut docids, &mut cache)?; - let mut nodes_resolved = RoaringBitmap::new(); - // TODO: should be given as an argument and kept between invocations of resolve query graph let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()]; let mut next_nodes_to_visit = VecDeque::new(); @@ -123,100 +114,14 @@ pub fn resolve_query_graph<'transaction>( next_nodes_to_visit.push_back(succ); } } + // This is currently slow but could easily be implemented very efficiently for prec in q.edges[node as usize].predecessors.iter() { if q.edges[prec as usize].successors.is_subset(&nodes_resolved) { path_nodes_docids[prec as usize].clear(); } } - // println!("cached docids: {nodes_docids:?}"); } panic!() } - -#[cfg(test)] -mod tests { - use charabia::Tokenize; - - use super::resolve_query_graph; - use crate::db_snap; - use crate::index::tests::TempIndex; - use crate::new::db_cache::DatabaseCache; - use crate::new::resolve_query_graph::NodeDocIdsCache; - use crate::search::new::query_term::{word_derivations, LocatedQueryTerm}; - use crate::search::new::QueryGraph; - - #[test] - fn test_resolve_query_graph() { - let index = TempIndex::new(); - - index - .update_settings(|s| { - s.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - index - .add_documents(documents!([ - {"id": 0, "text": "0"}, - {"id": 1, "text": "1"}, - {"id": 2, "text": "2"}, - {"id": 3, "text": "3"}, - {"id": 4, "text": "4"}, - {"id": 5, "text": "5"}, - {"id": 6, "text": "6"}, - {"id": 7, "text": "7"}, - {"id": 8, "text": "0 1 2 3 4 5 6 7"}, - {"id": 9, "text": "7 6 5 4 3 2 1 0"}, - {"id": 10, "text": "01 234 56 7"}, - {"id": 11, "text": "7 56 0 1 23 5 4"}, - {"id": 12, "text": "0 1 2 3 4 5 6"}, - {"id": 13, "text": "01 23 4 5 7"}, - ])) - .unwrap(); - db_snap!(index, word_docids, @"7512d0b80659f6bf37d98b374ada8098"); - - let txn = index.read_txn().unwrap(); - let mut db_cache = DatabaseCache::default(); - let fst = index.words_fst(&txn).unwrap(); - let query = LocatedQueryTerm::from_query( - "no 0 1 2 3 no 4 5 6 7".tokenize(), - None, - |word, is_prefix| { - word_derivations( - &index, - &txn, - word, - if word.len() < 3 { - 0 - } else if word.len() < 6 { - 1 - } else { - 2 - }, - is_prefix, - &fst, - ) - }, - ) - .unwrap(); - let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap(); - println!("{}", graph.graphviz()); - let mut node_docids_cache = NodeDocIdsCache::default(); - let universe = index.documents_ids(&txn).unwrap(); - insta::assert_debug_snapshot!(universe, @"RoaringBitmap<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]>"); - let docids = resolve_query_graph( - &index, - &txn, - &mut db_cache, - &mut node_docids_cache, - &graph, - &universe, - ) - .unwrap(); - insta::assert_debug_snapshot!(docids, @"RoaringBitmap<[8, 9, 11]>"); - - // TODO: test with a reduced universe - } -} diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 9a48a49e7..3e3fe0faf 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -1,9 +1,10 @@ use heed::RoTxn; use roaring::RoaringBitmap; +use super::db_cache::DatabaseCache; use super::{ - db_cache::DatabaseCache, RankingRule, RankingRuleOutput, RankingRuleOutputIter, - RankingRuleOutputIterWrapper, RankingRuleQueryTrait, + RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper, + RankingRuleQueryTrait, }; use crate::{ // facet::FacetType, @@ -33,18 +34,6 @@ impl<'transaction, Query> Sort<'transaction, Query> { let fields_ids_map = index.fields_ids_map(rtxn)?; let field_id = fields_ids_map.id(&field_name); - // TODO: What is this, why? - // let faceted_candidates = match field_id { - // Some(field_id) => { - // let number_faceted = - // index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?; - // let string_faceted = - // index.faceted_documents_ids(rtxn, field_id, FacetType::String)?; - // number_faceted | string_faceted - // } - // None => RoaringBitmap::default(), - // }; - Ok(Self { field_id, is_ascending, iter: None }) } } diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 5852d137a..1c03586fd 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -79,8 +79,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { return Ok(None); } let Some(query_graph) = &mut self.query_graph else { panic!() }; - // let graphviz = query_graph.graphviz(); - // println!("\n===={graphviz}\n===="); + let this_bucket = resolve_query_graph( index, txn, @@ -89,10 +88,8 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { query_graph, universe, )?; - // println!("WORDS: this bucket: {this_bucket:?}"); + let child_query_graph = query_graph.clone(); - // this_bucket is the one that must be returned now - // self.cur_bucket is set to the next bucket // TODO: Check whether a position exists in the graph before removing it and // returning the next bucket. // while graph.does_not_contain(positions_to_remove.last()) { positions_to_remove.pop() } @@ -118,41 +115,3 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { self.positions_to_remove = vec![]; } } - -#[cfg(test)] -mod tests { - // use charabia::Tokenize; - // use roaring::RoaringBitmap; - - // use crate::{ - // index::tests::TempIndex, - // search::{criteria::CriteriaBuilder, new::QueryGraphOrPlaceholder}, - // }; - - // use super::Words; - - // fn placeholder() { - // let qt = QueryGraphOrPlaceholder::Placeholder; - // let index = TempIndex::new(); - // let rtxn = index.read_txn().unwrap(); - - // let query = "a beautiful summer house by the beach overlooking what seems"; - // // let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); - // // let (qt, parts, matching_words) = builder.build(query.tokenize()).unwrap().unwrap(); - - // // let cb = CriteriaBuilder::new(&rtxn, &index).unwrap(); - // // let x = cb - // // .build( - // // Some(qt), - // // Some(parts), - // // None, - // // None, - // // false, - // // None, - // // crate::CriterionImplementationStrategy::OnlySetBased, - // // ) - // // .unwrap(); - - // // let rr = Words::new(&index, &RoaringBitmap::from_sorted_iter(0..1000)).unwrap(); - // } -}