mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-27 04:25:06 +08:00
Remove noise in codebase
This commit is contained in:
parent
a938fbde4a
commit
c8e251bf24
@ -1,7 +1,8 @@
|
|||||||
use std::collections::hash_map::Entry;
|
use std::collections::hash_map::Entry;
|
||||||
|
|
||||||
use fxhash::FxHashMap;
|
use fxhash::FxHashMap;
|
||||||
use heed::{types::ByteSlice, RoTxn};
|
use heed::types::ByteSlice;
|
||||||
|
use heed::RoTxn;
|
||||||
|
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result};
|
||||||
|
|
||||||
@ -62,10 +63,7 @@ impl<'transaction> DatabaseCache<'transaction> {
|
|||||||
match self.word_pair_proximity_docids.entry(key.clone()) {
|
match self.word_pair_proximity_docids.entry(key.clone()) {
|
||||||
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
// Note that now, we really want to do a prefix iter over (w1, w2) to get all the possible proximities
|
// We shouldn't greedily access this DB at all
|
||||||
// but oh well
|
|
||||||
//
|
|
||||||
// Actually, we shouldn'transaction greedily access this DB at all
|
|
||||||
// a DB (w1, w2) -> [proximities] would be much better
|
// a DB (w1, w2) -> [proximities] would be much better
|
||||||
// We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity
|
// We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity
|
||||||
// And if we worked with words encoded as integers, the set of words could be a roaring bitmap
|
// And if we worked with words encoded as integers, the set of words could be a roaring bitmap
|
||||||
|
@ -1,20 +1,15 @@
|
|||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::{
|
use super::db_cache::DatabaseCache;
|
||||||
new::ranking_rule_graph::cheapest_paths::{self, Path},
|
use super::ranking_rule_graph::cheapest_paths::KCheapestPathsState;
|
||||||
Index, Result,
|
use super::ranking_rule_graph::edge_docids_cache::EdgeDocidsCache;
|
||||||
};
|
use super::ranking_rule_graph::empty_paths_cache::EmptyPathsCache;
|
||||||
|
use super::ranking_rule_graph::paths_map::PathsMap;
|
||||||
use super::{
|
use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
db_cache::DatabaseCache,
|
use super::{QueryGraph, RankingRule, RankingRuleOutput};
|
||||||
ranking_rule_graph::{
|
use crate::new::ranking_rule_graph::cheapest_paths::{self, Path};
|
||||||
cheapest_paths::KCheapestPathsState, edge_docids_cache::EdgeDocidsCache,
|
use crate::{Index, Result};
|
||||||
empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, RankingRuleGraph,
|
|
||||||
RankingRuleGraphTrait,
|
|
||||||
},
|
|
||||||
QueryGraph, RankingRule, RankingRuleOutput,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
||||||
state: Option<GraphBasedRankingRuleState<G>>,
|
state: Option<GraphBasedRankingRuleState<G>>,
|
||||||
@ -43,16 +38,8 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
|||||||
universe: &RoaringBitmap,
|
universe: &RoaringBitmap,
|
||||||
query_graph: &QueryGraph,
|
query_graph: &QueryGraph,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// if let Some(state) = &mut self.state {
|
// TODO: update old state instead of starting from scratch
|
||||||
// // TODO: update the previous state
|
|
||||||
// // TODO: update the existing graph incrementally, based on a diff
|
|
||||||
|
|
||||||
// } else {
|
|
||||||
let graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?;
|
let graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?;
|
||||||
// println!("Initialized Proximity Ranking Rule.");
|
|
||||||
// println!("GRAPH:");
|
|
||||||
// let graphviz = graph.graphviz();
|
|
||||||
// println!("{graphviz}");
|
|
||||||
|
|
||||||
let cheapest_paths_state = KCheapestPathsState::new(&graph);
|
let cheapest_paths_state = KCheapestPathsState::new(&graph);
|
||||||
let state = GraphBasedRankingRuleState {
|
let state = GraphBasedRankingRuleState {
|
||||||
@ -62,13 +49,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
|||||||
empty_paths_cache: <_>::default(),
|
empty_paths_cache: <_>::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
// let desc = state.graph.graphviz_with_path(
|
|
||||||
// &state.cheapest_paths_state.as_ref().unwrap().kth_cheapest_path.clone(),
|
|
||||||
// );
|
|
||||||
// println!("Cheapest path: {desc}");
|
|
||||||
|
|
||||||
self.state = Some(state);
|
self.state = Some(state);
|
||||||
// }
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@ -86,17 +67,9 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
|||||||
let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else {
|
let Some(cheapest_paths_state) = state.cheapest_paths_state.take() else {
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
};
|
};
|
||||||
// println!("Proximity: Next Bucket");
|
|
||||||
|
|
||||||
let mut paths = PathsMap::default();
|
let mut paths = PathsMap::default();
|
||||||
|
|
||||||
// let desc = state.graph.dot_description_with_path(&cheapest_paths_state.kth_cheapest_path);
|
|
||||||
// println!("CHeapest Path: {desc}");
|
|
||||||
// TODO: when does it return None? -> when there is no cheapest path
|
|
||||||
// How to handle it? -> ... return all document ids from the universe?
|
|
||||||
//
|
|
||||||
// TODO: Give an empty_edge and empty_prefix argument to the
|
|
||||||
// compute_paths_of_next_lowest_cost function
|
|
||||||
if let Some(next_cheapest_paths_state) = cheapest_paths_state
|
if let Some(next_cheapest_paths_state) = cheapest_paths_state
|
||||||
.compute_paths_of_next_lowest_cost(
|
.compute_paths_of_next_lowest_cost(
|
||||||
&mut state.graph,
|
&mut state.graph,
|
||||||
@ -107,31 +80,12 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
|||||||
state.cheapest_paths_state = Some(next_cheapest_paths_state);
|
state.cheapest_paths_state = Some(next_cheapest_paths_state);
|
||||||
} else {
|
} else {
|
||||||
state.cheapest_paths_state = None;
|
state.cheapest_paths_state = None;
|
||||||
// If returns None if there are no longer any paths to compute
|
|
||||||
// BUT! paths_map may not be empty, and we need to compute the current bucket still
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// println!("PATHS: {}", paths.graphviz(&state.graph));
|
|
||||||
|
|
||||||
// paths.iterate(|path, cost| {
|
|
||||||
// let desc = state.graph.graphviz_with_path(&Path { edges: path.clone(), cost: *cost });
|
|
||||||
// println!("Path to resolve of cost {cost}: {desc}");
|
|
||||||
// });
|
|
||||||
|
|
||||||
// let desc = state.graph.dot_description_with_path(
|
|
||||||
// &state.cheapest_paths_state.as_ref().unwrap().kth_cheapest_path.clone(),
|
|
||||||
// );
|
|
||||||
// println!("Cheapest path: {desc}");
|
|
||||||
|
|
||||||
// TODO: verify that this is correct
|
|
||||||
// If the paths are empty, we should probably return the universe?
|
|
||||||
// BUT! Is there a case where the paths are empty AND the universe is
|
|
||||||
// not empty?
|
|
||||||
if paths.is_empty() {
|
if paths.is_empty() {
|
||||||
self.state = None;
|
self.state = None;
|
||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
// Here, log all the paths?
|
|
||||||
|
|
||||||
let bucket = state.graph.resolve_paths(
|
let bucket = state.graph.resolve_paths(
|
||||||
index,
|
index,
|
||||||
@ -142,10 +96,6 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
|||||||
universe,
|
universe,
|
||||||
paths,
|
paths,
|
||||||
)?;
|
)?;
|
||||||
// The call above also updated the graph such that it doesn't contain the empty edges anymore.
|
|
||||||
// println!("Resolved all the paths: {bucket:?} from universe {:?}", state.universe);
|
|
||||||
// let graphviz = state.graph.graphviz();
|
|
||||||
// println!("{graphviz}");
|
|
||||||
|
|
||||||
let next_query_graph = state.graph.query_graph.clone();
|
let next_query_graph = state.graph.query_graph.clone();
|
||||||
|
|
||||||
@ -160,7 +110,6 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
|
|||||||
_txn: &'transaction RoTxn,
|
_txn: &'transaction RoTxn,
|
||||||
_db_cache: &mut DatabaseCache<'transaction>,
|
_db_cache: &mut DatabaseCache<'transaction>,
|
||||||
) {
|
) {
|
||||||
// println!("PROXIMITY: end iteration");
|
|
||||||
self.state = None;
|
self.state = None;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -14,10 +14,8 @@ pub use query_graph::*;
|
|||||||
pub use ranking_rules::*;
|
pub use ranking_rules::*;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use self::{
|
use self::db_cache::DatabaseCache;
|
||||||
db_cache::DatabaseCache,
|
use self::query_term::{word_derivations, LocatedQueryTerm};
|
||||||
query_term::{word_derivations, LocatedQueryTerm},
|
|
||||||
};
|
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result};
|
||||||
|
|
||||||
pub enum BitmapOrAllRef<'s> {
|
pub enum BitmapOrAllRef<'s> {
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
|
use std::collections::HashSet;
|
||||||
|
use std::fmt;
|
||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
use std::{collections::HashSet, fmt};
|
|
||||||
|
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{
|
use super::db_cache::DatabaseCache;
|
||||||
db_cache::DatabaseCache,
|
use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
||||||
query_term::{LocatedQueryTerm, QueryTerm, WordDerivations},
|
|
||||||
};
|
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result};
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@ -20,8 +19,7 @@ pub enum QueryNode {
|
|||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct Edges {
|
pub struct Edges {
|
||||||
// TODO: use a tiny bitset instead
|
// TODO: use a tiny bitset instead, something like a simple Vec<u8> where most queries will see a vector of one element
|
||||||
// something like a simple Vec<u8> where most queries will see a vector of one element
|
|
||||||
pub predecessors: RoaringBitmap,
|
pub predecessors: RoaringBitmap,
|
||||||
pub successors: RoaringBitmap,
|
pub successors: RoaringBitmap,
|
||||||
}
|
}
|
||||||
@ -75,7 +73,6 @@ impl QueryGraph {
|
|||||||
|
|
||||||
impl QueryGraph {
|
impl QueryGraph {
|
||||||
// TODO: return the list of all matching words here as well
|
// TODO: return the list of all matching words here as well
|
||||||
|
|
||||||
pub fn from_query<'transaction>(
|
pub fn from_query<'transaction>(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
txn: &RoTxn,
|
txn: &RoTxn,
|
||||||
@ -94,9 +91,7 @@ impl QueryGraph {
|
|||||||
let (mut prev2, mut prev1, mut prev0): (Vec<u32>, Vec<u32>, Vec<u32>) =
|
let (mut prev2, mut prev1, mut prev0): (Vec<u32>, Vec<u32>, Vec<u32>) =
|
||||||
(vec![], vec![], vec![graph.root_node]);
|
(vec![], vec![], vec![graph.root_node]);
|
||||||
|
|
||||||
// TODO: add all the word derivations found in the fst
|
// TODO: split words / synonyms
|
||||||
// and add split words / support phrases
|
|
||||||
|
|
||||||
for length in 1..=query.len() {
|
for length in 1..=query.len() {
|
||||||
let query = &query[..length];
|
let query = &query[..length];
|
||||||
|
|
||||||
@ -279,18 +274,6 @@ impl Debug for QueryNode {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
TODO:
|
|
||||||
|
|
||||||
1. Find the minimum number of words to check to resolve the 10 query trees at once.
|
|
||||||
(e.g. just 0 | 01 | 012 )
|
|
||||||
2. Simplify the query tree after removal of a node ✅
|
|
||||||
3. Create the proximity graph ✅
|
|
||||||
4. Assign different proximities for the ngrams ✅
|
|
||||||
5. Walk the proximity graph, finding all the potential paths of weight N from START to END ✅
|
|
||||||
(without checking the bitmaps)
|
|
||||||
|
|
||||||
*/
|
|
||||||
impl QueryGraph {
|
impl QueryGraph {
|
||||||
pub fn graphviz(&self) -> String {
|
pub fn graphviz(&self) -> String {
|
||||||
let mut desc = String::new();
|
let mut desc = String::new();
|
||||||
@ -317,91 +300,9 @@ node [shape = "record"]
|
|||||||
for edge in self.edges[node].successors.iter() {
|
for edge in self.edges[node].successors.iter() {
|
||||||
desc.push_str(&format!("{node} -> {edge};\n"));
|
desc.push_str(&format!("{node} -> {edge};\n"));
|
||||||
}
|
}
|
||||||
// for edge in self.edges[node].incoming.iter() {
|
|
||||||
// desc.push_str(&format!("{node} -> {edge} [color = grey];\n"));
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
desc.push('}');
|
desc.push('}');
|
||||||
desc
|
desc
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use charabia::Tokenize;
|
|
||||||
|
|
||||||
use super::{LocatedQueryTerm, QueryGraph, QueryNode};
|
|
||||||
use crate::index::tests::TempIndex;
|
|
||||||
use crate::new::db_cache::DatabaseCache;
|
|
||||||
use crate::search::new::query_term::word_derivations;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn build_graph() {
|
|
||||||
let mut index = TempIndex::new();
|
|
||||||
index.index_documents_config.autogenerate_docids = true;
|
|
||||||
index
|
|
||||||
.update_settings(|s| {
|
|
||||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
index
|
|
||||||
.add_documents(documents!({
|
|
||||||
"text": "0 1 2 3 4 5 6 7 01 23 234 56 79 709 7356",
|
|
||||||
}))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
// let fst = fst::Set::from_iter(["01", "23", "234", "56"]).unwrap();
|
|
||||||
let txn = index.read_txn().unwrap();
|
|
||||||
let mut db_cache = DatabaseCache::default();
|
|
||||||
|
|
||||||
let fst = index.words_fst(&txn).unwrap();
|
|
||||||
let query = LocatedQueryTerm::from_query(
|
|
||||||
"0 no 1 2 3 4 5 6 7".tokenize(),
|
|
||||||
None,
|
|
||||||
|word, is_prefix| {
|
|
||||||
word_derivations(
|
|
||||||
&index,
|
|
||||||
&txn,
|
|
||||||
word,
|
|
||||||
if word.len() < 3 {
|
|
||||||
0
|
|
||||||
} else if word.len() < 6 {
|
|
||||||
1
|
|
||||||
} else {
|
|
||||||
2
|
|
||||||
},
|
|
||||||
is_prefix,
|
|
||||||
&fst,
|
|
||||||
)
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap();
|
|
||||||
println!("{}", graph.graphviz());
|
|
||||||
|
|
||||||
// let positions_to_remove = vec![3, 6, 0, 4];
|
|
||||||
// for p in positions_to_remove {
|
|
||||||
// graph.remove_words_at_position(p);
|
|
||||||
// println!("{}", graph.graphviz());
|
|
||||||
// }
|
|
||||||
|
|
||||||
// let proximities = |w1: &str, w2: &str| -> Vec<i8> {
|
|
||||||
// if matches!((w1, w2), ("56", "7")) {
|
|
||||||
// vec![]
|
|
||||||
// } else {
|
|
||||||
// vec![1, 2]
|
|
||||||
// }
|
|
||||||
// };
|
|
||||||
|
|
||||||
// let prox_graph = ProximityGraph::from_query_graph(graph, proximities);
|
|
||||||
|
|
||||||
// println!("{}", prox_graph.graphviz());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// fn remove_element_from_vector(v: &mut Vec<usize>, el: usize) {
|
|
||||||
// let position = v.iter().position(|&x| x == el).unwrap();
|
|
||||||
// v.swap_remove(position);
|
|
||||||
// }
|
|
||||||
|
@ -17,10 +17,6 @@ use crate::{Index, Result};
|
|||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct WordDerivations {
|
pub struct WordDerivations {
|
||||||
// TODO: should have a list for the words corresponding to the prefix as well!
|
|
||||||
// This is to implement the `exactness` ranking rule.
|
|
||||||
// However, we could also consider every term in `zero_typo` (except first one) to
|
|
||||||
// be words of that the original word is a prefix of
|
|
||||||
pub original: String,
|
pub original: String,
|
||||||
pub zero_typo: Vec<String>,
|
pub zero_typo: Vec<String>,
|
||||||
pub one_typo: Vec<String>,
|
pub one_typo: Vec<String>,
|
||||||
|
@ -46,8 +46,6 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// ranking_rule_graph.simplify();
|
|
||||||
|
|
||||||
Ok(ranking_rule_graph)
|
Ok(ranking_rule_graph)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,10 +3,9 @@ use std::collections::{BTreeMap, HashSet};
|
|||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{
|
use super::empty_paths_cache::EmptyPathsCache;
|
||||||
empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, Edge, RankingRuleGraph,
|
use super::paths_map::PathsMap;
|
||||||
RankingRuleGraphTrait,
|
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
};
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||||
pub struct Path {
|
pub struct Path {
|
||||||
|
@ -18,18 +18,12 @@ use crate::{Index, Result};
|
|||||||
|
|
||||||
pub struct EdgeDocidsCache<G: RankingRuleGraphTrait> {
|
pub struct EdgeDocidsCache<G: RankingRuleGraphTrait> {
|
||||||
pub cache: FxHashMap<u32, RoaringBitmap>,
|
pub cache: FxHashMap<u32, RoaringBitmap>,
|
||||||
|
|
||||||
// TODO: There is a big difference between `cache`, which is always valid, and
|
|
||||||
// `empty_path_prefixes`, which is only accurate for a particular universe
|
|
||||||
// ALSO, we should have a universe-specific `empty_edge` to use
|
|
||||||
// pub empty_path_prefixes: HashSet<Vec<u32>>,
|
|
||||||
_phantom: PhantomData<G>,
|
_phantom: PhantomData<G>,
|
||||||
}
|
}
|
||||||
impl<G: RankingRuleGraphTrait> Default for EdgeDocidsCache<G> {
|
impl<G: RankingRuleGraphTrait> Default for EdgeDocidsCache<G> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
cache: Default::default(),
|
cache: Default::default(),
|
||||||
// empty_path_prefixes: Default::default(),
|
|
||||||
_phantom: Default::default(),
|
_phantom: Default::default(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,7 +5,6 @@ pub mod empty_paths_cache;
|
|||||||
pub mod paths_map;
|
pub mod paths_map;
|
||||||
pub mod proximity;
|
pub mod proximity;
|
||||||
pub mod resolve_paths;
|
pub mod resolve_paths;
|
||||||
|
|
||||||
use std::collections::{BTreeSet, HashSet};
|
use std::collections::{BTreeSet, HashSet};
|
||||||
use std::ops::ControlFlow;
|
use std::ops::ControlFlow;
|
||||||
|
|
||||||
@ -86,22 +85,10 @@ pub struct RankingRuleGraph<G: RankingRuleGraphTrait> {
|
|||||||
pub node_edges: Vec<RoaringBitmap>,
|
pub node_edges: Vec<RoaringBitmap>,
|
||||||
|
|
||||||
pub successors: Vec<RoaringBitmap>,
|
pub successors: Vec<RoaringBitmap>,
|
||||||
// to get the edges between two nodes:
|
// TODO: to get the edges between two nodes:
|
||||||
// 1. get node_outgoing_edges[from]
|
// 1. get node_outgoing_edges[from]
|
||||||
// 2. get node_incoming_edges[to]
|
// 2. get node_incoming_edges[to]
|
||||||
// 3. take intersection betweem the two
|
// 3. take intersection betweem the two
|
||||||
|
|
||||||
// TODO: node edges could be different I guess
|
|
||||||
// something like:
|
|
||||||
// pub node_edges: Vec<BitSet>
|
|
||||||
// where each index is the result of:
|
|
||||||
// the successor index in the top 16 bits, the edge index in the bottom 16 bits
|
|
||||||
|
|
||||||
// TODO:
|
|
||||||
// node_successors?
|
|
||||||
|
|
||||||
// pub removed_edges: HashSet<u32>,
|
|
||||||
// pub tmp_removed_edges: HashSet<u32>,
|
|
||||||
}
|
}
|
||||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||||
// Visit all edges between the two given nodes in order of increasing cost.
|
// Visit all edges between the two given nodes in order of increasing cost.
|
||||||
@ -142,50 +129,6 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
}
|
}
|
||||||
self.successors[from_node as usize] = new_successors_from_node;
|
self.successors[from_node as usize] = new_successors_from_node;
|
||||||
}
|
}
|
||||||
// pub fn remove_nodes(&mut self, nodes: &[usize]) {
|
|
||||||
// for &node in nodes {
|
|
||||||
// let edge_indices = &mut self.node_edges[node];
|
|
||||||
// for edge_index in edge_indices.iter() {
|
|
||||||
// self.all_edges[*edge_index] = None;
|
|
||||||
// }
|
|
||||||
// edge_indices.clear();
|
|
||||||
|
|
||||||
// let preds = &self.query_graph.edges[node].incoming;
|
|
||||||
// for pred in preds {
|
|
||||||
// let edge_indices = &mut self.node_edges[*pred];
|
|
||||||
// for edge_index in edge_indices.iter() {
|
|
||||||
// let edge_opt = &mut self.all_edges[*edge_index];
|
|
||||||
// let Some(edge) = edge_opt else { continue; };
|
|
||||||
// if edge.to_node == node {
|
|
||||||
// *edge_opt = None;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// panic!("remove nodes is incorrect at the moment");
|
|
||||||
// edge_indices.clear();
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// self.query_graph.remove_nodes(nodes);
|
|
||||||
// }
|
|
||||||
// pub fn simplify(&mut self) {
|
|
||||||
// loop {
|
|
||||||
// let mut nodes_to_remove = vec![];
|
|
||||||
// for (node_idx, node) in self.query_graph.nodes.iter().enumerate() {
|
|
||||||
// if !matches!(node, QueryNode::End | QueryNode::Deleted)
|
|
||||||
// && self.node_edges[node_idx].is_empty()
|
|
||||||
// {
|
|
||||||
// nodes_to_remove.push(node_idx);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// if nodes_to_remove.is_empty() {
|
|
||||||
// break;
|
|
||||||
// } else {
|
|
||||||
// self.remove_nodes(&nodes_to_remove);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// fn is_removed_edge(&self, edge: u32) -> bool {
|
|
||||||
// self.removed_edges.contains(&edge) || self.tmp_removed_edges.contains(&edge)
|
|
||||||
// }
|
|
||||||
|
|
||||||
pub fn graphviz(&self) -> String {
|
pub fn graphviz(&self) -> String {
|
||||||
let mut desc = String::new();
|
let mut desc = String::new();
|
||||||
|
@ -6,14 +6,13 @@ use std::hash::{Hash, Hasher};
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::cheapest_paths::Path;
|
use super::cheapest_paths::Path;
|
||||||
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait, Edge};
|
use super::{Edge, EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
|
||||||
use crate::new::QueryNode;
|
use crate::new::QueryNode;
|
||||||
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct PathsMap<V> {
|
pub struct PathsMap<V> {
|
||||||
nodes: Vec<(u32, PathsMap<V>)>,
|
nodes: Vec<(u32, PathsMap<V>)>,
|
||||||
value: Option<V>
|
value: Option<V>,
|
||||||
}
|
}
|
||||||
impl<V> Default for PathsMap<V> {
|
impl<V> Default for PathsMap<V> {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
@ -73,7 +72,7 @@ impl<V> PathsMap<V> {
|
|||||||
}
|
}
|
||||||
pub fn remove_first(&mut self) -> Option<(Vec<u32>, V)> {
|
pub fn remove_first(&mut self) -> Option<(Vec<u32>, V)> {
|
||||||
if self.is_empty() {
|
if self.is_empty() {
|
||||||
return None
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut result = vec![];
|
let mut result = vec![];
|
||||||
@ -163,7 +162,7 @@ impl<V> PathsMap<V> {
|
|||||||
let [first_edge, remaining_prefix @ ..] = prefix else {
|
let [first_edge, remaining_prefix @ ..] = prefix else {
|
||||||
return self.nodes.iter().map(|n| n.0).collect();
|
return self.nodes.iter().map(|n| n.0).collect();
|
||||||
};
|
};
|
||||||
for (edge, rest) in self.nodes.iter(){
|
for (edge, rest) in self.nodes.iter() {
|
||||||
if edge == first_edge {
|
if edge == first_edge {
|
||||||
return rest.edge_indices_after_prefix(remaining_prefix);
|
return rest.edge_indices_after_prefix(remaining_prefix);
|
||||||
}
|
}
|
||||||
@ -173,14 +172,12 @@ impl<V> PathsMap<V> {
|
|||||||
|
|
||||||
pub fn contains_prefix_of_path(&self, path: &[u32]) -> bool {
|
pub fn contains_prefix_of_path(&self, path: &[u32]) -> bool {
|
||||||
if self.value.is_some() {
|
if self.value.is_some() {
|
||||||
return true
|
return true;
|
||||||
}
|
}
|
||||||
match path {
|
match path {
|
||||||
[] => {
|
[] => false,
|
||||||
false
|
|
||||||
}
|
|
||||||
[first_edge, remaining_path @ ..] => {
|
[first_edge, remaining_path @ ..] => {
|
||||||
for (edge, rest) in self.nodes.iter(){
|
for (edge, rest) in self.nodes.iter() {
|
||||||
if edge == first_edge {
|
if edge == first_edge {
|
||||||
return rest.contains_prefix_of_path(remaining_path);
|
return rest.contains_prefix_of_path(remaining_path);
|
||||||
}
|
}
|
||||||
@ -197,7 +194,12 @@ impl<V> PathsMap<V> {
|
|||||||
desc.push_str("\n}\n");
|
desc.push_str("\n}\n");
|
||||||
desc
|
desc
|
||||||
}
|
}
|
||||||
fn graphviz_rec<G: RankingRuleGraphTrait>(&self, desc: &mut String, path_from: Vec<u64>, graph: &RankingRuleGraph<G>) {
|
fn graphviz_rec<G: RankingRuleGraphTrait>(
|
||||||
|
&self,
|
||||||
|
desc: &mut String,
|
||||||
|
path_from: Vec<u64>,
|
||||||
|
graph: &RankingRuleGraph<G>,
|
||||||
|
) {
|
||||||
let id_from = {
|
let id_from = {
|
||||||
let mut h = DefaultHasher::new();
|
let mut h = DefaultHasher::new();
|
||||||
path_from.hash(&mut h);
|
path_from.hash(&mut h);
|
||||||
@ -227,7 +229,6 @@ impl<V> PathsMap<V> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||||
|
|
||||||
pub fn graphviz_with_path(&self, path: &Path) -> String {
|
pub fn graphviz_with_path(&self, path: &Path) -> String {
|
||||||
let mut desc = String::new();
|
let mut desc = String::new();
|
||||||
desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n");
|
desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n");
|
||||||
@ -248,11 +249,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
for (edge_idx, edge) in self.all_edges.iter().enumerate() {
|
for (edge_idx, edge) in self.all_edges.iter().enumerate() {
|
||||||
let Some(edge) = edge else { continue };
|
let Some(edge) = edge else { continue };
|
||||||
let Edge { from_node, to_node, cost, details } = edge;
|
let Edge { from_node, to_node, cost, details } = edge;
|
||||||
let color = if path.edges.contains(&(edge_idx as u32)) {
|
let color = if path.edges.contains(&(edge_idx as u32)) { "red" } else { "green" };
|
||||||
"red"
|
|
||||||
} else {
|
|
||||||
"green"
|
|
||||||
};
|
|
||||||
match &edge.details {
|
match &edge.details {
|
||||||
EdgeDetails::Unconditional => {
|
EdgeDetails::Unconditional => {
|
||||||
desc.push_str(&format!(
|
desc.push_str(&format!(
|
||||||
@ -273,157 +270,4 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
desc.push('}');
|
desc.push('}');
|
||||||
desc
|
desc
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::PathsMap;
|
|
||||||
use crate::db_snap;
|
|
||||||
use crate::index::tests::TempIndex;
|
|
||||||
use crate::new::db_cache::DatabaseCache;
|
|
||||||
use crate::new::ranking_rule_graph::cheapest_paths::KCheapestPathsState;
|
|
||||||
use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache;
|
|
||||||
use crate::new::ranking_rule_graph::proximity::ProximityGraph;
|
|
||||||
use crate::new::ranking_rule_graph::RankingRuleGraph;
|
|
||||||
use crate::search::new::query_term::{word_derivations, LocatedQueryTerm};
|
|
||||||
use crate::search::new::QueryGraph;
|
|
||||||
use charabia::Tokenize;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn paths_tree() {
|
|
||||||
let mut index = TempIndex::new();
|
|
||||||
index.index_documents_config.autogenerate_docids = true;
|
|
||||||
index
|
|
||||||
.update_settings(|s| {
|
|
||||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
index
|
|
||||||
.add_documents(documents!([
|
|
||||||
{
|
|
||||||
"text": "0 1 2 3 4 5"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": "0 a 1 b 2 3 4 5"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": "0 a 1 b 3 a 4 b 5"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": "0 a a 1 b 2 3 4 5"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"text": "0 a a a a 1 b 3 45"
|
|
||||||
},
|
|
||||||
]))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, word_pair_proximity_docids, @"679d1126b569b3e8b10dd937c3faedf9");
|
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
|
||||||
let mut db_cache = DatabaseCache::default();
|
|
||||||
let fst = index.words_fst(&txn).unwrap();
|
|
||||||
let query =
|
|
||||||
LocatedQueryTerm::from_query("0 1 2 3 4 5".tokenize(), None, |word, is_prefix| {
|
|
||||||
word_derivations(&index, &txn, word, if word.len() < 3 {
|
|
||||||
0
|
|
||||||
} else if word.len() < 6 {
|
|
||||||
1
|
|
||||||
} else {
|
|
||||||
2
|
|
||||||
},is_prefix, &fst)
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap();
|
|
||||||
let empty_paths_cache = EmptyPathsCache::default();
|
|
||||||
let mut db_cache = DatabaseCache::default();
|
|
||||||
|
|
||||||
let mut prox_graph =
|
|
||||||
RankingRuleGraph::<ProximityGraph>::build(&index, &txn, &mut db_cache, graph).unwrap();
|
|
||||||
|
|
||||||
println!("{}", prox_graph.graphviz());
|
|
||||||
|
|
||||||
let mut state = KCheapestPathsState::new(&prox_graph).unwrap();
|
|
||||||
|
|
||||||
let mut path_tree = PathsMap::default();
|
|
||||||
while state.next_cost() <= 6 {
|
|
||||||
let next_state = state.compute_paths_of_next_lowest_cost(&mut prox_graph, &empty_paths_cache, &mut path_tree);
|
|
||||||
if let Some(next_state) = next_state {
|
|
||||||
state = next_state;
|
|
||||||
} else {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let desc = path_tree.graphviz(&prox_graph);
|
|
||||||
println!("{desc}");
|
|
||||||
|
|
||||||
// let path = vec![u32 { from: 0, to: 2, edge_idx: 0 }, u32 { from: 2, to: 3, edge_idx: 0 }, u32 { from: 3, to: 4, edge_idx: 0 }, u32 { from: 4, to: 5, edge_idx: 0 }, u32 { from: 5, to: 8, edge_idx: 0 }, u32 { from: 8, to: 1, edge_idx: 0 }, u32 { from: 1, to: 10, edge_idx: 0 }];
|
|
||||||
// println!("{}", psath_tree.contains_prefix_of_path(&path));
|
|
||||||
|
|
||||||
|
|
||||||
// let path = vec![u32 { from: 0, to: 2, edge_idx: 0 }, u32 { from: 2, to: 3, edge_idx: 0 }, u32 { from: 3, to: 4, edge_idx: 0 }, u32 { from: 4, to: 5, edge_idx: 0 }, u32 { from: 5, to: 6, edge_idx: 0 }, u32 { from: 6, to: 7, edge_idx: 0 }, u32 { from: 7, to: 1, edge_idx: 0 }];
|
|
||||||
|
|
||||||
|
|
||||||
// path_tree.iterate(|path, cost| {
|
|
||||||
// println!("cost {cost} for path: {path:?}");
|
|
||||||
// });
|
|
||||||
|
|
||||||
// path_tree.remove_forbidden_prefix(&[
|
|
||||||
// u32 { from: 0, to: 2, edge_idx: 0 },
|
|
||||||
// u32 { from: 2, to: 3, edge_idx: 2 },
|
|
||||||
// ]);
|
|
||||||
// let desc = path_tree.graphviz();
|
|
||||||
// println!("{desc}");
|
|
||||||
|
|
||||||
// path_tree.remove_forbidden_edge(&u32 { from: 5, to: 6, cost: 1 });
|
|
||||||
|
|
||||||
// let desc = path_tree.graphviz();
|
|
||||||
// println!("AFTER REMOVING 5-6 [1]:\n{desc}");
|
|
||||||
|
|
||||||
// path_tree.remove_forbidden_edge(&u32 { from: 3, to: 4, cost: 1 });
|
|
||||||
|
|
||||||
// let desc = path_tree.graphviz();
|
|
||||||
// println!("AFTER REMOVING 3-4 [1]:\n{desc}");
|
|
||||||
|
|
||||||
// let p = path_tree.remove_first();
|
|
||||||
// println!("PATH: {p:?}");
|
|
||||||
// let desc = path_tree.graphviz();
|
|
||||||
// println!("AFTER REMOVING: {desc}");
|
|
||||||
|
|
||||||
// let p = path_tree.remove_first();
|
|
||||||
// println!("PATH: {p:?}");
|
|
||||||
// let desc = path_tree.graphviz();
|
|
||||||
// println!("AFTER REMOVING: {desc}");
|
|
||||||
|
|
||||||
// path_tree.remove_all_containing_edge(&u32 { from: 5, to: 6, cost: 2 });
|
|
||||||
|
|
||||||
// let desc = path_tree.graphviz();
|
|
||||||
// println!("{desc}");
|
|
||||||
|
|
||||||
// let first_edges = path_tree.remove_first().unwrap();
|
|
||||||
// println!("{first_edges:?}");
|
|
||||||
// let desc = path_tree.graphviz();
|
|
||||||
// println!("{desc}");
|
|
||||||
|
|
||||||
// let first_edges = path_tree.remove_first().unwrap();
|
|
||||||
// println!("{first_edges:?}");
|
|
||||||
// let desc = path_tree.graphviz();
|
|
||||||
// println!("{desc}");
|
|
||||||
|
|
||||||
// let first_edges = path_tree.remove_first().unwrap();
|
|
||||||
// println!("{first_edges:?}");
|
|
||||||
// let desc = path_tree.graphviz();
|
|
||||||
// println!("{desc}");
|
|
||||||
|
|
||||||
// println!("{path_tree:?}");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_contains_prefix_of_path() {
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use heed::RoTxn;
|
||||||
|
use itertools::Itertools;
|
||||||
|
|
||||||
use super::ProximityEdge;
|
use super::ProximityEdge;
|
||||||
use crate::new::db_cache::DatabaseCache;
|
use crate::new::db_cache::DatabaseCache;
|
||||||
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
||||||
@ -7,8 +10,6 @@ use crate::new::ranking_rule_graph::proximity::WordPair;
|
|||||||
use crate::new::ranking_rule_graph::{Edge, EdgeDetails};
|
use crate::new::ranking_rule_graph::{Edge, EdgeDetails};
|
||||||
use crate::new::QueryNode;
|
use crate::new::QueryNode;
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result};
|
||||||
use heed::RoTxn;
|
|
||||||
use itertools::Itertools;
|
|
||||||
|
|
||||||
pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations, i8)>> {
|
pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations, i8)>> {
|
||||||
Ok(Some(match from_node {
|
Ok(Some(match from_node {
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
use roaring::MultiOps;
|
use heed::RoTxn;
|
||||||
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
|
|
||||||
use super::{ProximityEdge, WordPair};
|
use super::{ProximityEdge, WordPair};
|
||||||
use crate::new::db_cache::DatabaseCache;
|
use crate::new::db_cache::DatabaseCache;
|
||||||
use crate::CboRoaringBitmapCodec;
|
use crate::{CboRoaringBitmapCodec, Result};
|
||||||
|
|
||||||
pub fn compute_docids<'transaction>(
|
pub fn compute_docids<'transaction>(
|
||||||
index: &crate::Index,
|
index: &crate::Index,
|
||||||
txn: &'transaction heed::RoTxn,
|
txn: &'transaction RoTxn,
|
||||||
db_cache: &mut DatabaseCache<'transaction>,
|
db_cache: &mut DatabaseCache<'transaction>,
|
||||||
edge: &ProximityEdge,
|
edge: &ProximityEdge,
|
||||||
) -> crate::Result<roaring::RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
let ProximityEdge { pairs, proximity } = edge;
|
let ProximityEdge { pairs, proximity } = edge;
|
||||||
// TODO: we should know already which pair of words to look for
|
|
||||||
let mut pair_docids = vec![];
|
let mut pair_docids = vec![];
|
||||||
for pair in pairs.iter() {
|
for pair in pairs.iter() {
|
||||||
let bytes = match pair {
|
let bytes = match pair {
|
||||||
@ -25,7 +25,6 @@ pub fn compute_docids<'transaction>(
|
|||||||
bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default();
|
bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default();
|
||||||
pair_docids.push(bitmap);
|
pair_docids.push(bitmap);
|
||||||
}
|
}
|
||||||
pair_docids.sort_by_key(|rb| rb.len());
|
|
||||||
let docids = MultiOps::union(pair_docids);
|
let docids = MultiOps::union(pair_docids);
|
||||||
Ok(docids)
|
Ok(docids)
|
||||||
}
|
}
|
||||||
|
@ -1,12 +1,13 @@
|
|||||||
pub mod build;
|
pub mod build;
|
||||||
pub mod compute_docids;
|
pub mod compute_docids;
|
||||||
|
|
||||||
|
use heed::RoTxn;
|
||||||
|
|
||||||
use super::{Edge, EdgeDetails, RankingRuleGraphTrait};
|
use super::{Edge, EdgeDetails, RankingRuleGraphTrait};
|
||||||
use crate::new::db_cache::DatabaseCache;
|
use crate::new::db_cache::DatabaseCache;
|
||||||
use crate::new::query_term::WordDerivations;
|
use crate::new::query_term::WordDerivations;
|
||||||
use crate::new::QueryNode;
|
use crate::new::QueryNode;
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result};
|
||||||
use heed::RoTxn;
|
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub enum WordPair {
|
pub enum WordPair {
|
||||||
|
@ -68,14 +68,6 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
|||||||
}
|
}
|
||||||
path_bitmaps.push(path_bitmap);
|
path_bitmaps.push(path_bitmap);
|
||||||
}
|
}
|
||||||
let docids = MultiOps::union(path_bitmaps);
|
Ok(MultiOps::union(path_bitmaps))
|
||||||
Ok(docids)
|
|
||||||
// for each path, translate it to an intersection of cached roaring bitmaps
|
|
||||||
// then do a union for all paths
|
|
||||||
|
|
||||||
// get the docids of the given paths in the proximity graph
|
|
||||||
// in the fastest possible way
|
|
||||||
// 1. roaring MultiOps (before we can do the Frozen+AST thing)
|
|
||||||
// 2. minimize number of operations
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -97,49 +97,10 @@ pub fn get_start_universe<'transaction>(
|
|||||||
query_graph: &QueryGraph,
|
query_graph: &QueryGraph,
|
||||||
term_matching_strategy: TermsMatchingStrategy,
|
term_matching_strategy: TermsMatchingStrategy,
|
||||||
// filters: Filters,
|
// filters: Filters,
|
||||||
// mut distinct: Option<D>,
|
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
// NOTE:
|
// TODO: actually compute the universe from the query graph
|
||||||
//
|
|
||||||
// There is a performance problem when using `distinct` + exhaustive number of hits,
|
|
||||||
// especially for search that yield many results (many ~= almost all of the
|
|
||||||
// dataset).
|
|
||||||
//
|
|
||||||
// We'll solve it later. Maybe there are smart ways to go about it.
|
|
||||||
//
|
|
||||||
// For example, if there are millions of possible values for the distinct attribute,
|
|
||||||
// then we could just look at the documents which share any distinct attribute with
|
|
||||||
// another one, and remove the later docids them from the universe.
|
|
||||||
// => NO! because we don't know which one to remove, only after the sorting is done can we know it
|
|
||||||
// => this kind of computation can be done, but only in the evaluation of the number
|
|
||||||
// of hits for the documents that aren't returned by the search.
|
|
||||||
//
|
|
||||||
// `Distinct` otherwise should always be computed during
|
|
||||||
|
|
||||||
let universe = index.documents_ids(txn).unwrap();
|
let universe = index.documents_ids(txn).unwrap();
|
||||||
|
Ok(universe)
|
||||||
// resolve the whole query tree to retrieve an exhaustive list of documents matching the query.
|
|
||||||
// NOTE: this is wrong
|
|
||||||
// Instead, we should only compute the documents corresponding to the last remaining
|
|
||||||
// word, 2-gram, and 3-gran.
|
|
||||||
// let candidates = resolve_query_graph(index, txn, db_cache, query_graph, &universe)?;
|
|
||||||
|
|
||||||
// Distinct should be lazy if placeholder?
|
|
||||||
//
|
|
||||||
// // because the initial_candidates should be an exhaustive count of the matching documents,
|
|
||||||
// // we precompute the distinct attributes.
|
|
||||||
// let initial_candidates = match &mut distinct {
|
|
||||||
// Some(distinct) => {
|
|
||||||
// let mut initial_candidates = RoaringBitmap::new();
|
|
||||||
// for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) {
|
|
||||||
// initial_candidates.insert(c?);
|
|
||||||
// }
|
|
||||||
// initial_candidates
|
|
||||||
// }
|
|
||||||
// None => candidates.clone(),
|
|
||||||
// };
|
|
||||||
|
|
||||||
Ok(/*candidates*/ universe)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn execute_search<'transaction>(
|
pub fn execute_search<'transaction>(
|
||||||
@ -306,43 +267,6 @@ mod tests {
|
|||||||
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
let primary_key = index.primary_key(&txn).unwrap().unwrap();
|
||||||
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
|
||||||
|
|
||||||
loop {
|
|
||||||
let start = Instant::now();
|
|
||||||
|
|
||||||
let mut db_cache = DatabaseCache::default();
|
|
||||||
|
|
||||||
let query_graph = make_query_graph(
|
|
||||||
&index,
|
|
||||||
&txn,
|
|
||||||
&mut db_cache,
|
|
||||||
"released from prison by the government",
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
// println!("{}", query_graph.graphviz());
|
|
||||||
|
|
||||||
// TODO: filters + maybe distinct attributes?
|
|
||||||
let universe = get_start_universe(
|
|
||||||
&index,
|
|
||||||
&txn,
|
|
||||||
&mut db_cache,
|
|
||||||
&query_graph,
|
|
||||||
TermsMatchingStrategy::Last,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
// println!("universe: {universe:?}");
|
|
||||||
|
|
||||||
let results = execute_search(
|
|
||||||
&index,
|
|
||||||
&txn,
|
|
||||||
&mut db_cache,
|
|
||||||
&universe,
|
|
||||||
&query_graph, /* 0, 20 */
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let elapsed = start.elapsed();
|
|
||||||
println!("{}us: {results:?}", elapsed.as_micros());
|
|
||||||
}
|
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
let mut db_cache = DatabaseCache::default();
|
let mut db_cache = DatabaseCache::default();
|
||||||
@ -350,7 +274,6 @@ mod tests {
|
|||||||
let query_graph =
|
let query_graph =
|
||||||
make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government")
|
make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government")
|
||||||
.unwrap();
|
.unwrap();
|
||||||
// println!("{}", query_graph.graphviz());
|
|
||||||
|
|
||||||
// TODO: filters + maybe distinct attributes?
|
// TODO: filters + maybe distinct attributes?
|
||||||
let universe = get_start_universe(
|
let universe = get_start_universe(
|
||||||
@ -361,7 +284,6 @@ mod tests {
|
|||||||
TermsMatchingStrategy::Last,
|
TermsMatchingStrategy::Last,
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
// println!("universe: {universe:?}");
|
|
||||||
|
|
||||||
let results =
|
let results =
|
||||||
execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /* 0, 20 */)
|
execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /* 0, 20 */)
|
||||||
@ -396,7 +318,7 @@ mod tests {
|
|||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
|
|
||||||
let mut s = Search::new(&txn, &index);
|
let mut s = Search::new(&txn, &index);
|
||||||
s.query("released from prison by the government");
|
s.query("b b b b b b b b b b");
|
||||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
|
s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
|
||||||
let docs = s.execute().unwrap();
|
let docs = s.execute().unwrap();
|
||||||
@ -414,30 +336,14 @@ mod tests {
|
|||||||
let index = Index::new(options, "data_movies").unwrap();
|
let index = Index::new(options, "data_movies").unwrap();
|
||||||
let mut wtxn = index.write_txn().unwrap();
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
|
||||||
// let primary_key = "id";
|
|
||||||
// let searchable_fields = vec!["title", "overview"];
|
|
||||||
// let filterable_fields = vec!["release_date", "genres"];
|
|
||||||
// let sortable_fields = vec[];
|
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||||
|
|
||||||
builder.set_min_word_len_one_typo(5);
|
builder.set_min_word_len_one_typo(5);
|
||||||
builder.set_min_word_len_two_typos(100);
|
builder.set_min_word_len_two_typos(100);
|
||||||
|
|
||||||
// builder.set_primary_key(primary_key.to_owned());
|
|
||||||
|
|
||||||
// let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
|
|
||||||
// builder.set_searchable_fields(searchable_fields);
|
|
||||||
|
|
||||||
// let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
|
|
||||||
// builder.set_filterable_fields(filterable_fields);
|
|
||||||
|
|
||||||
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||||
|
|
||||||
// let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect();
|
|
||||||
// builder.set_sortable_fields(sortable_fields);
|
|
||||||
|
|
||||||
builder.execute(|_| (), || false).unwrap();
|
builder.execute(|_| (), || false).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -452,7 +358,6 @@ mod tests {
|
|||||||
let primary_key = "id";
|
let primary_key = "id";
|
||||||
let searchable_fields = vec!["title", "overview"];
|
let searchable_fields = vec!["title", "overview"];
|
||||||
let filterable_fields = vec!["release_date", "genres"];
|
let filterable_fields = vec!["release_date", "genres"];
|
||||||
// let sortable_fields = vec[];
|
|
||||||
|
|
||||||
let config = IndexerConfig::default();
|
let config = IndexerConfig::default();
|
||||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
|
use std::collections::{HashMap, HashSet, VecDeque};
|
||||||
|
|
||||||
use fxhash::FxHashMap;
|
use fxhash::FxHashMap;
|
||||||
use heed::{BytesDecode, RoTxn};
|
use heed::{BytesDecode, RoTxn};
|
||||||
use roaring::{MultiOps, RoaringBitmap};
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
use std::collections::{HashMap, HashSet, VecDeque};
|
|
||||||
|
|
||||||
use super::db_cache::DatabaseCache;
|
use super::db_cache::DatabaseCache;
|
||||||
use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
||||||
@ -9,8 +10,6 @@ use super::QueryGraph;
|
|||||||
use crate::{Index, Result, RoaringBitmapCodec};
|
use crate::{Index, Result, RoaringBitmapCodec};
|
||||||
|
|
||||||
// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.
|
// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.
|
||||||
|
|
||||||
// TODO: reuse NodeDocidsCache in between calls to resolve_query_graph
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct NodeDocIdsCache {
|
pub struct NodeDocIdsCache {
|
||||||
pub cache: FxHashMap<u32, RoaringBitmap>,
|
pub cache: FxHashMap<u32, RoaringBitmap>,
|
||||||
@ -55,11 +54,6 @@ impl NodeDocIdsCache {
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap());
|
.map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap());
|
||||||
MultiOps::union(derivations_iter)
|
MultiOps::union(derivations_iter)
|
||||||
// TODO: if `or` is empty, register that somewhere, and immediately return an empty bitmap
|
|
||||||
// On the other hand, `or` *cannot* be empty, only its intersection with the universe can
|
|
||||||
//
|
|
||||||
// TODO: Or we don't do anything and accumulate all these operations in a tree of operations
|
|
||||||
// between frozen roaring bitmap that is resolved only at the very end
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let _ = self.cache.insert(node_idx, docids);
|
let _ = self.cache.insert(node_idx, docids);
|
||||||
@ -79,10 +73,7 @@ pub fn resolve_query_graph<'transaction>(
|
|||||||
// TODO: there is definitely a faster way to compute this big
|
// TODO: there is definitely a faster way to compute this big
|
||||||
// roaring bitmap expression
|
// roaring bitmap expression
|
||||||
|
|
||||||
// resolve_query_graph_rec(index, txn, q, q.root_node, &mut docids, &mut cache)?;
|
|
||||||
|
|
||||||
let mut nodes_resolved = RoaringBitmap::new();
|
let mut nodes_resolved = RoaringBitmap::new();
|
||||||
// TODO: should be given as an argument and kept between invocations of resolve query graph
|
|
||||||
let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()];
|
let mut path_nodes_docids = vec![RoaringBitmap::new(); q.nodes.len()];
|
||||||
|
|
||||||
let mut next_nodes_to_visit = VecDeque::new();
|
let mut next_nodes_to_visit = VecDeque::new();
|
||||||
@ -123,100 +114,14 @@ pub fn resolve_query_graph<'transaction>(
|
|||||||
next_nodes_to_visit.push_back(succ);
|
next_nodes_to_visit.push_back(succ);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is currently slow but could easily be implemented very efficiently
|
// This is currently slow but could easily be implemented very efficiently
|
||||||
for prec in q.edges[node as usize].predecessors.iter() {
|
for prec in q.edges[node as usize].predecessors.iter() {
|
||||||
if q.edges[prec as usize].successors.is_subset(&nodes_resolved) {
|
if q.edges[prec as usize].successors.is_subset(&nodes_resolved) {
|
||||||
path_nodes_docids[prec as usize].clear();
|
path_nodes_docids[prec as usize].clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// println!("cached docids: {nodes_docids:?}");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
panic!()
|
panic!()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use charabia::Tokenize;
|
|
||||||
|
|
||||||
use super::resolve_query_graph;
|
|
||||||
use crate::db_snap;
|
|
||||||
use crate::index::tests::TempIndex;
|
|
||||||
use crate::new::db_cache::DatabaseCache;
|
|
||||||
use crate::new::resolve_query_graph::NodeDocIdsCache;
|
|
||||||
use crate::search::new::query_term::{word_derivations, LocatedQueryTerm};
|
|
||||||
use crate::search::new::QueryGraph;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_resolve_query_graph() {
|
|
||||||
let index = TempIndex::new();
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|s| {
|
|
||||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
index
|
|
||||||
.add_documents(documents!([
|
|
||||||
{"id": 0, "text": "0"},
|
|
||||||
{"id": 1, "text": "1"},
|
|
||||||
{"id": 2, "text": "2"},
|
|
||||||
{"id": 3, "text": "3"},
|
|
||||||
{"id": 4, "text": "4"},
|
|
||||||
{"id": 5, "text": "5"},
|
|
||||||
{"id": 6, "text": "6"},
|
|
||||||
{"id": 7, "text": "7"},
|
|
||||||
{"id": 8, "text": "0 1 2 3 4 5 6 7"},
|
|
||||||
{"id": 9, "text": "7 6 5 4 3 2 1 0"},
|
|
||||||
{"id": 10, "text": "01 234 56 7"},
|
|
||||||
{"id": 11, "text": "7 56 0 1 23 5 4"},
|
|
||||||
{"id": 12, "text": "0 1 2 3 4 5 6"},
|
|
||||||
{"id": 13, "text": "01 23 4 5 7"},
|
|
||||||
]))
|
|
||||||
.unwrap();
|
|
||||||
db_snap!(index, word_docids, @"7512d0b80659f6bf37d98b374ada8098");
|
|
||||||
|
|
||||||
let txn = index.read_txn().unwrap();
|
|
||||||
let mut db_cache = DatabaseCache::default();
|
|
||||||
let fst = index.words_fst(&txn).unwrap();
|
|
||||||
let query = LocatedQueryTerm::from_query(
|
|
||||||
"no 0 1 2 3 no 4 5 6 7".tokenize(),
|
|
||||||
None,
|
|
||||||
|word, is_prefix| {
|
|
||||||
word_derivations(
|
|
||||||
&index,
|
|
||||||
&txn,
|
|
||||||
word,
|
|
||||||
if word.len() < 3 {
|
|
||||||
0
|
|
||||||
} else if word.len() < 6 {
|
|
||||||
1
|
|
||||||
} else {
|
|
||||||
2
|
|
||||||
},
|
|
||||||
is_prefix,
|
|
||||||
&fst,
|
|
||||||
)
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap();
|
|
||||||
println!("{}", graph.graphviz());
|
|
||||||
let mut node_docids_cache = NodeDocIdsCache::default();
|
|
||||||
let universe = index.documents_ids(&txn).unwrap();
|
|
||||||
insta::assert_debug_snapshot!(universe, @"RoaringBitmap<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]>");
|
|
||||||
let docids = resolve_query_graph(
|
|
||||||
&index,
|
|
||||||
&txn,
|
|
||||||
&mut db_cache,
|
|
||||||
&mut node_docids_cache,
|
|
||||||
&graph,
|
|
||||||
&universe,
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
insta::assert_debug_snapshot!(docids, @"RoaringBitmap<[8, 9, 11]>");
|
|
||||||
|
|
||||||
// TODO: test with a reduced universe
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use super::db_cache::DatabaseCache;
|
||||||
use super::{
|
use super::{
|
||||||
db_cache::DatabaseCache, RankingRule, RankingRuleOutput, RankingRuleOutputIter,
|
RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper,
|
||||||
RankingRuleOutputIterWrapper, RankingRuleQueryTrait,
|
RankingRuleQueryTrait,
|
||||||
};
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
// facet::FacetType,
|
// facet::FacetType,
|
||||||
@ -33,18 +34,6 @@ impl<'transaction, Query> Sort<'transaction, Query> {
|
|||||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||||
let field_id = fields_ids_map.id(&field_name);
|
let field_id = fields_ids_map.id(&field_name);
|
||||||
|
|
||||||
// TODO: What is this, why?
|
|
||||||
// let faceted_candidates = match field_id {
|
|
||||||
// Some(field_id) => {
|
|
||||||
// let number_faceted =
|
|
||||||
// index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?;
|
|
||||||
// let string_faceted =
|
|
||||||
// index.faceted_documents_ids(rtxn, field_id, FacetType::String)?;
|
|
||||||
// number_faceted | string_faceted
|
|
||||||
// }
|
|
||||||
// None => RoaringBitmap::default(),
|
|
||||||
// };
|
|
||||||
|
|
||||||
Ok(Self { field_id, is_ascending, iter: None })
|
Ok(Self { field_id, is_ascending, iter: None })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -79,8 +79,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
|
|||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
let Some(query_graph) = &mut self.query_graph else { panic!() };
|
let Some(query_graph) = &mut self.query_graph else { panic!() };
|
||||||
// let graphviz = query_graph.graphviz();
|
|
||||||
// println!("\n===={graphviz}\n====");
|
|
||||||
let this_bucket = resolve_query_graph(
|
let this_bucket = resolve_query_graph(
|
||||||
index,
|
index,
|
||||||
txn,
|
txn,
|
||||||
@ -89,10 +88,8 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
|
|||||||
query_graph,
|
query_graph,
|
||||||
universe,
|
universe,
|
||||||
)?;
|
)?;
|
||||||
// println!("WORDS: this bucket: {this_bucket:?}");
|
|
||||||
let child_query_graph = query_graph.clone();
|
let child_query_graph = query_graph.clone();
|
||||||
// this_bucket is the one that must be returned now
|
|
||||||
// self.cur_bucket is set to the next bucket
|
|
||||||
// TODO: Check whether a position exists in the graph before removing it and
|
// TODO: Check whether a position exists in the graph before removing it and
|
||||||
// returning the next bucket.
|
// returning the next bucket.
|
||||||
// while graph.does_not_contain(positions_to_remove.last()) { positions_to_remove.pop() }
|
// while graph.does_not_contain(positions_to_remove.last()) { positions_to_remove.pop() }
|
||||||
@ -118,41 +115,3 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
|
|||||||
self.positions_to_remove = vec![];
|
self.positions_to_remove = vec![];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
// use charabia::Tokenize;
|
|
||||||
// use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
// use crate::{
|
|
||||||
// index::tests::TempIndex,
|
|
||||||
// search::{criteria::CriteriaBuilder, new::QueryGraphOrPlaceholder},
|
|
||||||
// };
|
|
||||||
|
|
||||||
// use super::Words;
|
|
||||||
|
|
||||||
// fn placeholder() {
|
|
||||||
// let qt = QueryGraphOrPlaceholder::Placeholder;
|
|
||||||
// let index = TempIndex::new();
|
|
||||||
// let rtxn = index.read_txn().unwrap();
|
|
||||||
|
|
||||||
// let query = "a beautiful summer house by the beach overlooking what seems";
|
|
||||||
// // let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap();
|
|
||||||
// // let (qt, parts, matching_words) = builder.build(query.tokenize()).unwrap().unwrap();
|
|
||||||
|
|
||||||
// // let cb = CriteriaBuilder::new(&rtxn, &index).unwrap();
|
|
||||||
// // let x = cb
|
|
||||||
// // .build(
|
|
||||||
// // Some(qt),
|
|
||||||
// // Some(parts),
|
|
||||||
// // None,
|
|
||||||
// // None,
|
|
||||||
// // false,
|
|
||||||
// // None,
|
|
||||||
// // crate::CriterionImplementationStrategy::OnlySetBased,
|
|
||||||
// // )
|
|
||||||
// // .unwrap();
|
|
||||||
|
|
||||||
// // let rr = Words::new(&index, &RoaringBitmap::from_sorted_iter(0..1000)).unwrap();
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
Loading…
Reference in New Issue
Block a user