diff --git a/Cargo.lock b/Cargo.lock index b960a47c9..6129fd497 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -292,12 +292,6 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" -[[package]] -name = "fixedbitset" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d" - [[package]] name = "flate2" version = "1.0.14" @@ -642,9 +636,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "1.3.2" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "076f042c5b7b98f31d205f1249267e12a6518c1481e9dae9764af19b707d2292" +checksum = "c398b2b113b55809ceb9ee3e753fcbac793f1956663f3c36549c1346015c2afe" dependencies = [ "autocfg 1.0.0", ] @@ -667,15 +661,6 @@ dependencies = [ "libc", ] -[[package]] -name = "itertools" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484" -dependencies = [ - "either", -] - [[package]] name = "itertools" version = "0.9.0" @@ -805,13 +790,13 @@ dependencies = [ "fst", "fxhash", "heed", - "itertools 0.9.0", + "indexmap", + "itertools", "jemallocator", "levenshtein_automata", "memmap", "once_cell", "oxidized-mtbl", - "pathfinding", "rayon", "roaring", "serde", @@ -988,15 +973,6 @@ dependencies = [ "winapi 0.3.8", ] -[[package]] -name = "num-traits" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c62be47e61d1842b9170f0fdeec8eba98e60e90e5446449a0545e5152acd7096" -dependencies = [ - "autocfg 1.0.0", -] - [[package]] name = "num_cpus" version = "1.13.0" @@ -1041,18 +1017,6 @@ dependencies = [ "winapi 0.3.8", ] -[[package]] -name = "pathfinding" -version = "2.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86f4d8cc85ca67860ef4324faf86973a39e4e1c78338987eda29a8e6b6ec0c0e" -dependencies = [ - "fixedbitset", - "indexmap", - "itertools 0.8.2", - "num-traits", -] - [[package]] name = "percent-encoding" version = "2.1.0" @@ -1979,6 +1943,6 @@ checksum = "c442965efc45353be5a9b9969c9b0872fff6828c7e06d118dda2cb2d0bb11d5a" dependencies = [ "cc", "glob", - "itertools 0.9.0", + "itertools", "libc", ] diff --git a/Cargo.toml b/Cargo.toml index e19a3fd6e..168a9fb09 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,7 +28,7 @@ structopt = { version = "0.3.14", default-features = false } tempfile = "3.1.0" # best proximity -pathfinding = "2.0.4" +indexmap = "1.4.0" # to implement internally itertools = "0.9.0" diff --git a/src/best_proximity.rs b/src/best_proximity.rs index 49f90eb8e..b17f176c7 100644 --- a/src/best_proximity.rs +++ b/src/best_proximity.rs @@ -1,7 +1,7 @@ use std::cmp; use std::time::Instant; -use pathfinding::directed::astar::astar_bag; +use crate::iter_shortest_paths::astar_bag; const ONE_ATTRIBUTE: u32 = 1000; const MAX_DISTANCE: u32 = 8; @@ -37,6 +37,8 @@ enum Node { position: u32, // The total accumulated proximity until this node, used for skipping nodes. acc_proximity: u32, + // The parent position from the above layer. + parent_position: u32, }, } @@ -44,35 +46,29 @@ impl Node { // TODO we must skip the successors that have already been seen // TODO we must skip the successors that doesn't return any documents // this way we are able to skip entire paths - fn successors( - &self, - positions: &[Vec], - best_proximity: u32, - mut contains_documents: F, - ) -> Vec<(Node, u32)> - where F: FnMut((usize, u32), (usize, u32)) -> bool, - { + fn successors(&self, positions: &[Vec], best_proximity: u32) -> Vec<(Node, u32)> { match self { Node::Uninit => { positions[0].iter().map(|p| { - (Node::Init { layer: 0, position: *p, acc_proximity: 0 }, 0) + (Node::Init { layer: 0, position: *p, acc_proximity: 0, parent_position: 0 }, 0) }).collect() }, // We reached the highest layer n @ Node::Init { .. } if n.is_complete(positions) => vec![], - Node::Init { layer, position, acc_proximity } => { + Node::Init { layer, position, acc_proximity, .. } => { positions[layer + 1].iter().filter_map(|p| { let proximity = positions_proximity(*position, *p); - let node = Node::Init { layer: layer + 1, position: *p, acc_proximity: acc_proximity + proximity }; - if (contains_documents)((*layer, *position), (layer + 1, *p)) { - // We do not produce the nodes we have already seen in previous iterations loops. - if node.is_complete(positions) && acc_proximity + proximity < best_proximity { - None - } else { - Some((node, proximity)) - } - } else { + let node = Node::Init { + layer: layer + 1, + position: *p, + acc_proximity: acc_proximity + proximity, + parent_position: *position, + }; + // We do not produce the nodes we have already seen in previous iterations loops. + if node.is_complete(positions) && acc_proximity + proximity < best_proximity { None + } else { + Some((node, proximity)) } }).collect() } @@ -92,6 +88,35 @@ impl Node { Node::Init { position, .. } => Some(*position), } } + + fn proximity(&self) -> u32 { + match self { + Node::Uninit => 0, + Node::Init { layer, position, acc_proximity, parent_position } => { + if layer.checked_sub(1).is_some() { + acc_proximity + positions_proximity(*position, *parent_position) + } else { + 0 + } + }, + } + } + + fn is_reachable(&self, mut contains_documents: F) -> bool + where F: FnMut((usize, u32), (usize, u32)) -> bool, + { + match self { + Node::Uninit => true, + Node::Init { layer, position, parent_position, .. } => { + match layer.checked_sub(1) { + Some(parent_layer) => { + (contains_documents)((parent_layer, *parent_position), (*layer, *position)) + }, + None => true, + } + }, + } + } } pub struct BestProximity { @@ -102,7 +127,7 @@ pub struct BestProximity { impl BestProximity { pub fn new(positions: Vec>, contains_documents: F) -> BestProximity { - let best_proximity = positions.len() as u32 - 1; + let best_proximity = (positions.len() as u32).saturating_sub(1); BestProximity { positions, best_proximity, contains_documents } } } @@ -121,9 +146,12 @@ where F: FnMut((usize, u32), (usize, u32)) -> bool + Copy, let result = astar_bag( &Node::Uninit, // start - |n| n.successors(&self.positions, self.best_proximity, self.contains_documents), + |n| n.successors(&self.positions, self.best_proximity), |_| 0, // heuristic - |n| n.is_complete(&self.positions), // success + |n| { // success + let c = n.is_complete(&self.positions) && n.proximity() >= self.best_proximity; + if n.is_reachable(self.contains_documents) { Some(c) } else { None } + }, ); eprintln!("BestProximity::next() took {:.02?}", before.elapsed()); diff --git a/src/iter_shortest_paths.rs b/src/iter_shortest_paths.rs new file mode 100644 index 000000000..f993ea674 --- /dev/null +++ b/src/iter_shortest_paths.rs @@ -0,0 +1,204 @@ +use std::cmp::Ordering; +use std::collections::{BinaryHeap, HashSet}; +use std::hash::Hash; +use std::usize; + +use indexmap::map::Entry::{Occupied, Vacant}; +use indexmap::IndexMap; + +pub fn astar_bag( + start: &N, + mut successors: FN, + mut heuristic: FH, + mut success: FS, +) -> Option<(AstarSolution, u32)> +where + N: Eq + Hash + Clone, + FN: FnMut(&N) -> IN, + IN: IntoIterator, + FH: FnMut(&N) -> u32, + FS: FnMut(&N) -> Option, +{ + let mut to_see = BinaryHeap::new(); + let mut min_cost = None; + let mut sinks = HashSet::new(); + to_see.push(SmallestCostHolder { + estimated_cost: heuristic(start), + cost: 0, + index: 0, + }); + let mut parents: IndexMap, u32)> = IndexMap::new(); + parents.insert(start.clone(), (HashSet::new(), 0)); + while let Some(SmallestCostHolder { cost, index, estimated_cost, .. }) = to_see.pop() { + if let Some(min_cost) = min_cost { + if estimated_cost > min_cost { + break; + } + } + let successors = { + let (node, &(_, c)) = parents.get_index(index).unwrap(); + // We check that the node is even reachable and if so if it is an answer. + // If this node is unreachable we skip it. + match success(node) { + Some(success) => if success { + min_cost = Some(cost); + sinks.insert(index); + }, + None => continue, + } + + // We may have inserted a node several time into the binary heap if we found + // a better way to access it. Ensure that we are currently dealing with the + // best path and discard the others. + if cost > c { + continue; + } + successors(node) + }; + for (successor, move_cost) in successors { + let new_cost = cost + move_cost; + let h; // heuristic(&successor) + let n; // index for successor + match parents.entry(successor) { + Vacant(e) => { + h = heuristic(e.key()); + n = e.index(); + let mut p = HashSet::new(); + p.insert(index); + e.insert((p, new_cost)); + } + Occupied(mut e) => { + if e.get().1 > new_cost { + h = heuristic(e.key()); + n = e.index(); + let s = e.get_mut(); + s.0.clear(); + s.0.insert(index); + s.1 = new_cost; + } else { + if e.get().1 == new_cost { + // New parent with an identical cost, this is not + // considered as an insertion. + e.get_mut().0.insert(index); + } + continue; + } + } + } + + to_see.push(SmallestCostHolder { + estimated_cost: new_cost + h, + cost: new_cost, + index: n, + }); + } + } + + min_cost.map(|cost| { + let parents = parents + .into_iter() + .map(|(k, (ps, _))| (k, ps.into_iter().collect())) + .collect(); + ( + AstarSolution { + sinks: sinks.into_iter().collect(), + parents, + current: vec![], + terminated: false, + }, + cost, + ) + }) +} + +struct SmallestCostHolder { + estimated_cost: K, + cost: K, + index: usize, +} + +impl PartialEq for SmallestCostHolder { + fn eq(&self, other: &Self) -> bool { + self.estimated_cost.eq(&other.estimated_cost) && self.cost.eq(&other.cost) + } +} + +impl Eq for SmallestCostHolder {} + +impl PartialOrd for SmallestCostHolder { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for SmallestCostHolder { + fn cmp(&self, other: &Self) -> Ordering { + match other.estimated_cost.cmp(&self.estimated_cost) { + Ordering::Equal => self.cost.cmp(&other.cost), + s => s, + } + } +} + +/// Iterator structure created by the `astar_bag` function. +#[derive(Clone)] +pub struct AstarSolution { + sinks: Vec, + parents: Vec<(N, Vec)>, + current: Vec>, + terminated: bool, +} + +impl AstarSolution { + fn complete(&mut self) { + loop { + let ps = match self.current.last() { + None => self.sinks.clone(), + Some(last) => { + let &top = last.last().unwrap(); + self.parents(top).clone() + } + }; + if ps.is_empty() { + break; + } + self.current.push(ps); + } + } + + fn next_vec(&mut self) { + while self.current.last().map(Vec::len) == Some(1) { + self.current.pop(); + } + self.current.last_mut().map(Vec::pop); + } + + fn node(&self, i: usize) -> &N { + &self.parents[i].0 + } + + fn parents(&self, i: usize) -> &Vec { + &self.parents[i].1 + } +} + +impl Iterator for AstarSolution { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.terminated { + return None; + } + self.complete(); + let path = self + .current + .iter() + .rev() + .map(|v| v.last().cloned().unwrap()) + .map(|i| self.node(i).clone()) + .collect::>(); + self.next_vec(); + self.terminated = self.current.is_empty(); + Some(path) + } +} diff --git a/src/lib.rs b/src/lib.rs index fee6831ef..d49621483 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ mod best_proximity; +mod iter_shortest_paths; mod query_tokens; use std::borrow::Cow;