diff --git a/Cargo.lock b/Cargo.lock index a7590c0de..7899d6558 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -292,6 +292,12 @@ version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" +[[package]] +name = "fixedbitset" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37ab347416e802de484e4d03c7316c48f1ecb56574dfd4a46a80f173ce1de04d" + [[package]] name = "flate2" version = "1.0.14" @@ -661,6 +667,15 @@ dependencies = [ "libc", ] +[[package]] +name = "itertools" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.9.0" @@ -790,12 +805,13 @@ dependencies = [ "fst", "fxhash", "heed", - "itertools", + "itertools 0.9.0", "jemallocator", "levenshtein_automata", "memmap", "once_cell", "oxidized-mtbl", + "pathfinding", "rayon", "roaring", "serde", @@ -972,6 +988,15 @@ dependencies = [ "winapi 0.3.8", ] +[[package]] +name = "num-traits" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c62be47e61d1842b9170f0fdeec8eba98e60e90e5446449a0545e5152acd7096" +dependencies = [ + "autocfg 1.0.0", +] + [[package]] name = "num_cpus" version = "1.13.0" @@ -1016,6 +1041,18 @@ dependencies = [ "winapi 0.3.8", ] +[[package]] +name = "pathfinding" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86f4d8cc85ca67860ef4324faf86973a39e4e1c78338987eda29a8e6b6ec0c0e" +dependencies = [ + "fixedbitset", + "indexmap", + "itertools 0.8.2", + "num-traits", +] + [[package]] name = "percent-encoding" version = "2.1.0" @@ -1942,6 +1979,6 @@ checksum = "c442965efc45353be5a9b9969c9b0872fff6828c7e06d118dda2cb2d0bb11d5a" dependencies = [ "cc", "glob", - "itertools", + "itertools 0.9.0", "libc", ] diff --git a/Cargo.toml b/Cargo.toml index 7a2c0005d..3d166be15 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,9 @@ smallvec = "1.4.0" structopt = { version = "0.3.14", default-features = false } tempfile = "3.1.0" +# best proximity +pathfinding = "2.0.4" + # to implement internally itertools = "0.9.0" diff --git a/src/best_proximity.rs b/src/best_proximity.rs index 11fd4c0ba..5b3c2a64b 100644 --- a/src/best_proximity.rs +++ b/src/best_proximity.rs @@ -1,4 +1,5 @@ use std::cmp; +use pathfinding::directed::dijkstra::dijkstra; const ONE_ATTRIBUTE: u32 = 1000; const MAX_INDEX: u32 = ONE_ATTRIBUTE - 1; @@ -29,107 +30,40 @@ fn construct_position(attr: u32, index: u32) -> u32 { attr * ONE_ATTRIBUTE + index } -// TODO we should use an sdset::Set for `next_positions`. -// TODO We must not recursively search for the best proximity but return None if proximity is not found. -// Returns the positions to focus that will give the best possible proximity. -fn best_proximity_for(current_position: u32, proximity: u32, next_positions: &[u32]) -> Option<(u32, Vec)> { - let (current_attr, _) = extract_position(current_position); +#[derive(Debug, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +struct Path(Vec); - match proximity { - // look at i+0 - 0 => { - match next_positions.binary_search(¤t_position) { - Ok(_) => Some((0, vec![current_position])), - Err(_) => best_proximity_for(current_position, proximity + 1, next_positions), - } - }, - // look at i+1 - 1 => { - let position = current_position + 1; - let (attr, _) = extract_position(position); +impl Path { + fn new(positions: &[Vec]) -> Option { + let position = positions.first()?.first()?; + Some(Path(vec![*position])) + } - // We must check that we do not overflowed the current attribute. If so, - // we must check for a bigger proximity that we will be able to find behind. - if current_attr == attr { - match next_positions.binary_search(&position) { - Ok(_) => Some((1, vec![position])), - Err(_) => best_proximity_for(current_position, proximity + 1, next_positions), - } - } else { - best_proximity_for(current_position, proximity + 1, next_positions) - } - }, - // look at i-(p-1), i+p - 2..=7 => { - let mut output = Vec::new(); + fn successors(&self, _positions: &[Vec]) -> Vec<(Path, u32)> { + vec![] + } - // Behind the current_position - if let Some(position) = current_position.checked_sub(proximity - 1) { - let (attr, _) = extract_position(position); - // We must make sure we are not looking at a word at the end of another attribute. - if current_attr == attr && next_positions.binary_search(&position).is_ok() { - output.push(position); - } - } + fn proximity(&self) -> u32 { + self.0.windows(2).map(|ps| positions_proximity(ps[0], ps[1])).sum::() + } - // In front of the current_position - let position = current_position + proximity; - let (attr, _) = extract_position(position); - // We must make sure we are not looking at a word at the end of another attribute. - if current_attr == attr && next_positions.binary_search(&position).is_ok() { - output.push(position); - } - - if output.is_empty() { - best_proximity_for(current_position, proximity + 1, next_positions) - } else { - Some((proximity, output)) - } - }, - // look at i+8 and all above and i-(8-1) and all below - 8 => { - let mut output = Vec::new(); - - // Make sure we look at the latest index of the previous attr. - if let Some(previous_position) = construct_position(current_attr, 0).checked_sub(1) { - let position = current_position.saturating_sub(7).max(previous_position); - match dbg!(next_positions.binary_search(&position)) { - Ok(i) => output.extend_from_slice(&next_positions[..=i]), - Err(i) => if let Some(i) = i.checked_sub(1) { - if let Some(positions) = next_positions.get(..=i) { - output.extend_from_slice(positions) - } - }, - } - } - - // Make sure the position doesn't overflow to the next attribute. - let position = (current_position + 8).min(construct_position(current_attr + 1, 0)); - match next_positions.binary_search(&position) { - Ok(i) => output.extend_from_slice(&next_positions[i..]), - Err(i) => if let Some(positions) = next_positions.get(i..) { - output.extend_from_slice(positions); - }, - } - - if output.is_empty() { - None - } else { - Some((8, output)) - } - } - _ => None, + fn is_complete(&self, positions: &[Vec]) -> bool { + positions.len() == self.0.len() } } pub struct BestProximity { positions: Vec>, - best_proximities: Option>, + best_proximity: u32, } impl BestProximity { pub fn new(positions: Vec>) -> BestProximity { - BestProximity { positions, best_proximities: None } + BestProximity { positions, best_proximity: 0 } + } + + fn is_path_successful(&self, path: &Path) -> bool { + path.is_complete(&self.positions) && path.proximity() >= self.best_proximity } } @@ -137,59 +71,44 @@ impl Iterator for BestProximity { type Item = (u32, Vec>); fn next(&mut self) -> Option { - match &mut self.best_proximities { - Some(best_proximities) => { - let expected_proximity = best_proximities.iter().sum::() + 1; - dbg!(expected_proximity); + let mut output: Option<(u32, Vec>)> = None; - for (i, (win, proximity)) in self.positions.windows(2).zip(best_proximities.iter()).enumerate() { - let (posa, posb) = (&win[0], &win[1]); - dbg!(proximity, posa, posb); - let expected_proximity = proximity + 1; - let best_proximity = posa.iter().filter_map(|pa| { - best_proximity_for(*pa, expected_proximity, posb).map(|res| (*pa, res)) - }).min(); - dbg!(best_proximity); - } + unimplemented!("we must use and update self.best_proximity"); - None - }, - None => { - let expected_proximity = 0; - let mut best_results = Vec::new(); + loop { + let start = Path::new(&self.positions)?; + let result = dijkstra( + &start, + |p| p.successors(&self.positions), + |p| self.is_path_successful(p) && output.as_ref().map_or(true, |paths| !paths.1.contains(&p.0)), + ); - for win in self.positions.windows(2) { - let (posa, posb) = (&win[0], &win[1]); - match best_results.last() { - Some((start, _)) => { - // We know from where we must continue searching for the best path. - let (best_proximity, positions) = dbg!(best_proximity_for(*start, expected_proximity, posb).unwrap()); - best_results.push((positions[0], best_proximity)); + match result { + Some((mut paths, proximity)) => { + let positions = paths.pop().unwrap(); + + // If the current output is + match &mut output { + Some((best_proximity, paths)) => { + // If the shortest path we found is bigger than the one requested + // it means that we found all the paths with the same proximity and can + // return those to the user. + if proximity > *best_proximity { + break; + } + + // We add the new path to the output list as this path is known + // to be the requested distance. + paths.push(positions.0); }, - None => { - // This is the first loop, we need to find the best start of the path. - let best_proximity = posa.iter().filter_map(|pa| { - best_proximity_for(*pa, expected_proximity, posb).map(|res| (*pa, res)) - }).min(); - let (pa, (best_proximity, positions)) = best_proximity.unwrap(); - // We must save the best start of path we found. - best_results.push((pa, 0)); - // And the next associated position along with the proximity between those. - best_results.push((positions[0], best_proximity)); - } + None => output = Some((positions.proximity(), vec![positions.0])), } - } - - if best_results.is_empty() { - None - } else { - let proximity = best_results.windows(2).map(|ps| positions_proximity(ps[0].0, ps[1].0)).sum::(); - self.best_proximities = Some(best_results.iter().skip(1).map(|(_, p)| *p).collect()); - let best_positions = best_results.into_iter().map(|(x, _)| vec![x]).collect(); - Some((proximity, best_positions)) - } + }, + None => break, } } + + output } } @@ -217,26 +136,4 @@ mod tests { // assert_eq!(iter.next(), Some((4+5, vec![4, 1, 6]))); // 9 // assert_eq!(iter.next(), None); } - - #[test] - fn easy_best_proximity_for() { - // classic - assert_eq!(best_proximity_for(0, 0, &[0]), Some((0, vec![0]))); - assert_eq!(best_proximity_for(0, 1, &[0]), None); - assert_eq!(best_proximity_for(1, 1, &[0]), Some((2, vec![0]))); - assert_eq!(best_proximity_for(0, 1, &[0, 1]), Some((1, vec![1]))); - assert_eq!(best_proximity_for(1, 1, &[0, 2]), Some((1, vec![2]))); - assert_eq!(best_proximity_for(1, 2, &[0, 2]), Some((2, vec![0]))); - assert_eq!(best_proximity_for(1, 2, &[0, 3]), Some((2, vec![0, 3]))); - - // limits - assert_eq!(best_proximity_for(2, 7, &[0, 9]), Some((7, vec![9]))); - assert_eq!(best_proximity_for(12, 7, &[6, 19]), Some((7, vec![6, 19]))); - - // another attribute - assert_eq!(best_proximity_for(1000, 7, &[994, 1007]), Some((7, vec![1007]))); - assert_eq!(best_proximity_for(1004, 7, &[994, 1011]), Some((7, vec![1011]))); - assert_eq!(best_proximity_for(1004, 8, &[900, 913, 1000, 1012, 2012]), Some((8, vec![900, 913, 1012, 2012]))); - assert_eq!(best_proximity_for(1009, 8, &[900, 913, 1002, 1012, 2012]), Some((8, vec![900, 913, 1002, 2012]))); - } }