From 4afc4d075161fa38106e176348787462ec430270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 30 Aug 2020 12:02:06 +0200 Subject: [PATCH] Use the groups of four positions to speed up disjunctions tests --- src/bin/indexer.rs | 4 +- src/node.rs | 5 +++ src/search.rs | 101 ++++++++++++++++++++++++++++++++++----------- 3 files changed, 84 insertions(+), 26 deletions(-) diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index cca89d926..d71efca6e 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -88,7 +88,7 @@ struct IndexerOpt { max_memory: Option, /// Size of the ARC cache when indexing. - #[structopt(long, default_value = "65535")] + #[structopt(long, default_value = "43690")] arc_cache_size: usize, /// The name of the compression algorithm to use when compressing intermediate @@ -184,7 +184,7 @@ impl Store { let position = position - position % 4; let word_vec = SmallVec32::from(word.as_bytes()); let ids = RoaringBitmap::from_iter(Some(id)); - let (_, lrus) = self.word_position_docids.insert((word_vec, position), ids, |old, new| old.union_with(&new)); + let (_, lrus) = self.word_four_positions_docids.insert((word_vec, position), ids, |old, new| old.union_with(&new)); Self::write_word_four_positions_docids(&mut self.sorter, lrus) } diff --git a/src/node.rs b/src/node.rs index 1779c821c..cbe6cbc59 100644 --- a/src/node.rs +++ b/src/node.rs @@ -24,6 +24,11 @@ pub fn extract_position(position: u32) -> (u32, u32) { (position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE) } +// Returns the group of four positions in which this position reside (i.e. 0, 4, 12). +pub fn group_of_four(position: u32) -> u32 { + position - position % 4 +} + #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub enum Node { // Is this node is the first node. diff --git a/src/search.rs b/src/search.rs index a9d2610ff..2e53ca5ed 100644 --- a/src/search.rs +++ b/src/search.rs @@ -176,6 +176,24 @@ impl<'a> Search<'a> { Ok(union_docids) } + /// Returns the union of the same gorup of four positions for all the given words. + fn union_word_four_positions( + rtxn: &heed::RoTxn, + index: &Index, + words: &[(String, u8, RoaringBitmap)], + group: Position, + ) -> anyhow::Result + { + let mut union_docids = RoaringBitmap::new(); + for (word, _distance, _positions) in words { + // TODO would be better to check if the group exist + if let Some(docids) = index.word_four_positions_docids.get(rtxn, &(word, group))? { + union_docids.union_with(&docids); + } + } + Ok(union_docids) + } + /// Returns the union of the same attribute for all the given words. fn union_word_attribute( rtxn: &heed::RoTxn, @@ -203,6 +221,8 @@ impl<'a> Search<'a> { derived_words: &[Vec<(String, u8, RoaringBitmap)>], union_cache: &mut HashMap<(usize, u32), RoaringBitmap>, non_disjoint_cache: &mut HashMap<((usize, u32), (usize, u32)), bool>, + group_four_union_cache: &mut HashMap<(usize, u32), RoaringBitmap>, + group_four_non_disjoint_cache: &mut HashMap<((usize, u32), (usize, u32)), bool>, attribute_union_cache: &mut HashMap<(usize, u32), RoaringBitmap>, attribute_non_disjoint_cache: &mut HashMap<((usize, u32), (usize, u32)), bool>, ) -> bool @@ -214,37 +234,68 @@ impl<'a> Search<'a> { let (rattr, _) = node::extract_position(rpos); if lattr == rattr { - // We retrieve or compute the intersection between the two given words and positions. - *non_disjoint_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| { - // We retrieve or compute the unions for the two words and positions. - union_cache.entry((lword, lpos)).or_insert_with(|| { - let words: &Vec<_> = &derived_words[lword]; - Self::union_word_position(rtxn, index, words, lpos).unwrap() - }); - union_cache.entry((rword, rpos)).or_insert_with(|| { - let words: &Vec<_> = &derived_words[rword]; - Self::union_word_position(rtxn, index, words, rpos).unwrap() - }); + // TODO move this function to a better place. + let lgroup = node::group_of_four(lpos); + let rgroup = node::group_of_four(rpos); - // TODO is there a way to avoid this double gets? - let lunion_docids = union_cache.get(&(lword, lpos)).unwrap(); - let runion_docids = union_cache.get(&(rword, rpos)).unwrap(); + // We can't compute a disjunction on a group of four positions if those + // two positions are in the same group, we must go down to the position. + if lgroup == rgroup { + // We retrieve or compute the intersection between the two given words and positions. + *non_disjoint_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| { + // We retrieve or compute the unions for the two words and positions. + union_cache.entry((lword, lpos)).or_insert_with(|| { + let words = &derived_words[lword]; + Self::union_word_position(rtxn, index, words, lpos).unwrap() + }); + union_cache.entry((rword, rpos)).or_insert_with(|| { + let words = &derived_words[rword]; + Self::union_word_position(rtxn, index, words, rpos).unwrap() + }); - // We first check that the docids of these unions are part of the candidates. - if lunion_docids.is_disjoint(candidates) { return false } - if runion_docids.is_disjoint(candidates) { return false } + // TODO is there a way to avoid this double gets? + let lunion_docids = union_cache.get(&(lword, lpos)).unwrap(); + let runion_docids = union_cache.get(&(rword, rpos)).unwrap(); - !lunion_docids.is_disjoint(&runion_docids) - }) + // We first check that the docids of these unions are part of the candidates. + if lunion_docids.is_disjoint(candidates) { return false } + if runion_docids.is_disjoint(candidates) { return false } + + !lunion_docids.is_disjoint(&runion_docids) + }) + } else { + // We retrieve or compute the intersection between the two given words and positions. + *group_four_non_disjoint_cache.entry(((lword, lgroup), (rword, rgroup))).or_insert_with(|| { + // We retrieve or compute the unions for the two words and group of four positions. + group_four_union_cache.entry((lword, lgroup)).or_insert_with(|| { + let words = &derived_words[lword]; + Self::union_word_four_positions(rtxn, index, words, lgroup).unwrap() + }); + group_four_union_cache.entry((rword, rgroup)).or_insert_with(|| { + let words = &derived_words[rword]; + Self::union_word_four_positions(rtxn, index, words, rgroup).unwrap() + }); + + // TODO is there a way to avoid this double gets? + let lunion_group_docids = group_four_union_cache.get(&(lword, lgroup)).unwrap(); + let runion_group_docids = group_four_union_cache.get(&(rword, rgroup)).unwrap(); + + // We first check that the docids of these unions are part of the candidates. + if lunion_group_docids.is_disjoint(candidates) { return false } + if runion_group_docids.is_disjoint(candidates) { return false } + + !lunion_group_docids.is_disjoint(&runion_group_docids) + }) + } } else { *attribute_non_disjoint_cache.entry(((lword, lattr), (rword, rattr))).or_insert_with(|| { // We retrieve or compute the unions for the two words and positions. attribute_union_cache.entry((lword, lattr)).or_insert_with(|| { - let words: &Vec<_> = &derived_words[lword]; + let words = &derived_words[lword]; Self::union_word_attribute(rtxn, index, words, lattr).unwrap() }); attribute_union_cache.entry((rword, rattr)).or_insert_with(|| { - let words: &Vec<_> = &derived_words[rword]; + let words = &derived_words[rword]; Self::union_word_attribute(rtxn, index, words, rattr).unwrap() }); @@ -290,6 +341,9 @@ impl<'a> Search<'a> { let union_cache = HashMap::new(); let mut non_disjoint_cache = HashMap::new(); + let mut group_four_union_cache = HashMap::new(); + let mut group_four_non_disjoint_cache = HashMap::new(); + let mut attribute_union_cache = HashMap::new(); let mut attribute_non_disjoint_cache = HashMap::new(); @@ -306,13 +360,13 @@ impl<'a> Search<'a> { &derived_words, &mut union_cache_cloned.borrow_mut(), &mut non_disjoint_cache, + &mut group_four_union_cache, + &mut group_four_non_disjoint_cache, &mut attribute_union_cache, &mut attribute_non_disjoint_cache, ) }; - // We instantiate an astar bag Iterator that returns the best paths incrementally, - // it means that it will first return the best paths then the next best paths... let astar_iter = AstarBagIter::new( Node::Uninit, // start |n| n.successors(&union_positions, &mut contains_documents), // successors @@ -322,7 +376,6 @@ impl<'a> Search<'a> { let mut documents = Vec::new(); for (paths, proximity) in astar_iter { - let mut union_cache = union_cache.borrow_mut(); let mut candidates = candidates.borrow_mut();