mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-19 01:18:31 +08:00
Use the groups of four positions to speed up disjunctions tests
This commit is contained in:
parent
605f75b56f
commit
4afc4d0751
@ -88,7 +88,7 @@ struct IndexerOpt {
|
|||||||
max_memory: Option<usize>,
|
max_memory: Option<usize>,
|
||||||
|
|
||||||
/// Size of the ARC cache when indexing.
|
/// Size of the ARC cache when indexing.
|
||||||
#[structopt(long, default_value = "65535")]
|
#[structopt(long, default_value = "43690")]
|
||||||
arc_cache_size: usize,
|
arc_cache_size: usize,
|
||||||
|
|
||||||
/// The name of the compression algorithm to use when compressing intermediate
|
/// The name of the compression algorithm to use when compressing intermediate
|
||||||
@ -184,7 +184,7 @@ impl Store {
|
|||||||
let position = position - position % 4;
|
let position = position - position % 4;
|
||||||
let word_vec = SmallVec32::from(word.as_bytes());
|
let word_vec = SmallVec32::from(word.as_bytes());
|
||||||
let ids = RoaringBitmap::from_iter(Some(id));
|
let ids = RoaringBitmap::from_iter(Some(id));
|
||||||
let (_, lrus) = self.word_position_docids.insert((word_vec, position), ids, |old, new| old.union_with(&new));
|
let (_, lrus) = self.word_four_positions_docids.insert((word_vec, position), ids, |old, new| old.union_with(&new));
|
||||||
Self::write_word_four_positions_docids(&mut self.sorter, lrus)
|
Self::write_word_four_positions_docids(&mut self.sorter, lrus)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -24,6 +24,11 @@ pub fn extract_position(position: u32) -> (u32, u32) {
|
|||||||
(position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE)
|
(position / ONE_ATTRIBUTE, position % ONE_ATTRIBUTE)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns the group of four positions in which this position reside (i.e. 0, 4, 12).
|
||||||
|
pub fn group_of_four(position: u32) -> u32 {
|
||||||
|
position - position % 4
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
pub enum Node {
|
pub enum Node {
|
||||||
// Is this node is the first node.
|
// Is this node is the first node.
|
||||||
|
@ -176,6 +176,24 @@ impl<'a> Search<'a> {
|
|||||||
Ok(union_docids)
|
Ok(union_docids)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the union of the same gorup of four positions for all the given words.
|
||||||
|
fn union_word_four_positions(
|
||||||
|
rtxn: &heed::RoTxn,
|
||||||
|
index: &Index,
|
||||||
|
words: &[(String, u8, RoaringBitmap)],
|
||||||
|
group: Position,
|
||||||
|
) -> anyhow::Result<RoaringBitmap>
|
||||||
|
{
|
||||||
|
let mut union_docids = RoaringBitmap::new();
|
||||||
|
for (word, _distance, _positions) in words {
|
||||||
|
// TODO would be better to check if the group exist
|
||||||
|
if let Some(docids) = index.word_four_positions_docids.get(rtxn, &(word, group))? {
|
||||||
|
union_docids.union_with(&docids);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(union_docids)
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the union of the same attribute for all the given words.
|
/// Returns the union of the same attribute for all the given words.
|
||||||
fn union_word_attribute(
|
fn union_word_attribute(
|
||||||
rtxn: &heed::RoTxn,
|
rtxn: &heed::RoTxn,
|
||||||
@ -203,6 +221,8 @@ impl<'a> Search<'a> {
|
|||||||
derived_words: &[Vec<(String, u8, RoaringBitmap)>],
|
derived_words: &[Vec<(String, u8, RoaringBitmap)>],
|
||||||
union_cache: &mut HashMap<(usize, u32), RoaringBitmap>,
|
union_cache: &mut HashMap<(usize, u32), RoaringBitmap>,
|
||||||
non_disjoint_cache: &mut HashMap<((usize, u32), (usize, u32)), bool>,
|
non_disjoint_cache: &mut HashMap<((usize, u32), (usize, u32)), bool>,
|
||||||
|
group_four_union_cache: &mut HashMap<(usize, u32), RoaringBitmap>,
|
||||||
|
group_four_non_disjoint_cache: &mut HashMap<((usize, u32), (usize, u32)), bool>,
|
||||||
attribute_union_cache: &mut HashMap<(usize, u32), RoaringBitmap>,
|
attribute_union_cache: &mut HashMap<(usize, u32), RoaringBitmap>,
|
||||||
attribute_non_disjoint_cache: &mut HashMap<((usize, u32), (usize, u32)), bool>,
|
attribute_non_disjoint_cache: &mut HashMap<((usize, u32), (usize, u32)), bool>,
|
||||||
) -> bool
|
) -> bool
|
||||||
@ -214,15 +234,22 @@ impl<'a> Search<'a> {
|
|||||||
let (rattr, _) = node::extract_position(rpos);
|
let (rattr, _) = node::extract_position(rpos);
|
||||||
|
|
||||||
if lattr == rattr {
|
if lattr == rattr {
|
||||||
|
// TODO move this function to a better place.
|
||||||
|
let lgroup = node::group_of_four(lpos);
|
||||||
|
let rgroup = node::group_of_four(rpos);
|
||||||
|
|
||||||
|
// We can't compute a disjunction on a group of four positions if those
|
||||||
|
// two positions are in the same group, we must go down to the position.
|
||||||
|
if lgroup == rgroup {
|
||||||
// We retrieve or compute the intersection between the two given words and positions.
|
// We retrieve or compute the intersection between the two given words and positions.
|
||||||
*non_disjoint_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| {
|
*non_disjoint_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| {
|
||||||
// We retrieve or compute the unions for the two words and positions.
|
// We retrieve or compute the unions for the two words and positions.
|
||||||
union_cache.entry((lword, lpos)).or_insert_with(|| {
|
union_cache.entry((lword, lpos)).or_insert_with(|| {
|
||||||
let words: &Vec<_> = &derived_words[lword];
|
let words = &derived_words[lword];
|
||||||
Self::union_word_position(rtxn, index, words, lpos).unwrap()
|
Self::union_word_position(rtxn, index, words, lpos).unwrap()
|
||||||
});
|
});
|
||||||
union_cache.entry((rword, rpos)).or_insert_with(|| {
|
union_cache.entry((rword, rpos)).or_insert_with(|| {
|
||||||
let words: &Vec<_> = &derived_words[rword];
|
let words = &derived_words[rword];
|
||||||
Self::union_word_position(rtxn, index, words, rpos).unwrap()
|
Self::union_word_position(rtxn, index, words, rpos).unwrap()
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -236,15 +263,39 @@ impl<'a> Search<'a> {
|
|||||||
|
|
||||||
!lunion_docids.is_disjoint(&runion_docids)
|
!lunion_docids.is_disjoint(&runion_docids)
|
||||||
})
|
})
|
||||||
|
} else {
|
||||||
|
// We retrieve or compute the intersection between the two given words and positions.
|
||||||
|
*group_four_non_disjoint_cache.entry(((lword, lgroup), (rword, rgroup))).or_insert_with(|| {
|
||||||
|
// We retrieve or compute the unions for the two words and group of four positions.
|
||||||
|
group_four_union_cache.entry((lword, lgroup)).or_insert_with(|| {
|
||||||
|
let words = &derived_words[lword];
|
||||||
|
Self::union_word_four_positions(rtxn, index, words, lgroup).unwrap()
|
||||||
|
});
|
||||||
|
group_four_union_cache.entry((rword, rgroup)).or_insert_with(|| {
|
||||||
|
let words = &derived_words[rword];
|
||||||
|
Self::union_word_four_positions(rtxn, index, words, rgroup).unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
|
// TODO is there a way to avoid this double gets?
|
||||||
|
let lunion_group_docids = group_four_union_cache.get(&(lword, lgroup)).unwrap();
|
||||||
|
let runion_group_docids = group_four_union_cache.get(&(rword, rgroup)).unwrap();
|
||||||
|
|
||||||
|
// We first check that the docids of these unions are part of the candidates.
|
||||||
|
if lunion_group_docids.is_disjoint(candidates) { return false }
|
||||||
|
if runion_group_docids.is_disjoint(candidates) { return false }
|
||||||
|
|
||||||
|
!lunion_group_docids.is_disjoint(&runion_group_docids)
|
||||||
|
})
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
*attribute_non_disjoint_cache.entry(((lword, lattr), (rword, rattr))).or_insert_with(|| {
|
*attribute_non_disjoint_cache.entry(((lword, lattr), (rword, rattr))).or_insert_with(|| {
|
||||||
// We retrieve or compute the unions for the two words and positions.
|
// We retrieve or compute the unions for the two words and positions.
|
||||||
attribute_union_cache.entry((lword, lattr)).or_insert_with(|| {
|
attribute_union_cache.entry((lword, lattr)).or_insert_with(|| {
|
||||||
let words: &Vec<_> = &derived_words[lword];
|
let words = &derived_words[lword];
|
||||||
Self::union_word_attribute(rtxn, index, words, lattr).unwrap()
|
Self::union_word_attribute(rtxn, index, words, lattr).unwrap()
|
||||||
});
|
});
|
||||||
attribute_union_cache.entry((rword, rattr)).or_insert_with(|| {
|
attribute_union_cache.entry((rword, rattr)).or_insert_with(|| {
|
||||||
let words: &Vec<_> = &derived_words[rword];
|
let words = &derived_words[rword];
|
||||||
Self::union_word_attribute(rtxn, index, words, rattr).unwrap()
|
Self::union_word_attribute(rtxn, index, words, rattr).unwrap()
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -290,6 +341,9 @@ impl<'a> Search<'a> {
|
|||||||
let union_cache = HashMap::new();
|
let union_cache = HashMap::new();
|
||||||
let mut non_disjoint_cache = HashMap::new();
|
let mut non_disjoint_cache = HashMap::new();
|
||||||
|
|
||||||
|
let mut group_four_union_cache = HashMap::new();
|
||||||
|
let mut group_four_non_disjoint_cache = HashMap::new();
|
||||||
|
|
||||||
let mut attribute_union_cache = HashMap::new();
|
let mut attribute_union_cache = HashMap::new();
|
||||||
let mut attribute_non_disjoint_cache = HashMap::new();
|
let mut attribute_non_disjoint_cache = HashMap::new();
|
||||||
|
|
||||||
@ -306,13 +360,13 @@ impl<'a> Search<'a> {
|
|||||||
&derived_words,
|
&derived_words,
|
||||||
&mut union_cache_cloned.borrow_mut(),
|
&mut union_cache_cloned.borrow_mut(),
|
||||||
&mut non_disjoint_cache,
|
&mut non_disjoint_cache,
|
||||||
|
&mut group_four_union_cache,
|
||||||
|
&mut group_four_non_disjoint_cache,
|
||||||
&mut attribute_union_cache,
|
&mut attribute_union_cache,
|
||||||
&mut attribute_non_disjoint_cache,
|
&mut attribute_non_disjoint_cache,
|
||||||
)
|
)
|
||||||
};
|
};
|
||||||
|
|
||||||
// We instantiate an astar bag Iterator that returns the best paths incrementally,
|
|
||||||
// it means that it will first return the best paths then the next best paths...
|
|
||||||
let astar_iter = AstarBagIter::new(
|
let astar_iter = AstarBagIter::new(
|
||||||
Node::Uninit, // start
|
Node::Uninit, // start
|
||||||
|n| n.successors(&union_positions, &mut contains_documents), // successors
|
|n| n.successors(&union_positions, &mut contains_documents), // successors
|
||||||
@ -322,7 +376,6 @@ impl<'a> Search<'a> {
|
|||||||
|
|
||||||
let mut documents = Vec::new();
|
let mut documents = Vec::new();
|
||||||
for (paths, proximity) in astar_iter {
|
for (paths, proximity) in astar_iter {
|
||||||
|
|
||||||
let mut union_cache = union_cache.borrow_mut();
|
let mut union_cache = union_cache.borrow_mut();
|
||||||
let mut candidates = candidates.borrow_mut();
|
let mut candidates = candidates.borrow_mut();
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user