diff --git a/src/best_proximity.rs b/src/best_proximity.rs index 0f4b7c233..8afd36ad4 100644 --- a/src/best_proximity.rs +++ b/src/best_proximity.rs @@ -10,11 +10,11 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 { if lhs <= rhs { cmp::min(rhs - lhs, MAX_DISTANCE) } else { - cmp::min(lhs - rhs, MAX_DISTANCE) + 1 + cmp::min((lhs - rhs) + 1, MAX_DISTANCE) } } -fn positions_proximity(lhs: u32, rhs: u32) -> u32 { +pub fn positions_proximity(lhs: u32, rhs: u32) -> u32 { let (lhs_attr, lhs_index) = extract_position(lhs); let (rhs_attr, rhs_index) = extract_position(rhs); if lhs_attr != rhs_attr { MAX_DISTANCE } diff --git a/src/lib.rs b/src/lib.rs index 151ddc6d8..d2dd7928d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -120,22 +120,37 @@ impl Index { let mut documents = Vec::new(); + let mut debug_intersects = HashMap::new(); let mut intersect_cache = HashMap::new(); - let contains_documents = |(lword, lpos): (usize, u32), (rword, rpos)| { + let contains_documents = |(lword, lpos): (usize, u32), (rword, rpos): (usize, u32)| { + let proximity = best_proximity::positions_proximity(lpos, rpos); + *intersect_cache.entry(((lword, lpos), (rword, rpos))).or_insert_with(|| { use std::iter::once; + let (nb_words, nb_docs_intersect, lnblookups, lnbbitmaps, rnblookups, rnbbitmaps) = + debug_intersects.entry((lword, lpos, rword, rpos, proximity)).or_default(); + let left = (&words[lword], lpos); let right = (&words[rword], rpos); + *nb_words = left.0.len() + right.0.len(); + + let mut l_lookups = 0; + let mut l_bitmaps = 0; + let mut r_lookups = 0; + let mut r_bitmaps = 0; + let mut intersect_docids: Option = None; - for (derived_words, pos) in once(left).chain(once(right)) { + for (i, (derived_words, pos))in once(left).chain(once(right)).enumerate() { let mut union_docids = RoaringBitmap::default(); // TODO re-enable the prefixes system for word in derived_words.iter() { + if i == 0 { l_lookups += 1 } else { r_lookups += 1 } let mut key = word.clone(); key.extend_from_slice(&pos.to_be_bytes()); if let Some(attrs) = self.postings_ids.get(rtxn, &key).unwrap() { + if i == 0 { l_bitmaps += 1 } else { r_bitmaps += 1 } let right = RoaringBitmap::deserialize_from(attrs).unwrap(); union_docids.union_with(&right); } @@ -147,6 +162,12 @@ impl Index { } } + *lnblookups = l_lookups; + *lnbbitmaps = l_bitmaps; + *rnblookups = r_lookups; + *rnbbitmaps = r_bitmaps; + *nb_docs_intersect += intersect_docids.as_ref().map_or(0, |i| i.len()); + intersect_docids.map_or(false, |i| !i.is_empty()) }) }; @@ -223,7 +244,63 @@ impl Index { } } + debug_intersects_to_csv(debug_intersects); + eprintln!("{} candidates", documents.iter().map(RoaringBitmap::len).sum::()); Ok(documents.iter().flatten().take(20).collect()) } } + +fn debug_intersects_to_csv(intersects: HashMap<(usize, u32, usize, u32, u32), (usize, u64, usize, usize, usize, usize)>) { + let mut wrt = csv::Writer::from_path("intersects-stats.csv").unwrap(); + wrt.write_record(&[ + "proximity", + "lword", + "lpos", + "rword", + "rpos", + "nb_derived_words", + "nb_docs_intersect", + "lnblookups", + "lnbbitmaps", + "rnblookups", + "rnbbitmaps", + ]).unwrap(); + + for ((lword, lpos, rword, rpos, proximity), vals) in intersects { + let ( + nb_derived_words, + nb_docs_intersect, + lnblookups, + lnbbitmaps, + rnblookups, + rnbbitmaps, + ) = vals; + + let proximity = proximity.to_string(); + let lword = lword.to_string(); + let lpos = lpos.to_string(); + let rword = rword.to_string(); + let rpos = rpos.to_string(); + let nb_derived_words = nb_derived_words.to_string(); + let nb_docs_intersect = nb_docs_intersect.to_string(); + let lnblookups = lnblookups.to_string(); + let lnbbitmaps = lnbbitmaps.to_string(); + let rnblookups = rnblookups.to_string(); + let rnbbitmaps = rnbbitmaps.to_string(); + + wrt.write_record(&[ + &proximity, + &lword, + &lpos, + &rword, + &rpos, + &nb_derived_words, + &nb_docs_intersect, + &lnblookups, + &lnbbitmaps, + &rnblookups, + &rnbbitmaps, + ]).unwrap(); + } +}