Remove proximity database, forcing us to remove phrase search and splitwords

This commit is contained in:
ManyTheFish 2023-10-03 16:58:26 +02:00
parent 5637978fe4
commit 42bbfebf70
4 changed files with 67 additions and 43 deletions

View File

@ -12,7 +12,8 @@ use super::Word;
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
use crate::{ use crate::{
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec,
RoaringBitmapLenCodec, SearchContext,
}; };
/// A cache storing pointers to values in the LMDB databases. /// A cache storing pointers to values in the LMDB databases.
@ -259,6 +260,7 @@ impl<'ctx> SearchContext<'ctx> {
word2: Interned<String>, word2: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
unreachable!();
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn, self.txn,
(proximity, word1, word2), (proximity, word1, word2),
@ -278,6 +280,7 @@ impl<'ctx> SearchContext<'ctx> {
word2: Interned<String>, word2: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<u64>> { ) -> Result<Option<u64>> {
unreachable!();
DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>(
self.txn, self.txn,
(proximity, word1, word2), (proximity, word1, word2),
@ -291,12 +294,23 @@ impl<'ctx> SearchContext<'ctx> {
) )
} }
pub fn get_db_word_docids_len(&mut self, word: Interned<String>) -> Result<Option<u64>> {
DatabaseCache::get_value::<_, _, RoaringBitmapLenCodec>(
self.txn,
word,
self.word_interner.get(word).as_str(),
&mut self.db_cache.word_docids,
self.index.word_docids.remap_data_type::<ByteSlice>(),
)
}
pub fn get_db_word_prefix_pair_proximity_docids( pub fn get_db_word_prefix_pair_proximity_docids(
&mut self, &mut self,
word1: Interned<String>, word1: Interned<String>,
prefix2: Interned<String>, prefix2: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
unreachable!();
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn, self.txn,
(proximity, word1, prefix2), (proximity, word1, prefix2),
@ -315,6 +329,7 @@ impl<'ctx> SearchContext<'ctx> {
right: Interned<String>, right: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
unreachable!();
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
self.txn, self.txn,
(proximity, left_prefix, right), (proximity, left_prefix, right),

View File

@ -295,11 +295,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
ranking_rules.push(Box::new(Typo::new(None))); ranking_rules.push(Box::new(Typo::new(None)));
} }
crate::Criterion::Proximity => { crate::Criterion::Proximity => {
if proximity { // if proximity {
continue; continue;
} // }
proximity = true; // proximity = true;
ranking_rules.push(Box::new(Proximity::new(None))); // ranking_rules.push(Box::new(Proximity::new(None)));
} }
crate::Criterion::Attribute => { crate::Criterion::Attribute => {
if attribute { if attribute {

View File

@ -265,11 +265,11 @@ pub fn partially_initialized_term_from_word(
} }
fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Interned<Phrase>>> { fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Interned<Phrase>>> {
if let Some((l, r)) = split_best_frequency(ctx, word)? { // if let Some((l, r)) = split_best_frequency(ctx, word)? {
Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) // Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
} else { // } else {
Ok(None) Ok(None)
} // }
} }
impl Interned<QueryTerm> { impl Interned<QueryTerm> {
@ -416,11 +416,20 @@ fn split_best_frequency(
let left = ctx.word_interner.insert(left.to_owned()); let left = ctx.word_interner.insert(left.to_owned());
let right = ctx.word_interner.insert(right.to_owned()); let right = ctx.word_interner.insert(right.to_owned());
if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? { if let (Some(l_freq), Some(r_freq)) =
(ctx.get_db_word_docids_len(left)?, ctx.get_db_word_docids_len(right)?)
{
let frequency = l_freq.min(r_freq);
if best.map_or(true, |(old, _, _)| frequency > old) { if best.map_or(true, |(old, _, _)| frequency > old) {
best = Some((frequency, left, right)); best = Some((frequency, left, right));
} }
} }
// if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? {
// if best.map_or(true, |(old, _, _)| frequency > old) {
// best = Some((frequency, left, right));
// }
// }
} }
Ok(best.map(|(_, left, right)| (left, right))) Ok(best.map(|(_, left, right)| (left, right)))

View File

@ -82,41 +82,41 @@ pub fn located_query_terms_from_tokens(
position = position.wrapping_add(7); position = position.wrapping_add(7);
} }
phrase = 'phrase: { // phrase = 'phrase: {
let phrase = phrase.take(); // let phrase = phrase.take();
// If we have a hard separator inside a phrase, we immediately start a new phrase // // If we have a hard separator inside a phrase, we immediately start a new phrase
let phrase = if separator_kind == SeparatorKind::Hard { // let phrase = if separator_kind == SeparatorKind::Hard {
if let Some(phrase) = phrase { // if let Some(phrase) = phrase {
if let Some(located_query_term) = phrase.build(ctx) { // if let Some(located_query_term) = phrase.build(ctx) {
located_terms.push(located_query_term) // located_terms.push(located_query_term)
} // }
Some(PhraseBuilder::empty()) // Some(PhraseBuilder::empty())
} else { // } else {
None // None
} // }
} else { // } else {
phrase // phrase
}; // };
// We close and start a new phrase depending on the number of double quotes // // We close and start a new phrase depending on the number of double quotes
let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); // let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count();
if quote_count == 0 { // if quote_count == 0 {
break 'phrase phrase; // break 'phrase phrase;
} // }
// Consume the closing quote and the phrase // // Consume the closing quote and the phrase
if let Some(phrase) = phrase { // if let Some(phrase) = phrase {
// Per the check above, quote_count > 0 // // Per the check above, quote_count > 0
quote_count -= 1; // quote_count -= 1;
if let Some(located_query_term) = phrase.build(ctx) { // if let Some(located_query_term) = phrase.build(ctx) {
located_terms.push(located_query_term) // located_terms.push(located_query_term)
} // }
} // }
// Start new phrase if the token ends with an opening quote // // Start new phrase if the token ends with an opening quote
(quote_count % 2 == 1).then_some(PhraseBuilder::empty()) // (quote_count % 2 == 1).then_some(PhraseBuilder::empty())
}; // };
} }
_ => (), _ => (),
} }