From 42bbfebf70ad5638266ff6513613a2b4a57c7c9a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 3 Oct 2023 16:58:26 +0200 Subject: [PATCH] Remove proximity database, forcing us to remove phrase search and splitwords --- milli/src/search/new/db_cache.rs | 17 ++++- milli/src/search/new/mod.rs | 10 +-- .../new/query_term/compute_derivations.rs | 21 +++++-- .../src/search/new/query_term/parse_query.rs | 62 +++++++++---------- 4 files changed, 67 insertions(+), 43 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index e0a2ba3cf..0d21d2089 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -12,7 +12,8 @@ use super::Word; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; use crate::{ - CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, + CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, + RoaringBitmapLenCodec, SearchContext, }; /// A cache storing pointers to values in the LMDB databases. @@ -259,6 +260,7 @@ impl<'ctx> SearchContext<'ctx> { word2: Interned, proximity: u8, ) -> Result> { + unreachable!(); DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, (proximity, word1, word2), @@ -278,6 +280,7 @@ impl<'ctx> SearchContext<'ctx> { word2: Interned, proximity: u8, ) -> Result> { + unreachable!(); DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( self.txn, (proximity, word1, word2), @@ -291,12 +294,23 @@ impl<'ctx> SearchContext<'ctx> { ) } + pub fn get_db_word_docids_len(&mut self, word: Interned) -> Result> { + DatabaseCache::get_value::<_, _, RoaringBitmapLenCodec>( + self.txn, + word, + self.word_interner.get(word).as_str(), + &mut self.db_cache.word_docids, + self.index.word_docids.remap_data_type::(), + ) + } + pub fn get_db_word_prefix_pair_proximity_docids( &mut self, word1: Interned, prefix2: Interned, proximity: u8, ) -> Result> { + unreachable!(); DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, (proximity, word1, prefix2), @@ -315,6 +329,7 @@ impl<'ctx> SearchContext<'ctx> { right: Interned, proximity: u8, ) -> Result> { + unreachable!(); DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, (proximity, left_prefix, right), diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 361804426..70dbc9614 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -295,11 +295,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( ranking_rules.push(Box::new(Typo::new(None))); } crate::Criterion::Proximity => { - if proximity { - continue; - } - proximity = true; - ranking_rules.push(Box::new(Proximity::new(None))); + // if proximity { + continue; + // } + // proximity = true; + // ranking_rules.push(Box::new(Proximity::new(None))); } crate::Criterion::Attribute => { if attribute { diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index d5dfbbcd0..1f78ffbf3 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -265,11 +265,11 @@ pub fn partially_initialized_term_from_word( } fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result>> { - if let Some((l, r)) = split_best_frequency(ctx, word)? { - Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) - } else { - Ok(None) - } + // if let Some((l, r)) = split_best_frequency(ctx, word)? { + // Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) + // } else { + Ok(None) + // } } impl Interned { @@ -416,11 +416,20 @@ fn split_best_frequency( let left = ctx.word_interner.insert(left.to_owned()); let right = ctx.word_interner.insert(right.to_owned()); - if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? { + if let (Some(l_freq), Some(r_freq)) = + (ctx.get_db_word_docids_len(left)?, ctx.get_db_word_docids_len(right)?) + { + let frequency = l_freq.min(r_freq); if best.map_or(true, |(old, _, _)| frequency > old) { best = Some((frequency, left, right)); } } + + // if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? { + // if best.map_or(true, |(old, _, _)| frequency > old) { + // best = Some((frequency, left, right)); + // } + // } } Ok(best.map(|(_, left, right)| (left, right))) diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 64fe07a31..22f7313e5 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -82,41 +82,41 @@ pub fn located_query_terms_from_tokens( position = position.wrapping_add(7); } - phrase = 'phrase: { - let phrase = phrase.take(); + // phrase = 'phrase: { + // let phrase = phrase.take(); - // If we have a hard separator inside a phrase, we immediately start a new phrase - let phrase = if separator_kind == SeparatorKind::Hard { - if let Some(phrase) = phrase { - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) - } - Some(PhraseBuilder::empty()) - } else { - None - } - } else { - phrase - }; + // // If we have a hard separator inside a phrase, we immediately start a new phrase + // let phrase = if separator_kind == SeparatorKind::Hard { + // if let Some(phrase) = phrase { + // if let Some(located_query_term) = phrase.build(ctx) { + // located_terms.push(located_query_term) + // } + // Some(PhraseBuilder::empty()) + // } else { + // None + // } + // } else { + // phrase + // }; - // We close and start a new phrase depending on the number of double quotes - let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - if quote_count == 0 { - break 'phrase phrase; - } + // // We close and start a new phrase depending on the number of double quotes + // let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + // if quote_count == 0 { + // break 'phrase phrase; + // } - // Consume the closing quote and the phrase - if let Some(phrase) = phrase { - // Per the check above, quote_count > 0 - quote_count -= 1; - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) - } - } + // // Consume the closing quote and the phrase + // if let Some(phrase) = phrase { + // // Per the check above, quote_count > 0 + // quote_count -= 1; + // if let Some(located_query_term) = phrase.build(ctx) { + // located_terms.push(located_query_term) + // } + // } - // Start new phrase if the token ends with an opening quote - (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) - }; + // // Start new phrase if the token ends with an opening quote + // (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) + // }; } _ => (), }