From 345c99d5bd5cd098133e627c6d8f1714bcfcd9f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:49:25 +0100 Subject: [PATCH] Introduce the words ranking rule working with the new search structures --- milli/src/search/new/words.rs | 149 ++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 milli/src/search/new/words.rs diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs new file mode 100644 index 000000000..4d812d9ff --- /dev/null +++ b/milli/src/search/new/words.rs @@ -0,0 +1,149 @@ +use std::collections::BTreeSet; + +use heed::RoTxn; +use roaring::RoaringBitmap; + +use super::db_cache::DatabaseCache; +use super::resolve_query_graph::resolve_query_graph; +use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput}; +use crate::{Index, Result, TermsMatchingStrategy}; + +pub struct Words { + exhausted: bool, + query_graph: Option, + iterating: bool, + positions_to_remove: Vec, + terms_matching_strategy: TermsMatchingStrategy, +} +impl Words { + pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self { + Self { + exhausted: true, + query_graph: None, + iterating: false, + positions_to_remove: vec![], + terms_matching_strategy, + } + } +} + +impl<'transaction> RankingRule<'transaction, QueryGraph> for Words { + fn start_iteration( + &mut self, + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + parent_candidates: &RoaringBitmap, + parent_query_graph: &QueryGraph, + ) -> Result<()> { + // println!("Words: start iteration"); + self.exhausted = false; + self.query_graph = Some(parent_query_graph.clone()); + + // TODO: a phrase can contain many positions, but represents a single node. + // That's a problem. + let positions_to_remove = match self.terms_matching_strategy { + TermsMatchingStrategy::Last => { + let mut all_positions = BTreeSet::new(); + for n in parent_query_graph.nodes.iter() { + match n { + QueryNode::Term(term) => { + all_positions.extend(term.positions.clone().into_iter()); + } + QueryNode::Deleted | QueryNode::Start | QueryNode::End => {} + } + } + all_positions.into_iter().collect() + } + TermsMatchingStrategy::All => vec![], + }; + // println!("positions to remove: {positions_to_remove:?}"); + self.positions_to_remove = positions_to_remove; + self.iterating = true; + Ok(()) + } + + fn next_bucket( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + ) -> Result>> { + // println!("Words: next bucket"); + assert!(self.iterating); + assert!(universe.len() > 1); + if self.exhausted { + return Ok(None); + } + let Some(query_graph) = &mut self.query_graph else { panic!() }; + // let graphviz = query_graph.graphviz(); + // println!("\n===={graphviz}\n===="); + let this_bucket = resolve_query_graph(index, txn, db_cache, query_graph, universe)?; + // println!("WORDS: this bucket: {this_bucket:?}"); + let child_query_graph = query_graph.clone(); + // this_bucket is the one that must be returned now + // self.cur_bucket is set to the next bucket + // TODO: Check whether a position exists in the graph before removing it and + // returning the next bucket. + // while graph.does_not_contain(positions_to_remove.last()) { positions_to_remove.pop() } + if self.positions_to_remove.is_empty() { + self.exhausted = true; + } else { + let position_to_remove = self.positions_to_remove.pop().unwrap(); + query_graph.remove_words_at_position(position_to_remove); + } + + Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket })) + } + + fn end_iteration( + &mut self, + _index: &Index, + _txn: &'transaction RoTxn, + _db_cache: &mut DatabaseCache<'transaction>, + ) { + // println!("Words: end iteration"); + self.iterating = false; + self.exhausted = true; + self.positions_to_remove = vec![]; + } +} + +#[cfg(test)] +mod tests { + // use charabia::Tokenize; + // use roaring::RoaringBitmap; + + // use crate::{ + // index::tests::TempIndex, + // search::{criteria::CriteriaBuilder, new::QueryGraphOrPlaceholder}, + // }; + + // use super::Words; + + // fn placeholder() { + // let qt = QueryGraphOrPlaceholder::Placeholder; + // let index = TempIndex::new(); + // let rtxn = index.read_txn().unwrap(); + + // let query = "a beautiful summer house by the beach overlooking what seems"; + // // let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); + // // let (qt, parts, matching_words) = builder.build(query.tokenize()).unwrap().unwrap(); + + // // let cb = CriteriaBuilder::new(&rtxn, &index).unwrap(); + // // let x = cb + // // .build( + // // Some(qt), + // // Some(parts), + // // None, + // // None, + // // false, + // // None, + // // crate::CriterionImplementationStrategy::OnlySetBased, + // // ) + // // .unwrap(); + + // // let rr = Words::new(&index, &RoaringBitmap::from_sorted_iter(0..1000)).unwrap(); + // } +}