diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs new file mode 100644 index 000000000..ce883ad6a --- /dev/null +++ b/milli/src/search/new/ranking_rules.rs @@ -0,0 +1,523 @@ +use heed::RoTxn; +use roaring::RoaringBitmap; + +use super::db_cache::DatabaseCache; +use super::resolve_query_graph::resolve_query_graph; +use super::QueryGraph; +use crate::new::graph_based_ranking_rule::GraphBasedRankingRule; +use crate::new::ranking_rule_graph::proximity::ProximityGraph; +use crate::new::words::Words; +// use crate::search::new::sort::Sort; +use crate::{Index, Result, TermsMatchingStrategy}; + +pub trait RankingRuleOutputIter<'transaction, Query> { + fn next_bucket(&mut self) -> Result>>; +} + +pub struct RankingRuleOutputIterWrapper<'transaction, Query> { + iter: Box>> + 'transaction>, +} +impl<'transaction, Query> RankingRuleOutputIterWrapper<'transaction, Query> { + pub fn new( + iter: Box>> + 'transaction>, + ) -> Self { + Self { iter } + } +} +impl<'transaction, Query> RankingRuleOutputIter<'transaction, Query> + for RankingRuleOutputIterWrapper<'transaction, Query> +{ + fn next_bucket(&mut self) -> Result>> { + match self.iter.next() { + Some(x) => x.map(Some), + None => Ok(None), + } + } +} + +pub trait RankingRuleQueryTrait: Sized + Clone + 'static {} +#[derive(Clone)] +pub struct PlaceholderQuery; +impl RankingRuleQueryTrait for PlaceholderQuery {} +impl RankingRuleQueryTrait for QueryGraph {} + +pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> { + // TODO: add an update_candidates function to deal with distinct + // attributes? + + fn start_iteration( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + query: &Query, + ) -> Result<()>; + + fn next_bucket( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + ) -> Result>>; + + fn end_iteration( + &mut self, + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + ); +} + +#[derive(Debug)] +pub struct RankingRuleOutput { + /// The query tree that must be used by the child ranking rule to fetch candidates. + pub query: Q, + /// The allowed candidates for the child ranking rule + pub candidates: RoaringBitmap, +} + +#[allow(unused)] +pub fn get_start_universe<'transaction>( + index: &Index, + txn: &'transaction RoTxn, + db_cache: &mut DatabaseCache<'transaction>, + query_graph: &QueryGraph, + term_matching_strategy: TermsMatchingStrategy, + // filters: Filters, + // mut distinct: Option, +) -> Result { + // NOTE: + // + // There is a performance problem when using `distinct` + exhaustive number of hits, + // especially for search that yield many results (many ~= almost all of the + // dataset). + // + // We'll solve it later. Maybe there are smart ways to go about it. + // + // For example, if there are millions of possible values for the distinct attribute, + // then we could just look at the documents which share any distinct attribute with + // another one, and remove the later docids them from the universe. + // => NO! because we don't know which one to remove, only after the sorting is done can we know it + // => this kind of computation can be done, but only in the evaluation of the number + // of hits for the documents that aren't returned by the search. + // + // `Distinct` otherwise should always be computed during + + let universe = index.documents_ids(txn).unwrap(); + + // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. + // NOTE: this is wrong + // Instead, we should only compute the documents corresponding to the last remaining + // word, 2-gram, and 3-gran. + // let candidates = resolve_query_graph(index, txn, db_cache, query_graph, &universe)?; + + // Distinct should be lazy if placeholder? + // + // // because the initial_candidates should be an exhaustive count of the matching documents, + // // we precompute the distinct attributes. + // let initial_candidates = match &mut distinct { + // Some(distinct) => { + // let mut initial_candidates = RoaringBitmap::new(); + // for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { + // initial_candidates.insert(c?); + // } + // initial_candidates + // } + // None => candidates.clone(), + // }; + + Ok(/*candidates*/ universe) +} + +pub fn execute_search<'transaction>( + index: &Index, + txn: &'transaction heed::RoTxn, + // TODO: ranking rules parameter + db_cache: &mut DatabaseCache<'transaction>, + universe: &RoaringBitmap, + query_graph: &QueryGraph, + // _from: usize, + // _length: usize, +) -> Result> { + let words = Words::new(TermsMatchingStrategy::Last); + // let sort = Sort::new(index, txn, "sort1".to_owned(), true)?; + let proximity = GraphBasedRankingRule::::default(); + // TODO: ranking rules given as argument + let mut ranking_rules: Vec>> = + vec![Box::new(words), Box::new(proximity) /* Box::new(sort) */]; + + let ranking_rules_len = ranking_rules.len(); + ranking_rules[0].start_iteration(index, txn, db_cache, universe, query_graph)?; + + // TODO: parent_candidates could be used only during debugging? + let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len]; + candidates[0] = universe.clone(); + + let mut cur_ranking_rule_index = 0; + + macro_rules! back { + () => { + candidates[cur_ranking_rule_index].clear(); + ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache); + if cur_ranking_rule_index == 0 { + break; + } else { + cur_ranking_rule_index -= 1; + } + }; + } + + let mut results = vec![]; + // TODO: skip buckets when we want to start from an offset + while results.len() < 20 { + // The universe for this bucket is zero or one element, so we don't need to sort + // anything, just extend the results and go back to the parent ranking rule. + if candidates[cur_ranking_rule_index].len() <= 1 { + results.extend(&candidates[cur_ranking_rule_index]); + back!(); + continue; + } + let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, &candidates[cur_ranking_rule_index])? else { + back!(); + continue; + }; + + candidates[cur_ranking_rule_index] -= &next_bucket.candidates; + + if next_bucket.candidates.len() <= 1 { + // Only zero or one candidate, no need to sort through the child ranking rule. + results.extend(next_bucket.candidates); + continue; + } else { + // many candidates, give to next ranking rule, if any + if cur_ranking_rule_index == ranking_rules_len - 1 { + // TODO: don't extend too much, up to the limit only + results.extend(next_bucket.candidates); + } else { + cur_ranking_rule_index += 1; + candidates[cur_ranking_rule_index] = next_bucket.candidates.clone(); + ranking_rules[cur_ranking_rule_index].start_iteration( + index, + txn, + db_cache, + &next_bucket.candidates, + &next_bucket.query, + )?; + } + } + } + + Ok(results) +} + +#[cfg(test)] +mod tests { + use std::fs::File; + use std::io::{BufRead, BufReader, Cursor, Seek}; + use std::time::Instant; + + use heed::EnvOpenOptions; + + use super::{execute_search, get_start_universe}; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; + use crate::new::db_cache::DatabaseCache; + use crate::new::make_query_graph; + use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; + use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; + + #[test] + fn execute_new_search() { + let index = TempIndex::new(); + index + .add_documents(documents!([ + { + "id": 7, + "text": "the super quick super brown fox jumps over", + }, + { + "id": 8, + "text": "the super quick brown fox jumps over", + }, + { + "id": 9, + "text": "the quick super brown fox jumps over", + }, + { + "id": 10, + "text": "the quick brown fox jumps over", + }, + { + "id": 11, + "text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 12, + "text": "the quick brown cat jumps over the lazy dog", + }, + ])) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut db_cache = DatabaseCache::default(); + + let query_graph = + make_query_graph(&index, &txn, &mut db_cache, "the quick brown fox jumps over") + .unwrap(); + println!("{}", query_graph.graphviz()); + + // TODO: filters + maybe distinct attributes? + let universe = get_start_universe( + &index, + &txn, + &mut db_cache, + &query_graph, + TermsMatchingStrategy::Last, + ) + .unwrap(); + println!("universe: {universe:?}"); + + let results = + execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /* 0, 20 */) + .unwrap(); + println!("{results:?}") + } + + #[test] + fn search_movies_new() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + let txn = index.read_txn().unwrap(); + + let primary_key = index.primary_key(&txn).unwrap().unwrap(); + let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); + + // loop { + // let start = Instant::now(); + + // let mut db_cache = DatabaseCache::default(); + + // let query_graph = make_query_graph( + // &index, + // &txn, + // &mut db_cache, + // "released from prison by the government", + // ) + // .unwrap(); + // // println!("{}", query_graph.graphviz()); + + // // TODO: filters + maybe distinct attributes? + // let universe = get_start_universe( + // &index, + // &txn, + // &mut db_cache, + // &query_graph, + // TermsMatchingStrategy::Last, + // ) + // .unwrap(); + // // println!("universe: {universe:?}"); + + // let results = execute_search( + // &index, + // &txn, + // &mut db_cache, + // &universe, + // &query_graph, /* 0, 20 */ + // ) + // .unwrap(); + + // let elapsed = start.elapsed(); + // println!("{}us: {results:?}", elapsed.as_micros()); + // } + let start = Instant::now(); + + let mut db_cache = DatabaseCache::default(); + + let query_graph = + make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government") + .unwrap(); + // println!("{}", query_graph.graphviz()); + + // TODO: filters + maybe distinct attributes? + let universe = get_start_universe( + &index, + &txn, + &mut db_cache, + &query_graph, + TermsMatchingStrategy::Last, + ) + .unwrap(); + // println!("universe: {universe:?}"); + + let results = + execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /* 0, 20 */) + .unwrap(); + + let elapsed = start.elapsed(); + + let ids = index + .documents(&txn, results.iter().copied()) + .unwrap() + .into_iter() + .map(|x| { + let obkv = &x.1; + let id = obkv.get(primary_key).unwrap(); + let id: serde_json::Value = serde_json::from_slice(id).unwrap(); + id.as_str().unwrap().to_owned() + }) + .collect::>(); + + println!("{}us: {results:?}", elapsed.as_micros()); + println!("external ids: {ids:?}"); + } + + #[test] + fn search_movies_old() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + let txn = index.read_txn().unwrap(); + + let start = Instant::now(); + + let mut s = Search::new(&txn, &index); + s.query("released from prison by the government"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); + let docs = s.execute().unwrap(); + + let elapsed = start.elapsed(); + + println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); + } + + #[test] + fn _settings_movies() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + // let primary_key = "id"; + // let searchable_fields = vec!["title", "overview"]; + // let filterable_fields = vec!["release_date", "genres"]; + // let sortable_fields = vec[]; + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_min_word_len_one_typo(5); + builder.set_min_word_len_two_typos(100); + + // builder.set_primary_key(primary_key.to_owned()); + + // let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_searchable_fields(searchable_fields); + + // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_filterable_fields(filterable_fields); + + builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + + // let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_sortable_fields(sortable_fields); + + builder.execute(|_| (), || false).unwrap(); + } + + // #[test] + fn _index_movies() { + let mut options = EnvOpenOptions::new(); + options.map_size(100 * 1024 * 1024 * 1024); // 100 GB + + let index = Index::new(options, "data_movies").unwrap(); + let mut wtxn = index.write_txn().unwrap(); + + let primary_key = "id"; + let searchable_fields = vec!["title", "overview"]; + let filterable_fields = vec!["release_date", "genres"]; + // let sortable_fields = vec[]; + + let config = IndexerConfig::default(); + let mut builder = Settings::new(&mut wtxn, &index, &config); + + builder.set_primary_key(primary_key.to_owned()); + + let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_searchable_fields(searchable_fields); + + let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); + builder.set_filterable_fields(filterable_fields); + + builder.set_criteria(vec![Criterion::Words]); + + // let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect(); + // builder.set_sortable_fields(sortable_fields); + + builder.execute(|_| (), || false).unwrap(); + + let config = IndexerConfig::default(); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) + .unwrap(); + + let documents = documents_from( + "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json", + "json", + ); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + index.prepare_for_closing().wait(); + } + + fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { + let reader = File::open(filename) + .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename)); + let reader = BufReader::new(reader); + let documents = match filetype { + "csv" => documents_from_csv(reader).unwrap(), + "json" => documents_from_json(reader).unwrap(), + "jsonl" => documents_from_jsonl(reader).unwrap(), + otherwise => panic!("invalid update format {:?}", otherwise), + }; + DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() + } + + fn documents_from_jsonl(reader: impl BufRead) -> crate::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { + let object = result.unwrap(); + documents.append_json_object(&object)?; + } + + documents.into_inner().map_err(Into::into) + } + + fn documents_from_json(reader: impl BufRead) -> crate::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + documents.append_json_array(reader)?; + + documents.into_inner().map_err(Into::into) + } + + fn documents_from_csv(reader: impl BufRead) -> crate::Result> { + let csv = csv::Reader::from_reader(reader); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) + } +}