Introduce a common way to manage the coordination between ranking rules

2025-02-20 17:45:54 +08:00 · 2023-02-21 09:44:03 +01:00 · 2023-02-21 09:44:03 +01:00 · ce0d1e0e13
commit ce0d1e0e13
parent 5065d8b0c1
1 changed files with 523 additions and 0 deletions
--- a/milli/src/search/new/ranking_rules.rs
+++ b/milli/src/search/new/ranking_rules.rs
@ -0,0 +1,523 @@
+use heed::RoTxn;
+use roaring::RoaringBitmap;
+
+use super::db_cache::DatabaseCache;
+use super::resolve_query_graph::resolve_query_graph;
+use super::QueryGraph;
+use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
+use crate::new::ranking_rule_graph::proximity::ProximityGraph;
+use crate::new::words::Words;
+// use crate::search::new::sort::Sort;
+use crate::{Index, Result, TermsMatchingStrategy};
+
+pub trait RankingRuleOutputIter<'transaction, Query> {
+    fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>>;
+}
+
+pub struct RankingRuleOutputIterWrapper<'transaction, Query> {
+    iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'transaction>,
+}
+impl<'transaction, Query> RankingRuleOutputIterWrapper<'transaction, Query> {
+    pub fn new(
+        iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'transaction>,
+    ) -> Self {
+        Self { iter }
+    }
+}
+impl<'transaction, Query> RankingRuleOutputIter<'transaction, Query>
+    for RankingRuleOutputIterWrapper<'transaction, Query>
+{
+    fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>> {
+        match self.iter.next() {
+            Some(x) => x.map(Some),
+            None => Ok(None),
+        }
+    }
+}
+
+pub trait RankingRuleQueryTrait: Sized + Clone + 'static {}
+#[derive(Clone)]
+pub struct PlaceholderQuery;
+impl RankingRuleQueryTrait for PlaceholderQuery {}
+impl RankingRuleQueryTrait for QueryGraph {}
+
+pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
+    // TODO: add an update_candidates function to deal with distinct
+    // attributes?
+
+    fn start_iteration(
+        &mut self,
+        index: &Index,
+        txn: &'transaction RoTxn,
+        db_cache: &mut DatabaseCache<'transaction>,
+        universe: &RoaringBitmap,
+        query: &Query,
+    ) -> Result<()>;
+
+    fn next_bucket(
+        &mut self,
+        index: &Index,
+        txn: &'transaction RoTxn,
+        db_cache: &mut DatabaseCache<'transaction>,
+        universe: &RoaringBitmap,
+    ) -> Result<Option<RankingRuleOutput<Query>>>;
+
+    fn end_iteration(
+        &mut self,
+        index: &Index,
+        txn: &'transaction RoTxn,
+        db_cache: &mut DatabaseCache<'transaction>,
+    );
+}
+
+#[derive(Debug)]
+pub struct RankingRuleOutput<Q> {
+    /// The query tree that must be used by the child ranking rule to fetch candidates.
+    pub query: Q,
+    /// The allowed candidates for the child ranking rule
+    pub candidates: RoaringBitmap,
+}
+
+#[allow(unused)]
+pub fn get_start_universe<'transaction>(
+    index: &Index,
+    txn: &'transaction RoTxn,
+    db_cache: &mut DatabaseCache<'transaction>,
+    query_graph: &QueryGraph,
+    term_matching_strategy: TermsMatchingStrategy,
+    // filters: Filters,
+    // mut distinct: Option<D>,
+) -> Result<RoaringBitmap> {
+    // NOTE:
+    //
+    // There is a performance problem when using `distinct` + exhaustive number of hits,
+    // especially for search that yield many results (many ~= almost all of the
+    // dataset).
+    //
+    // We'll solve it later. Maybe there are smart ways to go about it.
+    //
+    // For example, if there are millions of possible values for the distinct attribute,
+    // then we could just look at the documents which share any distinct attribute with
+    // another one, and remove the later docids them from the universe.
+    // => NO! because we don't know which one to remove, only after the sorting is done can we know it
+    // => this kind of computation can be done, but only in the evaluation of the number
+    // of hits for the documents that aren't returned by the search.
+    //
+    // `Distinct` otherwise should always be computed during
+
+    let universe = index.documents_ids(txn).unwrap();
+
+    // resolve the whole query tree to retrieve an exhaustive list of documents matching the query.
+    // NOTE: this is wrong
+    // Instead, we should only compute the documents corresponding to the last remaining
+    // word, 2-gram, and 3-gran.
+    // let candidates = resolve_query_graph(index, txn, db_cache, query_graph, &universe)?;
+
+    // Distinct should be lazy if placeholder?
+    //
+    // // because the initial_candidates should be an exhaustive count of the matching documents,
+    // // we precompute the distinct attributes.
+    // let initial_candidates = match &mut distinct {
+    //     Some(distinct) => {
+    //         let mut initial_candidates = RoaringBitmap::new();
+    //         for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) {
+    //             initial_candidates.insert(c?);
+    //         }
+    //         initial_candidates
+    //     }
+    //     None => candidates.clone(),
+    // };
+
+    Ok(/*candidates*/ universe)
+}
+
+pub fn execute_search<'transaction>(
+    index: &Index,
+    txn: &'transaction heed::RoTxn,
+    // TODO: ranking rules parameter
+    db_cache: &mut DatabaseCache<'transaction>,
+    universe: &RoaringBitmap,
+    query_graph: &QueryGraph,
+    // _from: usize,
+    // _length: usize,
+) -> Result<Vec<u32>> {
+    let words = Words::new(TermsMatchingStrategy::Last);
+    // let sort = Sort::new(index, txn, "sort1".to_owned(), true)?;
+    let proximity = GraphBasedRankingRule::<ProximityGraph>::default();
+    // TODO: ranking rules given as argument
+    let mut ranking_rules: Vec<Box<dyn RankingRule<'transaction, QueryGraph>>> =
+        vec![Box::new(words), Box::new(proximity) /*  Box::new(sort) */];
+
+    let ranking_rules_len = ranking_rules.len();
+    ranking_rules[0].start_iteration(index, txn, db_cache, universe, query_graph)?;
+
+    // TODO: parent_candidates could be used only during debugging?
+    let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
+    candidates[0] = universe.clone();
+
+    let mut cur_ranking_rule_index = 0;
+
+    macro_rules! back {
+        () => {
+            candidates[cur_ranking_rule_index].clear();
+            ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache);
+            if cur_ranking_rule_index == 0 {
+                break;
+            } else {
+                cur_ranking_rule_index -= 1;
+            }
+        };
+    }
+
+    let mut results = vec![];
+    // TODO: skip buckets when we want to start from an offset
+    while results.len() < 20 {
+        // The universe for this bucket is zero or one element, so we don't need to sort
+        // anything, just extend the results and go back to the parent ranking rule.
+        if candidates[cur_ranking_rule_index].len() <= 1 {
+            results.extend(&candidates[cur_ranking_rule_index]);
+            back!();
+            continue;
+        }
+        let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, &candidates[cur_ranking_rule_index])? else {
+            back!();
+            continue;
+        };
+
+        candidates[cur_ranking_rule_index] -= &next_bucket.candidates;
+
+        if next_bucket.candidates.len() <= 1 {
+            // Only zero or one candidate, no need to sort through the child ranking rule.
+            results.extend(next_bucket.candidates);
+            continue;
+        } else {
+            // many candidates, give to next ranking rule, if any
+            if cur_ranking_rule_index == ranking_rules_len - 1 {
+                // TODO: don't extend too much, up to the limit only
+                results.extend(next_bucket.candidates);
+            } else {
+                cur_ranking_rule_index += 1;
+                candidates[cur_ranking_rule_index] = next_bucket.candidates.clone();
+                ranking_rules[cur_ranking_rule_index].start_iteration(
+                    index,
+                    txn,
+                    db_cache,
+                    &next_bucket.candidates,
+                    &next_bucket.query,
+                )?;
+            }
+        }
+    }
+
+    Ok(results)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::fs::File;
+    use std::io::{BufRead, BufReader, Cursor, Seek};
+    use std::time::Instant;
+
+    use heed::EnvOpenOptions;
+
+    use super::{execute_search, get_start_universe};
+    use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
+    use crate::index::tests::TempIndex;
+    use crate::new::db_cache::DatabaseCache;
+    use crate::new::make_query_graph;
+    use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
+    use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy};
+
+    #[test]
+    fn execute_new_search() {
+        let index = TempIndex::new();
+        index
+            .add_documents(documents!([
+                {
+                    "id": 7,
+                    "text": "the super quick super brown fox jumps over",
+                },
+                {
+                    "id": 8,
+                    "text": "the super quick brown fox jumps over",
+                },
+                {
+                    "id": 9,
+                    "text": "the quick super brown fox jumps over",
+                },
+                {
+                    "id": 10,
+                    "text": "the quick brown fox jumps over",
+                },
+                {
+                    "id": 11,
+                    "text": "the quick brown fox jumps over the lazy dog",
+                },
+                {
+                    "id": 12,
+                    "text": "the quick brown cat jumps over the lazy dog",
+                },
+            ]))
+            .unwrap();
+        let txn = index.read_txn().unwrap();
+
+        let mut db_cache = DatabaseCache::default();
+
+        let query_graph =
+            make_query_graph(&index, &txn, &mut db_cache, "the quick brown fox jumps over")
+                .unwrap();
+        println!("{}", query_graph.graphviz());
+
+        // TODO: filters + maybe distinct attributes?
+        let universe = get_start_universe(
+            &index,
+            &txn,
+            &mut db_cache,
+            &query_graph,
+            TermsMatchingStrategy::Last,
+        )
+        .unwrap();
+        println!("universe: {universe:?}");
+
+        let results =
+            execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /*  0, 20 */)
+                .unwrap();
+        println!("{results:?}")
+    }
+
+    #[test]
+    fn search_movies_new() {
+        let mut options = EnvOpenOptions::new();
+        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+
+        let index = Index::new(options, "data_movies").unwrap();
+        let txn = index.read_txn().unwrap();
+
+        let primary_key = index.primary_key(&txn).unwrap().unwrap();
+        let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
+
+        // loop {
+        //     let start = Instant::now();
+
+        //     let mut db_cache = DatabaseCache::default();
+
+        //     let query_graph = make_query_graph(
+        //         &index,
+        //         &txn,
+        //         &mut db_cache,
+        //         "released from prison by the government",
+        //     )
+        //     .unwrap();
+        //     // println!("{}", query_graph.graphviz());
+
+        //     // TODO: filters + maybe distinct attributes?
+        //     let universe = get_start_universe(
+        //         &index,
+        //         &txn,
+        //         &mut db_cache,
+        //         &query_graph,
+        //         TermsMatchingStrategy::Last,
+        //     )
+        //     .unwrap();
+        //     // println!("universe: {universe:?}");
+
+        //     let results = execute_search(
+        //         &index,
+        //         &txn,
+        //         &mut db_cache,
+        //         &universe,
+        //         &query_graph, /*  0, 20 */
+        //     )
+        //     .unwrap();
+
+        //     let elapsed = start.elapsed();
+        //     println!("{}us: {results:?}", elapsed.as_micros());
+        // }
+        let start = Instant::now();
+
+        let mut db_cache = DatabaseCache::default();
+
+        let query_graph =
+            make_query_graph(&index, &txn, &mut db_cache, "released from prison by the government")
+                .unwrap();
+        // println!("{}", query_graph.graphviz());
+
+        // TODO: filters + maybe distinct attributes?
+        let universe = get_start_universe(
+            &index,
+            &txn,
+            &mut db_cache,
+            &query_graph,
+            TermsMatchingStrategy::Last,
+        )
+        .unwrap();
+        // println!("universe: {universe:?}");
+
+        let results =
+            execute_search(&index, &txn, &mut db_cache, &universe, &query_graph /*  0, 20 */)
+                .unwrap();
+
+        let elapsed = start.elapsed();
+
+        let ids = index
+            .documents(&txn, results.iter().copied())
+            .unwrap()
+            .into_iter()
+            .map(|x| {
+                let obkv = &x.1;
+                let id = obkv.get(primary_key).unwrap();
+                let id: serde_json::Value = serde_json::from_slice(id).unwrap();
+                id.as_str().unwrap().to_owned()
+            })
+            .collect::<Vec<_>>();
+
+        println!("{}us: {results:?}", elapsed.as_micros());
+        println!("external ids: {ids:?}");
+    }
+
+    #[test]
+    fn search_movies_old() {
+        let mut options = EnvOpenOptions::new();
+        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+
+        let index = Index::new(options, "data_movies").unwrap();
+        let txn = index.read_txn().unwrap();
+
+        let start = Instant::now();
+
+        let mut s = Search::new(&txn, &index);
+        s.query("released from prison by the government");
+        s.terms_matching_strategy(TermsMatchingStrategy::Last);
+        // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
+        let docs = s.execute().unwrap();
+
+        let elapsed = start.elapsed();
+
+        println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
+    }
+
+    #[test]
+    fn _settings_movies() {
+        let mut options = EnvOpenOptions::new();
+        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+
+        let index = Index::new(options, "data_movies").unwrap();
+        let mut wtxn = index.write_txn().unwrap();
+
+        // let primary_key = "id";
+        // let searchable_fields = vec!["title", "overview"];
+        // let filterable_fields = vec!["release_date", "genres"];
+        // let sortable_fields = vec[];
+
+        let config = IndexerConfig::default();
+        let mut builder = Settings::new(&mut wtxn, &index, &config);
+
+        builder.set_min_word_len_one_typo(5);
+        builder.set_min_word_len_two_typos(100);
+
+        // builder.set_primary_key(primary_key.to_owned());
+
+        // let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
+        // builder.set_searchable_fields(searchable_fields);
+
+        // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
+        // builder.set_filterable_fields(filterable_fields);
+
+        builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
+
+        // let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect();
+        // builder.set_sortable_fields(sortable_fields);
+
+        builder.execute(|_| (), || false).unwrap();
+    }
+
+    // #[test]
+    fn _index_movies() {
+        let mut options = EnvOpenOptions::new();
+        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
+
+        let index = Index::new(options, "data_movies").unwrap();
+        let mut wtxn = index.write_txn().unwrap();
+
+        let primary_key = "id";
+        let searchable_fields = vec!["title", "overview"];
+        let filterable_fields = vec!["release_date", "genres"];
+        // let sortable_fields = vec[];
+
+        let config = IndexerConfig::default();
+        let mut builder = Settings::new(&mut wtxn, &index, &config);
+
+        builder.set_primary_key(primary_key.to_owned());
+
+        let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
+        builder.set_searchable_fields(searchable_fields);
+
+        let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
+        builder.set_filterable_fields(filterable_fields);
+
+        builder.set_criteria(vec![Criterion::Words]);
+
+        // let sortable_fields = sortable_fields.iter().map(|s| s.to_string()).collect();
+        // builder.set_sortable_fields(sortable_fields);
+
+        builder.execute(|_| (), || false).unwrap();
+
+        let config = IndexerConfig::default();
+        let indexing_config = IndexDocumentsConfig::default();
+        let builder =
+            IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
+                .unwrap();
+
+        let documents = documents_from(
+            "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json",
+            "json",
+        );
+        let (builder, user_error) = builder.add_documents(documents).unwrap();
+        user_error.unwrap();
+        builder.execute().unwrap();
+        wtxn.commit().unwrap();
+
+        index.prepare_for_closing().wait();
+    }
+
+    fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
+        let reader = File::open(filename)
+            .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
+        let reader = BufReader::new(reader);
+        let documents = match filetype {
+            "csv" => documents_from_csv(reader).unwrap(),
+            "json" => documents_from_json(reader).unwrap(),
+            "jsonl" => documents_from_jsonl(reader).unwrap(),
+            otherwise => panic!("invalid update format {:?}", otherwise),
+        };
+        DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
+    }
+
+    fn documents_from_jsonl(reader: impl BufRead) -> crate::Result<Vec<u8>> {
+        let mut documents = DocumentsBatchBuilder::new(Vec::new());
+
+        for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
+            let object = result.unwrap();
+            documents.append_json_object(&object)?;
+        }
+
+        documents.into_inner().map_err(Into::into)
+    }
+
+    fn documents_from_json(reader: impl BufRead) -> crate::Result<Vec<u8>> {
+        let mut documents = DocumentsBatchBuilder::new(Vec::new());
+
+        documents.append_json_array(reader)?;
+
+        documents.into_inner().map_err(Into::into)
+    }
+
+    fn documents_from_csv(reader: impl BufRead) -> crate::Result<Vec<u8>> {
+        let csv = csv::Reader::from_reader(reader);
+
+        let mut documents = DocumentsBatchBuilder::new(Vec::new());
+        documents.append_csv(csv)?;
+
+        documents.into_inner().map_err(Into::into)
+    }
+}