meilisearch/milli/src/search/new/ranking_rule_graph/proximity/build.rs

use super::ProximityEdge;
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::new::ranking_rule_graph::proximity::WordPair;
use crate::new::ranking_rule_graph::EdgeDetails;
use crate::new::{QueryNode, SearchContext};
use crate::Result;
use itertools::Itertools;
use std::collections::BTreeMap;

pub fn visit_from_node(
    ctx: &mut SearchContext,
    from_node: &QueryNode,
) -> Result<Option<(WordDerivations, i8)>> {
    Ok(Some(match from_node {
        QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 {
            QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()),
            QueryTerm::Phrase { phrase: phrase1 } => {
                let phrase1 = ctx.phrase_interner.get(*phrase1);
                if let Some(original) = *phrase1.words.last().unwrap() {
                    (
                        WordDerivations {
                            original,
                            zero_typo: Box::new([original]),
                            one_typo: Box::new([]),
                            two_typos: Box::new([]),
                            use_prefix_db: false,
                            synonyms: Box::new([]),
                            split_words: None,
                        },
                        *pos1.end(),
                    )
                } else {
                    // No word pairs if the phrase does not have a regular word as its last term
                    return Ok(None);
                }
            }
        },
        QueryNode::Start => (
            WordDerivations {
                original: ctx.word_interner.insert(String::new()),
                zero_typo: Box::new([]),
                one_typo: Box::new([]),
                two_typos: Box::new([]),
                use_prefix_db: false,
                synonyms: Box::new([]),
                split_words: None,
            },
            -100,
        ),
        _ => return Ok(None),
    }))
}

pub fn visit_to_node<'search, 'from_data>(
    ctx: &mut SearchContext<'search>,
    to_node: &QueryNode,
    from_node_data: &'from_data (WordDerivations, i8),
) -> Result<Vec<(u8, EdgeDetails<ProximityEdge>)>> {
    let (derivations1, pos1) = from_node_data;
    let term2 = match &to_node {
        QueryNode::End => return Ok(vec![(0, EdgeDetails::Unconditional)]),
        QueryNode::Deleted | QueryNode::Start => return Ok(vec![]),
        QueryNode::Term(term) => term,
    };
    let LocatedQueryTerm { value: value2, positions: pos2 } = term2;

    let (derivations2, pos2, ngram_len2) = match value2 {
        QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()),
        QueryTerm::Phrase { phrase: phrase2 } => {
            let phrase2 = ctx.phrase_interner.get(*phrase2);
            if let Some(original) = *phrase2.words.first().unwrap() {
                (
                    WordDerivations {
                        original,
                        zero_typo: Box::new([original]),
                        one_typo: Box::new([]),
                        two_typos: Box::new([]),
                        use_prefix_db: false,
                        synonyms: Box::new([]),
                        split_words: None,
                    },
                    *pos2.start(),
                    1,
                )
            } else {
                // No word pairs if the phrase does not have a regular word as its first term
                return Ok(vec![]);
            }
        }
    };

    if pos1 + 1 != pos2 {
        // TODO: how should this actually be handled?
        // We want to effectively ignore this pair of terms
        // Unconditionally walk through the edge without computing the docids
        // But also what should the cost be?
        return Ok(vec![(0, EdgeDetails::Unconditional)]);
    }

    let updb1 = derivations1.use_prefix_db;
    let updb2 = derivations2.use_prefix_db;

    // left term cannot be a prefix
    assert!(!updb1);

    let derivations1 = derivations1.all_derivations_except_prefix_db();
    // TODO: eventually, we want to get rid of the uses from `orginal`
    let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();

    if updb2 {
        for word1 in derivations1.clone() {
            for proximity in 1..=(8 - ngram_len2) {
                let cost = (proximity + ngram_len2 - 1) as u8;
                // TODO: if we had access to the universe here, we could already check whether
                // the bitmap corresponding to this word pair is disjoint with the universe or not
                if ctx
                    .get_word_prefix_pair_proximity_docids(
                        word1,
                        derivations2.original,
                        proximity as u8,
                    )?
                    .is_some()
                {
                    cost_proximity_word_pairs
                        .entry(cost)
                        .or_default()
                        .entry(proximity as u8)
                        .or_default()
                        .push(WordPair::WordPrefix {
                            left: word1,
                            right_prefix: derivations2.original,
                        });
                }
                if ctx
                    .get_prefix_word_pair_proximity_docids(
                        derivations2.original,
                        word1,
                        proximity as u8 - 1,
                    )?
                    .is_some()
                {
                    cost_proximity_word_pairs
                        .entry(cost)
                        .or_default()
                        .entry(proximity as u8)
                        .or_default()
                        .push(WordPair::WordPrefixSwapped {
                            left_prefix: derivations2.original,
                            right: word1,
                        });
                }
            }
        }
    }

    let derivations2 = derivations2.all_derivations_except_prefix_db();
    // TODO: add safeguard in case the cartesian product is too large?
    let product_derivations = derivations1.cartesian_product(derivations2);

    for (word1, word2) in product_derivations {
        for proximity in 1..=(8 - ngram_len2) {
            let cost = (proximity + ngram_len2 - 1) as u8;
            if ctx.get_word_pair_proximity_docids(word1, word2, proximity as u8)?.is_some() {
                cost_proximity_word_pairs
                    .entry(cost)
                    .or_default()
                    .entry(proximity as u8)
                    .or_default()
                    .push(WordPair::Words { left: word1, right: word2 });
            }
            if proximity > 1
                && ctx.get_word_pair_proximity_docids(word2, word1, proximity as u8 - 1)?.is_some()
            {
                cost_proximity_word_pairs
                    .entry(cost)
                    .or_default()
                    .entry(proximity as u8 - 1)
                    .or_default()
                    .push(WordPair::Words { left: word2, right: word1 });
            }
        }
    }
    let mut new_edges = cost_proximity_word_pairs
        .into_iter()
        .flat_map(|(cost, proximity_word_pairs)| {
            let mut edges = vec![];
            for (proximity, word_pairs) in proximity_word_pairs {
                edges.push((
                    cost,
                    EdgeDetails::Data(ProximityEdge {
                        pairs: word_pairs.into_boxed_slice(),
                        proximity,
                    }),
                ))
            }
            edges
        })
        .collect::<Vec<_>>();
    new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeDetails::Unconditional));
    Ok(new_edges)
}
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`use super::ProximityEdge;`
			`use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};`
			`use crate::new::ranking_rule_graph::proximity::WordPair;`
Remove warnings 2023-02-28 11:49:24 +01:00			`use crate::new::ranking_rule_graph::EdgeDetails;`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`use crate::new::{QueryNode, SearchContext};`
			`use crate::Result;`
			`use itertools::Itertools;`
			`use std::collections::BTreeMap;`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`pub fn visit_from_node(`
			`ctx: &mut SearchContext,`
			`from_node: &QueryNode,`
			`) -> Result<Option<(WordDerivations, i8)>> {`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`Ok(Some(match from_node {`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 {`
			`QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()),`
			`QueryTerm::Phrase { phrase: phrase1 } => {`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`let phrase1 = ctx.phrase_interner.get(*phrase1);`
			`if let Some(original) = *phrase1.words.last().unwrap() {`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`(`
			`WordDerivations {`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`original,`
			`zero_typo: Box::new([original]),`
			`one_typo: Box::new([]),`
			`two_typos: Box::new([]),`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`use_prefix_db: false,`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`synonyms: Box::new([]),`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`split_words: None,`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`},`
			`*pos1.end(),`
			`)`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`} else {`
			`// No word pairs if the phrase does not have a regular word as its last term`
			`return Ok(None);`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`}`
			`}`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`},`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`QueryNode::Start => (`
			`WordDerivations {`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`original: ctx.word_interner.insert(String::new()),`
			`zero_typo: Box::new([]),`
			`one_typo: Box::new([]),`
			`two_typos: Box::new([]),`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`use_prefix_db: false,`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`synonyms: Box::new([]),`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`split_words: None,`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`},`
			`-100,`
			`),`
			`_ => return Ok(None),`
			`}))`
			`}`

Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`pub fn visit_to_node<'search, 'from_data>(`
			`ctx: &mut SearchContext<'search>,`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`to_node: &QueryNode,`
			`from_node_data: &'from_data (WordDerivations, i8),`
Add some documentation and use bitmaps instead of hashmaps when possible 2023-02-21 12:33:32 +01:00			`) -> Result<Vec<(u8, EdgeDetails<ProximityEdge>)>> {`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`let (derivations1, pos1) = from_node_data;`
			`let term2 = match &to_node {`
Add some documentation and use bitmaps instead of hashmaps when possible 2023-02-21 12:33:32 +01:00			`QueryNode::End => return Ok(vec![(0, EdgeDetails::Unconditional)]),`
			`QueryNode::Deleted \| QueryNode::Start => return Ok(vec![]),`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`QueryNode::Term(term) => term,`
			`};`
			`let LocatedQueryTerm { value: value2, positions: pos2 } = term2;`

			`let (derivations2, pos2, ngram_len2) = match value2 {`
			`QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()),`
Rewrite cheapest path algorithm and empty path cache It is now much simpler and has much better performance. 2023-03-02 21:27:42 +01:00			`QueryTerm::Phrase { phrase: phrase2 } => {`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`let phrase2 = ctx.phrase_interner.get(*phrase2);`
			`if let Some(original) = *phrase2.words.first().unwrap() {`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`(`
			`WordDerivations {`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`original,`
			`zero_typo: Box::new([original]),`
			`one_typo: Box::new([]),`
			`two_typos: Box::new([]),`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`use_prefix_db: false,`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`synonyms: Box::new([]),`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`split_words: None,`
			`},`
			`*pos2.start(),`
			`1,`
			`)`
			`} else {`
			`// No word pairs if the phrase does not have a regular word as its first term`
			`return Ok(vec![]);`
			`}`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`}`
			`};`

			`if pos1 + 1 != pos2 {`
			`// TODO: how should this actually be handled?`
			`// We want to effectively ignore this pair of terms`
			`// Unconditionally walk through the edge without computing the docids`
			`// But also what should the cost be?`
Add some documentation and use bitmaps instead of hashmaps when possible 2023-02-21 12:33:32 +01:00			`return Ok(vec![(0, EdgeDetails::Unconditional)]);`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`}`

			`let updb1 = derivations1.use_prefix_db;`
			`let updb2 = derivations2.use_prefix_db;`

			`// left term cannot be a prefix`
			`assert!(!updb1);`

			`let derivations1 = derivations1.all_derivations_except_prefix_db();`
Fix: computation of initial universe, code organisation 2023-03-06 08:35:01 +01:00			// TODO: eventually, we want to get rid of the uses from `orginal`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();`

			`if updb2 {`
			`for word1 in derivations1.clone() {`
Fix proximity graph edge builder to include all proximities 2023-03-05 19:49:39 +01:00			`for proximity in 1..=(8 - ngram_len2) {`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`let cost = (proximity + ngram_len2 - 1) as u8;`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`// TODO: if we had access to the universe here, we could already check whether`
			`// the bitmap corresponding to this word pair is disjoint with the universe or not`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`if ctx`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`.get_word_prefix_pair_proximity_docids(`
			`word1,`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`derivations2.original,`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`proximity as u8,`
			`)?`
			`.is_some()`
			`{`
			`cost_proximity_word_pairs`
			`.entry(cost)`
			`.or_default()`
			`.entry(proximity as u8)`
			`.or_default()`
			`.push(WordPair::WordPrefix {`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`left: word1,`
			`right_prefix: derivations2.original,`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`});`
			`}`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`if ctx`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`.get_prefix_word_pair_proximity_docids(`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`derivations2.original,`
			`word1,`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`proximity as u8 - 1,`
			`)?`
			`.is_some()`
			`{`
			`cost_proximity_word_pairs`
			`.entry(cost)`
			`.or_default()`
			`.entry(proximity as u8)`
			`.or_default()`
			`.push(WordPair::WordPrefixSwapped {`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`left_prefix: derivations2.original,`
			`right: word1,`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`});`
			`}`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`}`
			`}`
			`}`

			`let derivations2 = derivations2.all_derivations_except_prefix_db();`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`// TODO: add safeguard in case the cartesian product is too large?`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`let product_derivations = derivations1.cartesian_product(derivations2);`

			`for (word1, word2) in product_derivations {`
Fix proximity graph edge builder to include all proximities 2023-03-05 19:49:39 +01:00			`for proximity in 1..=(8 - ngram_len2) {`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`let cost = (proximity + ngram_len2 - 1) as u8;`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`if ctx.get_word_pair_proximity_docids(word1, word2, proximity as u8)?.is_some() {`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`cost_proximity_word_pairs`
			`.entry(cost)`
			`.or_default()`
			`.entry(proximity as u8)`
			`.or_default()`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`.push(WordPair::Words { left: word1, right: word2 });`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`}`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`if proximity > 1`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`&& ctx.get_word_pair_proximity_docids(word2, word1, proximity as u8 - 1)?.is_some()`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`{`
			`cost_proximity_word_pairs`
			`.entry(cost)`
			`.or_default()`
			`.entry(proximity as u8 - 1)`
			`.or_default()`
Intern all strings and phrases in the search logic 2023-03-06 19:21:55 +01:00			`.push(WordPair::Words { left: word2, right: word1 });`
Fix code visibility issue + unimplemented detail in proximity rule 2023-03-05 20:07:37 +01:00			`}`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`}`
			`}`
			`let mut new_edges = cost_proximity_word_pairs`
			`.into_iter()`
			`.flat_map(\|(cost, proximity_word_pairs)\| {`
			`let mut edges = vec![];`
			`for (proximity, word_pairs) in proximity_word_pairs {`
Apply a few optimisations for graph-based ranking rules 2023-03-07 14:42:58 +01:00			`edges.push((`
			`cost,`
			`EdgeDetails::Data(ProximityEdge {`
			`pairs: word_pairs.into_boxed_slice(),`
			`proximity,`
			`}),`
			`))`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`}`
			`edges`
			`})`
			`.collect::<Vec<_>>();`
			`new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeDetails::Unconditional));`
Add some documentation and use bitmaps instead of hashmaps when possible 2023-02-21 12:33:32 +01:00			`Ok(new_edges)`
Introduce the proximity ranking rule as a graph-based ranking rule 2023-02-21 09:49:05 +01:00			`}`