WIP on split words and synonyms support

2024-11-23 02:27:40 +08:00 · 2023-03-02 21:27:57 +01:00 · 2023-03-02 21:27:57 +01:00 · 1db152046e
commit 1db152046e
parent c27ea2677f
5 changed files with 233 additions and 142 deletions
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -31,22 +31,27 @@ pub fn make_query_graph<'transaction>(
    query: &str,
 ) -> Result<QueryGraph> {
    assert!(!query.is_empty());
-    let fst = index.words_fst(txn).unwrap();
-    let query = LocatedQueryTerm::from_query(query.tokenize(), None, |word, is_prefix| {
-        word_derivations(
-            index,
-            txn,
-            word,
-            if word.len() < 4 {
-                0
-            } else if word.len() < 100 {
-                1
-            } else {
-                2
-            },
-            is_prefix,
-            &fst,
-        )
+    let authorize_typos = index.authorize_typos(txn)?;
+    let min_len_one_typo = index.min_word_len_one_typo(txn)?;
+    let min_len_two_typos = index.min_word_len_two_typos(txn)?;
+
+    let exact_words = index.exact_words(txn)?;
+    let fst = index.words_fst(txn)?;
+
+    // TODO: get rid of this closure
+    // also, ngrams can have one typo?
+    let query = LocatedQueryTerm::from_query(query.tokenize(), None, move |word, is_prefix| {
+        let typos = if !authorize_typos
+            || word.len() < min_len_one_typo as usize
+            || exact_words.as_ref().map_or(false, |fst| fst.contains(word))
+        {
+            0
+        } else if word.len() < min_len_two_typos as usize {
+            1
+        } else {
+            2
+        };
+        word_derivations(index, txn, word, typos, is_prefix, &fst)
    })
    .unwrap();
    let graph = QueryGraph::from_query(index, txn, db_cache, query)?;
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@ -7,7 +7,7 @@ use super::db_cache::DatabaseCache;
 use super::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
 use crate::{Index, Result};

-#[derive(Clone)]
+#[derive(Debug, Clone)]
 pub enum QueryNode {
    Term(LocatedQueryTerm),
    Deleted,
@ -31,7 +31,7 @@ pub struct QueryGraph {
 }

 fn _assert_sizes() {
-    let _: [u8; 112] = [0; std::mem::size_of::<QueryNode>()];
+    let _: [u8; 184] = [0; std::mem::size_of::<QueryNode>()];
    let _: [u8; 48] = [0; std::mem::size_of::<Edges>()];
 }

@ -116,6 +116,8 @@ impl QueryGraph {
                                    one_typo: vec![],
                                    two_typos: vec![],
                                    use_prefix_db: false,
+                                    synonyms: vec![],  // TODO: ngram synonyms
+                                    split_words: None, // TODO: maybe ngram split words?
                                },
                            },
                            positions: ngram2_pos,
@ -141,6 +143,8 @@ impl QueryGraph {
                                    one_typo: vec![],
                                    two_typos: vec![],
                                    use_prefix_db: false,
+                                    synonyms: vec![],  // TODO: ngram synonyms
+                                    split_words: None, // TODO: maybe ngram split words?
                                },
                            },
                            positions: ngram3_pos,
@ -188,19 +192,20 @@ impl QueryGraph {
                Edges { predecessors: RoaringBitmap::new(), successors: RoaringBitmap::new() };
        }
    }
-    pub fn remove_words_at_position(&mut self, position: i8) {
+    pub fn remove_words_at_position(&mut self, position: i8) -> bool {
        let mut nodes_to_remove_keeping_edges = vec![];
        for (node_idx, node) in self.nodes.iter().enumerate() {
            let node_idx = node_idx as u32;
            let QueryNode::Term(LocatedQueryTerm { value: _, positions }) = node else { continue };
            if positions.start() == &position {
-                nodes_to_remove_keeping_edges.push(node_idx)
+                nodes_to_remove_keeping_edges.push(node_idx);
            }
        }

        self.remove_nodes_keep_edges(&nodes_to_remove_keeping_edges);

        self.simplify();
+        !nodes_to_remove_keeping_edges.is_empty()
    }

    fn simplify(&mut self) {
@ -223,80 +228,3 @@ impl QueryGraph {
        }
    }
 }
-impl Debug for QueryNode {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            QueryNode::Term(term @ LocatedQueryTerm { value, positions: _ }) => match value {
-                QueryTerm::Word {
-                    derivations:
-                        WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db },
-                } => {
-                    if term.is_empty() {
-                        write!(f, "\"{original} (∅)\"")
-                    } else {
-                        let derivations = std::iter::once(original.clone())
-                            .chain(zero_typo.iter().map(|s| format!("T0 .. {s}")))
-                            .chain(one_typo.iter().map(|s| format!("T1 .. {s}")))
-                            .chain(two_typos.iter().map(|s| format!("T2 .. {s}")))
-                            .collect::<Vec<String>>()
-                            .join(" | ");
-
-                        write!(f, "\"{derivations}")?;
-                        if *use_prefix_db {
-                            write!(f, " | +prefix_db")?;
-                        }
-                        write!(f, " | pos:{}..={}", term.positions.start(), term.positions.end())?;
-                        write!(f, "\"")?;
-                        /*
-                        "beautiful" [label = "<f0> beautiful | beauiful | beautifol"]
-                        */
-                        Ok(())
-                    }
-                }
-                QueryTerm::Phrase(ws) => {
-                    let joined =
-                        ws.iter().filter_map(|x| x.clone()).collect::<Vec<String>>().join(" ");
-                    let in_quotes = format!("\"{joined}\"");
-                    let escaped = in_quotes.escape_default().collect::<String>();
-                    write!(f, "\"{escaped}\"")
-                }
-            },
-            QueryNode::Start => write!(f, "\"START\""),
-            QueryNode::End => write!(f, "\"END\""),
-            QueryNode::Deleted => write!(f, "\"_deleted_\""),
-        }
-    }
-}
-
-impl QueryGraph {
-    pub fn graphviz(&self) -> String {
-        let mut desc = String::new();
-        desc.push_str(
-            r#"
-digraph G {
-rankdir = LR;
-node [shape = "record"]
-"#,
-        );
-
-        for node in 0..self.nodes.len() {
-            if matches!(self.nodes[node], QueryNode::Deleted) {
-                continue;
-            }
-            desc.push_str(&format!("{node} [label = {:?}]", &self.nodes[node],));
-            if node == self.root_node as usize {
-                desc.push_str("[color = blue]");
-            } else if node == self.end_node as usize {
-                desc.push_str("[color = red]");
-            }
-            desc.push_str(";\n");
-
-            for edge in self.edges[node].successors.iter() {
-                desc.push_str(&format!("{node} -> {edge};\n"));
-            }
-        }
-
-        desc.push('}');
-        desc
-    }
-}
--- a/milli/src/search/new/query_term.rs
+++ b/milli/src/search/new/query_term.rs
@ -10,14 +10,28 @@ use fst::automaton::Str;
 use fst::{Automaton, IntoStreamer, Streamer};
 use heed::types::DecodeIgnore;
 use heed::RoTxn;
+use itertools::Itertools;

 use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
 use crate::search::{build_dfa, get_first};
-use crate::{Index, Result};
+use crate::{CboRoaringBitmapLenCodec, Index, Result};
+
+#[derive(Debug, Default, Clone)]
+pub struct Phrase {
+    pub words: Vec<Option<String>>,
+}
+impl Phrase {
+    pub fn description(&self) -> String {
+        self.words.iter().flatten().join(" ")
+    }
+}

 #[derive(Debug, Clone)]
 pub struct WordDerivations {
    pub original: String,
+    // TODO: pub prefix_of: Vec<String>,
+    pub synonyms: Vec<Phrase>,
+    pub split_words: Option<(String, String)>,
    pub zero_typo: Vec<String>,
    pub one_typo: Vec<String>,
    pub two_typos: Vec<String>,
@ -114,19 +128,63 @@ pub fn word_derivations(
            }
        }
    }
+    let split_words = split_best_frequency(index, txn, word)?;

-    Ok(WordDerivations { original: word.to_owned(), zero_typo, one_typo, two_typos, use_prefix_db })
+    let synonyms = index.synonyms(txn)?;
+    let synonyms = synonyms
+        .get(&vec![word.to_owned()])
+        .cloned()
+        .unwrap_or_default()
+        .into_iter()
+        .map(|words| Phrase { words: words.into_iter().map(Some).collect() })
+        .collect();
+
+    Ok(WordDerivations {
+        original: word.to_owned(),
+        synonyms,
+        split_words,
+        zero_typo,
+        one_typo,
+        two_typos,
+        use_prefix_db,
+    })
+}
+
+fn split_best_frequency(
+    index: &Index,
+    txn: &RoTxn,
+    original: &str,
+) -> Result<Option<(String, String)>> {
+    let chars = original.char_indices().skip(1);
+    let mut best = None;
+
+    for (i, _) in chars {
+        let (left, right) = original.split_at(i);
+
+        let key = (1, left, right);
+        let frequency = index
+            .word_pair_proximity_docids
+            .remap_data_type::<CboRoaringBitmapLenCodec>()
+            .get(txn, &key)?
+            .unwrap_or(0);
+
+        if frequency != 0 && best.map_or(true, |(old, _, _)| frequency > old) {
+            best = Some((frequency, left, right));
+        }
+    }
+
+    Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned())))
 }

 #[derive(Debug, Clone)]
 pub enum QueryTerm {
-    Phrase(Vec<Option<String>>),
+    Phrase { phrase: Phrase },
    Word { derivations: WordDerivations },
 }
 impl QueryTerm {
    pub fn original_single_word(&self) -> Option<&str> {
        match self {
-            QueryTerm::Phrase(_) => None,
+            QueryTerm::Phrase { phrase: _ } => None,
            QueryTerm::Word { derivations } => {
                if derivations.is_empty() {
                    None
@ -140,14 +198,14 @@ impl QueryTerm {

 #[derive(Debug, Clone)]
 pub struct LocatedQueryTerm {
-    pub value: QueryTerm, // value should be able to contain the word derivations as well
+    pub value: QueryTerm,
    pub positions: RangeInclusive<i8>,
 }

 impl LocatedQueryTerm {
    pub fn is_empty(&self) -> bool {
        match &self.value {
-            QueryTerm::Phrase(_) => false,
+            QueryTerm::Phrase { phrase: _ } => false,
            QueryTerm::Word { derivations, .. } => derivations.is_empty(),
        }
    }
@ -156,6 +214,7 @@ impl LocatedQueryTerm {
    pub fn from_query(
        query: NormalizedTokenIter<Vec<u8>>,
        words_limit: Option<usize>,
+        // TODO:` use index + txn + ? instead of closure
        derivations: impl Fn(&str, bool) -> Result<WordDerivations>,
    ) -> Result<Vec<LocatedQueryTerm>> {
        let mut primitive_query = Vec::new();
@ -232,7 +291,9 @@ impl LocatedQueryTerm {
                        && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
                    {
                        let located_query_term = LocatedQueryTerm {
-                            value: QueryTerm::Phrase(mem::take(&mut phrase)),
+                            value: QueryTerm::Phrase {
+                                phrase: Phrase { words: mem::take(&mut phrase) },
+                            },
                            positions: phrase_start..=phrase_end,
                        };
                        primitive_query.push(located_query_term);
@ -245,7 +306,7 @@ impl LocatedQueryTerm {
        // If a quote is never closed, we consider all of the end of the query as a phrase.
        if !phrase.is_empty() {
            let located_query_term = LocatedQueryTerm {
-                value: QueryTerm::Phrase(mem::take(&mut phrase)),
+                value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } },
                positions: phrase_start..=phrase_end,
            };
            primitive_query.push(located_query_term);
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@ -5,9 +5,10 @@ use heed::{BytesDecode, RoTxn};
 use roaring::{MultiOps, RoaringBitmap};

 use super::db_cache::DatabaseCache;
-use super::query_term::{QueryTerm, WordDerivations};
-use super::QueryGraph;
-use crate::{Index, Result, RoaringBitmapCodec};
+use super::query_term::{Phrase, QueryTerm, WordDerivations};
+use super::{QueryGraph, QueryNode};
+
+use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};

 // TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.
 #[derive(Default)]
@ -27,33 +28,49 @@ impl NodeDocIdsCache {
            return Ok(&self.cache[&node_idx]);
        };
        let docids = match term {
-            QueryTerm::Phrase(_) => {
-                todo!("resolve phrase")
-            }
+            QueryTerm::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase)?,
            QueryTerm::Word {
                derivations:
-                    WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db },
+                    WordDerivations {
+                        original,
+                        zero_typo,
+                        one_typo,
+                        two_typos,
+                        use_prefix_db,
+                        synonyms,
+                        split_words,
+                    },
            } => {
-                let derivations_docids = {
-                    let mut or_docids = vec![];
-                    for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) {
-                        if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? {
-                            or_docids.push(word_docids);
-                        }
+                let mut or_docids = vec![];
+                for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) {
+                    if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? {
+                        or_docids.push(word_docids);
                    }
-                    if *use_prefix_db {
-                        if let Some(prefix_docids) =
-                            db_cache.get_prefix_docids(index, txn, original.as_str())?
-                        {
-                            or_docids.push(prefix_docids);
-                        }
+                }
+                if *use_prefix_db {
+                    if let Some(prefix_docids) =
+                        db_cache.get_prefix_docids(index, txn, original.as_str())?
+                    {
+                        or_docids.push(prefix_docids);
                    }
-                    or_docids
-                };
-                let derivations_iter = derivations_docids
+                }
+                let mut docids = or_docids
                    .into_iter()
-                    .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap());
-                MultiOps::union(derivations_iter)
+                    .map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap())
+                    .collect::<Vec<_>>();
+                for synonym in synonyms {
+                    // TODO: cache resolve_phrase?
+                    docids.push(resolve_phrase(index, txn, db_cache, synonym)?);
+                }
+                if let Some((left, right)) = split_words {
+                    if let Some(split_word_docids) =
+                        db_cache.get_word_pair_proximity_docids(index, txn, left, right, 1)?
+                    {
+                        docids.push(CboRoaringBitmapCodec::deserialize_from(split_word_docids)?);
+                    }
+                }
+
+                MultiOps::union(docids)
            }
        };
        let _ = self.cache.insert(node_idx, docids);
@ -90,19 +107,19 @@ pub fn resolve_query_graph<'transaction>(
        let predecessors_docids = MultiOps::union(predecessors_iter);

        let n = &q.nodes[node as usize];
-        // println!("resolving {node} {n:?}, predecessors: {predecessors:?}, their docids: {predecessors_docids:?}");
+
        let node_docids = match n {
-            super::QueryNode::Term(located_term) => {
+            QueryNode::Term(located_term) => {
                let term = &located_term.value;
                let derivations_docids =
                    node_docids_cache.get_docids(index, txn, db_cache, term, node)?;
                predecessors_docids & derivations_docids
            }
-            super::QueryNode::Deleted => {
+            QueryNode::Deleted => {
                panic!()
            }
-            super::QueryNode::Start => universe.clone(),
-            super::QueryNode::End => {
+            QueryNode::Start => universe.clone(),
+            QueryNode::End => {
                return Ok(predecessors_docids);
            }
        };
@ -125,3 +142,80 @@ pub fn resolve_query_graph<'transaction>(

    panic!()
 }
+
+pub fn resolve_phrase<'transaction>(
+    index: &Index,
+    txn: &'transaction RoTxn,
+    db_cache: &mut DatabaseCache<'transaction>,
+    phrase: &Phrase,
+) -> Result<RoaringBitmap> {
+    let Phrase { words } = phrase;
+    let mut candidates = RoaringBitmap::new();
+    let mut first_iter = true;
+    let winsize = words.len().min(3);
+
+    if words.is_empty() {
+        return Ok(candidates);
+    }
+
+    for win in words.windows(winsize) {
+        // Get all the documents with the matching distance for each word pairs.
+        let mut bitmaps = Vec::with_capacity(winsize.pow(2));
+        for (offset, s1) in win
+            .iter()
+            .enumerate()
+            .filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
+        {
+            for (dist, s2) in win
+                .iter()
+                .skip(offset + 1)
+                .enumerate()
+                .filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
+            {
+                if dist == 0 {
+                    match db_cache.get_word_pair_proximity_docids(index, txn, s1, s2, 1)? {
+                        Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?),
+                        // If there are no documents for this pair, there will be no
+                        // results for the phrase query.
+                        None => return Ok(RoaringBitmap::new()),
+                    }
+                } else {
+                    let mut bitmap = RoaringBitmap::new();
+                    for dist in 0..=dist {
+                        if let Some(m) = db_cache.get_word_pair_proximity_docids(
+                            index,
+                            txn,
+                            s1,
+                            s2,
+                            dist as u8 + 1,
+                        )? {
+                            bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?;
+                        }
+                    }
+                    if bitmap.is_empty() {
+                        return Ok(bitmap);
+                    } else {
+                        bitmaps.push(bitmap);
+                    }
+                }
+            }
+        }
+
+        // We sort the bitmaps so that we perform the small intersections first, which is faster.
+        bitmaps.sort_unstable_by_key(|a| a.len());
+
+        for bitmap in bitmaps {
+            if first_iter {
+                candidates = bitmap;
+                first_iter = false;
+            } else {
+                candidates &= bitmap;
+            }
+            // There will be no match, return early
+            if candidates.is_empty() {
+                break;
+            }
+        }
+    }
+    Ok(candidates)
+}
--- a/milli/src/search/new/words.rs
+++ b/milli/src/search/new/words.rs
@ -99,14 +99,17 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
        )?;

        let child_query_graph = query_graph.clone();
-        // TODO: Check whether a position exists in the graph before removing it and
-        // returning the next bucket.
-        // while graph.does_not_contain(positions_to_remove.last()) { positions_to_remove.pop() }
-        if self.positions_to_remove.is_empty() {
-            self.exhausted = true;
-        } else {
-            let position_to_remove = self.positions_to_remove.pop().unwrap();
-            query_graph.remove_words_at_position(position_to_remove);
+        loop {
+            if self.positions_to_remove.is_empty() {
+                self.exhausted = true;
+                break;
+            } else {
+                let position_to_remove = self.positions_to_remove.pop().unwrap();
+                let did_delete_any_node = query_graph.remove_words_at_position(position_to_remove);
+                if did_delete_any_node {
+                    break;
+                }
+            }
        }

        Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket }))