Add documentation

2025-02-20 17:45:54 +08:00 · 2023-03-08 13:26:29 +01:00 · 2023-03-08 13:26:29 +01:00 · c232cdabf5
commit c232cdabf5
parent 4e266211bf
10 changed files with 358 additions and 548 deletions
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@ -1,15 +1,21 @@
 use std::collections::hash_map::Entry;
+use std::hash::Hash;

 use fxhash::FxHashMap;
 use heed::types::ByteSlice;
+use heed::{BytesEncode, Database, RoTxn};

 use super::interner::Interned;
 use super::SearchContext;
 use crate::Result;

+/// A cache storing pointers to values in the LMDB databases.
+///
+/// Used for performance reasons only. By using this cache, we avoid performing a
+/// database lookup and instead get a direct reference to the value using a fast
+/// local HashMap lookup.
 #[derive(Default)]
 pub struct DatabaseCache<'search> {
-    // TODO: interner for all database cache keys?
    pub word_pair_proximity_docids:
        FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
    pub word_prefix_pair_proximity_docids:
@ -21,36 +27,50 @@ pub struct DatabaseCache<'search> {
    pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
 }
 impl<'search> SearchContext<'search> {
-    pub fn get_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'search [u8]>> {
-        let bitmap_ptr = match self.db_cache.word_docids.entry(word) {
+    fn get_value<'v, K1, KC>(
+        txn: &'search RoTxn,
+        cache_key: K1,
+        db_key: &'v KC::EItem,
+        cache: &mut FxHashMap<K1, Option<&'search [u8]>>,
+        db: Database<KC, ByteSlice>,
+    ) -> Result<Option<&'search [u8]>>
+    where
+        K1: Copy + Eq + Hash,
+        KC: BytesEncode<'v>,
+    {
+        let bitmap_ptr = match cache.entry(cache_key) {
            Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
            Entry::Vacant(entry) => {
-                let bitmap_ptr = self
-                    .index
-                    .word_docids
-                    .remap_data_type::<ByteSlice>()
-                    .get(self.txn, self.word_interner.get(word))?;
+                let bitmap_ptr = db.get(txn, db_key)?;
                entry.insert(bitmap_ptr);
                bitmap_ptr
            }
        };
        Ok(bitmap_ptr)
    }
-    pub fn get_prefix_docids(&mut self, prefix: Interned<String>) -> Result<Option<&'search [u8]>> {
-        // In the future, this will be a frozen roaring bitmap
-        let bitmap_ptr = match self.db_cache.word_prefix_docids.entry(prefix) {
-            Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
-            Entry::Vacant(entry) => {
-                let bitmap_ptr = self
-                    .index
-                    .word_prefix_docids
-                    .remap_data_type::<ByteSlice>()
-                    .get(self.txn, self.word_interner.get(prefix))?;
-                entry.insert(bitmap_ptr);
-                bitmap_ptr
-            }
-        };
-        Ok(bitmap_ptr)
+
+    /// Retrieve or insert the given value in the `word_docids` database.
+    pub fn get_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'search [u8]>> {
+        Self::get_value(
+            self.txn,
+            word,
+            self.word_interner.get(word).as_str(),
+            &mut self.db_cache.word_docids,
+            self.index.word_docids.remap_data_type::<ByteSlice>(),
+        )
+    }
+    /// Retrieve or insert the given value in the `word_prefix_docids` database.
+    pub fn get_word_prefix_docids(
+        &mut self,
+        prefix: Interned<String>,
+    ) -> Result<Option<&'search [u8]>> {
+        Self::get_value(
+            self.txn,
+            prefix,
+            self.word_interner.get(prefix).as_str(),
+            &mut self.db_cache.word_prefix_docids,
+            self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
+        )
    }

    pub fn get_word_pair_proximity_docids(
@ -59,40 +79,17 @@ impl<'search> SearchContext<'search> {
        word2: Interned<String>,
        proximity: u8,
    ) -> Result<Option<&'search [u8]>> {
-        let key = (proximity, word1, word2);
-        match self.db_cache.word_pair_proximity_docids.entry(key) {
-            Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
-            Entry::Vacant(entry) => {
-                // We shouldn't greedily access this DB at all
-                // a DB (w1, w2) -> [proximities] would be much better
-                // We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity
-                // And if we worked with words encoded as integers, the set of words could be a roaring bitmap
-                // Then, to find all the proximities between two list of words, we'd do:
-
-                // inputs:
-                //    - words1 (roaring bitmap)
-                //    - words2 (roaring bitmap)
-                // output:
-                //    - [(word1, word2, [proximities])]
-                // algo:
-                //  let mut ouput = vec![];
-                //  for word1 in words1 {
-                //      let all_words_in_proximity_of_w1 = pair_words_db.get(word1);
-                //      let words_in_proximity_of_w1 = all_words_in_proximity_of_w1 & words2;
-                //      for word2 in words_in_proximity_of_w1 {
-                //          let proximties = prox_db.get(word1, word2);
-                //          output.push(word1, word2, proximities);
-                //      }
-                //  }
-                let bitmap_ptr =
-                    self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().get(
-                        self.txn,
-                        &(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
-                    )?;
-                entry.insert(bitmap_ptr);
-                Ok(bitmap_ptr)
-            }
-        }
+        Self::get_value(
+            self.txn,
+            (proximity, word1, word2),
+            &(
+                proximity,
+                self.word_interner.get(word1).as_str(),
+                self.word_interner.get(word2).as_str(),
+            ),
+            &mut self.db_cache.word_pair_proximity_docids,
+            self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
+        )
    }

    pub fn get_word_prefix_pair_proximity_docids(
@ -101,22 +98,17 @@ impl<'search> SearchContext<'search> {
        prefix2: Interned<String>,
        proximity: u8,
    ) -> Result<Option<&'search [u8]>> {
-        let key = (proximity, word1, prefix2);
-        match self.db_cache.word_prefix_pair_proximity_docids.entry(key) {
-            Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
-            Entry::Vacant(entry) => {
-                let bitmap_ptr = self
-                    .index
-                    .word_prefix_pair_proximity_docids
-                    .remap_data_type::<ByteSlice>()
-                    .get(
-                        self.txn,
-                        &(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
-                    )?;
-                entry.insert(bitmap_ptr);
-                Ok(bitmap_ptr)
-            }
-        }
+        Self::get_value(
+            self.txn,
+            (proximity, word1, prefix2),
+            &(
+                proximity,
+                self.word_interner.get(word1).as_str(),
+                self.word_interner.get(prefix2).as_str(),
+            ),
+            &mut self.db_cache.word_prefix_pair_proximity_docids,
+            self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
+        )
    }
    pub fn get_prefix_word_pair_proximity_docids(
        &mut self,
@ -124,25 +116,16 @@ impl<'search> SearchContext<'search> {
        right: Interned<String>,
        proximity: u8,
    ) -> Result<Option<&'search [u8]>> {
-        let key = (proximity, left_prefix, right);
-        match self.db_cache.prefix_word_pair_proximity_docids.entry(key) {
-            Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
-            Entry::Vacant(entry) => {
-                let bitmap_ptr = self
-                    .index
-                    .prefix_word_pair_proximity_docids
-                    .remap_data_type::<ByteSlice>()
-                    .get(
-                        self.txn,
-                        &(
-                            proximity,
-                            self.word_interner.get(left_prefix),
-                            self.word_interner.get(right),
-                        ),
-                    )?;
-                entry.insert(bitmap_ptr);
-                Ok(bitmap_ptr)
-            }
-        }
+        Self::get_value(
+            self.txn,
+            (proximity, left_prefix, right),
+            &(
+                proximity,
+                self.word_interner.get(left_prefix).as_str(),
+                self.word_interner.get(right).as_str(),
+            ),
+            &mut self.db_cache.prefix_word_pair_proximity_docids,
+            self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
+        )
    }
 }
--- a/milli/src/search/new/graph_based_ranking_rule.rs
+++ b/milli/src/search/new/graph_based_ranking_rule.rs
@ -1,3 +1,41 @@
+/*! Implementation of a generic graph-based ranking rule.
+
+A graph-based ranking rule is a ranking rule that works by representing
+its possible operations and their relevancy cost as a directed acyclic multi-graph
+built on top of the query graph. It then computes its buckets by finding the
+cheapest paths from the start node to the end node and computing the document ids
+that satisfy those paths.
+
+For example, the proximity ranking rule builds a graph where the edges between two
+nodes represent a condition that the term of the source node is in a certain proximity
+to the term of the destination node. With the query "pretty house by" where the term
+"pretty" has three possible proximities to the term "house" and "house" has two
+proximities to "by", the graph will look like this:
+
+```txt
+┌───────┐     ┌───────┐─────1────▶┌───────┐──1──▶┌─────┐    ┌───────┐
+│ START │──0─▶│pretty │─────2────▶│ house │      │ by  │─0─▶│  END  │
+└───────┘     └───────┘─────3────▶└───────┘──2-─▶└─────┘    └───────┘
+```
+The proximity ranking rule's first bucket will be determined by the union of all
+the shortest paths from START to END, which in this case is:
+```txt
+START --0-> pretty --1--> house --1--> by --0--> end
+```
+The path's corresponding document ids are found by taking the intersection of the
+document ids of each edge. That is, we find the documents where both `pretty` is
+1-close to `house` AND `house` is 1-close to `by`.
+
+For the second bucket, we get the union of the second-cheapest paths, which are:
+```txt
+START --0-> pretty --1--> house --2--> by --0--> end
+START --0-> pretty --2--> house --1--> by --0--> end
+```
+That is we find the documents where either:
+- `pretty` is 1-close to `house` AND `house` is 2-close to `by`
+- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
+*/
+
 use roaring::RoaringBitmap;

 use super::logger::SearchLogger;
@ -8,24 +46,38 @@ use super::small_bitmap::SmallBitmap;
 use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
 use crate::Result;

+/// A generic graph-based ranking rule
 pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
    id: String,
+    // When the ranking rule is not iterating over its buckets,
+    // its state is `None`.
    state: Option<GraphBasedRankingRuleState<G>>,
 }
 impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
+    /// Creates the ranking rule with the given identifier
    pub fn new(id: String) -> Self {
        Self { id, state: None }
    }
 }

+/// The internal state of a graph-based ranking rule during iteration
 pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
+    /// The current graph
    graph: RankingRuleGraph<G>,
+    /// Cache to retrieve the docids associated with each edge
    edge_docids_cache: EdgeDocidsCache<G>,
+    /// Cache used to optimistically discard paths that resolve to no documents.
    empty_paths_cache: EmptyPathsCache,
+    /// A structure giving the list of possible costs from each node to the end node,
+    /// along with a set of unavoidable edges that must be traversed to achieve that distance.
    all_distances: Vec<Vec<(u16, SmallBitmap)>>,
+    /// An index in the first element of `all_distances`, giving the cost of the next bucket
    cur_distance_idx: usize,
 }

+/// Traverse each edge of the graph, computes its associated document ids,
+/// and remove this edge from the graph if its docids are disjoint with the
+/// given universe.
 fn remove_empty_edges<'search, G: RankingRuleGraphTrait>(
    ctx: &mut SearchContext<'search>,
    graph: &mut RankingRuleGraph<G>,
@ -70,6 +122,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
        let mut edge_docids_cache = EdgeDocidsCache::default();
        let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len() as u16);

+        // First simplify the graph as much as possible, by computing the docids of the edges
+        // within the rule's universe and removing the edges that have no associated docids.
        remove_empty_edges(
            ctx,
            &mut graph,
@ -77,6 +131,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
            universe,
            &mut empty_paths_cache,
        )?;
+
+        // Then pre-compute the cost of all paths from each node to the end node
        let all_distances = graph.initialize_distances_with_necessary_edges();

        let state = GraphBasedRankingRuleState {
@ -98,9 +154,14 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
        logger: &mut dyn SearchLogger<QueryGraph>,
        universe: &RoaringBitmap,
    ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
+        // If universe.len() <= 1, the bucket sort algorithm
+        // should not have called this function.
        assert!(universe.len() > 1);
+        // Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
+        // should never happen
        let mut state = self.state.take().unwrap();

+        // TODO: does this have a real positive performance cost?
        remove_empty_edges(
            ctx,
            &mut state.graph,
@ -109,12 +170,16 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
            &mut state.empty_paths_cache,
        )?;

+        // If the cur_distance_idx does not point to a valid cost in the `all_distances`
+        // structure, then we have computed all the buckets and can return.
        if state.cur_distance_idx
            >= state.all_distances[state.graph.query_graph.root_node as usize].len()
        {
            self.state = None;
            return Ok(None);
        }
+
+        // Retrieve the cost of the paths to compute
        let (cost, _) =
            state.all_distances[state.graph.query_graph.root_node as usize][state.cur_distance_idx];
        state.cur_distance_idx += 1;
@ -129,22 +194,38 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
            cur_distance_idx: _,
        } = &mut state;

-        let mut paths = vec![];
        let original_universe = universe;
        let mut universe = universe.clone();

        // TODO: remove this unnecessary clone
        let original_graph = graph.clone();
+        // and this vector as well
+        let mut paths = vec![];
+
+        // For each path of the given cost, we will compute its associated
+        // document ids.
+        // In case the path does not resolve to any document id, we try to figure out why
+        // and update the `empty_paths_cache` accordingly.
+        // For example, it may be that the path is empty because one of its edges is disjoint
+        // with the universe, or because a prefix of the path is disjoint with the universe, or because
+        // the path contains two edges that are disjoint from each other within the universe.
+        // Updating the empty_paths_cache helps speed up the execution of `visit_paths_of_cost` and reduces
+        // the number of future candidate paths given by that same function.
        graph.visit_paths_of_cost(
            graph.query_graph.root_node as usize,
            cost,
            all_distances,
            empty_paths_cache,
            |path, graph, empty_paths_cache| {
+                // Accumulate the path for logging purposes only
                paths.push(path.to_vec());
                let mut path_docids = universe.clone();
+
+                // We store the edges and their docids in vectors in case the path turns out to be
+                // empty and we need to figure out why it was empty.
                let mut visited_edges = vec![];
                let mut cached_edge_docids = vec![];
+
                for &edge_index in path {
                    visited_edges.push(edge_index);
                    let edge_docids =
@ -154,21 +235,29 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
                        BitmapOrAllRef::All => continue,
                    };
                    cached_edge_docids.push((edge_index, edge_docids.clone()));
+
+                    // If the edge is empty, then the path will be empty as well, we update the graph
+                    // and caches accordingly and skip to the next candidate path.
                    if edge_docids.is_disjoint(&universe) {
                        // 1. Store in the cache that this edge is empty for this universe
                        empty_paths_cache.forbid_edge(edge_index);
                        // 2. remove this edge from the ranking rule graph
                        graph.remove_edge(edge_index);
+                        // 3. Also remove the entry from the edge_docids_cache, since we don't need it anymore
                        edge_docids_cache.cache.remove(&edge_index);
                        return Ok(());
                    }
                    path_docids &= edge_docids;

+                    // If the (sub)path is empty, we try to figure out why and update the caches accordingly.
                    if path_docids.is_disjoint(&universe) {
-                        // empty_paths_cache.forbid_prefix(&visited_edges);
-                        // if the intersection between this edge and any
+                        // First, we know that this path is empty, and thus any path
+                        // that is a superset of it will also be empty.
+                        empty_paths_cache.forbid_prefix(&visited_edges);
+                        // Second, if the intersection between this edge and any
                        // previous one is disjoint with the universe,
-                        // then we add these two edges to the empty_path_cache
+                        // then we also know that any path containing the same couple of
+                        // edges will also be empty.
                        for (edge_index2, edge_docids2) in
                            cached_edge_docids[..cached_edge_docids.len() - 1].iter()
                        {
@ -181,6 +270,7 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
                    }
                }
                bucket |= &path_docids;
+                // Reduce the size of the universe so that we can more optimistically discard candidate paths
                universe -= path_docids;
                Ok(())
            },
@ -196,6 +286,8 @@ impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
            logger,
        );

+        // TODO: Graph-based ranking rules do not (yet) modify the query graph. We could, however,
+        // remove nodes and/or terms within nodes that weren't present in any of the paths.
        let next_query_graph = state.graph.query_graph.clone();

        self.state = Some(state);
--- a/milli/src/search/new/interner.rs
+++ b/milli/src/search/new/interner.rs
@ -3,6 +3,7 @@ use std::marker::PhantomData;

 use fxhash::FxHashMap;

+/// An index within a [`Interner<T>`] structure.
 pub struct Interned<T> {
    idx: u32,
    _phantom: PhantomData<T>,
@ -13,7 +14,10 @@ impl<T> Interned<T> {
        Self { idx, _phantom: PhantomData }
    }
 }
-
+/// An [`Interner`] is used to store a unique copy of a value of type `T`. This value
+/// is then identified by a lightweight index of type [`Interned<T>`], which can
+/// be copied, compared, and hashed efficiently. An immutable reference to the original value
+/// can be retrieved using `self.get(interned)`.
 pub struct Interner<T> {
    stable_store: Vec<T>,
    lookup: FxHashMap<T, Interned<T>>,
--- a/milli/src/search/new/logger/mod.rs
+++ b/milli/src/search/new/logger/mod.rs
@ -7,7 +7,82 @@ use super::ranking_rule_graph::{EmptyPathsCache, ProximityGraph, RankingRuleGrap
 use super::small_bitmap::SmallBitmap;
 use super::{RankingRule, RankingRuleQueryTrait};

+/// Trait for structure logging the execution of a search query.
+pub trait SearchLogger<Q: RankingRuleQueryTrait> {
+    /// Logs the initial query
+    fn initial_query(&mut self, query: &Q);
+
+    /// Logs the query that was used to compute the set of all candidates
+    fn query_for_universe(&mut self, query: &Q);
+
+    /// Logs the value of the initial set of all candidates
+    fn initial_universe(&mut self, universe: &RoaringBitmap);
+
+    /// Logs the ranking rules used to perform the search query
+    fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<Q>]);
+
+    /// Logs the start of a ranking rule's iteration.
+    fn start_iteration_ranking_rule<'transaction>(
+        &mut self,
+        ranking_rule_idx: usize,
+        ranking_rule: &dyn RankingRule<'transaction, Q>,
+        query: &Q,
+        universe: &RoaringBitmap,
+    );
+    /// Logs the end of the computation of a ranking rule bucket
+    fn next_bucket_ranking_rule<'transaction>(
+        &mut self,
+        ranking_rule_idx: usize,
+        ranking_rule: &dyn RankingRule<'transaction, Q>,
+        universe: &RoaringBitmap,
+        candidates: &RoaringBitmap,
+    );
+    /// Logs the skipping of a ranking rule bucket
+    fn skip_bucket_ranking_rule<'transaction>(
+        &mut self,
+        ranking_rule_idx: usize,
+        ranking_rule: &dyn RankingRule<'transaction, Q>,
+        candidates: &RoaringBitmap,
+    );
+    /// Logs the end of a ranking rule's iteration.
+    fn end_iteration_ranking_rule<'transaction>(
+        &mut self,
+        ranking_rule_idx: usize,
+        ranking_rule: &dyn RankingRule<'transaction, Q>,
+        universe: &RoaringBitmap,
+    );
+    /// Logs the addition of document ids to the final results
+    fn add_to_results(&mut self, docids: &[u32]);
+
+    /// Logs the internal state of the words ranking rule
+    fn log_words_state(&mut self, query_graph: &Q);
+
+    /// Logs the internal state of the proximity ranking rule
+    fn log_proximity_state(
+        &mut self,
+        query_graph: &RankingRuleGraph<ProximityGraph>,
+        paths: &[Vec<u16>],
+        empty_paths_cache: &EmptyPathsCache,
+        universe: &RoaringBitmap,
+        distances: Vec<Vec<(u16, SmallBitmap)>>,
+        cost: u16,
+    );
+
+    /// Logs the internal state of the typo ranking rule
+    fn log_typo_state(
+        &mut self,
+        query_graph: &RankingRuleGraph<TypoGraph>,
+        paths: &[Vec<u16>],
+        empty_paths_cache: &EmptyPathsCache,
+        universe: &RoaringBitmap,
+        distances: Vec<Vec<(u16, SmallBitmap)>>,
+        cost: u16,
+    );
+}
+
+/// A dummy [`SearchLogger`] which does nothing.
 pub struct DefaultSearchLogger;
+
 impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
    fn initial_query(&mut self, _query: &Q) {}

@ -76,63 +151,3 @@ impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
    ) {
    }
 }
-
-pub trait SearchLogger<Q: RankingRuleQueryTrait> {
-    fn initial_query(&mut self, query: &Q);
-
-    fn query_for_universe(&mut self, query: &Q);
-
-    fn initial_universe(&mut self, universe: &RoaringBitmap);
-
-    fn ranking_rules(&mut self, rr: &[&mut dyn RankingRule<Q>]);
-
-    fn start_iteration_ranking_rule<'transaction>(
-        &mut self,
-        ranking_rule_idx: usize,
-        ranking_rule: &dyn RankingRule<'transaction, Q>,
-        query: &Q,
-        universe: &RoaringBitmap,
-    );
-    fn next_bucket_ranking_rule<'transaction>(
-        &mut self,
-        ranking_rule_idx: usize,
-        ranking_rule: &dyn RankingRule<'transaction, Q>,
-        universe: &RoaringBitmap,
-        candidates: &RoaringBitmap,
-    );
-    fn skip_bucket_ranking_rule<'transaction>(
-        &mut self,
-        ranking_rule_idx: usize,
-        ranking_rule: &dyn RankingRule<'transaction, Q>,
-        candidates: &RoaringBitmap,
-    );
-    fn end_iteration_ranking_rule<'transaction>(
-        &mut self,
-        ranking_rule_idx: usize,
-        ranking_rule: &dyn RankingRule<'transaction, Q>,
-        universe: &RoaringBitmap,
-    );
-    fn add_to_results(&mut self, docids: &[u32]);
-
-    fn log_words_state(&mut self, query_graph: &Q);
-
-    fn log_proximity_state(
-        &mut self,
-        query_graph: &RankingRuleGraph<ProximityGraph>,
-        paths: &[Vec<u16>],
-        empty_paths_cache: &EmptyPathsCache,
-        universe: &RoaringBitmap,
-        distances: Vec<Vec<(u16, SmallBitmap)>>,
-        cost: u16,
-    );
-
-    fn log_typo_state(
-        &mut self,
-        query_graph: &RankingRuleGraph<TypoGraph>,
-        paths: &[Vec<u16>],
-        empty_paths_cache: &EmptyPathsCache,
-        universe: &RoaringBitmap,
-        distances: Vec<Vec<(u16, SmallBitmap)>>,
-        cost: u16,
-    );
-}
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -88,7 +88,7 @@ fn resolve_maximally_reduced_query_graph<'search>(
            break;
        } else {
            let position_to_remove = positions_to_remove.pop().unwrap();
-            let _ = graph.remove_words_at_position(position_to_remove);
+            let _ = graph.remove_words_starting_at_position(position_to_remove);
        }
    }
    logger.query_for_universe(&graph);
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@ -3,6 +3,17 @@ use super::small_bitmap::SmallBitmap;
 use super::SearchContext;
 use crate::Result;

+const QUERY_GRAPH_NODE_LENGTH_LIMIT: u16 = 64;
+
+/// A node of the [`QueryGraph`].
+///
+/// There are four types of nodes:
+/// 1. `Start` : unique, represents the start of the query
+/// 2. `End` : unique, represents the end of a query
+/// 3. `Deleted` : represents a node that was deleted.
+/// All deleted nodes are unreachable from the start node.
+/// 4. `Term` is a regular node representing a word or combination of words
+/// from the user query.
 #[derive(Clone)]
 pub enum QueryNode {
    Term(LocatedQueryTerm),
@ -11,34 +22,84 @@ pub enum QueryNode {
    End,
 }

+/// The edges associated with a node in the query graph.
 #[derive(Clone)]
 pub struct Edges {
-    // TODO: use a tiny bitset instead, something like a simple Vec<u8> where most queries will see a vector of one element
+    /// Set of nodes which have an edge going to the current node
    pub predecessors: SmallBitmap,
+    /// Set of nodes which are reached by an edge from the current node
    pub successors: SmallBitmap,
 }

+/**
+A graph representing all the ways to interpret the user's search query.
+
+## Important
+At the moment, a query graph has a hardcoded limit of [`QUERY_GRAPH_NODE_LENGTH_LIMIT`] nodes.
+
+## Example 1
+For the search query `sunflower`, we need to register the following things:
+- we need to look for the exact word `sunflower`
+- but also any word which is 1 or 2 typos apart from `sunflower`
+- and every word that contains the prefix `sunflower`
+- and also the couple of adjacent words `sun flower`
+- as well as all the user-defined synonyms of `sunflower`
+
+All these derivations of a word will be stored in [`WordDerivations`].
+
+## Example 2:
+For the search query `summer house by`.
+
+We also look for all word derivations of each term. And we also need to consider
+the potential n-grams `summerhouse`, `summerhouseby`, and `houseby`.
+Furthermore, we need to know which words these ngrams replace. This is done by creating the
+following graph, where each node also contains a list of derivations:
+```txt
+                        ┌───────┐
+                      ┌─│houseby│─────────┐
+                      │ └───────┘         │
+┌───────┐   ┌───────┐ │ ┌───────┐  ┌────┐ │ ┌───────┐
+│ START │─┬─│summer │─┴─│ house │┌─│ by │─┼─│  END  │
+└───────┘ │ └───────┘   └───────┘│ └────┘ │ └───────┘
+          │ ┌────────────┐       │        │
+          ├─│summerhouse │───────┘        │
+          │ └────────────┘                │
+          │         ┌─────────────┐       │
+          └─────────│summerhouseby│───────┘
+                    └─────────────┘
+```
+Note also that each node has a range of positions associated with it,
+such that `summer` is known to be a word at the positions `0..=0` and `houseby`
+is registered with the positions `1..=2`. When two nodes are connected by an edge,
+it means that they are potentially next to each other in the user's search query
+(depending on the [`TermsMatchingStrategy`](crate::search::TermsMatchingStrategy)
+and the transformations that were done on the query graph).
+*/
 #[derive(Clone)]
 pub struct QueryGraph {
+    /// The index of the start node within `self.nodes`
    pub root_node: u16,
+    /// The index of the end node within `self.nodes`
    pub end_node: u16,
+    /// The list of all query nodes
    pub nodes: Vec<QueryNode>,
+    /// The list of all node edges
    pub edges: Vec<Edges>,
 }

-fn _assert_sizes() {
-    // TODO: QueryNodes are too big now, 88B is a bit too big
-    let _: [u8; 88] = [0; std::mem::size_of::<QueryNode>()];
-    let _: [u8; 32] = [0; std::mem::size_of::<Edges>()];
-}
-
 impl Default for QueryGraph {
    /// Create a new QueryGraph with two disconnected nodes: the root and end nodes.
    fn default() -> Self {
        let nodes = vec![QueryNode::Start, QueryNode::End];
        let edges = vec![
-            Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) },
-            Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) },
+            Edges {
+                predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
+                successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
+            },
+            Edges {
+                predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
+                successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
+            },
        ];

        Self { root_node: 0, end_node: 1, nodes, edges }
@ -46,33 +107,31 @@ impl Default for QueryGraph {
 }

 impl QueryGraph {
+    /// Connect all the given predecessor nodes to the given successor node
    fn connect_to_node(&mut self, from_nodes: &[u16], to_node: u16) {
        for &from_node in from_nodes {
            self.edges[from_node as usize].successors.insert(to_node);
            self.edges[to_node as usize].predecessors.insert(from_node);
        }
    }
+    /// Add the given node to the graph and connect it to all the given predecessor nodes
    fn add_node(&mut self, from_nodes: &[u16], node: QueryNode) -> u16 {
        let new_node_idx = self.nodes.len() as u16;
+        assert!(new_node_idx <= QUERY_GRAPH_NODE_LENGTH_LIMIT);
        self.nodes.push(node);
        self.edges.push(Edges {
-            predecessors: SmallBitmap::from_array(from_nodes, 64),
-            successors: SmallBitmap::new(64),
+            predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
+            successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
        });
-        for from_node in from_nodes {
-            self.edges[*from_node as usize].successors.insert(new_node_idx);
-        }
+        self.connect_to_node(from_nodes, new_node_idx);
+
        new_node_idx
    }
 }

 impl QueryGraph {
-    // TODO: return the list of all matching words here as well
+    /// Build the query graph from the parsed user search query.
    pub fn from_query(ctx: &mut SearchContext, terms: Vec<LocatedQueryTerm>) -> Result<QueryGraph> {
-        // TODO: maybe empty nodes should not be removed here, to compute
-        // the score of the `words` ranking rule correctly
-        // it is very easy to traverse the graph and remove afterwards anyway
-        // Still, I'm keeping this here as a demo
        let mut empty_nodes = vec![];

        let word_set = ctx.index.words_fst(ctx.txn)?;
@ -81,7 +140,6 @@ impl QueryGraph {
        let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
            (vec![], vec![], vec![graph.root_node]);

-        // TODO: split words / synonyms
        for length in 1..=terms.len() {
            let query = &terms[..length];

@ -156,6 +214,8 @@ impl QueryGraph {

        Ok(graph)
    }
+
+    /// Remove the given nodes and all their edges from the query graph.
    pub fn remove_nodes(&mut self, nodes: &[u16]) {
        for &node in nodes {
            self.nodes[node as usize] = QueryNode::Deleted;
@ -166,10 +226,13 @@ impl QueryGraph {
            for succ in edges.successors.iter() {
                self.edges[succ as usize].predecessors.remove(node);
            }
-            self.edges[node as usize] =
-                Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) };
+            self.edges[node as usize] = Edges {
+                predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
+                successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
+            };
        }
    }
+    /// Remove the given nodes, connecting all their predecessors to all their successors.
    pub fn remove_nodes_keep_edges(&mut self, nodes: &[u16]) {
        for &node in nodes {
            self.nodes[node as usize] = QueryNode::Deleted;
@ -182,11 +245,17 @@ impl QueryGraph {
                self.edges[succ as usize].predecessors.remove(node);
                self.edges[succ as usize].predecessors.union(&edges.predecessors);
            }
-            self.edges[node as usize] =
-                Edges { predecessors: SmallBitmap::new(64), successors: SmallBitmap::new(64) };
+            self.edges[node as usize] = Edges {
+                predecessors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
+                successors: SmallBitmap::new(QUERY_GRAPH_NODE_LENGTH_LIMIT),
+            };
        }
    }
-    pub fn remove_words_at_position(&mut self, position: i8) -> bool {
+
+    /// Remove all the nodes that correspond to a word starting at the given position, and connect
+    /// the predecessors of these nodes to their successors.
+    /// Return `true` if any node was removed.
+    pub fn remove_words_starting_at_position(&mut self, position: i8) -> bool {
        let mut nodes_to_remove_keeping_edges = vec![];
        for (node_idx, node) in self.nodes.iter().enumerate() {
            let node_idx = node_idx as u16;
@ -202,14 +271,15 @@ impl QueryGraph {
        !nodes_to_remove_keeping_edges.is_empty()
    }

+    /// Simplify the query graph by removing all nodes that are disconnected from
+    /// the start or end nodes.
    fn simplify(&mut self) {
        loop {
            let mut nodes_to_remove = vec![];
            for (node_idx, node) in self.nodes.iter().enumerate() {
-                if (!matches!(node, QueryNode::End | QueryNode::Deleted)
-                    && self.edges[node_idx].successors.is_empty())
-                    || (!matches!(node, QueryNode::Start | QueryNode::Deleted)
-                        && self.edges[node_idx].predecessors.is_empty())
+                if !matches!(node, QueryNode::End | QueryNode::Deleted)
+                    && (self.edges[node_idx].successors.is_empty()
+                        || self.edges[node_idx].predecessors.is_empty())
                {
                    nodes_to_remove.push(node_idx as u16);
                }
--- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs
@ -53,7 +53,7 @@ impl RankingRuleGraphTrait for TypoGraph {
                    docids |= bitmap;
                }
                if *nbr_typos == 0 {
-                    if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? {
+                    if let Some(bytes) = ctx.get_word_prefix_docids(derivations.original)? {
                        // TODO: deserialize bitmap within a universe
                        let bitmap = universe
                            & RoaringBitmapCodec::bytes_decode(bytes)
--- a/milli/src/search/new/ranking_rules.rs
+++ b/milli/src/search/new/ranking_rules.rs
@ -114,7 +114,7 @@ pub fn apply_ranking_rules<'search>(
    logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe);
    ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?;

-    let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
+    let mut candidates: Vec<RoaringBitmap> = vec![RoaringBitmap::default(); ranking_rules_len];
    candidates[0] = universe.clone();

    let mut cur_ranking_rule_index = 0;
@ -174,7 +174,7 @@ pub fn apply_ranking_rules<'search>(
                    }
                } else {
                    let candidates =
-                        candidates.iter().take(length - results.len()).collect::<Vec<_>>();
+                        candidates.iter().take(length - results.len()).collect::<Vec<u32>>();
                    logger.add_to_results(&candidates);
                    results.extend(&candidates);
                }
@ -234,358 +234,3 @@ pub fn apply_ranking_rules<'search>(

    Ok(results)
 }
-
-#[cfg(test)]
-mod tests {
-    // use crate::allocator::ALLOC;
-    use std::fs::File;
-    use std::io::{BufRead, BufReader, Cursor, Seek};
-    use std::time::Instant;
-
-    use big_s::S;
-    use heed::EnvOpenOptions;
-    use maplit::hashset;
-
-    use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
-    // use crate::search::new::logger::detailed::DetailedSearchLogger;
-    use crate::search::new::logger::DefaultSearchLogger;
-    use crate::search::new::{execute_search, SearchContext};
-    use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
-    use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy};
-
-    #[test]
-    fn search_wiki_new() {
-        let mut options = EnvOpenOptions::new();
-        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-
-        let index = Index::new(options, "data_wiki").unwrap();
-        let txn = index.read_txn().unwrap();
-
-        println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len());
-
-        // loop {
-        let start = Instant::now();
-
-        // let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log");
-        let mut ctx = SearchContext::new(&index, &txn);
-        let results = execute_search(
-            &mut ctx,
-            "which a the releases from poison by the government",
-            None,
-            0,
-            20,
-            &mut DefaultSearchLogger,
-            // &mut logger,
-        )
-        .unwrap();
-
-        // logger.write_d2_description(&mut ctx);
-
-        let elapsed = start.elapsed();
-        println!("{}us", elapsed.as_micros());
-
-        let _documents = index
-            .documents(&txn, results.iter().copied())
-            .unwrap()
-            .into_iter()
-            .map(|(id, obkv)| {
-                let mut object = serde_json::Map::default();
-                for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
-                    let value = obkv.get(fid).unwrap();
-                    let value: serde_json::Value = serde_json::from_slice(value).unwrap();
-                    object.insert(fid_name.to_owned(), value);
-                }
-                (id, serde_json::to_string_pretty(&object).unwrap())
-            })
-            .collect::<Vec<_>>();
-
-        println!("{}us: {:?}", elapsed.as_micros(), results);
-        // }
-        // for (id, _document) in documents {
-        //     println!("{id}:");
-        //     // println!("{document}");
-        // }
-    }
-
-    #[test]
-    fn search_wiki_old() {
-        let mut options = EnvOpenOptions::new();
-        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-
-        let index = Index::new(options, "data_wiki").unwrap();
-
-        let txn = index.read_txn().unwrap();
-
-        let rr = index.criteria(&txn).unwrap();
-        println!("{rr:?}");
-
-        let start = Instant::now();
-
-        let mut s = Search::new(&txn, &index);
-        s.query("which a the releases from poison by the government");
-        s.terms_matching_strategy(TermsMatchingStrategy::Last);
-        s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
-        let docs = s.execute().unwrap();
-
-        let elapsed = start.elapsed();
-
-        let documents = index
-            .documents(&txn, docs.documents_ids.iter().copied())
-            .unwrap()
-            .into_iter()
-            .map(|(id, obkv)| {
-                let mut object = serde_json::Map::default();
-                for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
-                    let value = obkv.get(fid).unwrap();
-                    let value: serde_json::Value = serde_json::from_slice(value).unwrap();
-                    object.insert(fid_name.to_owned(), value);
-                }
-                (id, serde_json::to_string_pretty(&object).unwrap())
-            })
-            .collect::<Vec<_>>();
-
-        println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
-        for (id, _document) in documents {
-            println!("{id}:");
-            // println!("{document}");
-        }
-    }
-    #[test]
-    fn search_movies_new() {
-        let mut options = EnvOpenOptions::new();
-        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-
-        let index = Index::new(options, "data_movies").unwrap();
-        let txn = index.read_txn().unwrap();
-
-        // let primary_key = index.primary_key(&txn).unwrap().unwrap();
-        // let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
-        // loop {
-        let start = Instant::now();
-
-        let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log");
-        let mut ctx = SearchContext::new(&index, &txn);
-        let results = execute_search(
-            &mut ctx,
-            "releases from poison by the government",
-            None,
-            0,
-            20,
-            // &mut DefaultSearchLogger,
-            &mut logger,
-        )
-        .unwrap();
-
-        logger.write_d2_description(&mut ctx);
-
-        let elapsed = start.elapsed();
-
-        // let ids = index
-        //     .documents(&txn, results.iter().copied())
-        //     .unwrap()
-        //     .into_iter()
-        //     .map(|x| {
-        //         let obkv = &x.1;
-        //         let id = obkv.get(primary_key).unwrap();
-        //         let id: serde_json::Value = serde_json::from_slice(id).unwrap();
-        //         id.as_str().unwrap().to_owned()
-        //     })
-        //     .collect::<Vec<_>>();
-
-        println!("{}us: {results:?}", elapsed.as_micros());
-        // println!("external ids: {ids:?}");
-        // }
-    }
-
-    #[test]
-    fn search_movies_old() {
-        let mut options = EnvOpenOptions::new();
-        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-
-        let index = Index::new(options, "data_movies").unwrap();
-
-        let txn = index.read_txn().unwrap();
-
-        let rr = index.criteria(&txn).unwrap();
-        println!("{rr:?}");
-
-        let primary_key = index.primary_key(&txn).unwrap().unwrap();
-        let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
-
-        let start = Instant::now();
-
-        let mut s = Search::new(&txn, &index);
-        s.query("which a the releases from poison by the government");
-        s.terms_matching_strategy(TermsMatchingStrategy::Last);
-        s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased);
-        let docs = s.execute().unwrap();
-
-        let elapsed = start.elapsed();
-
-        let ids = index
-            .documents(&txn, docs.documents_ids.iter().copied())
-            .unwrap()
-            .into_iter()
-            .map(|x| {
-                let obkv = &x.1;
-                let id = obkv.get(primary_key).unwrap();
-                let id: serde_json::Value = serde_json::from_slice(id).unwrap();
-                id.as_str().unwrap().to_owned()
-            })
-            .collect::<Vec<_>>();
-
-        println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
-        println!("external ids: {ids:?}");
-    }
-
-    #[test]
-    fn _settings_movies() {
-        let mut options = EnvOpenOptions::new();
-        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-
-        let index = Index::new(options, "data_movies").unwrap();
-        let mut wtxn = index.write_txn().unwrap();
-
-        let config = IndexerConfig::default();
-        let mut builder = Settings::new(&mut wtxn, &index, &config);
-
-        builder.set_min_word_len_one_typo(5);
-        builder.set_min_word_len_two_typos(100);
-        builder.set_sortable_fields(hashset! { S("release_date") });
-        builder.set_criteria(vec![
-            Criterion::Words,
-            Criterion::Typo,
-            Criterion::Proximity,
-            Criterion::Asc("release_date".to_owned()),
-        ]);
-
-        builder.execute(|_| (), || false).unwrap();
-        wtxn.commit().unwrap();
-    }
-
-    #[test]
-    fn _index_movies() {
-        let mut options = EnvOpenOptions::new();
-        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-
-        let index = Index::new(options, "data_movies").unwrap();
-        let mut wtxn = index.write_txn().unwrap();
-
-        let primary_key = "id";
-        let searchable_fields = vec!["title", "overview"];
-        let filterable_fields = vec!["release_date", "genres"];
-
-        let config = IndexerConfig::default();
-        let mut builder = Settings::new(&mut wtxn, &index, &config);
-        builder.set_primary_key(primary_key.to_owned());
-        let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
-        builder.set_searchable_fields(searchable_fields);
-        let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
-        builder.set_filterable_fields(filterable_fields);
-
-        builder.set_min_word_len_one_typo(5);
-        builder.set_min_word_len_two_typos(100);
-        builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
-        builder.execute(|_| (), || false).unwrap();
-
-        let config = IndexerConfig::default();
-        let indexing_config = IndexDocumentsConfig::default();
-        let builder =
-            IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
-                .unwrap();
-
-        let documents = documents_from(
-            "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json",
-            "json",
-        );
-        let (builder, user_error) = builder.add_documents(documents).unwrap();
-        user_error.unwrap();
-        builder.execute().unwrap();
-        wtxn.commit().unwrap();
-
-        index.prepare_for_closing().wait();
-    }
-    #[test]
-    fn _index_wiki() {
-        let mut options = EnvOpenOptions::new();
-        options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-
-        let index = Index::new(options, "data_wiki").unwrap();
-        let mut wtxn = index.write_txn().unwrap();
-
-        // let primary_key = "id";
-        let searchable_fields = vec!["body", "title", "url"];
-        // let filterable_fields = vec![];
-        let config = IndexerConfig::default();
-        let mut builder = Settings::new(&mut wtxn, &index, &config);
-        // builder.set_primary_key(primary_key.to_owned());
-        let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
-        builder.set_searchable_fields(searchable_fields);
-        // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
-        // builder.set_filterable_fields(filterable_fields);
-
-        // builder.set_min_word_len_one_typo(5);
-        // builder.set_min_word_len_two_typos(100);
-        builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
-        builder.execute(|_| (), || false).unwrap();
-
-        let config = IndexerConfig::default();
-        let indexing_config =
-            IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let builder =
-            IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false)
-                .unwrap();
-
-        let documents = documents_from(
-            "/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv",
-            "csv",
-        );
-        let (builder, user_error) = builder.add_documents(documents).unwrap();
-        user_error.unwrap();
-        builder.execute().unwrap();
-        wtxn.commit().unwrap();
-
-        index.prepare_for_closing().wait();
-    }
-
-    fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
-        let reader = File::open(filename)
-            .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
-        let reader = BufReader::new(reader);
-        let documents = match filetype {
-            "csv" => documents_from_csv(reader).unwrap(),
-            "json" => documents_from_json(reader).unwrap(),
-            "jsonl" => documents_from_jsonl(reader).unwrap(),
-            otherwise => panic!("invalid update format {:?}", otherwise),
-        };
-        DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
-    }
-
-    fn documents_from_jsonl(reader: impl BufRead) -> crate::Result<Vec<u8>> {
-        let mut documents = DocumentsBatchBuilder::new(Vec::new());
-
-        for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
-            let object = result.unwrap();
-            documents.append_json_object(&object)?;
-        }
-
-        documents.into_inner().map_err(Into::into)
-    }
-
-    fn documents_from_json(reader: impl BufRead) -> crate::Result<Vec<u8>> {
-        let mut documents = DocumentsBatchBuilder::new(Vec::new());
-
-        documents.append_json_array(reader)?;
-
-        documents.into_inner().map_err(Into::into)
-    }
-
-    fn documents_from_csv(reader: impl BufRead) -> crate::Result<Vec<u8>> {
-        let csv = csv::Reader::from_reader(reader);
-
-        let mut documents = DocumentsBatchBuilder::new(Vec::new());
-        documents.append_csv(csv)?;
-
-        documents.into_inner().map_err(Into::into)
-    }
-}
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@ -46,7 +46,7 @@ impl<'search> SearchContext<'search> {
                    }
                }
                if *use_prefix_db {
-                    if let Some(prefix_docids) = self.get_prefix_docids(*original)? {
+                    if let Some(prefix_docids) = self.get_word_prefix_docids(*original)? {
                        or_docids.push(prefix_docids);
                    }
                }
--- a/milli/src/search/new/words.rs
+++ b/milli/src/search/new/words.rs
@ -88,7 +88,8 @@ impl<'search> RankingRule<'search, QueryGraph> for Words {
                break;
            } else {
                let position_to_remove = self.positions_to_remove.pop().unwrap();
-                let did_delete_any_node = query_graph.remove_words_at_position(position_to_remove);
+                let did_delete_any_node =
+                    query_graph.remove_words_starting_at_position(position_to_remove);
                if did_delete_any_node {
                    break;
                }