Work in progress: It seems like we support synonyms, split and concat words

2024-11-26 12:05:05 +08:00 · 2019-11-30 16:53:34 +01:00 · 2019-11-30 16:53:34 +01:00 · 902625601a
commit 902625601a
parent d17d4dc5ec
9 changed files with 1026 additions and 48 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -257,6 +257,11 @@ dependencies = [
 "bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]

+[[package]]
+name = "compact_arena"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
 [[package]]
 name = "const-random"
 version = "0.1.6"
@ -937,6 +942,7 @@ dependencies = [
 "bincode 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)",
+ "compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
@ -946,6 +952,7 @@ dependencies = [
 "hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
 "heed 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
 "levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
 "meilisearch-schema 0.8.4",
@ -2648,6 +2655,7 @@ dependencies = [
 "checksum chunked_transfer 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f98beb6554de08a14bd7b5c6014963c79d6a25a1c66b1d4ecb9e733ccba51d6c"
 "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
 "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
+"checksum compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4ab08c5bed92075075d5db5149887a477b2dc0318c40882a0dfbd34315ac6141"
 "checksum const-random 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b641a8c9867e341f3295564203b1c250eb8ce6cb6126e007941f78c4d2ed7fe"
 "checksum const-random-macro 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c750ec12b83377637110d5a57f5ae08e895b06c4b16e2bdbf1a94ef717428c59"
 "checksum cookie 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "888604f00b3db336d2af898ec3c1d5d0ddf5e6d462220f2ededc33a87ac4bbd5"
--- a/meilisearch-core/Cargo.toml
+++ b/meilisearch-core/Cargo.toml
@ -10,6 +10,7 @@ arc-swap = "0.4.3"
 bincode = "1.1.4"
 byteorder = "1.3.2"
 chrono = { version = "0.4.9", features = ["serde"] }
+compact_arena = "0.4.0"
 crossbeam-channel = "0.4.0"
 deunicode = "1.0.0"
 env_logger = "0.7.0"
@ -35,6 +36,7 @@ assert_matches = "1.3"
 criterion = "0.3"
 csv = "1.0.7"
 indexmap = { version = "1.2.0", features = ["serde-1"] }
+jemallocator = "0.3.2"
 rustyline = { version = "5.0.0", default-features = false }
 structopt = "0.3.2"
 tempfile = "3.1.0"
--- a/meilisearch-core/examples/from_file.rs
+++ b/meilisearch-core/examples/from_file.rs
@ -1,5 +1,5 @@
-use std::collections::btree_map::{BTreeMap, Entry};
 use std::collections::HashSet;
+use std::collections::btree_map::{BTreeMap, Entry};
 use std::error::Error;
 use std::io::{Read, Write};
 use std::iter::FromIterator;
@ -15,6 +15,10 @@ use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
 use meilisearch_core::{Database, Highlight, ProcessedUpdateResult};
 use meilisearch_schema::SchemaAttr;

+// #[cfg(target_os = "linux")]
+#[global_allocator]
+static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
+
 #[derive(Debug, StructOpt)]
 struct IndexCommand {
    /// The destination where the database must be created.
--- a/meilisearch-core/src/automaton/mod.rs
+++ b/meilisearch-core/src/automaton/mod.rs
@ -13,11 +13,11 @@ use crate::database::MainT;
 use crate::error::MResult;
 use crate::store;

-use self::dfa::{build_dfa, build_prefix_dfa};
+pub use self::dfa::{build_dfa, build_prefix_dfa};
 pub use self::query_enhancer::QueryEnhancer;
-use self::query_enhancer::QueryEnhancerBuilder;
+pub use self::query_enhancer::QueryEnhancerBuilder;

-const NGRAMS: usize = 3;
+pub const NGRAMS: usize = 3;

 pub struct AutomatonProducer {
    automatons: Vec<AutomatonGroup>,
@ -145,7 +145,7 @@ pub fn normalize_str(string: &str) -> String {
    string
 }

-fn split_best_frequency<'a>(
+pub fn split_best_frequency<'a>(
    reader: &heed::RoTxn<MainT>,
    word: &'a str,
    postings_lists_store: store::PostingsLists,
--- a/meilisearch-core/src/automaton/query_enhancer.rs
+++ b/meilisearch-core/src/automaton/query_enhancer.rs
@ -143,8 +143,7 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
        // we need to pad real query indices
        let real_range = real..real + replacement.len().max(range.len());
        let real_length = replacement.len();
-        self.real_to_origin
-            .push((real_range, (range.start, real_length)));
+        self.real_to_origin.push((real_range, (range.start, real_length)));
    }

    pub fn build(self) -> QueryEnhancer {
@ -162,7 +161,7 @@ pub struct QueryEnhancer {
 }

 impl QueryEnhancer {
-    /// Returns the query indices to use to replace this real query index.
+    /// Returns the query indices that represent this real query index.
    pub fn replacement(&self, real: u32) -> Range<u32> {
        let real = real as usize;

--- a/meilisearch-core/src/bucket_sort.rs
+++ b/meilisearch-core/src/bucket_sort.rs
@ -0,0 +1,467 @@
+use std::ops::Deref;
+use std::borrow::Cow;
+use std::cmp::Ordering;
+use std::collections::HashSet;
+use std::io::Write;
+use std::mem;
+use std::ops::Range;
+use std::rc::Rc;
+use std::time::{Duration, Instant};
+
+use compact_arena::{SmallArena, Idx32, mk_arena};
+use fst::{IntoStreamer, Streamer};
+use levenshtein_automata::DFA;
+use log::debug;
+use meilisearch_tokenizer::{is_cjk, split_query_string};
+use meilisearch_types::{DocIndex, Highlight};
+use sdset::Set;
+use slice_group_by::{GroupBy, GroupByMut};
+
+use crate::automaton::NGRAMS;
+use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
+use crate::automaton::{build_dfa, build_prefix_dfa};
+use crate::automaton::{normalize_str, split_best_frequency};
+
+use crate::criterion2::*;
+use crate::levenshtein::prefix_damerau_levenshtein;
+use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
+use crate::{store, Document, DocumentId, MResult};
+
+pub fn bucket_sort<'c>(
+    reader: &heed::RoTxn<MainT>,
+    query: &str,
+    range: Range<usize>,
+    main_store: store::Main,
+    postings_lists_store: store::PostingsLists,
+    documents_fields_counts_store: store::DocumentsFieldsCounts,
+    synonyms_store: store::Synonyms,
+) -> MResult<Vec<Document>>
+{
+    // let automatons = construct_automatons(query);
+    let (automatons, query_enhancer) =
+        construct_automatons2(reader, query, main_store, postings_lists_store, synonyms_store)?;
+
+    let before_postings_lists_fetching = Instant::now();
+    mk_arena!(arena);
+    let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
+    debug!("bare matches ({}) retrieved in {:.02?}",
+        bare_matches.len(),
+        before_postings_lists_fetching.elapsed(),
+    );
+
+    let before_raw_documents_presort = Instant::now();
+    bare_matches.sort_unstable_by_key(|sm| sm.document_id);
+    debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
+
+    let before_raw_documents_building = Instant::now();
+    let mut raw_documents = Vec::new();
+    for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
+        raw_documents.push(RawDocument { raw_matches, processed_matches: None });
+    }
+    debug!("creating {} candidates documents took {:.02?}",
+        raw_documents.len(),
+        before_raw_documents_building.elapsed(),
+    );
+
+    dbg!(mem::size_of::<BareMatch>());
+    dbg!(mem::size_of::<SimpleMatch>());
+
+    let mut groups = vec![raw_documents.as_mut_slice()];
+
+    let criteria = [
+        Box::new(Typo) as Box<dyn Criterion>,
+        Box::new(Words),
+        Box::new(Proximity),
+        Box::new(Attribute),
+        Box::new(WordsPosition),
+        Box::new(Exact),
+        Box::new(StableDocId),
+    ];
+
+    'criteria: for criterion in &criteria {
+        let tmp_groups = mem::replace(&mut groups, Vec::new());
+        let mut documents_seen = 0;
+
+        for mut group in tmp_groups {
+            let before_criterion_preparation = Instant::now();
+            criterion.prepare(&mut group, &mut arena, &query_enhancer);
+            debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
+
+            let before_criterion_sort = Instant::now();
+            group.sort_unstable_by(|a, b| criterion.evaluate(a, b, &arena));
+            debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
+
+            for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b, &arena)) {
+                debug!("{:?} produced a group of size {}", criterion.name(), group.len());
+
+                documents_seen += group.len();
+                groups.push(group);
+
+                // we have sort enough documents if the last document sorted is after
+                // the end of the requested range, we can continue to the next criterion
+                if documents_seen >= range.end {
+                    continue 'criteria;
+                }
+            }
+        }
+    }
+
+    let iter = raw_documents.into_iter().skip(range.start).take(range.len());
+    let iter = iter.map(|d| {
+        let highlights = d.raw_matches.iter().flat_map(|sm| {
+            let postings_list = &arena[sm.postings_list];
+            let input = postings_list.input();
+            let query = &automatons[sm.query_index as usize].query;
+            postings_list.iter().map(move |m| {
+                let covered_area = if query.len() > input.len() {
+                    input.len()
+                } else {
+                    prefix_damerau_levenshtein(query.as_bytes(), input).1
+                };
+                Highlight { attribute: m.attribute, char_index: m.char_index, char_length: covered_area as u16 }
+            })
+        }).collect();
+
+        Document {
+            id: d.raw_matches[0].document_id,
+            highlights,
+            #[cfg(test)] matches: Vec::new(),
+        }
+    });
+
+    Ok(iter.collect())
+}
+
+pub struct RawDocument<'a, 'tag> {
+    pub raw_matches: &'a mut [BareMatch<'tag>],
+    pub processed_matches: Option<Vec<SimpleMatch>>,
+}
+
+pub struct BareMatch<'tag> {
+    pub document_id: DocumentId,
+    pub query_index: u16,
+    pub distance: u8,
+    pub is_exact: bool,
+    pub postings_list: Idx32<'tag>,
+}
+
+// TODO remove that
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct SimpleMatch {
+    pub query_index: u16,
+    pub distance: u8,
+    pub attribute: u16,
+    pub word_index: u16,
+    pub is_exact: bool,
+}
+
+#[derive(Clone)]
+pub struct PostingsListView<'txn> {
+    input: Rc<[u8]>,
+    postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
+    offset: usize,
+    len: usize,
+}
+
+impl<'txn> PostingsListView<'txn> {
+    pub fn new(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
+        let len = postings_list.len();
+        PostingsListView { input, postings_list, offset: 0, len }
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    pub fn input(&self) -> &[u8] {
+        &self.input
+    }
+
+    pub fn range(&self, offset: usize, len: usize) -> PostingsListView<'txn> {
+        assert!(offset + len <= self.len);
+        PostingsListView {
+            input: self.input.clone(),
+            postings_list: self.postings_list.clone(),
+            offset: self.offset + offset,
+            len: len,
+        }
+    }
+}
+
+impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
+    fn as_ref(&self) -> &Set<DocIndex> {
+        Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
+    }
+}
+
+impl Deref for PostingsListView<'_> {
+    type Target = Set<DocIndex>;
+
+    fn deref(&self) -> &Set<DocIndex> {
+        Set::new_unchecked(&self.postings_list[self.offset..self.offset + self.len])
+    }
+}
+
+fn fetch_matches<'txn, 'tag>(
+    reader: &'txn heed::RoTxn<MainT>,
+    automatons: &[QueryWordAutomaton],
+    arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
+    main_store: store::Main,
+    postings_lists_store: store::PostingsLists,
+) -> MResult<Vec<BareMatch<'tag>>>
+{
+    let mut before_words_fst = Instant::now();
+    let words = match main_store.words_fst(reader)? {
+        Some(words) => words,
+        None => return Ok(Vec::new()),
+    };
+    debug!("words fst took {:.02?}", before_words_fst.elapsed());
+
+    let mut total_postings_lists = Vec::new();
+
+    let mut dfa_time = Duration::default();
+    let mut stream_next_time = Duration::default();
+    let mut postings_lists_fetching_time = Duration::default();
+
+    for (query_index, automaton) in automatons.iter().enumerate() {
+        let before_dfa = Instant::now();
+        let dfa = automaton.dfa();
+        let QueryWordAutomaton { index, query, is_exact, is_prefix } = automaton;
+        dfa_time += before_dfa.elapsed();
+
+        let mut number_of_words = 0;
+
+        let before_fst_search = Instant::now();
+        let mut stream = words.search(&dfa).into_stream();
+        debug!("fst search took {:.02?}", before_fst_search.elapsed());
+
+        // while let Some(input) = stream.next() {
+        loop {
+            let before_stream_next = Instant::now();
+            let input = match stream.next() {
+                Some(input) => input,
+                None => break,
+            };
+            stream_next_time += before_stream_next.elapsed();
+
+            number_of_words += 1;
+
+            let distance = dfa.eval(input).to_u8();
+            let is_exact = *is_exact && distance == 0 && input.len() == query.len();
+
+            let before_postings_lists_fetching = Instant::now();
+            if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? {
+
+                let input = Rc::from(input);
+                let postings_list = Rc::new(postings_list);
+                let postings_list_view = PostingsListView::new(input, postings_list);
+                let mut offset = 0;
+                for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
+
+                    let posting_list_index = arena.add(postings_list_view.range(offset, group.len()));
+                    let document_id = group[0].document_id;
+                    let stuffed = BareMatch {
+                        document_id,
+                        query_index: query_index as u16,
+                        distance,
+                        is_exact,
+                        postings_list: posting_list_index,
+                    };
+
+                    total_postings_lists.push(stuffed);
+                    offset += group.len();
+                }
+            }
+            postings_lists_fetching_time += before_postings_lists_fetching.elapsed();
+        }
+
+        debug!("{:?} gives {} words", query, number_of_words);
+    }
+
+    debug!("stream next took {:.02?}", stream_next_time);
+    debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time);
+    debug!("dfa creation took {:.02?}", dfa_time);
+
+    Ok(total_postings_lists)
+}
+
+#[derive(Debug)]
+pub struct QueryWordAutomaton {
+    index: usize,
+    query: String,
+    /// Is it a word that must be considered exact
+    /// or is it some derived word (i.e. a synonym)
+    is_exact: bool,
+    is_prefix: bool,
+}
+
+impl QueryWordAutomaton {
+    pub fn exact(query: &str, index: usize) -> QueryWordAutomaton {
+        QueryWordAutomaton { index, query: query.to_string(), is_exact: true, is_prefix: false }
+    }
+
+    pub fn exact_prefix(query: &str, index: usize) -> QueryWordAutomaton {
+        QueryWordAutomaton { index, query: query.to_string(), is_exact: true, is_prefix: true }
+    }
+
+    pub fn non_exact(query: &str, index: usize) -> QueryWordAutomaton {
+        QueryWordAutomaton { index, query: query.to_string(), is_exact: false, is_prefix: false }
+    }
+
+    pub fn dfa(&self) -> DFA {
+        if self.is_prefix {
+            build_prefix_dfa(&self.query)
+        } else {
+            build_dfa(&self.query)
+        }
+    }
+}
+
+// fn construct_automatons(query: &str) -> Vec<QueryWordAutomaton> {
+//     let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
+//     let mut original_words = split_query_string(query).map(str::to_lowercase).peekable();
+//     let mut automatons = Vec::new();
+
+//     while let Some(word) = original_words.next() {
+//         let has_following_word = original_words.peek().is_some();
+//         let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
+
+//         let automaton = if not_prefix_dfa {
+//             QueryWordAutomaton::exact(word)
+//         } else {
+//             QueryWordAutomaton::exact_prefix(word)
+//         };
+
+//         automatons.push(automaton);
+//     }
+
+//     automatons
+// }
+
+fn construct_automatons2(
+    reader: &heed::RoTxn<MainT>,
+    query: &str,
+    main_store: store::Main,
+    postings_lists_store: store::PostingsLists,
+    synonym_store: store::Synonyms,
+) -> MResult<(Vec<QueryWordAutomaton>, QueryEnhancer)> {
+    let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
+    let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
+    let synonyms = match main_store.synonyms_fst(reader)? {
+        Some(synonym) => synonym,
+        None => fst::Set::default(),
+    };
+
+    let mut automaton_index = 0;
+    let mut automatons = Vec::new();
+    let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
+
+    // We must not declare the original words to the query enhancer
+    // *but* we need to push them in the automatons list first
+    let mut original_words = query_words.iter().peekable();
+    while let Some(word) = original_words.next() {
+        let has_following_word = original_words.peek().is_some();
+        let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
+
+        let automaton = if not_prefix_dfa {
+            QueryWordAutomaton::exact(word, automaton_index)
+        } else {
+            QueryWordAutomaton::exact_prefix(word, automaton_index)
+        };
+        automaton_index += 1;
+        automatons.push(automaton);
+    }
+
+    for n in 1..=NGRAMS {
+        let mut ngrams = query_words.windows(n).enumerate().peekable();
+        while let Some((query_index, ngram_slice)) = ngrams.next() {
+            let query_range = query_index..query_index + n;
+            let ngram_nb_words = ngram_slice.len();
+            let ngram = ngram_slice.join(" ");
+
+            let has_following_word = ngrams.peek().is_some();
+            let not_prefix_dfa =
+                has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
+
+            // automaton of synonyms of the ngrams
+            let normalized = normalize_str(&ngram);
+            let lev = if not_prefix_dfa {
+                build_dfa(&normalized)
+            } else {
+                build_prefix_dfa(&normalized)
+            };
+
+            let mut stream = synonyms.search(&lev).into_stream();
+            while let Some(base) = stream.next() {
+                // only trigger alternatives when the last word has been typed
+                // i.e. "new " do not but "new yo" triggers alternatives to "new york"
+                let base = std::str::from_utf8(base).unwrap();
+                let base_nb_words = split_query_string(base).count();
+                if ngram_nb_words != base_nb_words {
+                    continue;
+                }
+
+                if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
+                    let mut stream = synonyms.into_stream();
+                    while let Some(synonyms) = stream.next() {
+                        let synonyms = std::str::from_utf8(synonyms).unwrap();
+                        let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
+                        let nb_synonym_words = synonyms_words.len();
+
+                        let real_query_index = automaton_index;
+                        enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
+
+                        for synonym in synonyms_words {
+                            let automaton = if nb_synonym_words == 1 {
+                                QueryWordAutomaton::exact(synonym, automaton_index)
+                            } else {
+                                QueryWordAutomaton::non_exact(synonym, automaton_index)
+                            };
+                            automaton_index += 1;
+                            automatons.push(automaton);
+                        }
+                    }
+                }
+            }
+
+            if n == 1 {
+                if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
+                    let left_automaton = QueryWordAutomaton::exact(left, automaton_index);
+                    enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
+                    automaton_index += 1;
+                    automatons.push(left_automaton);
+
+                    let right_automaton = QueryWordAutomaton::exact(right, automaton_index);
+                    enhancer_builder.declare(query_range.clone(), automaton_index, &[right]);
+                    automaton_index += 1;
+                    automatons.push(right_automaton);
+
+                }
+            } else {
+                // automaton of concatenation of query words
+                let concat = ngram_slice.concat();
+                let normalized = normalize_str(&concat);
+
+                let real_query_index = automaton_index;
+                enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
+
+                let automaton = QueryWordAutomaton::exact(&normalized, automaton_index);
+                automaton_index += 1;
+                automatons.push(automaton);
+            }
+        }
+    }
+
+    // // order automatons, the most important first,
+    // // we keep the original automatons at the front.
+    // automatons[1..].sort_by_key(|group| {
+    //     let a = group.automatons.first().unwrap();
+    //     (
+    //         Reverse(a.is_exact),
+    //         a.ngram,
+    //         Reverse(group.automatons.len()),
+    //     )
+    // });
+
+    Ok((automatons, enhancer_builder.build()))
+}
--- a/meilisearch-core/src/criterion2.rs
+++ b/meilisearch-core/src/criterion2.rs
@ -0,0 +1,479 @@
+use std::cmp::{self, Ordering, Reverse};
+use std::borrow::Cow;
+use std::sync::atomic::{self, AtomicUsize};
+
+use slice_group_by::{GroupBy, GroupByMut};
+use compact_arena::SmallArena;
+use sdset::{Set, SetBuf};
+
+use crate::{DocIndex, DocumentId};
+use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView};
+use crate::automaton::QueryEnhancer;
+
+type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>;
+
+pub trait Criterion {
+    fn name(&self) -> &str;
+
+    fn prepare<'a, 'tag, 'txn>(
+        &self,
+        documents: &mut [RawDocument<'a, 'tag>],
+        postings_lists: &mut PostingsListsArena<'tag, 'txn>,
+        query_enhancer: &QueryEnhancer,
+    );
+
+    fn evaluate<'a, 'tag, 'txn>(
+        &self,
+        lhs: &RawDocument<'a, 'tag>,
+        rhs: &RawDocument<'a, 'tag>,
+        postings_lists: &PostingsListsArena<'tag, 'txn>,
+    ) -> Ordering;
+
+    #[inline]
+    fn eq<'a, 'tag, 'txn>(
+        &self,
+        lhs: &RawDocument<'a, 'tag>,
+        rhs: &RawDocument<'a, 'tag>,
+        postings_lists: &PostingsListsArena<'tag, 'txn>,
+    ) -> bool
+    {
+        self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal
+    }
+}
+
+pub struct Typo;
+
+impl Criterion for Typo {
+    fn name(&self) -> &str { "typo" }
+
+    fn prepare(
+        &self,
+        documents: &mut [RawDocument],
+        postings_lists: &mut PostingsListsArena,
+        query_enhancer: &QueryEnhancer,
+    ) {
+        for document in documents {
+            document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, bm.distance));
+        }
+    }
+
+    fn evaluate(
+        &self,
+        lhs: &RawDocument,
+        rhs: &RawDocument,
+        postings_lists: &PostingsListsArena,
+    ) -> Ordering
+    {
+        // This function is a wrong logarithmic 10 function.
+        // It is safe to panic on input number higher than 3,
+        // the number of typos is never bigger than that.
+        #[inline]
+        fn custom_log10(n: u8) -> f32 {
+            match n {
+                0 => 0.0,     // log(1)
+                1 => 0.30102, // log(2)
+                2 => 0.47712, // log(3)
+                3 => 0.60205, // log(4)
+                _ => panic!("invalid number"),
+            }
+        }
+
+        #[inline]
+        fn compute_typos(matches: &[BareMatch]) -> usize {
+            let mut number_words: usize = 0;
+            let mut sum_typos = 0.0;
+
+            for group in matches.linear_group_by_key(|bm| bm.query_index) {
+                sum_typos += custom_log10(group[0].distance);
+                number_words += 1;
+            }
+
+            (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
+        }
+
+        let lhs = compute_typos(&lhs.raw_matches);
+        let rhs = compute_typos(&rhs.raw_matches);
+
+        lhs.cmp(&rhs).reverse()
+    }
+}
+
+pub struct Words;
+
+impl Criterion for Words {
+    fn name(&self) -> &str { "words" }
+
+    fn prepare(
+        &self,
+        documents: &mut [RawDocument],
+        postings_lists: &mut PostingsListsArena,
+        query_enhancer: &QueryEnhancer,
+    ) {
+        for document in documents {
+            document.raw_matches.sort_unstable_by_key(|bm| bm.query_index);
+        }
+    }
+
+    fn evaluate(
+        &self,
+        lhs: &RawDocument,
+        rhs: &RawDocument,
+        postings_lists: &PostingsListsArena,
+    ) -> Ordering
+    {
+        #[inline]
+        fn number_of_query_words(matches: &[BareMatch]) -> usize {
+            matches.linear_group_by_key(|bm| bm.query_index).count()
+        }
+
+        let lhs = number_of_query_words(&lhs.raw_matches);
+        let rhs = number_of_query_words(&rhs.raw_matches);
+
+        lhs.cmp(&rhs).reverse()
+    }
+}
+
+fn process_raw_matches<'a, 'tag, 'txn>(
+    documents: &mut [RawDocument<'a, 'tag>],
+    postings_lists: &mut PostingsListsArena<'tag, 'txn>,
+    query_enhancer: &QueryEnhancer,
+) {
+    for document in documents {
+        if document.processed_matches.is_some() { continue }
+
+        let mut processed = Vec::new();
+        let document_id = document.raw_matches[0].document_id;
+
+        for m in document.raw_matches.iter() {
+            let postings_list = &postings_lists[m.postings_list];
+            processed.reserve(postings_list.len());
+            for di in postings_list.as_ref() {
+                let simple_match = SimpleMatch {
+                    query_index: m.query_index,
+                    distance: m.distance,
+                    attribute: di.attribute,
+                    word_index: di.word_index,
+                    is_exact: m.is_exact,
+                };
+                processed.push(simple_match);
+            }
+        }
+
+        let processed = multiword_rewrite_matches(&mut processed, query_enhancer);
+        document.processed_matches = Some(processed.into_vec());
+    }
+}
+
+pub struct Proximity;
+
+impl Criterion for Proximity {
+    fn name(&self) -> &str { "proximity" }
+
+    fn prepare<'a, 'tag, 'txn>(
+        &self,
+        documents: &mut [RawDocument<'a, 'tag>],
+        postings_lists: &mut PostingsListsArena<'tag, 'txn>,
+        query_enhancer: &QueryEnhancer,
+    ) {
+        process_raw_matches(documents, postings_lists, query_enhancer);
+    }
+
+    fn evaluate<'a, 'tag, 'txn>(
+        &self,
+        lhs: &RawDocument<'a, 'tag>,
+        rhs: &RawDocument<'a, 'tag>,
+        postings_lists: &PostingsListsArena<'tag, 'txn>,
+    ) -> Ordering
+    {
+        const MAX_DISTANCE: u16 = 8;
+
+        fn index_proximity(lhs: u16, rhs: u16) -> u16 {
+            if lhs < rhs {
+                cmp::min(rhs - lhs, MAX_DISTANCE)
+            } else {
+                cmp::min(lhs - rhs, MAX_DISTANCE) + 1
+            }
+        }
+
+        fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 {
+            if lhs.attribute != rhs.attribute { MAX_DISTANCE }
+            else { index_proximity(lhs.word_index, rhs.word_index) }
+        }
+
+        fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 {
+            let mut min_prox = u16::max_value();
+            for a in lhs {
+                for b in rhs {
+                    let prox = attribute_proximity(*a, *b);
+                    min_prox = cmp::min(min_prox, prox);
+                }
+            }
+            min_prox
+        }
+
+        fn matches_proximity(matches: &[SimpleMatch],) -> u16 {
+            let mut proximity = 0;
+            let mut iter = matches.linear_group_by_key(|m| m.query_index);
+
+            // iterate over groups by windows of size 2
+            let mut last = iter.next();
+            while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
+                proximity += min_proximity(lhs, rhs);
+                last = Some(rhs);
+            }
+
+            proximity
+        }
+
+        let lhs = matches_proximity(&lhs.processed_matches.as_ref().unwrap());
+        let rhs = matches_proximity(&rhs.processed_matches.as_ref().unwrap());
+
+        lhs.cmp(&rhs)
+    }
+}
+
+pub struct Attribute;
+
+impl Criterion for Attribute {
+    fn name(&self) -> &str { "attribute" }
+
+    fn prepare<'a, 'tag, 'txn>(
+        &self,
+        documents: &mut [RawDocument<'a, 'tag>],
+        postings_lists: &mut PostingsListsArena<'tag, 'txn>,
+        query_enhancer: &QueryEnhancer,
+    ) {
+        process_raw_matches(documents, postings_lists, query_enhancer);
+    }
+
+    fn evaluate<'a, 'tag, 'txn>(
+        &self,
+        lhs: &RawDocument<'a, 'tag>,
+        rhs: &RawDocument<'a, 'tag>,
+        postings_lists: &PostingsListsArena<'tag, 'txn>,
+    ) -> Ordering
+    {
+        #[inline]
+        fn sum_attribute(matches: &[SimpleMatch]) -> usize {
+            let mut sum_attribute = 0;
+            for group in matches.linear_group_by_key(|bm| bm.query_index) {
+                sum_attribute += group[0].attribute as usize;
+            }
+            sum_attribute
+        }
+
+        let lhs = sum_attribute(&lhs.processed_matches.as_ref().unwrap());
+        let rhs = sum_attribute(&rhs.processed_matches.as_ref().unwrap());
+
+        lhs.cmp(&rhs)
+    }
+}
+
+pub struct WordsPosition;
+
+impl Criterion for WordsPosition {
+    fn name(&self) -> &str { "words position" }
+
+    fn prepare<'a, 'tag, 'txn>(
+        &self,
+        documents: &mut [RawDocument<'a, 'tag>],
+        postings_lists: &mut PostingsListsArena<'tag, 'txn>,
+        query_enhancer: &QueryEnhancer,
+    ) {
+        process_raw_matches(documents, postings_lists, query_enhancer);
+    }
+
+    fn evaluate<'a, 'tag, 'txn>(
+        &self,
+        lhs: &RawDocument<'a, 'tag>,
+        rhs: &RawDocument<'a, 'tag>,
+        postings_lists: &PostingsListsArena<'tag, 'txn>,
+    ) -> Ordering
+    {
+        #[inline]
+        fn sum_words_position(matches: &[SimpleMatch]) -> usize {
+            let mut sum_words_position = 0;
+            for group in matches.linear_group_by_key(|bm| bm.query_index) {
+                sum_words_position += group[0].word_index as usize;
+            }
+            sum_words_position
+        }
+
+        let lhs = sum_words_position(&lhs.processed_matches.as_ref().unwrap());
+        let rhs = sum_words_position(&rhs.processed_matches.as_ref().unwrap());
+
+        lhs.cmp(&rhs)
+    }
+}
+
+pub struct Exact;
+
+impl Criterion for Exact {
+    fn name(&self) -> &str { "exact" }
+
+    fn prepare(
+        &self,
+        documents: &mut [RawDocument],
+        postings_lists: &mut PostingsListsArena,
+        query_enhancer: &QueryEnhancer,
+    ) {
+        for document in documents {
+            document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
+        }
+    }
+
+    fn evaluate(
+        &self,
+        lhs: &RawDocument,
+        rhs: &RawDocument,
+        postings_lists: &PostingsListsArena,
+    ) -> Ordering
+    {
+        #[inline]
+        fn sum_exact_query_words(matches: &[BareMatch]) -> usize {
+            let mut sum_exact_query_words = 0;
+
+            for group in matches.linear_group_by_key(|bm| bm.query_index) {
+                sum_exact_query_words += group[0].is_exact as usize;
+            }
+
+            sum_exact_query_words
+        }
+
+        let lhs = sum_exact_query_words(&lhs.raw_matches);
+        let rhs = sum_exact_query_words(&rhs.raw_matches);
+
+        lhs.cmp(&rhs).reverse()
+    }
+}
+
+pub struct StableDocId;
+
+impl Criterion for StableDocId {
+    fn name(&self) -> &str { "stable document id" }
+
+    fn prepare(
+        &self,
+        documents: &mut [RawDocument],
+        postings_lists: &mut PostingsListsArena,
+        query_enhancer: &QueryEnhancer,
+    ) {
+        // ...
+    }
+
+    fn evaluate(
+        &self,
+        lhs: &RawDocument,
+        rhs: &RawDocument,
+        postings_lists: &PostingsListsArena,
+    ) -> Ordering
+    {
+        let lhs = &lhs.raw_matches[0].document_id;
+        let rhs = &rhs.raw_matches[0].document_id;
+
+        lhs.cmp(rhs)
+    }
+}
+
+pub fn multiword_rewrite_matches(
+    matches: &mut [SimpleMatch],
+    query_enhancer: &QueryEnhancer,
+) -> SetBuf<SimpleMatch>
+{
+    let mut padded_matches = Vec::with_capacity(matches.len());
+
+    // let before_sort = Instant::now();
+    // we sort the matches by word index to make them rewritable
+    matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
+    // debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
+
+    // let before_padding = Instant::now();
+    // for each attribute of each document
+    for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
+        // padding will only be applied
+        // to word indices in the same attribute
+        let mut padding = 0;
+        let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
+
+        // for each match at the same position
+        // in this document attribute
+        while let Some(same_word_index) = iter.next() {
+            // find the biggest padding
+            let mut biggest = 0;
+            for match_ in same_word_index {
+                let mut replacement = query_enhancer.replacement(match_.query_index as u32);
+                let replacement_len = replacement.len();
+                let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
+
+                if let Some(query_index) = replacement.next() {
+                    let word_index = match_.word_index + padding as u16;
+                    let query_index = query_index as u16;
+                    let match_ = SimpleMatch { query_index, word_index, ..*match_ };
+                    padded_matches.push(match_);
+                }
+
+                let mut found = false;
+
+                // look ahead and if there already is a match
+                // corresponding to this padding word, abort the padding
+                'padding: for (x, next_group) in nexts.enumerate() {
+                    for (i, query_index) in replacement.clone().enumerate().skip(x) {
+                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
+                        let query_index = query_index as u16;
+                        let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
+
+                        for nmatch_ in next_group {
+                            let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
+                            let query_index = rep.next().unwrap() as u16;
+                            if query_index == padmatch.query_index {
+                                if !found {
+                                    // if we find a corresponding padding for the
+                                    // first time we must push preceding paddings
+                                    for (i, query_index) in replacement.clone().enumerate().take(i)
+                                    {
+                                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
+                                        let query_index = query_index as u16;
+                                        let match_ = SimpleMatch { query_index, word_index, ..*match_ };
+                                        padded_matches.push(match_);
+                                        biggest = biggest.max(i + 1);
+                                    }
+                                }
+
+                                padded_matches.push(padmatch);
+                                found = true;
+                                continue 'padding;
+                            }
+                        }
+                    }
+
+                    // if we do not find a corresponding padding in the
+                    // next groups so stop here and pad what was found
+                    break;
+                }
+
+                if !found {
+                    // if no padding was found in the following matches
+                    // we must insert the entire padding
+                    for (i, query_index) in replacement.enumerate() {
+                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
+                        let query_index = query_index as u16;
+                        let match_ = SimpleMatch { query_index, word_index, ..*match_ };
+                        padded_matches.push(match_);
+                    }
+
+                    biggest = biggest.max(replacement_len - 1);
+                }
+            }
+
+            padding += biggest;
+        }
+    }
+
+    // debug!("padding matches took {:.02?}", before_padding.elapsed());
+
+    // With this check we can see that the loop above takes something
+    // like 43% of the search time even when no rewrite is needed.
+    // assert_eq!(before_matches, padded_matches);
+
+    SetBuf::from_dirty(padded_matches)
+}
--- a/meilisearch-core/src/lib.rs
+++ b/meilisearch-core/src/lib.rs
@ -18,6 +18,10 @@ pub mod serde;
 pub mod store;
 mod update;

+// TODO replace
+mod bucket_sort;
+mod criterion2;
+
 pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT};
 pub use self::error::{Error, MResult};
 pub use self::number::{Number, ParseNumberError};
--- a/meilisearch-core/src/query_builder.rs
+++ b/meilisearch-core/src/query_builder.rs
@ -10,7 +10,7 @@ use log::debug;
 use sdset::SetBuf;
 use slice_group_by::{GroupBy, GroupByMut};

-use crate::database::MainT;
+use crate::{bucket_sort::bucket_sort, database::MainT};
 use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
 use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
 use crate::levenshtein::prefix_damerau_levenshtein;
@ -34,19 +34,14 @@ fn multiword_rewrite_matches(
    mut matches: Vec<(DocumentId, TmpMatch)>,
    query_enhancer: &QueryEnhancer,
 ) -> SetBuf<(DocumentId, TmpMatch)> {
-    if true {
-        let before_sort = Instant::now();
-        matches.sort_unstable();
-        let matches = SetBuf::new_unchecked(matches);
-        debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
-        return matches;
-    }
-
    let mut padded_matches = Vec::with_capacity(matches.len());

+    let before_sort = Instant::now();
    // we sort the matches by word index to make them rewritable
    matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index));
+    debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());

+    let before_padding = Instant::now();
    // for each attribute of each document
    for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
        // padding will only be applied
@ -145,6 +140,8 @@ fn multiword_rewrite_matches(
        document_matches.sort_unstable();
    }

+    debug!("padding matches took {:.02?}", before_padding.elapsed());
+
    // With this check we can see that the loop above takes something
    // like 43% of the search time even when no rewrite is needed.
    // assert_eq!(before_matches, padded_matches);
@ -163,7 +160,18 @@ fn fetch_raw_documents(
    let mut matches = Vec::new();
    let mut highlights = Vec::new();

+    let words = match main_store.words_fst(reader)? {
+        Some(words) => words,
+        None => return Ok(Vec::new()),
+    };
+
    let before_automatons_groups_loop = Instant::now();
+    let mut doc_indexes_rewrite = Duration::default();
+    let mut retrieve_postings_lists = Duration::default();
+    let mut stream_reserve = Duration::default();
+    let mut covered_area_time = Duration::default();
+    let mut eval_time = Duration::default();
+
    for group in automatons_groups {
        let AutomatonGroup { is_phrase_query, automatons } = group;
        let phrase_query_len = automatons.len();
@ -173,29 +181,39 @@ fn fetch_raw_documents(
            let Automaton { index, is_exact, query_len, query, .. } = automaton;
            let dfa = automaton.dfa();

-            let words = match main_store.words_fst(reader)? {
-                Some(words) => words,
-                None => return Ok(Vec::new()),
-            };
+            let before_stream_loop = Instant::now();
+            let mut stream_count = 0;

            let mut stream = words.search(&dfa).into_stream();
            while let Some(input) = stream.next() {
+                let before_eval_time = Instant::now();
                let distance = dfa.eval(input).to_u8();
+                eval_time += before_eval_time.elapsed();
+
                let is_exact = *is_exact && distance == 0 && input.len() == *query_len;

+                stream_count += 1;
+
+                let before_covered_area = Instant::now();
                let covered_area = if *query_len > input.len() {
                    input.len()
                } else {
                    prefix_damerau_levenshtein(query.as_bytes(), input).1
                };
+                covered_area_time += before_covered_area.elapsed();

+                let before_retrieve_postings_lists = Instant::now();
                let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
                    Some(doc_indexes) => doc_indexes,
                    None => continue,
                };
+                retrieve_postings_lists += before_retrieve_postings_lists.elapsed();

+                let before_stream_reserve = Instant::now();
                tmp_matches.reserve(doc_indexes.len());
+                stream_reserve += before_stream_reserve.elapsed();

+                let before_doc_indexes_rewrite = Instant::now();
                for di in doc_indexes.as_ref() {
                    let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
                    if let Some(attribute) = attribute {
@ -219,7 +237,9 @@ fn fetch_raw_documents(
                        tmp_matches.push((di.document_id, id, match_, highlight));
                    }
                }
+                doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed();
            }
+            debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count);
        }

        if *is_phrase_query {
@ -249,6 +269,10 @@ fn fetch_raw_documents(
            }
        } else {
            let before_rerewrite = Instant::now();
+
+            matches.reserve(tmp_matches.len());
+            highlights.reserve(tmp_matches.len());
+
            for (id, _, match_, highlight) in tmp_matches {
                matches.push((id, match_));
                highlights.push((id, highlight));
@ -257,13 +281,18 @@ fn fetch_raw_documents(
        }
    }
    debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed());
+    debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite);
+    debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists);
+    debug!("stream reserve took {:.02?}", stream_reserve);
+    debug!("covered area took {:.02?}", covered_area_time);
+    debug!("eval value took {:.02?}", eval_time);

-    {
-        let mut cloned = matches.clone();
-        let before_sort_test = Instant::now();
-        cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance));
-        debug!("sorting test took {:.02?}", before_sort_test.elapsed());
-    }
+    // {
+    //     let mut cloned = matches.clone();
+    //     let before_sort_test = Instant::now();
+    //     cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance));
+    //     debug!("sorting test took {:.02?}", before_sort_test.elapsed());
+    // }

    let before_multiword_rewrite_matches = Instant::now();
    debug!("number of matches before rewrite {}", matches.len());
@ -279,7 +308,6 @@ fn fetch_raw_documents(
    };
    debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed());

-
    let before_raw_documents = Instant::now();
    let raw_documents = raw_documents_from(matches, highlights);
    debug!("raw_documents took {:.02?}", before_raw_documents.elapsed());
@ -356,29 +384,12 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
        range: Range<usize>,
    ) -> MResult<Vec<Document>> {
        match self.distinct {
-            Some((distinct, distinct_size)) => raw_query_with_distinct(
+            Some((distinct, distinct_size)) => unimplemented!("distinct"),
+            None => bucket_sort(
                reader,
                query,
                range,
-                self.filter,
-                distinct,
-                distinct_size,
-                self.timeout,
-                self.criteria,
-                self.searchable_attrs,
-                self.main_store,
-                self.postings_lists_store,
-                self.documents_fields_counts_store,
-                self.synonyms_store,
-            ),
-            None => raw_query(
-                reader,
-                query,
-                range,
-                self.filter,
-                self.timeout,
-                self.criteria,
-                self.searchable_attrs,
+                // self.criteria,
                self.main_store,
                self.postings_lists_store,
                self.documents_fields_counts_store,
@ -472,6 +483,8 @@ where
            }
        }

+        let before_bucket_sort = Instant::now();
+
        let mut groups = vec![raw_documents.as_mut_slice()];

        'criteria: for criterion in criteria.as_ref() {
@ -520,6 +533,8 @@ where
            }
        }

+        debug!("bucket_sort took {:.02?}", before_bucket_sort.elapsed());
+
        // once we classified the documents related to the current
        // automatons we save that as the next valid result
        let iter = raw_documents