meilisearch/src/lib.rs

use std::collections::HashMap;
use std::hash::BuildHasherDefault;
use std::time::Instant;

use cow_utils::CowUtils;
use fst::{IntoStreamer, Streamer};
use fxhash::FxHasher32;
use heed::types::*;
use heed::{PolyDatabase, Database};
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
use once_cell::sync::OnceCell;
use roaring::RoaringBitmap;
use slice_group_by::StrGroupBy;

static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();
static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();
static LEVDIST2: OnceCell<LevBuilder> = OnceCell::new();

pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>;
pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
pub type DocumentId = u32;

pub fn alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {
    let is_alphanumeric = |s: &&str| s.chars().next().map_or(false, char::is_alphanumeric);
    string.linear_group_by_key(|c| c.is_alphanumeric()).filter(is_alphanumeric)
}

#[derive(Clone)]
pub struct Index {
    pub main: PolyDatabase,
    pub postings_ids: Database<Str, ByteSlice>,
    pub prefix_postings_ids: Database<Str, ByteSlice>,
    pub documents: Database<OwnedType<BEU32>, ByteSlice>,
}

impl Index {
    pub fn new(env: &heed::Env) -> heed::Result<Index> {
        let main = env.create_poly_database(None)?;
        let postings_ids = env.create_database(Some("postings-ids"))?;
        let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?;
        let documents = env.create_database(Some("documents"))?;

        Ok(Index { main, postings_ids, prefix_postings_ids, documents })
    }

    pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> {
        self.main.get::<_, Str, ByteSlice>(rtxn, "headers")
    }

    pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result<Vec<DocumentId>> {
        let fst = match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? {
            Some(bytes) => fst::Set::new(bytes)?,
            None => return Ok(Vec::new()),
        };

        // Building these factories is not free.
        let lev0 = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
        let lev1 = LEVDIST1.get_or_init(|| LevBuilder::new(1, true));
        let lev2 = LEVDIST2.get_or_init(|| LevBuilder::new(2, true));

        let words: Vec<_> = alphanumeric_tokens(query).collect();
        let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace);
        let number_of_words = words.len();
        let dfas = words.into_iter().enumerate().map(|(i, word)| {
            let word = word.cow_to_lowercase();
            let is_last = i + 1 == number_of_words;
            let is_prefix = is_last && !ends_with_whitespace;
            let dfa = match word.len() {
                0..=4 => if is_prefix { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) },
                5..=8 => if is_prefix { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) },
                _     => if is_prefix { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) },
            };
            (word, dfa)
        });

        let mut intersect_result: Option<RoaringBitmap> = None;
        for (word, dfa) in dfas {
            let before = Instant::now();

            let mut union_result = RoaringBitmap::default();
            if word.len() <= 4 {
                if let Some(ids) = self.prefix_postings_ids.get(rtxn, &word[..word.len().min(5)])? {
                    union_result = RoaringBitmap::deserialize_from(ids)?;
                }
            } else {
                let mut count = 0;
                let mut stream = fst.search(dfa).into_stream();
                while let Some(word) = stream.next() {
                    count += 1;
                    let word = std::str::from_utf8(word)?;
                    if let Some(ids) = self.postings_ids.get(rtxn, word)? {
                        let right = RoaringBitmap::deserialize_from(ids)?;
                        union_result.union_with(&right);
                    }
                }
                eprint!("with {:?} words ", count);
            }
            eprintln!("union for {:?} took {:.02?}", word, before.elapsed());

            intersect_result = match intersect_result.take() {
                Some(mut left) => {
                    let before = Instant::now();
                    let left_len = left.len();
                    left.intersect_with(&union_result);
                    eprintln!("intersect between {:?} and {:?} gives {:?} took {:.02?}",
                        left_len, union_result.len(), left.len(), before.elapsed());
                    Some(left)
                },
                None => Some(union_result),
            };
        }

        eprintln!("{} candidates", intersect_result.as_ref().map_or(0, |r| r.len()));

        Ok(intersect_result.unwrap_or_default().iter().take(20).collect())
    }
}
Support multiple space seperated words 2020-05-31 22:09:34 +08:00			`use std::collections::HashMap;`
			`use std::hash::BuildHasherDefault;`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`use std::time::Instant;`
Support multiple space seperated words 2020-05-31 22:09:34 +08:00
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`use cow_utils::CowUtils;`
			`use fst::{IntoStreamer, Streamer};`
Support multiple space seperated words 2020-05-31 22:09:34 +08:00			`use fxhash::FxHasher32;`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`use heed::types::*;`
			`use heed::{PolyDatabase, Database};`
Use OnceCell to cache levenshtein builders 2020-06-01 00:21:24 +08:00			`use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;`
			`use once_cell::sync::OnceCell;`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`use roaring::RoaringBitmap;`
Support multiple space seperated words 2020-05-31 22:09:34 +08:00			`use slice_group_by::StrGroupBy;`

Use OnceCell to cache levenshtein builders 2020-06-01 00:21:24 +08:00			`static LEVDIST0: OnceCell<LevBuilder> = OnceCell::new();`
			`static LEVDIST1: OnceCell<LevBuilder> = OnceCell::new();`
			`static LEVDIST2: OnceCell<LevBuilder> = OnceCell::new();`

Support multiple space seperated words 2020-05-31 22:09:34 +08:00			`pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;`
			`pub type SmallString32 = smallstr::SmallString<[u8; 32]>;`
			`pub type SmallVec32 = smallvec::SmallVec<[u8; 32]>;`
			`pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;`
			`pub type DocumentId = u32;`

			`pub fn alphanumeric_tokens(string: &str) -> impl Iterator<Item = &str> {`
			`let is_alphanumeric = \|s: &&str\| s.chars().next().map_or(false, char::is_alphanumeric);`
			`string.linear_group_by_key(\|c\| c.is_alphanumeric()).filter(is_alphanumeric)`
			`}`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00
			`#[derive(Clone)]`
			`pub struct Index {`
			`pub main: PolyDatabase,`
			`pub postings_ids: Database<Str, ByteSlice>,`
Introduce prefix postings ids for better perfs 2020-06-01 00:20:49 +08:00			`pub prefix_postings_ids: Database<Str, ByteSlice>,`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`pub documents: Database<OwnedType<BEU32>, ByteSlice>,`
			`}`

			`impl Index {`
			`pub fn new(env: &heed::Env) -> heed::Result<Index> {`
			`let main = env.create_poly_database(None)?;`
			`let postings_ids = env.create_database(Some("postings-ids"))?;`
Introduce prefix postings ids for better perfs 2020-06-01 00:20:49 +08:00			`let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?;`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`let documents = env.create_database(Some("documents"))?;`

Use OnceCell to cache levenshtein builders 2020-06-01 00:21:24 +08:00			`Ok(Index { main, postings_ids, prefix_postings_ids, documents })`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`}`

			`pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> {`
			`self.main.get::<_, Str, ByteSlice>(rtxn, "headers")`
			`}`

			`pub fn search(&self, rtxn: &heed::RoTxn, query: &str) -> anyhow::Result<Vec<DocumentId>> {`
			`let fst = match self.main.get::<_, Str, ByteSlice>(rtxn, "words-fst")? {`
			`Some(bytes) => fst::Set::new(bytes)?,`
			`None => return Ok(Vec::new()),`
			`};`

			`// Building these factories is not free.`
Use OnceCell to cache levenshtein builders 2020-06-01 00:21:24 +08:00			`let lev0 = LEVDIST0.get_or_init(\|\| LevBuilder::new(0, true));`
			`let lev1 = LEVDIST1.get_or_init(\|\| LevBuilder::new(1, true));`
			`let lev2 = LEVDIST2.get_or_init(\|\| LevBuilder::new(2, true));`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00
			`let words: Vec<_> = alphanumeric_tokens(query).collect();`
Disable prefix search when query is ended by a whitespace 2020-06-05 00:36:30 +08:00			`let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace);`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`let number_of_words = words.len();`
			`let dfas = words.into_iter().enumerate().map(\|(i, word)\| {`
			`let word = word.cow_to_lowercase();`
			`let is_last = i + 1 == number_of_words;`
Disable prefix search when query is ended by a whitespace 2020-06-05 00:36:30 +08:00			`let is_prefix = is_last && !ends_with_whitespace;`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`let dfa = match word.len() {`
Disable prefix search when query is ended by a whitespace 2020-06-05 00:36:30 +08:00			`0..=4 => if is_prefix { lev0.build_prefix_dfa(&word) } else { lev0.build_dfa(&word) },`
			`5..=8 => if is_prefix { lev1.build_prefix_dfa(&word) } else { lev1.build_dfa(&word) },`
			`_ => if is_prefix { lev2.build_prefix_dfa(&word) } else { lev2.build_dfa(&word) },`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`};`
			`(word, dfa)`
			`});`

			`let mut intersect_result: Option<RoaringBitmap> = None;`
			`for (word, dfa) in dfas {`
			`let before = Instant::now();`
Introduce prefix postings ids for better perfs 2020-06-01 00:20:49 +08:00
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`let mut union_result = RoaringBitmap::default();`
Introduce prefix postings ids for better perfs 2020-06-01 00:20:49 +08:00			`if word.len() <= 4 {`
Add a little bit more debug 2020-06-01 23:52:43 +08:00			`if let Some(ids) = self.prefix_postings_ids.get(rtxn, &word[..word.len().min(5)])? {`
Introduce prefix postings ids for better perfs 2020-06-01 00:20:49 +08:00			`union_result = RoaringBitmap::deserialize_from(ids)?;`
			`}`
			`} else {`
Add a little bit more debug 2020-06-01 23:52:43 +08:00			`let mut count = 0;`
Introduce prefix postings ids for better perfs 2020-06-01 00:20:49 +08:00			`let mut stream = fst.search(dfa).into_stream();`
			`while let Some(word) = stream.next() {`
Add a little bit more debug 2020-06-01 23:52:43 +08:00			`count += 1;`
Introduce prefix postings ids for better perfs 2020-06-01 00:20:49 +08:00			`let word = std::str::from_utf8(word)?;`
			`if let Some(ids) = self.postings_ids.get(rtxn, word)? {`
			`let right = RoaringBitmap::deserialize_from(ids)?;`
			`union_result.union_with(&right);`
			`}`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`}`
Add a little bit more debug 2020-06-01 23:52:43 +08:00			`eprint!("with {:?} words ", count);`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`}`
Use OnceCell to cache levenshtein builders 2020-06-01 00:21:24 +08:00			`eprintln!("union for {:?} took {:.02?}", word, before.elapsed());`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00
			`intersect_result = match intersect_result.take() {`
			`Some(mut left) => {`
			`let before = Instant::now();`
			`let left_len = left.len();`
			`left.intersect_with(&union_result);`
Add a little bit more debug 2020-06-01 23:52:43 +08:00			`eprintln!("intersect between {:?} and {:?} gives {:?} took {:.02?}",`
			`left_len, union_result.len(), left.len(), before.elapsed());`
Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`Some(left)`
			`},`
			`None => Some(union_result),`
			`};`
			`}`

Add a little bit more debug 2020-06-01 23:52:43 +08:00			`eprintln!("{} candidates", intersect_result.as_ref().map_or(0, \|r\| r.len()));`

Reintroduce a simple HTTP server 2020-05-31 23:48:13 +08:00			`Ok(intersect_result.unwrap_or_default().iter().take(20).collect())`
			`}`
			`}`