meilisearch/meilidb-data/src/indexer.rs

use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;

use deunicode::deunicode_with_tofu;
use meilidb_core::{DocumentId, DocIndex};
use meilidb_schema::SchemaAttr;
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
use sdset::SetBuf;

type Word = Vec<u8>; // TODO make it be a SmallVec

pub struct Indexer {
    word_limit: usize, // the maximum number of indexed words
    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
    docs_words: HashMap<DocumentId, Vec<Word>>,
}

pub struct Indexed {
    pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
    pub docs_words: HashMap<DocumentId, fst::Set>,
}

impl Indexer {
    pub fn new() -> Indexer {
        Indexer::with_word_limit(1000)
    }

    pub fn with_word_limit(limit: usize) -> Indexer {
        Indexer {
            word_limit: limit,
            words_doc_indexes: BTreeMap::new(),
            docs_words: HashMap::new(),
        }
    }

    pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
        for token in Tokenizer::new(text) {
            let must_continue = index_token(
                token,
                id,
                attr,
                self.word_limit,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );

            if !must_continue { break }
        }
    }

    pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
    where I: IntoIterator<Item=&'a str>,
    {
        let iter = iter.into_iter();
        for token in SeqTokenizer::new(iter) {
            let must_continue = index_token(
                token,
                id,
                attr,
                self.word_limit,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );

            if !must_continue { break }
        }
    }

    pub fn build(self) -> Indexed {
        let words_doc_indexes = self.words_doc_indexes
            .into_iter()
            .map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
            .collect();

        let docs_words = self.docs_words
            .into_iter()
            .map(|(id, mut words)| {
                words.sort_unstable();
                words.dedup();
                (id, fst::Set::from_iter(words).unwrap())
            })
            .collect();

        Indexed { words_doc_indexes, docs_words }
    }
}

fn index_token(
    token: Token,
    id: DocumentId,
    attr: SchemaAttr,
    word_limit: usize,
    words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
    docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool
{
    if token.word_index >= word_limit { return false }

    let lower = token.word.to_lowercase();
    let token = Token { word: &lower, ..token };
    match token_to_docindex(id, attr, token) {
        Some(docindex) => {
            let word = Vec::from(token.word);
            words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
            docs_words.entry(id).or_insert_with(Vec::new).push(word);
        },
        None => return false,
    }

    if !lower.contains(is_cjk) {
        let unidecoded = deunicode_with_tofu(&lower, "");
        if unidecoded != lower {
            let token = Token { word: &unidecoded, ..token };
            match token_to_docindex(id, attr, token) {
                Some(docindex) => {
                    let word = Vec::from(token.word);
                    words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
                    docs_words.entry(id).or_insert_with(Vec::new).push(word);
                },
                None => return false,
            }
        }
    }

    true
}

fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
    let word_index = u16::try_from(token.word_index).ok()?;
    let char_index = u16::try_from(token.char_index).ok()?;
    let char_length = u16::try_from(token.word.chars().count()).ok()?;

    let docindex = DocIndex {
        document_id: id,
        attribute: attr.0,
        word_index,
        char_index,
        char_length,
    };

    Some(docindex)
}
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`use std::collections::{BTreeMap, HashMap};`
feat: Introduce the Indexer struct 2019-04-15 21:16:53 +08:00			`use std::convert::TryFrom;`

feat: Index unidecoded words 2019-04-25 03:06:00 +08:00			`use deunicode::deunicode_with_tofu;`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`use meilidb_core::{DocumentId, DocIndex};`
feat: Move the Schema to its own workspace crate 2019-05-29 21:26:18 +08:00			`use meilidb_schema::SchemaAttr;`
feat: Index unidecoded words 2019-04-25 03:06:00 +08:00			`use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};`
feat: Introduce the DocumentsAddition type 2019-05-09 20:23:39 +08:00			`use sdset::SetBuf;`
feat: Introduce the Indexer struct 2019-04-15 21:16:53 +08:00
			`type Word = Vec<u8>; // TODO make it be a SmallVec`

			`pub struct Indexer {`
			`word_limit: usize, // the maximum number of indexed words`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,`
feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`docs_words: HashMap<DocumentId, Vec<Word>>,`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`}`

			`pub struct Indexed {`
			`pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,`
feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`pub docs_words: HashMap<DocumentId, fst::Set>,`
feat: Introduce the Indexer struct 2019-04-15 21:16:53 +08:00			`}`

			`impl Indexer {`
			`pub fn new() -> Indexer {`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`Indexer::with_word_limit(1000)`
feat: Introduce the Indexer struct 2019-04-15 21:16:53 +08:00			`}`

feat: Introduce the Serializer Which will serialize documents fields as message pack in the kv-store 2019-04-19 21:50:53 +08:00			`pub fn with_word_limit(limit: usize) -> Indexer {`
			`Indexer {`
			`word_limit: limit,`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`words_doc_indexes: BTreeMap::new(),`
feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`docs_words: HashMap::new(),`
feat: Introduce the Serializer Which will serialize documents fields as message pack in the kv-store 2019-04-19 21:50:53 +08:00			`}`
			`}`

feat: Introduce the Indexer struct 2019-04-15 21:16:53 +08:00			`pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {`
			`for token in Tokenizer::new(text) {`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`let must_continue = index_token(`
			`token,`
			`id,`
			`attr,`
			`self.word_limit,`
			`&mut self.words_doc_indexes,`
feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`&mut self.docs_words,`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`);`

feat: Index unidecoded words 2019-04-25 03:06:00 +08:00			`if !must_continue { break }`
feat: Introduce the Indexer struct 2019-04-15 21:16:53 +08:00			`}`
			`}`

			`pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)`
			`where I: IntoIterator<Item=&'a str>,`
			`{`
			`let iter = iter.into_iter();`
			`for token in SeqTokenizer::new(iter) {`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`let must_continue = index_token(`
			`token,`
			`id,`
			`attr,`
			`self.word_limit,`
			`&mut self.words_doc_indexes,`
feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`&mut self.docs_words,`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`);`

feat: Index unidecoded words 2019-04-25 03:06:00 +08:00			`if !must_continue { break }`
feat: Introduce the Indexer struct 2019-04-15 21:16:53 +08:00			`}`
			`}`

feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`pub fn build(self) -> Indexed {`
			`let words_doc_indexes = self.words_doc_indexes`
			`.into_iter()`
feat: Use safest SetBuf constructor instead of new_unchecked 2019-05-22 00:02:14 +08:00			`.map(\|(word, indexes)\| (word, SetBuf::from_dirty(indexes)))`
			`.collect();`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00
feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`let docs_words = self.docs_words`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`.into_iter()`
feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`.map(\|(id, mut words)\| {`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`words.sort_unstable();`
			`words.dedup();`
feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`(id, fst::Set::from_iter(words).unwrap())`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`})`
			`.collect();`

feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`Indexed { words_doc_indexes, docs_words }`
feat: Introduce the Indexer struct 2019-04-15 21:16:53 +08:00			`}`
			`}`

feat: Index unidecoded words 2019-04-25 03:06:00 +08:00			`fn index_token(`
			`token: Token,`
			`id: DocumentId,`
			`attr: SchemaAttr,`
			`word_limit: usize,`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,`
feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`docs_words: &mut HashMap<DocumentId, Vec<Word>>,`
feat: Index unidecoded words 2019-04-25 03:06:00 +08:00			`) -> bool`
			`{`
			`if token.word_index >= word_limit { return false }`

			`let lower = token.word.to_lowercase();`
			`let token = Token { word: &lower, ..token };`
			`match token_to_docindex(id, attr, token) {`
			`Some(docindex) => {`
			`let word = Vec::from(token.word);`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);`
feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`docs_words.entry(id).or_insert_with(Vec::new).push(word);`
feat: Index unidecoded words 2019-04-25 03:06:00 +08:00			`},`
			`None => return false,`
			`}`

			`if !lower.contains(is_cjk) {`
			`let unidecoded = deunicode_with_tofu(&lower, "");`
			`if unidecoded != lower {`
			`let token = Token { word: &unidecoded, ..token };`
			`match token_to_docindex(id, attr, token) {`
			`Some(docindex) => {`
			`let word = Vec::from(token.word);`
feat: Introduce the DocumentsDeletion type 2019-05-09 22:16:07 +08:00			`words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);`
feat: Store all documents words by document rather than by attribute 2019-05-13 22:22:36 +08:00			`docs_words.entry(id).or_insert_with(Vec::new).push(word);`
feat: Index unidecoded words 2019-04-25 03:06:00 +08:00			`},`
			`None => return false,`
			`}`
			`}`
			`}`

			`true`
			`}`

			`fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {`
feat: Introduce the Indexer struct 2019-04-15 21:16:53 +08:00			`let word_index = u16::try_from(token.word_index).ok()?;`
			`let char_index = u16::try_from(token.char_index).ok()?;`
			`let char_length = u16::try_from(token.word.chars().count()).ok()?;`

			`let docindex = DocIndex {`
			`document_id: id,`
			`attribute: attr.0,`
chore: Do a little clippy pass 2019-05-22 17:00:58 +08:00			`word_index,`
			`char_index,`
			`char_length,`
feat: Introduce the Indexer struct 2019-04-15 21:16:53 +08:00			`};`

			`Some(docindex)`
			`}`