meilisearch/meilidb-data/src/indexer.rs

use std::collections::{BTreeMap, HashMap};
use std::convert::TryFrom;

use deunicode::deunicode_with_tofu;
use meilidb_core::{DocumentId, DocIndex};
use meilidb_schema::SchemaAttr;
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
use sdset::SetBuf;

type Word = Vec<u8>; // TODO make it be a SmallVec

pub struct Indexer {
    word_limit: usize, // the maximum number of indexed words
    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
    docs_words: HashMap<DocumentId, Vec<Word>>,
}

pub struct Indexed {
    pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
    pub docs_words: HashMap<DocumentId, fst::Set>,
}

impl Indexer {
    pub fn new() -> Indexer {
        Indexer::with_word_limit(1000)
    }

    pub fn with_word_limit(limit: usize) -> Indexer {
        Indexer {
            word_limit: limit,
            words_doc_indexes: BTreeMap::new(),
            docs_words: HashMap::new(),
        }
    }

    pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
        let lowercase_text = text.to_lowercase();
        let deunicoded = deunicode_with_tofu(&lowercase_text, "");

        // TODO compute the deunicoded version after the cjk check
        let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
            Some(deunicoded)
        } else {
            None
        };
        let iter = Some(lowercase_text).into_iter().chain(next);

        for text in iter {
            for token in Tokenizer::new(&text) {
                let must_continue = index_token(
                    token,
                    id,
                    attr,
                    self.word_limit,
                    &mut self.words_doc_indexes,
                    &mut self.docs_words,
                );

                if !must_continue { break }
            }
        }
    }

    pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
    where I: IntoIterator<Item=&'a str, IntoIter=IT>,
          IT: Iterator<Item = &'a str> + Clone,
    {
        // TODO serialize this to one call to the SeqTokenizer loop

        let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
        let iter = lowercased.iter().map(|t| t.as_str());

        for token in SeqTokenizer::new(iter) {
            let must_continue = index_token(
                token,
                id,
                attr,
                self.word_limit,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );

            if !must_continue { break }
        }

        let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| {
            if lowercase_text.contains(is_cjk) { return lowercase_text }
            let deunicoded = deunicode_with_tofu(&lowercase_text, "");
            if lowercase_text != deunicoded { deunicoded } else { lowercase_text }
        }).collect();
        let iter = deunicoded.iter().map(|t| t.as_str());

        for token in SeqTokenizer::new(iter) {
            let must_continue = index_token(
                token,
                id,
                attr,
                self.word_limit,
                &mut self.words_doc_indexes,
                &mut self.docs_words,
            );

            if !must_continue { break }
        }
    }

    pub fn build(self) -> Indexed {
        let words_doc_indexes = self.words_doc_indexes
            .into_iter()
            .map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
            .collect();

        let docs_words = self.docs_words
            .into_iter()
            .map(|(id, mut words)| {
                words.sort_unstable();
                words.dedup();
                (id, fst::Set::from_iter(words).unwrap())
            })
            .collect();

        Indexed { words_doc_indexes, docs_words }
    }
}

fn index_token(
    token: Token,
    id: DocumentId,
    attr: SchemaAttr,
    word_limit: usize,
    words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
    docs_words: &mut HashMap<DocumentId, Vec<Word>>,
) -> bool
{
    if token.word_index >= word_limit { return false }

    match token_to_docindex(id, attr, token) {
        Some(docindex) => {
            let word = Vec::from(token.word);
            words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
            docs_words.entry(id).or_insert_with(Vec::new).push(word);
        },
        None => return false,
    }

    true
}

fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
    let word_index = u16::try_from(token.word_index).ok()?;
    let char_index = u16::try_from(token.char_index).ok()?;
    let char_length = u16::try_from(token.word.chars().count()).ok()?;

    let docindex = DocIndex {
        document_id: id,
        attribute: attr.0,
        word_index,
        char_index,
        char_length,
    };

    Some(docindex)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn strange_apostrophe() {
        let mut indexer = Indexer::new();

        let docid = DocumentId(0);
        let attr = SchemaAttr(0);
        let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
        indexer.index_text(docid, attr, text);

        let Indexed { words_doc_indexes, .. } = indexer.build();

        assert!(words_doc_indexes.get(&b"l"[..]).is_some());
        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());

        // with the ugly apostrophe...
        assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
    }

    #[test]
    fn strange_apostrophe_in_sequence() {
        let mut indexer = Indexer::new();

        let docid = DocumentId(0);
        let attr = SchemaAttr(0);
        let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
        indexer.index_text_seq(docid, attr, text);

        let Indexed { words_doc_indexes, .. } = indexer.build();

        assert!(words_doc_indexes.get(&b"l"[..]).is_some());
        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());

        // with the ugly apostrophe...
        assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
    }
}
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								use std::collections::{BTreeMap, HashMap};
-												feat: Introduce the Indexer struct

											
										
										
											2019-04-15 21:16:53 +08:00
+								use std::convert::TryFrom;
-												feat: Index unidecoded words

											
										
										
											2019-04-25 03:06:00 +08:00
+								use deunicode::deunicode_with_tofu;
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								use meilidb_core::{DocumentId, DocIndex};
-												feat: Move the Schema to its own workspace crate

											
										
										
											2019-05-29 21:26:18 +08:00
+								use meilidb_schema::SchemaAttr;
-												feat: Index unidecoded words

											
										
										
											2019-04-25 03:06:00 +08:00
+								use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
-												feat: Introduce the DocumentsAddition type

											
										
										
											2019-05-09 20:23:39 +08:00
+								use sdset::SetBuf;
-												feat: Introduce the Indexer struct

											
										
										
											2019-04-15 21:16:53 +08:00
 								type Word = Vec<u8>; // TODO make it be a SmallVec
 								pub struct Indexer {
 								    word_limit: usize, // the maximum number of indexed words
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
-												feat: Store all documents words by document rather than by attribute

											
										
										
											2019-05-13 22:22:36 +08:00
+								    docs_words: HashMap<DocumentId, Vec<Word>>,
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								}
 								pub struct Indexed {
 								    pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
-												feat: Store all documents words by document rather than by attribute

											
										
										
											2019-05-13 22:22:36 +08:00
+								    pub docs_words: HashMap<DocumentId, fst::Set>,
-												feat: Introduce the Indexer struct

											
										
										
											2019-04-15 21:16:53 +08:00
+								}
 								impl Indexer {
 								    pub fn new() -> Indexer {
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								        Indexer::with_word_limit(1000)
-												feat: Introduce the Indexer struct

											
										
										
											2019-04-15 21:16:53 +08:00
+								    }
-												feat: Introduce the Serializer

Which will serialize documents fields as message pack in the kv-store

											
										
										
											2019-04-19 21:50:53 +08:00
+								    pub fn with_word_limit(limit: usize) -> Indexer {
 								        Indexer {
 								            word_limit: limit,
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								            words_doc_indexes: BTreeMap::new(),
-												feat: Store all documents words by document rather than by attribute

											
										
										
											2019-05-13 22:22:36 +08:00
+								            docs_words: HashMap::new(),
-												feat: Introduce the Serializer

Which will serialize documents fields as message pack in the kv-store

											
										
										
											2019-04-19 21:50:53 +08:00
+								        }
 								    }
-												feat: Introduce the Indexer struct

											
										
										
											2019-04-15 21:16:53 +08:00
+								    pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
-												feat: Improve the tokenizer by split after deunicode

											
										
										
											2019-09-02 22:54:54 +08:00
+								        let lowercase_text = text.to_lowercase();
 								        let deunicoded = deunicode_with_tofu(&lowercase_text, "");
 								        // TODO compute the deunicoded version after the cjk check
 								        let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
 								            Some(deunicoded)
 								        } else {
 								            None
 								        };
 								        let iter = Some(lowercase_text).into_iter().chain(next);
 								        for text in iter {
 								            for token in Tokenizer::new(&text) {
 								                let must_continue = index_token(
 								                    token,
 								                    id,
 								                    attr,
 								                    self.word_limit,
 								                    &mut self.words_doc_indexes,
 								                    &mut self.docs_words,
 								                );
 								                if !must_continue { break }
 								            }
 								        }
 								    }
 								    pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
 								    where I: IntoIterator<Item=&'a str, IntoIter=IT>,
 								          IT: Iterator<Item = &'a str> + Clone,
 								    {
 								        // TODO serialize this to one call to the SeqTokenizer loop
 								        let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
 								        let iter = lowercased.iter().map(|t| t.as_str());
 								        for token in SeqTokenizer::new(iter) {
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								            let must_continue = index_token(
 								                token,
 								                id,
 								                attr,
 								                self.word_limit,
 								                &mut self.words_doc_indexes,
-												feat: Store all documents words by document rather than by attribute

											
										
										
											2019-05-13 22:22:36 +08:00
+								                &mut self.docs_words,
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								            );
-												feat: Index unidecoded words

											
										
										
											2019-04-25 03:06:00 +08:00
+								            if !must_continue { break }
-												feat: Introduce the Indexer struct

											
										
										
											2019-04-15 21:16:53 +08:00
+								        }
-												feat: Improve the tokenizer by split after deunicode

											
										
										
											2019-09-02 22:54:54 +08:00
+								        let deunicoded: Vec<_> = lowercased.into_iter().map(|lowercase_text| {
 								            if lowercase_text.contains(is_cjk) { return lowercase_text }
 								            let deunicoded = deunicode_with_tofu(&lowercase_text, "");
 								            if lowercase_text != deunicoded { deunicoded } else { lowercase_text }
 								        }).collect();
 								        let iter = deunicoded.iter().map(|t| t.as_str());
-												feat: Introduce the Indexer struct

											
										
										
											2019-04-15 21:16:53 +08:00
+								        for token in SeqTokenizer::new(iter) {
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								            let must_continue = index_token(
 								                token,
 								                id,
 								                attr,
 								                self.word_limit,
 								                &mut self.words_doc_indexes,
-												feat: Store all documents words by document rather than by attribute

											
										
										
											2019-05-13 22:22:36 +08:00
+								                &mut self.docs_words,
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								            );
-												feat: Index unidecoded words

											
										
										
											2019-04-25 03:06:00 +08:00
+								            if !must_continue { break }
-												feat: Introduce the Indexer struct

											
										
										
											2019-04-15 21:16:53 +08:00
+								        }
 								    }
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								    pub fn build(self) -> Indexed {
 								        let words_doc_indexes = self.words_doc_indexes
 								            .into_iter()
-												feat: Use safest SetBuf constructor instead of new_unchecked

											
										
										
											2019-05-22 00:02:14 +08:00
+								            .map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
 								            .collect();
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
-												feat: Store all documents words by document rather than by attribute

											
										
										
											2019-05-13 22:22:36 +08:00
+								        let docs_words = self.docs_words
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								            .into_iter()
-												feat: Store all documents words by document rather than by attribute

											
										
										
											2019-05-13 22:22:36 +08:00
+								            .map(|(id, mut words)| {
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								                words.sort_unstable();
 								                words.dedup();
-												feat: Store all documents words by document rather than by attribute

											
										
										
											2019-05-13 22:22:36 +08:00
+								                (id, fst::Set::from_iter(words).unwrap())
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								            })
 								            .collect();
-												feat: Store all documents words by document rather than by attribute

											
										
										
											2019-05-13 22:22:36 +08:00
+								        Indexed { words_doc_indexes, docs_words }
-												feat: Introduce the Indexer struct

											
										
										
											2019-04-15 21:16:53 +08:00
+								    }
 								}
-												feat: Index unidecoded words

											
										
										
											2019-04-25 03:06:00 +08:00
+								fn index_token(
 								    token: Token,
 								    id: DocumentId,
 								    attr: SchemaAttr,
 								    word_limit: usize,
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								    words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
-												feat: Store all documents words by document rather than by attribute

											
										
										
											2019-05-13 22:22:36 +08:00
+								    docs_words: &mut HashMap<DocumentId, Vec<Word>>,
-												feat: Index unidecoded words

											
										
										
											2019-04-25 03:06:00 +08:00
+								) -> bool
 								{
 								    if token.word_index >= word_limit { return false }
 								    match token_to_docindex(id, attr, token) {
 								        Some(docindex) => {
 								            let word = Vec::from(token.word);
-												feat: Introduce the DocumentsDeletion type

											
										
										
											2019-05-09 22:16:07 +08:00
+								            words_doc_indexes.entry(word.clone()).or_insert_with(Vec::new).push(docindex);
-												feat: Store all documents words by document rather than by attribute

											
										
										
											2019-05-13 22:22:36 +08:00
+								            docs_words.entry(id).or_insert_with(Vec::new).push(word);
-												feat: Index unidecoded words

											
										
										
											2019-04-25 03:06:00 +08:00
+								        },
 								        None => return false,
 								    }
 								    true
 								}
 								fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
-												feat: Introduce the Indexer struct

											
										
										
											2019-04-15 21:16:53 +08:00
+								    let word_index = u16::try_from(token.word_index).ok()?;
 								    let char_index = u16::try_from(token.char_index).ok()?;
 								    let char_length = u16::try_from(token.word.chars().count()).ok()?;
 								    let docindex = DocIndex {
 								        document_id: id,
 								        attribute: attr.0,
-												chore: Do a little clippy pass

											
										
										
											2019-05-22 17:00:58 +08:00
+								        word_index,
 								        char_index,
 								        char_length,
-												feat: Introduce the Indexer struct

											
										
										
											2019-04-15 21:16:53 +08:00
+								    };
 								    Some(docindex)
 								}
-												feat: Improve the tokenizer by split after deunicode

											
										
										
											2019-09-02 22:54:54 +08:00
 								#[cfg(test)]
 								mod tests {
 								    use super::*;
 								    #[test]
 								    fn strange_apostrophe() {
 								        let mut indexer = Indexer::new();
 								        let docid = DocumentId(0);
 								        let attr = SchemaAttr(0);
 								        let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
 								        indexer.index_text(docid, attr, text);
 								        let Indexed { words_doc_indexes, .. } = indexer.build();
 								        assert!(words_doc_indexes.get(&b"l"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
 								        // with the ugly apostrophe...
 								        assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
 								    }
 								    #[test]
 								    fn strange_apostrophe_in_sequence() {
 								        let mut indexer = Indexer::new();
 								        let docid = DocumentId(0);
 								        let attr = SchemaAttr(0);
 								        let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
 								        indexer.index_text_seq(docid, attr, text);
 								        let Indexed { words_doc_indexes, .. } = indexer.build();
 								        assert!(words_doc_indexes.get(&b"l"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
 								        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
 								        // with the ugly apostrophe...
 								        assert!(words_doc_indexes.get(&"l’éteindre".to_owned().into_bytes()).is_some());
 								    }
 								}