2020-05-22 21:00:50 +08:00
|
|
|
|
use std::borrow::Cow;
|
2019-10-02 23:34:32 +08:00
|
|
|
|
use std::collections::{BTreeMap, HashMap};
|
|
|
|
|
use std::convert::TryFrom;
|
|
|
|
|
|
|
|
|
|
use deunicode::deunicode_with_tofu;
|
2020-01-11 01:20:30 +08:00
|
|
|
|
use meilisearch_schema::IndexedPos;
|
2019-11-26 18:06:55 +08:00
|
|
|
|
use meilisearch_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer};
|
2019-10-02 23:34:32 +08:00
|
|
|
|
use sdset::SetBuf;
|
|
|
|
|
|
2020-05-22 21:00:50 +08:00
|
|
|
|
use crate::{DocIndex, DocumentId};
|
|
|
|
|
use crate::FstSetCow;
|
|
|
|
|
|
2019-11-11 00:41:32 +08:00
|
|
|
|
const WORD_LENGTH_LIMIT: usize = 80;
|
|
|
|
|
|
2019-10-02 23:34:32 +08:00
|
|
|
|
type Word = Vec<u8>; // TODO make it be a SmallVec
|
|
|
|
|
|
2020-05-22 21:00:50 +08:00
|
|
|
|
pub struct RawIndexer<A> {
|
2019-10-02 23:34:32 +08:00
|
|
|
|
word_limit: usize, // the maximum number of indexed words
|
2020-05-22 21:00:50 +08:00
|
|
|
|
stop_words: fst::Set<A>,
|
2019-10-02 23:34:32 +08:00
|
|
|
|
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
|
|
|
|
docs_words: HashMap<DocumentId, Vec<Word>>,
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-22 21:00:50 +08:00
|
|
|
|
pub struct Indexed<'a> {
|
2019-10-02 23:34:32 +08:00
|
|
|
|
pub words_doc_indexes: BTreeMap<Word, SetBuf<DocIndex>>,
|
2020-05-22 21:00:50 +08:00
|
|
|
|
pub docs_words: HashMap<DocumentId, FstSetCow<'a>>,
|
2019-10-02 23:34:32 +08:00
|
|
|
|
}
|
|
|
|
|
|
2020-05-22 21:00:50 +08:00
|
|
|
|
impl<A> RawIndexer<A> {
|
|
|
|
|
pub fn new(stop_words: fst::Set<A>) -> RawIndexer<A> {
|
2019-10-29 22:53:45 +08:00
|
|
|
|
RawIndexer::with_word_limit(stop_words, 1000)
|
2019-10-02 23:34:32 +08:00
|
|
|
|
}
|
|
|
|
|
|
2020-05-22 21:00:50 +08:00
|
|
|
|
pub fn with_word_limit(stop_words: fst::Set<A>, limit: usize) -> RawIndexer<A> {
|
2019-10-02 23:34:32 +08:00
|
|
|
|
RawIndexer {
|
|
|
|
|
word_limit: limit,
|
2019-10-29 22:53:45 +08:00
|
|
|
|
stop_words,
|
2019-10-02 23:34:32 +08:00
|
|
|
|
words_doc_indexes: BTreeMap::new(),
|
|
|
|
|
docs_words: HashMap::new(),
|
|
|
|
|
}
|
|
|
|
|
}
|
2020-05-22 21:00:50 +08:00
|
|
|
|
}
|
2019-10-02 23:34:32 +08:00
|
|
|
|
|
2020-05-22 21:00:50 +08:00
|
|
|
|
impl<A: AsRef<[u8]>> RawIndexer<A> {
|
2020-01-11 01:20:30 +08:00
|
|
|
|
pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
|
2019-10-14 19:56:52 +08:00
|
|
|
|
let mut number_of_words = 0;
|
2019-10-02 23:34:32 +08:00
|
|
|
|
|
2019-11-04 23:09:32 +08:00
|
|
|
|
for token in Tokenizer::new(text) {
|
2019-10-02 23:34:32 +08:00
|
|
|
|
let must_continue = index_token(
|
|
|
|
|
token,
|
|
|
|
|
id,
|
2020-01-11 01:20:30 +08:00
|
|
|
|
indexed_pos,
|
2019-10-02 23:34:32 +08:00
|
|
|
|
self.word_limit,
|
2019-10-29 22:53:45 +08:00
|
|
|
|
&self.stop_words,
|
2019-10-02 23:34:32 +08:00
|
|
|
|
&mut self.words_doc_indexes,
|
|
|
|
|
&mut self.docs_words,
|
|
|
|
|
);
|
|
|
|
|
|
2019-11-04 23:09:32 +08:00
|
|
|
|
number_of_words += 1;
|
|
|
|
|
|
2019-10-18 19:05:28 +08:00
|
|
|
|
if !must_continue {
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-10-02 23:34:32 +08:00
|
|
|
|
}
|
|
|
|
|
|
2019-11-04 23:09:32 +08:00
|
|
|
|
number_of_words
|
|
|
|
|
}
|
2019-10-02 23:34:32 +08:00
|
|
|
|
|
2020-05-22 21:00:50 +08:00
|
|
|
|
pub fn index_text_seq<'s, I>(&mut self, id: DocumentId, indexed_pos: IndexedPos, iter: I)
|
2019-11-04 23:09:32 +08:00
|
|
|
|
where
|
2020-05-22 21:00:50 +08:00
|
|
|
|
I: IntoIterator<Item = &'s str>,
|
2019-11-04 23:09:32 +08:00
|
|
|
|
{
|
|
|
|
|
let iter = iter.into_iter();
|
2019-10-02 23:34:32 +08:00
|
|
|
|
for token in SeqTokenizer::new(iter) {
|
|
|
|
|
let must_continue = index_token(
|
|
|
|
|
token,
|
|
|
|
|
id,
|
2020-01-11 01:20:30 +08:00
|
|
|
|
indexed_pos,
|
2019-10-02 23:34:32 +08:00
|
|
|
|
self.word_limit,
|
2019-10-29 22:53:45 +08:00
|
|
|
|
&self.stop_words,
|
2019-10-02 23:34:32 +08:00
|
|
|
|
&mut self.words_doc_indexes,
|
|
|
|
|
&mut self.docs_words,
|
|
|
|
|
);
|
|
|
|
|
|
2019-10-18 19:05:28 +08:00
|
|
|
|
if !must_continue {
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-10-02 23:34:32 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-22 21:00:50 +08:00
|
|
|
|
pub fn build(self) -> Indexed<'static> {
|
2019-10-18 19:05:28 +08:00
|
|
|
|
let words_doc_indexes = self
|
|
|
|
|
.words_doc_indexes
|
2019-10-02 23:34:32 +08:00
|
|
|
|
.into_iter()
|
|
|
|
|
.map(|(word, indexes)| (word, SetBuf::from_dirty(indexes)))
|
|
|
|
|
.collect();
|
|
|
|
|
|
2019-10-18 19:05:28 +08:00
|
|
|
|
let docs_words = self
|
|
|
|
|
.docs_words
|
2019-10-02 23:34:32 +08:00
|
|
|
|
.into_iter()
|
|
|
|
|
.map(|(id, mut words)| {
|
|
|
|
|
words.sort_unstable();
|
|
|
|
|
words.dedup();
|
2020-05-22 21:00:50 +08:00
|
|
|
|
let fst = fst::Set::from_iter(words).unwrap().map_data(Cow::Owned).unwrap();
|
|
|
|
|
(id, fst)
|
2019-10-02 23:34:32 +08:00
|
|
|
|
})
|
|
|
|
|
.collect();
|
|
|
|
|
|
2019-10-18 19:05:28 +08:00
|
|
|
|
Indexed {
|
|
|
|
|
words_doc_indexes,
|
|
|
|
|
docs_words,
|
|
|
|
|
}
|
2019-10-02 23:34:32 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-05-22 21:00:50 +08:00
|
|
|
|
fn index_token<A>(
|
2019-10-02 23:34:32 +08:00
|
|
|
|
token: Token,
|
|
|
|
|
id: DocumentId,
|
2020-01-11 01:20:30 +08:00
|
|
|
|
indexed_pos: IndexedPos,
|
2019-10-02 23:34:32 +08:00
|
|
|
|
word_limit: usize,
|
2020-05-22 21:00:50 +08:00
|
|
|
|
stop_words: &fst::Set<A>,
|
2019-10-02 23:34:32 +08:00
|
|
|
|
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
|
|
|
|
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
2020-05-22 21:00:50 +08:00
|
|
|
|
) -> bool
|
|
|
|
|
where A: AsRef<[u8]>,
|
|
|
|
|
{
|
2019-10-18 19:05:28 +08:00
|
|
|
|
if token.word_index >= word_limit {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2019-10-02 23:34:32 +08:00
|
|
|
|
|
2019-11-04 23:09:32 +08:00
|
|
|
|
let lower = token.word.to_lowercase();
|
|
|
|
|
let token = Token {
|
|
|
|
|
word: &lower,
|
|
|
|
|
..token
|
|
|
|
|
};
|
|
|
|
|
|
2019-10-29 23:04:48 +08:00
|
|
|
|
if !stop_words.contains(&token.word) {
|
2020-01-11 01:20:30 +08:00
|
|
|
|
match token_to_docindex(id, indexed_pos, token) {
|
2019-10-29 23:04:48 +08:00
|
|
|
|
Some(docindex) => {
|
|
|
|
|
let word = Vec::from(token.word);
|
2019-11-11 00:41:32 +08:00
|
|
|
|
|
|
|
|
|
if word.len() <= WORD_LENGTH_LIMIT {
|
|
|
|
|
words_doc_indexes
|
|
|
|
|
.entry(word.clone())
|
|
|
|
|
.or_insert_with(Vec::new)
|
|
|
|
|
.push(docindex);
|
|
|
|
|
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
|
|
|
|
|
|
|
|
|
if !lower.contains(is_cjk) {
|
|
|
|
|
let unidecoded = deunicode_with_tofu(&lower, "");
|
|
|
|
|
if unidecoded != lower && !unidecoded.is_empty() {
|
|
|
|
|
let word = Vec::from(unidecoded);
|
|
|
|
|
if word.len() <= WORD_LENGTH_LIMIT {
|
|
|
|
|
words_doc_indexes
|
|
|
|
|
.entry(word.clone())
|
|
|
|
|
.or_insert_with(Vec::new)
|
|
|
|
|
.push(docindex);
|
|
|
|
|
docs_words.entry(id).or_insert_with(Vec::new).push(word);
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-11-04 23:09:32 +08:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-11-05 23:40:34 +08:00
|
|
|
|
None => return false,
|
2019-11-04 23:09:32 +08:00
|
|
|
|
}
|
2019-10-02 23:34:32 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
true
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-11 01:20:30 +08:00
|
|
|
|
fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: Token) -> Option<DocIndex> {
|
2019-10-02 23:34:32 +08:00
|
|
|
|
let word_index = u16::try_from(token.word_index).ok()?;
|
|
|
|
|
let char_index = u16::try_from(token.char_index).ok()?;
|
|
|
|
|
let char_length = u16::try_from(token.word.chars().count()).ok()?;
|
|
|
|
|
|
|
|
|
|
let docindex = DocIndex {
|
|
|
|
|
document_id: id,
|
2020-01-11 01:20:30 +08:00
|
|
|
|
attribute: indexed_pos.0,
|
2019-10-02 23:34:32 +08:00
|
|
|
|
word_index,
|
|
|
|
|
char_index,
|
|
|
|
|
char_length,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
Some(docindex)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod tests {
|
|
|
|
|
use super::*;
|
2020-01-14 02:10:58 +08:00
|
|
|
|
use meilisearch_schema::IndexedPos;
|
2019-10-02 23:34:32 +08:00
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn strange_apostrophe() {
|
2019-10-29 22:53:45 +08:00
|
|
|
|
let mut indexer = RawIndexer::new(fst::Set::default());
|
2019-10-02 23:34:32 +08:00
|
|
|
|
|
|
|
|
|
let docid = DocumentId(0);
|
2020-01-14 02:10:58 +08:00
|
|
|
|
let indexed_pos = IndexedPos(0);
|
2019-10-02 23:34:32 +08:00
|
|
|
|
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
|
2020-01-14 02:10:58 +08:00
|
|
|
|
indexer.index_text(docid, indexed_pos, text);
|
2019-10-02 23:34:32 +08:00
|
|
|
|
|
2019-10-18 19:05:28 +08:00
|
|
|
|
let Indexed {
|
|
|
|
|
words_doc_indexes, ..
|
|
|
|
|
} = indexer.build();
|
2019-10-02 23:34:32 +08:00
|
|
|
|
|
|
|
|
|
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
|
|
|
|
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
|
|
|
|
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
|
|
|
|
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
2019-10-18 19:05:28 +08:00
|
|
|
|
assert!(words_doc_indexes
|
2019-11-04 23:10:31 +08:00
|
|
|
|
.get(&"éteindre".to_owned().into_bytes())
|
2019-10-18 19:05:28 +08:00
|
|
|
|
.is_some());
|
2019-10-02 23:34:32 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn strange_apostrophe_in_sequence() {
|
2019-10-29 22:53:45 +08:00
|
|
|
|
let mut indexer = RawIndexer::new(fst::Set::default());
|
2019-10-02 23:34:32 +08:00
|
|
|
|
|
|
|
|
|
let docid = DocumentId(0);
|
2020-01-14 02:10:58 +08:00
|
|
|
|
let indexed_pos = IndexedPos(0);
|
2019-10-02 23:34:32 +08:00
|
|
|
|
let text = vec!["Zut, l’aspirateur, j’ai oublié de l’éteindre !"];
|
2020-01-14 02:10:58 +08:00
|
|
|
|
indexer.index_text_seq(docid, indexed_pos, text);
|
2019-10-02 23:34:32 +08:00
|
|
|
|
|
2019-10-18 19:05:28 +08:00
|
|
|
|
let Indexed {
|
|
|
|
|
words_doc_indexes, ..
|
|
|
|
|
} = indexer.build();
|
2019-10-02 23:34:32 +08:00
|
|
|
|
|
|
|
|
|
assert!(words_doc_indexes.get(&b"l"[..]).is_some());
|
|
|
|
|
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
|
|
|
|
assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
|
|
|
|
|
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
2019-10-18 19:05:28 +08:00
|
|
|
|
assert!(words_doc_indexes
|
2019-11-04 23:10:31 +08:00
|
|
|
|
.get(&"éteindre".to_owned().into_bytes())
|
2019-10-18 19:05:28 +08:00
|
|
|
|
.is_some());
|
2019-10-02 23:34:32 +08:00
|
|
|
|
}
|
2019-10-29 23:04:48 +08:00
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn basic_stop_words() {
|
|
|
|
|
let stop_words = sdset::SetBuf::from_dirty(vec!["l", "j", "ai", "de"]);
|
|
|
|
|
let stop_words = fst::Set::from_iter(stop_words).unwrap();
|
|
|
|
|
|
|
|
|
|
let mut indexer = RawIndexer::new(stop_words);
|
|
|
|
|
|
|
|
|
|
let docid = DocumentId(0);
|
2020-01-14 02:10:58 +08:00
|
|
|
|
let indexed_pos = IndexedPos(0);
|
2019-10-29 23:04:48 +08:00
|
|
|
|
let text = "Zut, l’aspirateur, j’ai oublié de l’éteindre !";
|
2020-01-14 02:10:58 +08:00
|
|
|
|
indexer.index_text(docid, indexed_pos, text);
|
2019-10-29 23:04:48 +08:00
|
|
|
|
|
|
|
|
|
let Indexed {
|
|
|
|
|
words_doc_indexes, ..
|
|
|
|
|
} = indexer.build();
|
|
|
|
|
|
|
|
|
|
assert!(words_doc_indexes.get(&b"l"[..]).is_none());
|
|
|
|
|
assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
|
|
|
|
|
assert!(words_doc_indexes.get(&b"j"[..]).is_none());
|
|
|
|
|
assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
|
|
|
|
|
assert!(words_doc_indexes.get(&b"de"[..]).is_none());
|
|
|
|
|
assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
|
|
|
|
|
assert!(words_doc_indexes
|
2019-11-04 23:10:31 +08:00
|
|
|
|
.get(&"éteindre".to_owned().into_bytes())
|
2019-10-29 23:04:48 +08:00
|
|
|
|
.is_some());
|
|
|
|
|
}
|
2019-11-04 23:58:02 +08:00
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn no_empty_unidecode() {
|
|
|
|
|
let mut indexer = RawIndexer::new(fst::Set::default());
|
|
|
|
|
|
|
|
|
|
let docid = DocumentId(0);
|
2020-01-14 02:10:58 +08:00
|
|
|
|
let indexed_pos = IndexedPos(0);
|
2019-11-04 23:58:02 +08:00
|
|
|
|
let text = "🇯🇵";
|
2020-01-14 02:10:58 +08:00
|
|
|
|
indexer.index_text(docid, indexed_pos, text);
|
2019-11-04 23:58:02 +08:00
|
|
|
|
|
|
|
|
|
let Indexed {
|
|
|
|
|
words_doc_indexes, ..
|
|
|
|
|
} = indexer.build();
|
|
|
|
|
|
|
|
|
|
assert!(words_doc_indexes
|
|
|
|
|
.get(&"🇯🇵".to_owned().into_bytes())
|
|
|
|
|
.is_some());
|
|
|
|
|
}
|
2019-10-02 23:34:32 +08:00
|
|
|
|
}
|