mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 08:48:32 +08:00
Make the RawIndexer support stop words
This commit is contained in:
parent
a226fd23c3
commit
ff7dde7522
@ -11,6 +11,7 @@ type Word = Vec<u8>; // TODO make it be a SmallVec
|
|||||||
|
|
||||||
pub struct RawIndexer {
|
pub struct RawIndexer {
|
||||||
word_limit: usize, // the maximum number of indexed words
|
word_limit: usize, // the maximum number of indexed words
|
||||||
|
stop_words: fst::Set,
|
||||||
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
|
||||||
docs_words: HashMap<DocumentId, Vec<Word>>,
|
docs_words: HashMap<DocumentId, Vec<Word>>,
|
||||||
}
|
}
|
||||||
@ -21,13 +22,14 @@ pub struct Indexed {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl RawIndexer {
|
impl RawIndexer {
|
||||||
pub fn new() -> RawIndexer {
|
pub fn new(stop_words: fst::Set) -> RawIndexer {
|
||||||
RawIndexer::with_word_limit(1000)
|
RawIndexer::with_word_limit(stop_words, 1000)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn with_word_limit(limit: usize) -> RawIndexer {
|
pub fn with_word_limit(stop_words: fst::Set, limit: usize) -> RawIndexer {
|
||||||
RawIndexer {
|
RawIndexer {
|
||||||
word_limit: limit,
|
word_limit: limit,
|
||||||
|
stop_words,
|
||||||
words_doc_indexes: BTreeMap::new(),
|
words_doc_indexes: BTreeMap::new(),
|
||||||
docs_words: HashMap::new(),
|
docs_words: HashMap::new(),
|
||||||
}
|
}
|
||||||
@ -56,6 +58,7 @@ impl RawIndexer {
|
|||||||
id,
|
id,
|
||||||
attr,
|
attr,
|
||||||
self.word_limit,
|
self.word_limit,
|
||||||
|
&self.stop_words,
|
||||||
&mut self.words_doc_indexes,
|
&mut self.words_doc_indexes,
|
||||||
&mut self.docs_words,
|
&mut self.docs_words,
|
||||||
);
|
);
|
||||||
@ -87,6 +90,7 @@ impl RawIndexer {
|
|||||||
id,
|
id,
|
||||||
attr,
|
attr,
|
||||||
self.word_limit,
|
self.word_limit,
|
||||||
|
&self.stop_words,
|
||||||
&mut self.words_doc_indexes,
|
&mut self.words_doc_indexes,
|
||||||
&mut self.docs_words,
|
&mut self.docs_words,
|
||||||
);
|
);
|
||||||
@ -118,6 +122,7 @@ impl RawIndexer {
|
|||||||
id,
|
id,
|
||||||
attr,
|
attr,
|
||||||
self.word_limit,
|
self.word_limit,
|
||||||
|
&self.stop_words,
|
||||||
&mut self.words_doc_indexes,
|
&mut self.words_doc_indexes,
|
||||||
&mut self.docs_words,
|
&mut self.docs_words,
|
||||||
);
|
);
|
||||||
@ -152,17 +157,12 @@ impl RawIndexer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for RawIndexer {
|
|
||||||
fn default() -> Self {
|
|
||||||
Self::new()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn index_token(
|
fn index_token(
|
||||||
token: Token,
|
token: Token,
|
||||||
id: DocumentId,
|
id: DocumentId,
|
||||||
attr: SchemaAttr,
|
attr: SchemaAttr,
|
||||||
word_limit: usize,
|
word_limit: usize,
|
||||||
|
stop_words: &fst::Set,
|
||||||
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
words_doc_indexes: &mut BTreeMap<Word, Vec<DocIndex>>,
|
||||||
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
docs_words: &mut HashMap<DocumentId, Vec<Word>>,
|
||||||
) -> bool {
|
) -> bool {
|
||||||
@ -170,6 +170,10 @@ fn index_token(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if stop_words.contains(&token.word) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
match token_to_docindex(id, attr, token) {
|
match token_to_docindex(id, attr, token) {
|
||||||
Some(docindex) => {
|
Some(docindex) => {
|
||||||
let word = Vec::from(token.word);
|
let word = Vec::from(token.word);
|
||||||
@ -207,7 +211,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn strange_apostrophe() {
|
fn strange_apostrophe() {
|
||||||
let mut indexer = RawIndexer::new();
|
let mut indexer = RawIndexer::new(fst::Set::default());
|
||||||
|
|
||||||
let docid = DocumentId(0);
|
let docid = DocumentId(0);
|
||||||
let attr = SchemaAttr(0);
|
let attr = SchemaAttr(0);
|
||||||
@ -231,7 +235,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn strange_apostrophe_in_sequence() {
|
fn strange_apostrophe_in_sequence() {
|
||||||
let mut indexer = RawIndexer::new();
|
let mut indexer = RawIndexer::new(fst::Set::default());
|
||||||
|
|
||||||
let docid = DocumentId(0);
|
let docid = DocumentId(0);
|
||||||
let attr = SchemaAttr(0);
|
let attr = SchemaAttr(0);
|
||||||
|
@ -87,7 +87,6 @@ pub fn apply_documents_addition(
|
|||||||
addition: Vec<serde_json::Value>,
|
addition: Vec<serde_json::Value>,
|
||||||
) -> MResult<()> {
|
) -> MResult<()> {
|
||||||
let mut documents_additions = HashMap::new();
|
let mut documents_additions = HashMap::new();
|
||||||
let mut indexer = RawIndexer::new();
|
|
||||||
|
|
||||||
let schema = match main_store.schema(writer)? {
|
let schema = match main_store.schema(writer)? {
|
||||||
Some(schema) => schema,
|
Some(schema) => schema,
|
||||||
@ -124,7 +123,14 @@ pub fn apply_documents_addition(
|
|||||||
None => RankedMap::default(),
|
None => RankedMap::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let stop_words = match main_store.stop_words_fst(writer)? {
|
||||||
|
Some(stop_words) => stop_words,
|
||||||
|
None => fst::Set::default(),
|
||||||
|
};
|
||||||
|
|
||||||
// 3. index the documents fields in the stores
|
// 3. index the documents fields in the stores
|
||||||
|
let mut indexer = RawIndexer::new(stop_words);
|
||||||
|
|
||||||
for (document_id, document) in documents_additions {
|
for (document_id, document) in documents_additions {
|
||||||
let serializer = Serializer {
|
let serializer = Serializer {
|
||||||
txn: writer,
|
txn: writer,
|
||||||
@ -180,8 +186,13 @@ pub fn reindex_all_documents(
|
|||||||
postings_lists_store.clear(writer)?;
|
postings_lists_store.clear(writer)?;
|
||||||
docs_words_store.clear(writer)?;
|
docs_words_store.clear(writer)?;
|
||||||
|
|
||||||
|
let stop_words = match main_store.stop_words_fst(writer)? {
|
||||||
|
Some(stop_words) => stop_words,
|
||||||
|
None => fst::Set::default(),
|
||||||
|
};
|
||||||
|
|
||||||
// 3. re-index one document by one document (otherwise we make the borrow checker unhappy)
|
// 3. re-index one document by one document (otherwise we make the borrow checker unhappy)
|
||||||
let mut indexer = RawIndexer::new();
|
let mut indexer = RawIndexer::new(stop_words);
|
||||||
let mut ram_store = HashMap::new();
|
let mut ram_store = HashMap::new();
|
||||||
|
|
||||||
for document_id in documents_ids_to_reindex {
|
for document_id in documents_ids_to_reindex {
|
||||||
|
Loading…
Reference in New Issue
Block a user