mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-27 04:25:06 +08:00
feat: Do not save unidecoded cjk kanjis
This commit is contained in:
parent
24f265a963
commit
c9e0ad132c
@ -8,7 +8,7 @@ use crate::database::serde::SerializerError;
|
|||||||
use crate::database::schema::SchemaAttr;
|
use crate::database::schema::SchemaAttr;
|
||||||
use crate::tokenizer::TokenizerBuilder;
|
use crate::tokenizer::TokenizerBuilder;
|
||||||
use crate::tokenizer::Token;
|
use crate::tokenizer::Token;
|
||||||
use crate::{DocumentId, DocIndex};
|
use crate::{is_cjk, DocumentId, DocIndex};
|
||||||
|
|
||||||
pub struct IndexerSerializer<'a, 'b, B> {
|
pub struct IndexerSerializer<'a, 'b, B> {
|
||||||
pub tokenizer_builder: &'a B,
|
pub tokenizer_builder: &'a B,
|
||||||
@ -65,13 +65,16 @@ where B: TokenizerBuilder
|
|||||||
if self.stop_words.contains(&word_lower) { continue }
|
if self.stop_words.contains(&word_lower) { continue }
|
||||||
|
|
||||||
// and the unidecoded lowercased version
|
// and the unidecoded lowercased version
|
||||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
if !word_lower.chars().any(is_cjk) {
|
||||||
if word_lower != word_unidecoded {
|
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||||
let char_index = char_index as u32;
|
let word_unidecoded = word_unidecoded.trim();
|
||||||
let char_length = length;
|
if word_lower != word_unidecoded {
|
||||||
|
let char_index = char_index as u32;
|
||||||
|
let char_length = length;
|
||||||
|
|
||||||
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
||||||
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index)?;
|
self.update.insert_doc_index(word_unidecoded.as_bytes().to_vec(), doc_index)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let char_index = char_index as u32;
|
let char_index = char_index as u32;
|
||||||
|
Loading…
Reference in New Issue
Block a user