mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 03:55:07 +08:00
Update charabia
This commit is contained in:
parent
c26bd68de5
commit
cc02920f2b
37
Cargo.lock
generated
37
Cargo.lock
generated
@ -934,19 +934,15 @@ dependencies = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "charabia"
|
name = "charabia"
|
||||||
version = "0.8.12"
|
version = "0.8.12"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#a95a9217265cee515708a679a2ed08ced1ac58a3"
|
||||||
checksum = "9868a22f10dee80498a8a2b6c641d80bf28ea4495fcf71c2dc4836c2dd23958c"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick",
|
"aho-corasick",
|
||||||
"cow-utils",
|
|
||||||
"csv",
|
"csv",
|
||||||
"deunicode",
|
|
||||||
"either",
|
"either",
|
||||||
"fst",
|
"fst",
|
||||||
"irg-kvariants",
|
"irg-kvariants",
|
||||||
"jieba-rs",
|
"jieba-rs",
|
||||||
"lindera",
|
"lindera",
|
||||||
"litemap",
|
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"pinyin",
|
"pinyin",
|
||||||
"serde",
|
"serde",
|
||||||
@ -954,7 +950,6 @@ dependencies = [
|
|||||||
"unicode-normalization",
|
"unicode-normalization",
|
||||||
"wana_kana",
|
"wana_kana",
|
||||||
"whatlang",
|
"whatlang",
|
||||||
"zerovec",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1145,12 +1140,6 @@ version = "0.8.4"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
|
checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "cow-utils"
|
|
||||||
version = "0.1.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cpufeatures"
|
name = "cpufeatures"
|
||||||
version = "0.2.12"
|
version = "0.2.12"
|
||||||
@ -1551,12 +1540,6 @@ dependencies = [
|
|||||||
"syn 2.0.60",
|
"syn 2.0.60",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "deunicode"
|
|
||||||
version = "1.6.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "digest"
|
name = "digest"
|
||||||
version = "0.10.7"
|
version = "0.10.7"
|
||||||
@ -2666,8 +2649,7 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "irg-kvariants"
|
name = "irg-kvariants"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#a95a9217265cee515708a679a2ed08ced1ac58a3"
|
||||||
checksum = "ef2af7c331f2536964a32b78a7d2e0963d78b42f4a76323b16cc7d94b1ddce26"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"csv",
|
"csv",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
@ -3278,12 +3260,6 @@ dependencies = [
|
|||||||
"unicode-segmentation",
|
"unicode-segmentation",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "litemap"
|
|
||||||
version = "0.7.3"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lmdb-master-sys"
|
name = "lmdb-master-sys"
|
||||||
version = "0.2.2"
|
version = "0.2.2"
|
||||||
@ -6506,15 +6482,6 @@ dependencies = [
|
|||||||
"syn 2.0.60",
|
"syn 2.0.60",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "zerovec"
|
|
||||||
version = "0.10.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
|
|
||||||
dependencies = [
|
|
||||||
"zerofrom",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "zip"
|
name = "zip"
|
||||||
version = "1.1.4"
|
version = "1.1.4"
|
||||||
|
@ -17,7 +17,7 @@ bincode = "1.3.3"
|
|||||||
bstr = "1.9.1"
|
bstr = "1.9.1"
|
||||||
bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] }
|
bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] }
|
||||||
byteorder = "1.5.0"
|
byteorder = "1.5.0"
|
||||||
charabia = { version = "0.8.12", default-features = false }
|
charabia = { git = "https://github.com/meilisearch/charabia.git", branch = "simplify-lang-detection", default-features = false }
|
||||||
concat-arrays = "0.1.2"
|
concat-arrays = "0.1.2"
|
||||||
crossbeam-channel = "0.5.13"
|
crossbeam-channel = "0.5.13"
|
||||||
deserr = "0.6.2"
|
deserr = "0.6.2"
|
||||||
|
@ -1604,6 +1604,29 @@ impl Index {
|
|||||||
Ok(script_language)
|
Ok(script_language)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn languages(&self, rtxn: &RoTxn<'_>) -> heed::Result<Vec<Language>> {
|
||||||
|
let mut script_language_doc_count: Vec<(Language, u64)> = Vec::new();
|
||||||
|
let mut total = 0;
|
||||||
|
for sl in self.script_language_docids.iter(rtxn)? {
|
||||||
|
let ((_script, language), docids) = sl?;
|
||||||
|
|
||||||
|
// keep only Languages that contains at least 1 document.
|
||||||
|
let remaining_documents_count = docids.len();
|
||||||
|
total += remaining_documents_count;
|
||||||
|
if remaining_documents_count > 0 {
|
||||||
|
script_language_doc_count.push((language, remaining_documents_count));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let threshold = total / 20; // 5% (arbitrary)
|
||||||
|
|
||||||
|
Ok(script_language_doc_count
|
||||||
|
.into_iter()
|
||||||
|
.filter(|(_, count)| *count > threshold)
|
||||||
|
.map(|(language, _)| language)
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
/// Put the embedding configs:
|
/// Put the embedding configs:
|
||||||
/// 1. The name of the embedder
|
/// 1. The name of the embedder
|
||||||
/// 2. The configuration option for this embedder
|
/// 2. The configuration option for this embedder
|
||||||
|
@ -670,9 +670,9 @@ pub fn execute_search(
|
|||||||
tokbuilder.words_dict(dictionary);
|
tokbuilder.words_dict(dictionary);
|
||||||
}
|
}
|
||||||
|
|
||||||
let script_lang_map = ctx.index.script_language(ctx.txn)?;
|
let languages = ctx.index.languages(ctx.txn)?;
|
||||||
if !script_lang_map.is_empty() {
|
if !languages.is_empty() {
|
||||||
tokbuilder.allow_list(&script_lang_map);
|
tokbuilder.allow_list(&languages);
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokenizer = tokbuilder.build();
|
let tokenizer = tokbuilder.build();
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
use std::{io, mem, str};
|
use std::{io, mem, str};
|
||||||
|
|
||||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||||
use obkv::{KvReader, KvWriterU16};
|
use obkv::{KvReader, KvWriterU16};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
@ -12,11 +11,9 @@ use serde_json::Value;
|
|||||||
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
||||||
use crate::error::{InternalError, SerializationError};
|
use crate::error::{InternalError, SerializationError};
|
||||||
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||||
|
|
||||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
|
||||||
|
|
||||||
/// Extracts the word and positions where this word appear and
|
/// Extracts the word and positions where this word appear and
|
||||||
/// prefixes it by the document id.
|
/// prefixes it by the document id.
|
||||||
///
|
///
|
||||||
@ -28,7 +25,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
settings_diff: &InnerIndexSettingsDiff,
|
settings_diff: &InnerIndexSettingsDiff,
|
||||||
max_positions_per_attributes: Option<u32>,
|
max_positions_per_attributes: Option<u32>,
|
||||||
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||||
let max_positions_per_attributes = max_positions_per_attributes
|
let max_positions_per_attributes = max_positions_per_attributes
|
||||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
@ -36,7 +33,6 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
// initialize destination values.
|
// initialize destination values.
|
||||||
let mut documents_ids = RoaringBitmap::new();
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
let mut script_language_docids = HashMap::new();
|
|
||||||
let mut docid_word_positions_sorter = create_sorter(
|
let mut docid_word_positions_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
keep_latest_obkv,
|
keep_latest_obkv,
|
||||||
@ -109,9 +105,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||||
|| {
|
|| {
|
||||||
// deletions
|
// deletions
|
||||||
lang_safe_tokens_from_document(
|
tokens_from_document(
|
||||||
&obkv,
|
&obkv,
|
||||||
&settings_diff.old,
|
&settings_diff.old.searchable_fields_ids,
|
||||||
&del_tokenizer,
|
&del_tokenizer,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
DelAdd::Deletion,
|
DelAdd::Deletion,
|
||||||
@ -120,9 +116,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
},
|
},
|
||||||
|| {
|
|| {
|
||||||
// additions
|
// additions
|
||||||
lang_safe_tokens_from_document(
|
tokens_from_document(
|
||||||
&obkv,
|
&obkv,
|
||||||
&settings_diff.new,
|
&settings_diff.new.searchable_fields_ids,
|
||||||
&add_tokenizer,
|
&add_tokenizer,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
DelAdd::Addition,
|
DelAdd::Addition,
|
||||||
@ -131,8 +127,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
let (del_obkv, del_script_language_word_count) = del?;
|
let del_obkv = del?;
|
||||||
let (add_obkv, add_script_language_word_count) = add?;
|
let add_obkv = add?;
|
||||||
|
|
||||||
// merge deletions and additions.
|
// merge deletions and additions.
|
||||||
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
||||||
@ -150,31 +146,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
docid_word_positions_sorter.insert(&key_buffer, value)?;
|
docid_word_positions_sorter.insert(&key_buffer, value)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// update script_language_docids deletions.
|
|
||||||
for (script, languages_frequency) in del_script_language_word_count {
|
|
||||||
for (language, _) in languages_frequency {
|
|
||||||
let entry = script_language_docids
|
|
||||||
.entry((script, language))
|
|
||||||
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
|
||||||
entry.0.push(document_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// update script_language_docids additions.
|
|
||||||
for (script, languages_frequency) in add_script_language_word_count {
|
|
||||||
for (language, _) in languages_frequency {
|
|
||||||
let entry = script_language_docids
|
|
||||||
.entry((script, language))
|
|
||||||
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
|
||||||
entry.1.push(document_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
|
// the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
|
||||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||||
.map(|reader| (reader, script_language_docids))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Check if any searchable fields of a document changed.
|
/// Check if any searchable fields of a document changed.
|
||||||
@ -205,7 +180,7 @@ fn tokenizer_builder<'a>(
|
|||||||
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
||||||
allowed_separators: Option<&'a [&str]>,
|
allowed_separators: Option<&'a [&str]>,
|
||||||
dictionary: Option<&'a [&str]>,
|
dictionary: Option<&'a [&str]>,
|
||||||
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
languages: Option<&'a Vec<Language>>,
|
||||||
) -> TokenizerBuilder<'a, Vec<u8>> {
|
) -> TokenizerBuilder<'a, Vec<u8>> {
|
||||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||||
if let Some(stop_words) = stop_words {
|
if let Some(stop_words) = stop_words {
|
||||||
@ -218,81 +193,13 @@ fn tokenizer_builder<'a>(
|
|||||||
tokenizer_builder.separators(separators);
|
tokenizer_builder.separators(separators);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(script_language) = script_language {
|
if let Some(languages) = languages {
|
||||||
tokenizer_builder.allow_list(script_language);
|
tokenizer_builder.allow_list(languages);
|
||||||
}
|
}
|
||||||
|
|
||||||
tokenizer_builder
|
tokenizer_builder
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extract words mapped with their positions of a document,
|
|
||||||
/// ensuring no Language detection mistakes was made.
|
|
||||||
fn lang_safe_tokens_from_document<'a>(
|
|
||||||
obkv: &KvReader<'_, FieldId>,
|
|
||||||
settings: &InnerIndexSettings,
|
|
||||||
tokenizer: &Tokenizer<'_>,
|
|
||||||
max_positions_per_attributes: u32,
|
|
||||||
del_add: DelAdd,
|
|
||||||
buffers: &'a mut Buffers,
|
|
||||||
) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> {
|
|
||||||
let mut script_language_word_count = HashMap::new();
|
|
||||||
|
|
||||||
tokens_from_document(
|
|
||||||
obkv,
|
|
||||||
&settings.searchable_fields_ids,
|
|
||||||
tokenizer,
|
|
||||||
max_positions_per_attributes,
|
|
||||||
del_add,
|
|
||||||
buffers,
|
|
||||||
&mut script_language_word_count,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
// if we detect a potetial mistake in the language detection,
|
|
||||||
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
|
||||||
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
|
||||||
if script_language_word_count
|
|
||||||
.values()
|
|
||||||
.map(Vec::as_slice)
|
|
||||||
.any(potential_language_detection_error)
|
|
||||||
{
|
|
||||||
// build an allow list with the most frequent detected languages in the document.
|
|
||||||
let script_language: HashMap<_, _> =
|
|
||||||
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
|
||||||
|
|
||||||
// if the allow list is empty, meaning that no Language is considered frequent,
|
|
||||||
// then we don't rerun the extraction.
|
|
||||||
if !script_language.is_empty() {
|
|
||||||
// build a new temporary tokenizer including the allow list.
|
|
||||||
let stop_words = settings.stop_words.as_ref();
|
|
||||||
let separators: Option<Vec<_>> = settings
|
|
||||||
.allowed_separators
|
|
||||||
.as_ref()
|
|
||||||
.map(|s| s.iter().map(String::as_str).collect());
|
|
||||||
let dictionary: Option<Vec<_>> =
|
|
||||||
settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
|
||||||
let mut builder =
|
|
||||||
tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
|
|
||||||
let tokenizer = builder.build();
|
|
||||||
|
|
||||||
script_language_word_count.clear();
|
|
||||||
|
|
||||||
// rerun the extraction.
|
|
||||||
tokens_from_document(
|
|
||||||
obkv,
|
|
||||||
&settings.searchable_fields_ids,
|
|
||||||
&tokenizer,
|
|
||||||
max_positions_per_attributes,
|
|
||||||
del_add,
|
|
||||||
buffers,
|
|
||||||
&mut script_language_word_count,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>)
|
|
||||||
Ok((&buffers.obkv_buffer, script_language_word_count))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Extract words mapped with their positions of a document.
|
/// Extract words mapped with their positions of a document.
|
||||||
fn tokens_from_document<'a>(
|
fn tokens_from_document<'a>(
|
||||||
obkv: &KvReader<'a, FieldId>,
|
obkv: &KvReader<'a, FieldId>,
|
||||||
@ -301,7 +208,6 @@ fn tokens_from_document<'a>(
|
|||||||
max_positions_per_attributes: u32,
|
max_positions_per_attributes: u32,
|
||||||
del_add: DelAdd,
|
del_add: DelAdd,
|
||||||
buffers: &'a mut Buffers,
|
buffers: &'a mut Buffers,
|
||||||
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
|
||||||
) -> Result<&'a [u8]> {
|
) -> Result<&'a [u8]> {
|
||||||
buffers.obkv_buffer.clear();
|
buffers.obkv_buffer.clear();
|
||||||
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||||
@ -326,16 +232,6 @@ fn tokens_from_document<'a>(
|
|||||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||||
|
|
||||||
for (index, token) in tokens {
|
for (index, token) in tokens {
|
||||||
// if a language has been detected for the token, we update the counter.
|
|
||||||
if let Some(language) = token.language {
|
|
||||||
let script = token.script;
|
|
||||||
let entry = script_language_word_count.entry(script).or_default();
|
|
||||||
match entry.iter_mut().find(|(l, _)| *l == language) {
|
|
||||||
Some((_, n)) => *n += 1,
|
|
||||||
None => entry.push((language, 1)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// keep a word only if it is not empty and fit in a LMDB key.
|
// keep a word only if it is not empty and fit in a LMDB key.
|
||||||
let token = token.lemma().trim();
|
let token = token.lemma().trim();
|
||||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||||
@ -423,39 +319,6 @@ fn process_tokens<'a>(
|
|||||||
.filter(|(_, t)| t.is_word())
|
.filter(|(_, t)| t.is_word())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn potential_language_detection_error(languages_frequency: &[(Language, usize)]) -> bool {
|
|
||||||
if languages_frequency.len() > 1 {
|
|
||||||
let threshold = compute_language_frequency_threshold(languages_frequency);
|
|
||||||
languages_frequency.iter().any(|(_, c)| *c <= threshold)
|
|
||||||
} else {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn most_frequent_languages(
|
|
||||||
(script, languages_frequency): (&Script, &Vec<(Language, usize)>),
|
|
||||||
) -> Option<(Script, Vec<Language>)> {
|
|
||||||
if languages_frequency.len() > 1 {
|
|
||||||
let threshold = compute_language_frequency_threshold(languages_frequency);
|
|
||||||
|
|
||||||
let languages: Vec<_> =
|
|
||||||
languages_frequency.iter().filter(|(_, c)| *c > threshold).map(|(l, _)| *l).collect();
|
|
||||||
|
|
||||||
if languages.is_empty() {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some((*script, languages))
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize {
|
|
||||||
let total: usize = languages_frequency.iter().map(|(_, c)| c).sum();
|
|
||||||
total / 10 // 10% is a completely arbitrary value.
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct Buffers {
|
struct Buffers {
|
||||||
// the field buffer for each fields desserialization, and must be cleared between each field.
|
// the field buffer for each fields desserialization, and must be cleared between each field.
|
||||||
|
@ -345,8 +345,7 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
|
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||||
rayon::join(
|
rayon::join(
|
||||||
|| {
|
|| {
|
||||||
let (docid_word_positions_chunk, script_language_pair) =
|
let docid_word_positions_chunk = extract_docid_word_positions(
|
||||||
extract_docid_word_positions(
|
|
||||||
flattened_documents_chunk.clone(),
|
flattened_documents_chunk.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
&settings_diff,
|
&settings_diff,
|
||||||
@ -357,9 +356,6 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
let docid_word_positions_chunk =
|
let docid_word_positions_chunk =
|
||||||
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
||||||
|
|
||||||
let _ =
|
|
||||||
lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));
|
|
||||||
|
|
||||||
Ok(docid_word_positions_chunk)
|
Ok(docid_word_positions_chunk)
|
||||||
},
|
},
|
||||||
|| {
|
|| {
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
use std::collections::{BTreeSet, HashMap};
|
use std::collections::BTreeSet;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, BufReader};
|
use std::io::{self, BufReader};
|
||||||
|
|
||||||
use bytemuck::allocation::pod_collect_to_vec;
|
use bytemuck::allocation::pod_collect_to_vec;
|
||||||
use charabia::{Language, Script};
|
|
||||||
use grenad::{Merger, MergerBuilder};
|
use grenad::{Merger, MergerBuilder};
|
||||||
use heed::types::Bytes;
|
use heed::types::Bytes;
|
||||||
use heed::{BytesDecode, RwTxn};
|
use heed::{BytesDecode, RwTxn};
|
||||||
@ -94,7 +93,6 @@ pub(crate) enum TypedChunk {
|
|||||||
add_to_user_provided: RoaringBitmap,
|
add_to_user_provided: RoaringBitmap,
|
||||||
remove_from_user_provided: RoaringBitmap,
|
remove_from_user_provided: RoaringBitmap,
|
||||||
},
|
},
|
||||||
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TypedChunk {
|
impl TypedChunk {
|
||||||
@ -113,8 +111,7 @@ impl TypedChunk {
|
|||||||
| (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_))
|
| (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_))
|
||||||
| (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_))
|
| (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_))
|
||||||
| (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_))
|
| (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_))
|
||||||
| (GeoPoints(_), GeoPoints(_))
|
| (GeoPoints(_), GeoPoints(_)) => true,
|
||||||
| (ScriptLanguageDocids(_), ScriptLanguageDocids(_)) => true,
|
|
||||||
(
|
(
|
||||||
VectorPoints { embedder_name: left, expected_dimension: left_dim, .. },
|
VectorPoints { embedder_name: left, expected_dimension: left_dim, .. },
|
||||||
VectorPoints { embedder_name: right, expected_dimension: right_dim, .. },
|
VectorPoints { embedder_name: right, expected_dimension: right_dim, .. },
|
||||||
@ -775,33 +772,6 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
|
|
||||||
tracing::debug!("Finished vector chunk for {}", embedder_name);
|
tracing::debug!("Finished vector chunk for {}", embedder_name);
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids(_) => {
|
|
||||||
let span = tracing::trace_span!(target: "indexing::write_db", "script_language_docids");
|
|
||||||
let _entered = span.enter();
|
|
||||||
|
|
||||||
for typed_chunk in typed_chunks {
|
|
||||||
let TypedChunk::ScriptLanguageDocids(sl_map) = typed_chunk else { unreachable!() };
|
|
||||||
for (key, (deletion, addition)) in sl_map {
|
|
||||||
let mut db_key_exists = false;
|
|
||||||
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
|
||||||
Some(db_values) => {
|
|
||||||
db_key_exists = true;
|
|
||||||
(db_values - deletion) | addition
|
|
||||||
}
|
|
||||||
None => addition,
|
|
||||||
};
|
|
||||||
|
|
||||||
if final_value.is_empty() {
|
|
||||||
// If the database entry exists, delete it.
|
|
||||||
if db_key_exists {
|
|
||||||
index.script_language_docids.delete(wtxn, &key)?;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((RoaringBitmap::new(), is_merged_database))
|
Ok((RoaringBitmap::new(), is_merged_database))
|
||||||
|
Loading…
Reference in New Issue
Block a user