mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 08:48:32 +08:00
Merge #635
635: Use an unstable algorithm for `grenad::Sorter` when possible r=Kerollmops a=loiclec # Pull Request ## What does this PR do? Use an unstable algorithm to sort the internal vector used by `grenad::Sorter` whenever possible to speed up indexing. In practice, every time the merge function creates a `RoaringBitmap`, we use an unstable sort. For every other merge function, such as `keep_first`, `keep_last`, etc., a stable sort is used. Co-authored-by: Loïc Lecrenier <loic@meilisearch.com>
This commit is contained in:
commit
15d478cf4d
@ -17,7 +17,7 @@ flatten-serde-json = { path = "../flatten-serde-json" }
|
||||
fst = "0.4.7"
|
||||
fxhash = "0.2.1"
|
||||
geoutils = "0.4.1"
|
||||
grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] }
|
||||
grenad = { version = "0.4.3", default-features = false, features = ["tempfile"] }
|
||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] }
|
||||
json-depth-checker = { path = "../json-depth-checker" }
|
||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||
|
@ -32,6 +32,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
let mut docid_word_positions_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
concat_u32s_array,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
|
@ -21,6 +21,7 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_number_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
|
@ -23,6 +23,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first_prefix_value_merge_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
|
@ -28,6 +28,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
@ -36,6 +37,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
);
|
||||
|
||||
let mut fid_docid_facet_strings_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
|
@ -25,6 +25,7 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
|
@ -30,6 +30,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
@ -38,6 +39,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
);
|
||||
|
||||
let mut exact_word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
|
@ -24,6 +24,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_pair_proximity_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
|
@ -21,6 +21,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_position_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
|
@ -27,6 +27,7 @@ pub fn create_writer<R: io::Write>(
|
||||
}
|
||||
|
||||
pub fn create_sorter(
|
||||
sort_algorithm: grenad::SortAlgorithm,
|
||||
merge: MergeFn,
|
||||
chunk_compression_type: grenad::CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
@ -45,6 +46,7 @@ pub fn create_sorter(
|
||||
builder.dump_threshold(memory);
|
||||
builder.allow_realloc(false);
|
||||
}
|
||||
builder.sort_algorithm(sort_algorithm);
|
||||
builder.build()
|
||||
}
|
||||
|
||||
|
@ -1488,6 +1488,7 @@ mod tests {
|
||||
assert_eq!(count, 4);
|
||||
}
|
||||
|
||||
#[cfg(feature = "default")]
|
||||
#[test]
|
||||
fn test_meilisearch_1714() {
|
||||
let index = TempIndex::new();
|
||||
|
@ -99,6 +99,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
|
||||
// We initialize the sorter with the user indexing settings.
|
||||
let original_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_function,
|
||||
indexer_settings.chunk_compression_type,
|
||||
indexer_settings.chunk_compression_level,
|
||||
@ -108,6 +109,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
|
||||
// We initialize the sorter with the user indexing settings.
|
||||
let flattened_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_function,
|
||||
indexer_settings.chunk_compression_type,
|
||||
indexer_settings.chunk_compression_level,
|
||||
|
@ -48,6 +48,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
// It is forbidden to keep a mutable reference into the database
|
||||
// and write into it at the same time, therefore we write into another file.
|
||||
let mut prefix_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_roaring_bitmaps,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
|
@ -65,6 +65,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
|
||||
debug!("Computing and writing the word levels positions docids into LMDB on disk...");
|
||||
|
||||
let mut prefix_position_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
|
Loading…
Reference in New Issue
Block a user