Don't sort in parallel in sorters of the new indexer

2025-02-21 01:55:52 +08:00 · 2024-10-17 09:30:18 +02:00 · 2024-10-17 09:30:18 +02:00 · 0749633618
commit 0749633618
parent 0647f75e6b
15 changed files with 32 additions and 1 deletions
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@ -40,6 +40,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory,
+        true,
    );

    // initialize buffers.
--- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
@ -32,6 +32,7 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory,
+        true,
    );

    let mut buffer = Vec::new();
--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@ -61,6 +61,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory.map(|m| m / 2),
+        true,
    );

    let mut normalized_facet_string_docids_sorter = create_sorter(
@ -70,6 +71,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory.map(|m| m / 2),
+        true,
    );

    let mut buffer = Vec::new();
@ -149,6 +151,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory.map(|m| m / 2),
+        true,
    );

    let mut normalized_facet_string_docids_sorter = create_sorter(
@ -158,6 +161,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory.map(|m| m / 2),
+        true,
    );

    let mut buffer = Vec::new();
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@ -53,6 +53,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory.map(|m| m / 2),
+        true,
    );

    let mut fid_docid_facet_strings_sorter = create_sorter(
@ -62,6 +63,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory.map(|m| m / 2),
+        true,
    );

    // The tuples represents the Del and Add side for a bitmap
--- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
@ -35,6 +35,7 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory,
+        true,
    );

    let mut key_buffer = Vec::new();
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@ -44,6 +44,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory.map(|m| m / 3),
+        true,
    );
    let mut key_buffer = Vec::new();
    let mut del_words = BTreeSet::new();
@ -98,6 +99,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory.map(|m| m / 3),
+        true,
    );

    let mut exact_word_docids_sorter = create_sorter(
@ -107,6 +109,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory.map(|m| m / 3),
+        true,
    );

    let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@ -49,6 +49,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
                indexer.chunk_compression_level,
                indexer.max_nb_chunks,
                max_memory.map(|m| m / MAX_DISTANCE as usize),
+                true,
            )
        })
        .collect();
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@ -33,6 +33,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
        max_memory,
+        true,
    );

    let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@ -37,6 +37,7 @@ pub fn create_sorter<MF: MergeFunction>(
    chunk_compression_level: Option<u32>,
    max_nb_chunks: Option<usize>,
    max_memory: Option<usize>,
+    sort_in_parallel: bool,
 ) -> grenad::Sorter<MF> {
    let mut builder = grenad::Sorter::builder(merge);
    builder.chunk_compression_type(chunk_compression_type);
@ -51,7 +52,7 @@ pub fn create_sorter<MF: MergeFunction>(
        builder.allow_realloc(false);
    }
    builder.sort_algorithm(sort_algorithm);
-    builder.sort_in_parallel(true);
+    builder.sort_in_parallel(sort_in_parallel);
    builder.build()
 }

--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -127,6 +127,7 @@ impl<'a, 'i> Transform<'a, 'i> {
            indexer_settings.chunk_compression_level,
            indexer_settings.max_nb_chunks,
            indexer_settings.max_memory.map(|mem| mem / 2),
+            true,
        );

        // We initialize the sorter with the user indexing settings.
@ -137,6 +138,7 @@ impl<'a, 'i> Transform<'a, 'i> {
            indexer_settings.chunk_compression_level,
            indexer_settings.max_nb_chunks,
            indexer_settings.max_memory.map(|mem| mem / 2),
+            true,
        );
        let documents_ids = index.documents_ids(wtxn)?;

@ -988,6 +990,7 @@ impl<'a, 'i> Transform<'a, 'i> {
                self.indexer_settings.chunk_compression_level,
                self.indexer_settings.max_nb_chunks,
                self.indexer_settings.max_memory.map(|mem| mem / 2),
+                true,
            ))
        } else {
            None
@ -1030,6 +1033,7 @@ impl<'a, 'i> Transform<'a, 'i> {
                    self.indexer_settings.chunk_compression_level,
                    self.indexer_settings.max_nb_chunks,
                    self.indexer_settings.max_memory.map(|mem| mem / 2),
+                    true,
                ))
            } else {
                None
--- a/milli/src/update/new/extract/faceted/extract_facets.rs
+++ b/milli/src/update/new/extract/faceted/extract_facets.rs
@ -46,6 +46,10 @@ impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> {
                self.grenad_parameters.chunk_compression_level,
                self.grenad_parameters.max_nb_chunks,
                self.max_memory,
+                // *NOTE*: this must not be set to true:
+                // 1. we're already using max parallelism in the pool, so it wouldn't help
+                // 2. it creates correctness issues if it causes to yield a borrow-mut wielding task
+                false,
            ),
        ))))
    }
--- a/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs
@ -48,6 +48,7 @@ impl WordDocidsCachedSorters {
                indexer.chunk_compression_level,
                indexer.max_nb_chunks,
                max_memory,
+                false,
            ),
        );
        let word_docids = CboCachedSorter::new(
@ -59,6 +60,7 @@ impl WordDocidsCachedSorters {
                indexer.chunk_compression_level,
                indexer.max_nb_chunks,
                max_memory,
+                false,
            ),
        );
        let exact_word_docids = CboCachedSorter::new(
@ -70,6 +72,7 @@ impl WordDocidsCachedSorters {
                indexer.chunk_compression_level,
                indexer.max_nb_chunks,
                max_memory,
+                false,
            ),
        );
        let word_position_docids = CboCachedSorter::new(
@ -81,6 +84,7 @@ impl WordDocidsCachedSorters {
                indexer.chunk_compression_level,
                indexer.max_nb_chunks,
                max_memory,
+                false,
            ),
        );
        let fid_word_count_docids = CboCachedSorter::new(
@ -92,6 +96,7 @@ impl WordDocidsCachedSorters {
                indexer.chunk_compression_level,
                indexer.max_nb_chunks,
                max_memory,
+                false,
            ),
        );

--- a/milli/src/update/new/extract/searchable/mod.rs
+++ b/milli/src/update/new/extract/searchable/mod.rs
@ -50,6 +50,7 @@ impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
                self.grenad_parameters.chunk_compression_level,
                self.grenad_parameters.max_nb_chunks,
                self.max_memory,
+                false,
            ),
        ))))
    }
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@ -60,6 +60,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
            self.chunk_compression_level,
            self.max_nb_chunks,
            self.max_memory,
+            true,
        );

        if !common_prefix_fst_words.is_empty() {
--- a/milli/src/update/words_prefix_integer_docids.rs
+++ b/milli/src/update/words_prefix_integer_docids.rs
@ -65,6 +65,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
            self.chunk_compression_level,
            self.max_nb_chunks,
            self.max_memory,
+            true,
        );

        if !common_prefix_fst_words.is_empty() {