Fix some tests but not all of them

2024-11-22 10:07:40 +08:00 · 2024-11-18 17:39:55 +01:00 · 2024-11-18 17:39:55 +01:00 · aba8a0e9e0
commit aba8a0e9e0
parent 670aff5553
20 changed files with 1211 additions and 881 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2041,7 +2041,9 @@ name = "fuzzers"
 version = "1.11.0"
 dependencies = [
 "arbitrary",
+ "bumpalo",
 "clap",
+ "either",
 "fastrand",
 "milli",
 "serde",
--- a/crates/benchmarks/benches/utils.rs
+++ b/crates/benchmarks/benches/utils.rs
@ -140,7 +140,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
    }
 }

-pub fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
+pub fn documents_from(filename: &str, filetype: &str) -> Mmap {
    let reader = File::open(filename)
        .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
    let reader = BufReader::new(reader);
--- a/crates/fuzzers/Cargo.toml
+++ b/crates/fuzzers/Cargo.toml
@ -12,7 +12,9 @@ license.workspace = true

 [dependencies]
 arbitrary = { version = "1.3.2", features = ["derive"] }
+bumpalo = "3.16.0"
 clap = { version = "4.5.9", features = ["derive"] }
+either = "1.13.0"
 fastrand = "2.1.0"
 milli = { path = "../milli" }
 serde = { version = "1.0.204", features = ["derive"] }
--- a/crates/fuzzers/src/bin/fuzz-indexing.rs
+++ b/crates/fuzzers/src/bin/fuzz-indexing.rs
@ -4,11 +4,17 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::time::Duration;

 use arbitrary::{Arbitrary, Unstructured};
+use bumpalo::Bump;
 use clap::Parser;
+use either::Either;
 use fuzzers::Operation;
+use milli::documents::mmap_from_objects;
 use milli::heed::EnvOpenOptions;
-use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig};
+use milli::update::new::indexer;
+use milli::update::{IndexDocumentsMethod, IndexerConfig};
+use milli::vector::EmbeddingConfigs;
 use milli::Index;
+use serde_json::Value;
 use tempfile::TempDir;

 #[derive(Debug, Arbitrary)]
@ -58,7 +64,6 @@ fn main() {
            };
            let index = Index::new(options, tempdir.path()).unwrap();
            let indexer_config = IndexerConfig::default();
-            let index_documents_config = IndexDocumentsConfig::default();

            std::thread::scope(|s| {
                loop {
@ -75,38 +80,69 @@ fn main() {

                    let handle = s.spawn(|| {
                        let mut wtxn = index.write_txn().unwrap();
+                        let rtxn = index.read_txn().unwrap();

                        for batch in batches {
-                            let mut builder = IndexDocuments::new(
-                                &mut wtxn,
-                                &index,
-                                &indexer_config,
-                                index_documents_config.clone(),
-                                |_| (),
-                                || false,
-                            )
-                            .unwrap();
+                            let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
+                            let mut new_fields_ids_map = db_fields_ids_map.clone();

+                            let indexer_alloc = Bump::new();
+                            let embedders = EmbeddingConfigs::default();
+                            let mut indexer = indexer::DocumentOperation::new(
+                                IndexDocumentsMethod::ReplaceDocuments,
+                            );
+
+                            let mut operations = Vec::new();
                            for op in batch.0 {
                                match op {
                                    Operation::AddDoc(doc) => {
-                                        let documents =
-                                            milli::documents::objects_from_json_value(doc.to_d());
-                                        let documents =
-                                            milli::documents::documents_batch_reader_from_objects(
-                                                documents,
-                                            );
-                                        let (b, _added) = builder.add_documents(documents).unwrap();
-                                        builder = b;
+                                        let object = match doc.to_d() {
+                                            Value::Object(object) => object,
+                                            _ => unreachable!(),
+                                        };
+                                        let documents = mmap_from_objects(vec![object]);
+                                        operations.push(Either::Left(documents));
                                    }
                                    Operation::DeleteDoc(id) => {
-                                        let (b, _removed) =
-                                            builder.remove_documents(vec![id.to_s()]).unwrap();
-                                        builder = b;
+                                        let id = indexer_alloc.alloc_str(&id.to_s());
+                                        let ids = indexer_alloc.alloc_slice_copy(&[&*id]);
+                                        operations.push(Either::Right(ids));
                                    }
                                }
                            }
-                            builder.execute().unwrap();
+
+                            for op in &operations {
+                                match op {
+                                    Either::Left(documents) => {
+                                        indexer.add_documents(documents).unwrap()
+                                    }
+                                    Either::Right(ids) => indexer.delete_documents(ids),
+                                }
+                            }
+
+                            let (document_changes, _operation_stats, primary_key) = indexer
+                                .into_changes(
+                                    &indexer_alloc,
+                                    &index,
+                                    &rtxn,
+                                    None,
+                                    &mut new_fields_ids_map,
+                                )
+                                .unwrap();
+
+                            indexer::index(
+                                &mut wtxn,
+                                &index,
+                                indexer_config.grenad_parameters(),
+                                &db_fields_ids_map,
+                                new_fields_ids_map,
+                                primary_key,
+                                &document_changes,
+                                embedders,
+                                &|| false,
+                                &|_| (),
+                            )
+                            .unwrap();

                            // after executing a batch we check if the database is corrupted
                            let res = index.search(&wtxn).execute().unwrap();
--- a/crates/meilisearch/src/lib.rs
+++ b/crates/meilisearch/src/lib.rs
@ -448,11 +448,12 @@ fn import_dump(

        let builder = builder.with_embedders(embedders);

-        let (builder, user_result) = builder.add_documents(reader)?;
-        let user_result = user_result?;
-        tracing::info!(documents_found = user_result, "{} documents found.", user_result);
-        builder.execute()?;
-        wtxn.commit()?;
+        todo!("please plug the dump load of main");
+        // let (builder, user_result) = builder.add_documents(reader)?;
+        // let user_result = user_result?;
+        // tracing::info!(documents_found = user_result, "{} documents found.", user_result);
+        // builder.execute()?;
+        // wtxn.commit()?;
        tracing::info!("All documents successfully imported.");
    }

--- a/crates/milli/examples/index.rs
+++ b/crates/milli/examples/index.rs
@ -1,114 +0,0 @@
-use std::error::Error;
-use std::fs::File;
-use std::io::{BufRead, BufReader, Cursor, Seek};
-use std::path::Path;
-
-use heed::EnvOpenOptions;
-use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
-use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
-use milli::{Index, Object};
-
-fn usage(error: &str, program_name: &str) -> String {
-    format!(
-        "{}. Usage: {} <PATH-TO-INDEX> <PATH-TO-DATASET> [searchable_fields] [filterable_fields]",
-        error, program_name
-    )
-}
-
-fn main() -> Result<(), Box<dyn Error>> {
-    let mut args = std::env::args();
-    let program_name = args.next().expect("No program name");
-    let index_path =
-        args.next().unwrap_or_else(|| panic!("{}", usage("Missing path to index.", &program_name)));
-    let dataset_path = args
-        .next()
-        .unwrap_or_else(|| panic!("{}", usage("Missing path to source dataset.", &program_name)));
-    // let primary_key = args.next().unwrap_or_else(|| "id".into());
-    // "title overview"
-    let searchable_fields: Vec<String> = args
-        .next()
-        .map(|arg| arg.split_whitespace().map(ToString::to_string).collect())
-        .unwrap_or_default();
-
-    println!("{searchable_fields:?}");
-    // "release_date genres"
-    let filterable_fields: Vec<String> = args
-        .next()
-        .map(|arg| arg.split_whitespace().map(ToString::to_string).collect())
-        .unwrap_or_default();
-
-    let mut options = EnvOpenOptions::new();
-    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-
-    std::fs::create_dir_all(&index_path).unwrap();
-    let index = Index::new(options, index_path).unwrap();
-    let mut wtxn = index.write_txn().unwrap();
-
-    let config = IndexerConfig::default();
-    let mut builder = Settings::new(&mut wtxn, &index, &config);
-    // builder.set_primary_key(primary_key);
-    let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect();
-    builder.set_searchable_fields(searchable_fields);
-    let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect();
-    builder.set_filterable_fields(filterable_fields);
-
-    builder.execute(|_| (), || false).unwrap();
-
-    let config = IndexerConfig::default();
-    let indexing_config = IndexDocumentsConfig::default();
-
-    let builder =
-        IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
-
-    let documents = documents_from(
-        &dataset_path,
-        Path::new(&dataset_path).extension().unwrap_or_default().to_str().unwrap_or_default(),
-    );
-    let (builder, user_error) = builder.add_documents(documents).unwrap();
-    user_error.unwrap();
-    builder.execute().unwrap();
-    wtxn.commit().unwrap();
-
-    index.prepare_for_closing().wait();
-    Ok(())
-}
-fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
-    let reader = File::open(filename)
-        .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename));
-    let reader = BufReader::new(reader);
-    let documents = match filetype {
-        "csv" => documents_from_csv(reader).unwrap(),
-        "json" => documents_from_json(reader).unwrap(),
-        "jsonl" => documents_from_jsonl(reader).unwrap(),
-        otherwise => panic!("invalid update format {:?}", otherwise),
-    };
-    DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
-}
-
-fn documents_from_jsonl(reader: impl BufRead) -> milli::Result<Vec<u8>> {
-    let mut documents = DocumentsBatchBuilder::new(Vec::new());
-
-    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
-        let object = result.unwrap();
-        documents.append_json_object(&object)?;
-    }
-
-    documents.into_inner().map_err(Into::into)
-}
-
-fn documents_from_json(reader: impl BufRead) -> milli::Result<Vec<u8>> {
-    let mut documents = DocumentsBatchBuilder::new(Vec::new());
-
-    documents.append_json_array(reader)?;
-
-    documents.into_inner().map_err(Into::into)
-}
-
-fn documents_from_csv(reader: impl BufRead) -> milli::Result<Vec<u8>> {
-    let csv = csv::Reader::from_reader(reader);
-
-    let mut documents = DocumentsBatchBuilder::new(Vec::new());
-    documents.append_csv(csv)?;
-
-    documents.into_inner().map_err(Into::into)
-}
--- a/crates/milli/examples/search.rs
+++ b/crates/milli/examples/search.rs
@ -1,124 +0,0 @@
-use std::error::Error;
-use std::io::stdin;
-use std::path::Path;
-use std::time::Instant;
-
-use heed::EnvOpenOptions;
-use milli::{
-    execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, Index, SearchContext,
-    SearchLogger, TermsMatchingStrategy, TimeBudget,
-};
-
-#[global_allocator]
-static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
-
-fn main() -> Result<(), Box<dyn Error>> {
-    let mut args = std::env::args();
-    let program_name = args.next().expect("No program name");
-    let dataset = args.next().unwrap_or_else(|| {
-        panic!(
-            "Missing path to index. Usage: {} <PATH-TO-INDEX> [<logger-dir>] [print-documents]",
-            program_name
-        )
-    });
-    let detailed_logger_dir = args.next();
-    let print_documents: bool =
-        if let Some(arg) = args.next() { arg == "print-documents" } else { false };
-
-    let mut options = EnvOpenOptions::new();
-    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-
-    let index = Index::new(options, dataset)?;
-    let txn = index.read_txn()?;
-    let mut query = String::new();
-    while stdin().read_line(&mut query)? > 0 {
-        for _ in 0..2 {
-            let mut default_logger = DefaultSearchLogger;
-            // FIXME: consider resetting the state of the logger between search executions as otherwise panics are possible.
-            // Workaround'd here by recreating the logger on each iteration of the loop
-            let mut detailed_logger = detailed_logger_dir
-                .as_ref()
-                .map(|logger_dir| (milli::VisualSearchLogger::default(), logger_dir));
-            let logger: &mut dyn SearchLogger<_> =
-                if let Some((detailed_logger, _)) = detailed_logger.as_mut() {
-                    detailed_logger
-                } else {
-                    &mut default_logger
-                };
-
-            let start = Instant::now();
-
-            let mut ctx = SearchContext::new(&index, &txn)?;
-            let universe = filtered_universe(ctx.index, ctx.txn, &None)?;
-
-            let docs = execute_search(
-                &mut ctx,
-                (!query.trim().is_empty()).then(|| query.trim()),
-                TermsMatchingStrategy::Last,
-                milli::score_details::ScoringStrategy::Skip,
-                false,
-                universe,
-                &None,
-                &None,
-                GeoSortStrategy::default(),
-                0,
-                20,
-                None,
-                &mut DefaultSearchLogger,
-                logger,
-                TimeBudget::max(),
-                None,
-                None,
-            )?;
-            if let Some((logger, dir)) = detailed_logger {
-                logger.finish(&mut ctx, Path::new(dir))?;
-            }
-            let elapsed = start.elapsed();
-            println!("new: {}us, docids: {:?}", elapsed.as_micros(), docs.documents_ids);
-            if print_documents {
-                let documents = index
-                    .documents(&txn, docs.documents_ids.iter().copied())
-                    .unwrap()
-                    .into_iter()
-                    .map(|(id, obkv)| {
-                        let mut object = serde_json::Map::default();
-                        for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
-                            let value = obkv.get(fid).unwrap();
-                            let value: serde_json::Value = serde_json::from_slice(value).unwrap();
-                            object.insert(fid_name.to_owned(), value);
-                        }
-                        (id, serde_json::to_string_pretty(&object).unwrap())
-                    })
-                    .collect::<Vec<_>>();
-
-                for (id, document) in documents {
-                    println!("{id}:");
-                    println!("{document}");
-                }
-
-                let documents = index
-                    .documents(&txn, docs.documents_ids.iter().copied())
-                    .unwrap()
-                    .into_iter()
-                    .map(|(id, obkv)| {
-                        let mut object = serde_json::Map::default();
-                        for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
-                            let value = obkv.get(fid).unwrap();
-                            let value: serde_json::Value = serde_json::from_slice(value).unwrap();
-                            object.insert(fid_name.to_owned(), value);
-                        }
-                        (id, serde_json::to_string_pretty(&object).unwrap())
-                    })
-                    .collect::<Vec<_>>();
-                println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids);
-                for (id, document) in documents {
-                    println!("{id}:");
-                    println!("{document}");
-                }
-            }
-        }
-        query.clear();
-    }
-
-    Ok(())
-}
--- a/crates/milli/examples/settings.rs
+++ b/crates/milli/examples/settings.rs
@ -1,33 +0,0 @@
-// use big_s::S;
-use heed::EnvOpenOptions;
-// use maplit::hashset;
-use milli::{
-    update::{IndexerConfig, Settings},
-    Criterion, Index,
-};
-
-fn main() {
-    let mut options = EnvOpenOptions::new();
-    options.map_size(100 * 1024 * 1024 * 1024); // 100 GB
-
-    let index = Index::new(options, "data_movies.ms").unwrap();
-    let mut wtxn = index.write_txn().unwrap();
-
-    let config = IndexerConfig::default();
-    let mut builder = Settings::new(&mut wtxn, &index, &config);
-
-    // builder.set_min_word_len_one_typo(5);
-    // builder.set_min_word_len_two_typos(7);
-    // builder.set_sortable_fields(hashset! { S("release_date") });
-    builder.set_criteria(vec![
-        Criterion::Words,
-        Criterion::Typo,
-        Criterion::Proximity,
-        Criterion::Attribute,
-        Criterion::Sort,
-        Criterion::Exactness,
-    ]);
-
-    builder.execute(|_| (), || false).unwrap();
-    wtxn.commit().unwrap();
-}
--- a/crates/milli/src/documents/mod.rs
+++ b/crates/milli/src/documents/mod.rs
@ -150,11 +150,24 @@ pub fn objects_from_json_value(json: serde_json::Value) -> Vec<crate::Object> {
 macro_rules! documents {
    ($data:tt) => {{
        let documents = serde_json::json!($data);
-        let documents = $crate::documents::objects_from_json_value(documents);
-        $crate::documents::documents_batch_reader_from_objects(documents)
+        let mut file = tempfile::tempfile().unwrap();
+        for document in documents.as_array().unwrap() {
+            serde_json::to_writer(&mut file, &document).unwrap();
+        }
+        file.sync_all().unwrap();
+        unsafe { memmap2::Mmap::map(&file).unwrap() }
    }};
 }

+pub fn mmap_from_objects(objects: impl IntoIterator<Item = Object>) -> memmap2::Mmap {
+    let mut writer = tempfile::tempfile().map(std::io::BufWriter::new).unwrap();
+    for object in objects {
+        serde_json::to_writer(&mut writer, &object).unwrap();
+    }
+    let file = writer.into_inner().unwrap();
+    unsafe { memmap2::Mmap::map(&file).unwrap() }
+}
+
 pub fn documents_batch_reader_from_objects(
    objects: impl IntoIterator<Item = Object>,
 ) -> DocumentsBatchReader<std::io::Cursor<Vec<u8>>> {
@ -224,20 +237,6 @@ mod test {
        assert!(documents.next_document().unwrap().is_none());
    }

-    #[test]
-    fn test_nested() {
-        let docs_reader = documents!([{
-            "hello": {
-                "toto": ["hello"]
-            }
-        }]);
-
-        let (mut cursor, _) = docs_reader.into_cursor_and_fields_index();
-        let doc = cursor.next_document().unwrap().unwrap();
-        let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
-        assert_eq!(nested, json!({ "toto": ["hello"] }));
-    }
-
    #[test]
    fn out_of_order_json_fields() {
        let _documents = documents!([
--- a/crates/milli/src/index.rs
+++ b/crates/milli/src/index.rs
@ -1680,19 +1680,23 @@ pub(crate) mod tests {
    use std::ops::Deref;

    use big_s::S;
+    use bumpalo::Bump;
    use heed::{EnvOpenOptions, RwTxn};
    use maplit::{btreemap, hashset};
+    use memmap2::Mmap;
    use tempfile::TempDir;

-    use crate::documents::DocumentsBatchReader;
    use crate::error::{Error, InternalError};
    use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
+    use crate::update::new::indexer;
    use crate::update::{
-        self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
-        Settings,
+        self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, Settings,
    };
    use crate::vector::settings::{EmbedderSource, EmbeddingSettings};
-    use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
+    use crate::vector::EmbeddingConfigs;
+    use crate::{
+        db_snap, obkv_to_json, Filter, Index, Search, SearchResult, ThreadPoolNoAbortBuilder,
+    };

    pub(crate) struct TempIndex {
        pub inner: Index,
@ -1725,35 +1729,60 @@ pub(crate) mod tests {
        pub fn new() -> Self {
            Self::new_with_map_size(4096 * 2000)
        }
-        pub fn add_documents_using_wtxn<'t, R>(
+
+        pub fn add_documents_using_wtxn<'t>(
            &'t self,
            wtxn: &mut RwTxn<'t>,
-            documents: DocumentsBatchReader<R>,
-        ) -> Result<(), crate::error::Error>
-        where
-            R: std::io::Read + std::io::Seek,
-        {
-            let builder = IndexDocuments::new(
+            documents: Mmap,
+        ) -> Result<(), crate::error::Error> {
+            let local_pool;
+            let indexer_config = &self.indexer_config;
+            let pool = match &indexer_config.thread_pool {
+                Some(pool) => pool,
+                None => {
+                    local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
+                    &local_pool
+                }
+            };
+
+            let rtxn = self.inner.read_txn()?;
+            let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?;
+            let mut new_fields_ids_map = db_fields_ids_map.clone();
+
+            let embedders = EmbeddingConfigs::default();
+            let mut indexer =
+                indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
+            indexer.add_documents(&documents).unwrap();
+
+            let indexer_alloc = Bump::new();
+            let (document_changes, _operation_stats, primary_key) = indexer.into_changes(
+                &indexer_alloc,
+                &self.inner,
+                &rtxn,
+                None,
+                &mut new_fields_ids_map,
+            )?;
+
+            pool.install(|| {
+                indexer::index(
                    wtxn,
-                self,
-                &self.indexer_config,
-                self.index_documents_config.clone(),
-                |_| (),
-                || false,
+                    &self.inner,
+                    indexer_config.grenad_parameters(),
+                    &db_fields_ids_map,
+                    new_fields_ids_map,
+                    primary_key,
+                    &document_changes,
+                    embedders,
+                    &|| false,
+                    &|_| (),
                )
-            .unwrap();
-            let (builder, user_error) = builder.add_documents(documents).unwrap();
-            user_error?;
-            builder.execute()?;
+            })
+            .unwrap()?;
+
            Ok(())
        }
-        pub fn add_documents<R>(
-            &self,
-            documents: DocumentsBatchReader<R>,
-        ) -> Result<(), crate::error::Error>
-        where
-            R: std::io::Read + std::io::Seek,
-        {
+
+        pub fn add_documents(&self, documents: Mmap) -> Result<(), crate::error::Error> {
            let mut wtxn = self.write_txn().unwrap();
            self.add_documents_using_wtxn(&mut wtxn, documents)?;
            wtxn.commit().unwrap();
@ -1769,6 +1798,7 @@ pub(crate) mod tests {
            wtxn.commit().unwrap();
            Ok(())
        }
+
        pub fn update_settings_using_wtxn<'t>(
            &'t self,
            wtxn: &mut RwTxn<'t>,
@ -1784,19 +1814,54 @@ pub(crate) mod tests {
            &'t self,
            wtxn: &mut RwTxn<'t>,
            external_document_ids: Vec<String>,
-        ) {
-            let builder = IndexDocuments::new(
+        ) -> Result<(), crate::error::Error> {
+            let local_pool;
+            let indexer_config = &self.indexer_config;
+            let pool = match &indexer_config.thread_pool {
+                Some(pool) => pool,
+                None => {
+                    local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
+                    &local_pool
+                }
+            };
+
+            let rtxn = self.inner.read_txn()?;
+            let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?;
+            let mut new_fields_ids_map = db_fields_ids_map.clone();
+
+            let embedders = EmbeddingConfigs::default();
+            let mut indexer =
+                indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
+            let external_document_ids: Vec<_> =
+                external_document_ids.iter().map(AsRef::as_ref).collect();
+            indexer.delete_documents(external_document_ids.as_slice());
+
+            let indexer_alloc = Bump::new();
+            let (document_changes, _operation_stats, primary_key) = indexer.into_changes(
+                &indexer_alloc,
+                &self.inner,
+                &rtxn,
+                None,
+                &mut new_fields_ids_map,
+            )?;
+
+            pool.install(|| {
+                indexer::index(
                    wtxn,
-                self,
-                &self.indexer_config,
-                self.index_documents_config.clone(),
-                |_| (),
-                || false,
+                    &self.inner,
+                    indexer_config.grenad_parameters(),
+                    &db_fields_ids_map,
+                    new_fields_ids_map,
+                    primary_key,
+                    &document_changes,
+                    embedders,
+                    &|| false,
+                    &|_| (),
                )
-            .unwrap();
-            let (builder, user_error) = builder.remove_documents(external_document_ids).unwrap();
-            user_error.unwrap();
-            builder.execute().unwrap();
+            })
+            .unwrap()?;
+
+            Ok(())
        }

        pub fn delete_documents(&self, external_document_ids: Vec<String>) {
@ -1819,29 +1884,55 @@ pub(crate) mod tests {

        let index = TempIndex::new();
        let mut wtxn = index.inner.write_txn().unwrap();
-
        let should_abort = AtomicBool::new(false);
-        let builder = IndexDocuments::new(
-            &mut wtxn,
-            &index.inner,
-            &index.indexer_config,
-            index.index_documents_config.clone(),
-            |_| (),
-            || should_abort.load(Relaxed),
-        )
-        .unwrap();

-        let (builder, user_error) = builder
-            .add_documents(documents!([
+        let local_pool;
+        let indexer_config = &index.indexer_config;
+        let pool = match &indexer_config.thread_pool {
+            Some(pool) => pool,
+            None => {
+                local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap();
+                &local_pool
+            }
+        };
+
+        let rtxn = index.inner.read_txn().unwrap();
+        let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
+        let mut new_fields_ids_map = db_fields_ids_map.clone();
+
+        let embedders = EmbeddingConfigs::default();
+        let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
+        let payload = documents!([
            { "id": 1, "name": "kevin" },
            { "id": 2, "name": "bob", "age": 20 },
            { "id": 2, "name": "bob", "age": 20 },
-            ]))
+        ]);
+        indexer.add_documents(&payload);
+
+        let indexer_alloc = Bump::new();
+        let (document_changes, _operation_stats, primary_key) = indexer
+            .into_changes(&indexer_alloc, &index.inner, &rtxn, None, &mut new_fields_ids_map)
            .unwrap();
-        user_error.unwrap();

        should_abort.store(true, Relaxed);
-        let err = builder.execute().unwrap_err();
+
+        let err = pool
+            .install(|| {
+                indexer::index(
+                    &mut wtxn,
+                    &index.inner,
+                    indexer_config.grenad_parameters(),
+                    &db_fields_ids_map,
+                    new_fields_ids_map,
+                    primary_key,
+                    &document_changes,
+                    embedders,
+                    &|| should_abort.load(Relaxed),
+                    &|_| (),
+                )
+            })
+            .unwrap()
+            .unwrap_err();

        assert!(matches!(err, Error::InternalError(InternalError::AbortedIndexation)));
    }
--- a/crates/milli/src/search/facet/facet_distribution.rs
+++ b/crates/milli/src/search/facet/facet_distribution.rs
@ -407,7 +407,7 @@ mod tests {
    use big_s::S;
    use maplit::hashset;

-    use crate::documents::documents_batch_reader_from_objects;
+    use crate::documents::mmap_from_objects;
    use crate::index::tests::TempIndex;
    use crate::{milli_snap, FacetDistribution, OrderBy};

@ -508,8 +508,7 @@ mod tests {
            documents.push(document);
        }

-        let documents = documents_batch_reader_from_objects(documents);
-
+        let documents = mmap_from_objects(documents);
        index.add_documents(documents).unwrap();

        let txn = index.read_txn().unwrap();
@ -594,8 +593,7 @@ mod tests {
            documents.push(document);
        }

-        let documents = documents_batch_reader_from_objects(documents);
-
+        let documents = mmap_from_objects(documents);
        index.add_documents(documents).unwrap();

        let txn = index.read_txn().unwrap();
@ -654,8 +652,7 @@ mod tests {
            documents.push(document);
        }

-        let documents = documents_batch_reader_from_objects(documents);
-
+        let documents = mmap_from_objects(documents);
        index.add_documents(documents).unwrap();

        let txn = index.read_txn().unwrap();
@ -706,8 +703,7 @@ mod tests {
            documents.push(document);
        }

-        let documents = documents_batch_reader_from_objects(documents);
-
+        let documents = mmap_from_objects(documents);
        index.add_documents(documents).unwrap();

        let txn = index.read_txn().unwrap();
@ -758,8 +754,7 @@ mod tests {
            documents.push(document);
        }

-        let documents = documents_batch_reader_from_objects(documents);
-
+        let documents = mmap_from_objects(documents);
        index.add_documents(documents).unwrap();

        let txn = index.read_txn().unwrap();
@ -814,8 +809,7 @@ mod tests {
            documents.push(document);
        }

-        let documents = documents_batch_reader_from_objects(documents);
-
+        let documents = mmap_from_objects(documents);
        index.add_documents(documents).unwrap();

        let txn = index.read_txn().unwrap();
--- a/crates/milli/src/search/new/tests/integration.rs
+++ b/crates/milli/src/search/new/tests/integration.rs
@ -1,11 +1,16 @@
-use std::io::Cursor;
+use std::io::Write;

 use big_s::S;
+use bumpalo::Bump;
 use heed::EnvOpenOptions;
 use maplit::{btreemap, hashset};

 use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
-use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
+use crate::update::new::indexer;
+use crate::update::{
+    IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
+};
+use crate::vector::EmbeddingConfigs;
 use crate::{db_snap, Criterion, Index, Object};
 pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson");

@ -16,6 +21,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
    let index = Index::new(options, &path).unwrap();

    let mut wtxn = index.write_txn().unwrap();
+    let rtxn = index.read_txn().unwrap();
    let config = IndexerConfig::default();

    let mut builder = Settings::new(&mut wtxn, &index, &config);
@ -43,27 +49,41 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {

    // index documents
    let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
-    let indexing_config = IndexDocumentsConfig::default();

-    let builder =
-        IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
-    let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
-    let reader = Cursor::new(CONTENT.as_bytes());
+    let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
+    let mut new_fields_ids_map = db_fields_ids_map.clone();

-    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
-        let object = result.unwrap();
-        documents_builder.append_json_object(&object).unwrap();
-    }
+    let embedders = EmbeddingConfigs::default();
+    let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);

-    let vector = documents_builder.into_inner().unwrap();
+    let mut file = tempfile::tempfile().unwrap();
+    file.write_all(CONTENT.as_bytes()).unwrap();
+    file.sync_all().unwrap();
+    let payload = unsafe { memmap2::Mmap::map(&file).unwrap() };

    // index documents
-    let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
-    let (builder, user_error) = builder.add_documents(content).unwrap();
-    user_error.unwrap();
-    builder.execute().unwrap();
+    indexer.add_documents(&payload).unwrap();
+
+    let indexer_alloc = Bump::new();
+    let (document_changes, _operation_stats, primary_key) =
+        indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
+
+    indexer::index(
+        &mut wtxn,
+        &index,
+        config.grenad_parameters(),
+        &db_fields_ids_map,
+        new_fields_ids_map,
+        primary_key,
+        &document_changes,
+        embedders,
+        &|| false,
+        &|_| (),
+    )
+    .unwrap();

    wtxn.commit().unwrap();
+    drop(rtxn);

    index
 }
--- a/crates/milli/src/update/facet/bulk.rs
+++ b/crates/milli/src/update/facet/bulk.rs
@ -369,7 +369,7 @@ mod tests {
    use maplit::hashset;
    use roaring::RoaringBitmap;

-    use crate::documents::documents_batch_reader_from_objects;
+    use crate::documents::{documents_batch_reader_from_objects, mmap_from_objects};
    use crate::heed_codec::facet::OrderedF64Codec;
    use crate::heed_codec::StrRefCodec;
    use crate::index::tests::TempIndex;
@ -492,8 +492,8 @@ mod tests {
            );
        }

-        let documents = documents_batch_reader_from_objects(documents);
-        index.add_documents(documents).unwrap();
+        let documents = mmap_from_objects(documents);
+        index.add_documents(documents);

        db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
    }
--- a/crates/milli/src/update/index_documents/mod.rs
+++ b/crates/milli/src/update/index_documents/mod.rs
--- a/crates/milli/src/update/index_documents/parallel.rs
+++ b/crates/milli/src/update/index_documents/parallel.rs
@ -1,86 +0,0 @@
-use heed::types::Bytes;
-use heed::{Database, RoTxn};
-use obkv::KvReaderU16;
-use roaring::RoaringBitmap;
-
-use crate::{all_obkv_to_json, DocumentId, FieldsIdsMap, Object, ObkvCodec, Result, BEU32};
-
-pub struct ImmutableObkvs<'t> {
-    ids: RoaringBitmap,
-    fields_ids_map: FieldsIdsMap,
-    slices: Vec<&'t [u8]>,
-}
-
-impl<'t> ImmutableObkvs<'t> {
-    /// Creates the structure by fetching all the OBKVs
-    /// and keeping the transaction making the pointers valid.
-    pub fn new(
-        rtxn: &'t RoTxn,
-        documents_database: Database<BEU32, ObkvCodec>,
-        fields_ids_map: FieldsIdsMap,
-        subset: RoaringBitmap,
-    ) -> heed::Result<Self> {
-        let mut slices = Vec::new();
-        let documents_database = documents_database.remap_data_type::<Bytes>();
-        for docid in &subset {
-            let slice = documents_database.get(rtxn, &docid)?.unwrap();
-            slices.push(slice);
-        }
-
-        Ok(ImmutableObkvs { ids: subset, fields_ids_map, slices })
-    }
-
-    /// Returns the OBKVs identified by the given ID.
-    pub fn obkv(&self, docid: DocumentId) -> heed::Result<Option<&'t KvReaderU16>> {
-        match self
-            .ids
-            .rank(docid)
-            .checked_sub(1)
-            .and_then(|offset| self.slices.get(offset as usize))
-        {
-            Some(&bytes) => Ok(Some(bytes.into())),
-            None => Ok(None),
-        }
-    }
-
-    /// Returns the owned rhai::Map identified by the given ID.
-    pub fn rhai_map(&self, docid: DocumentId) -> Result<Option<rhai::Map>> {
-        let obkv = match self.obkv(docid) {
-            Ok(Some(obkv)) => obkv,
-            Ok(None) => return Ok(None),
-            Err(e) => return Err(e.into()),
-        };
-
-        let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
-        let map: Result<rhai::Map> = all_keys
-            .iter()
-            .copied()
-            .flat_map(|id| obkv.get(id).map(|value| (id, value)))
-            .map(|(id, value)| {
-                let name = self.fields_ids_map.name(id).ok_or(
-                    crate::error::FieldIdMapMissingEntry::FieldId {
-                        field_id: id,
-                        process: "all_obkv_to_rhaimap",
-                    },
-                )?;
-                let value = serde_json::from_slice(value)
-                    .map_err(crate::error::InternalError::SerdeJson)?;
-                Ok((name.into(), value))
-            })
-            .collect();
-
-        map.map(Some)
-    }
-
-    pub fn json_map(&self, docid: DocumentId) -> Result<Option<Object>> {
-        let obkv = match self.obkv(docid) {
-            Ok(Some(obkv)) => obkv,
-            Ok(None) => return Ok(None),
-            Err(e) => return Err(e.into()),
-        };
-
-        all_obkv_to_json(obkv, &self.fields_ids_map).map(Some)
-    }
-}
-
-unsafe impl Sync for ImmutableObkvs<'_> {}
--- a/crates/milli/src/update/index_documents/transform.rs
+++ b/crates/milli/src/update/index_documents/transform.rs
@ -48,23 +48,8 @@ pub struct TransformOutput {
 /// containing all those documents.
 pub struct Transform<'a, 'i> {
    pub index: &'i Index,
-    fields_ids_map: FieldsIdsMap,
-
    indexer_settings: &'a IndexerConfig,
    pub index_documents_method: IndexDocumentsMethod,
-    available_documents_ids: AvailableIds,
-
-    // Both grenad follows the same format:
-    // key | value
-    // u32 | 1 byte for the Operation byte, the rest is the obkv of the document stored
-    original_sorter: grenad::Sorter<EitherObkvMerge>,
-    flattened_sorter: grenad::Sorter<EitherObkvMerge>,
-
-    replaced_documents_ids: RoaringBitmap,
-    new_documents_ids: RoaringBitmap,
-    // To increase the cache locality and decrease the heap usage we use compact smartstring.
-    new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
-    documents_count: usize,
 }

 /// This enum is specific to the grenad sorter stored in the transform.
@ -75,29 +60,6 @@ pub enum Operation {
    Deletion,
 }

-/// Create a mapping between the field ids found in the document batch and the one that were
-/// already present in the index.
-///
-/// If new fields are present in the addition, they are added to the index field ids map.
-fn create_fields_mapping(
-    index_field_map: &mut FieldsIdsMap,
-    batch_field_map: &DocumentsBatchIndex,
-) -> Result<HashMap<FieldId, FieldId>> {
-    batch_field_map
-        .iter()
-        // we sort by id here to ensure a deterministic mapping of the fields, that preserves
-        // the original ordering.
-        .sorted_by_key(|(&id, _)| id)
-        .map(|(field, name)| match index_field_map.id(name) {
-            Some(id) => Ok((*field, id)),
-            None => index_field_map
-                .insert(name)
-                .ok_or(Error::UserError(UserError::AttributeLimitReached))
-                .map(|id| (*field, id)),
-        })
-        .collect()
-}
-
 impl<'a, 'i> Transform<'a, 'i> {
    pub fn new(
        wtxn: &mut heed::RwTxn<'_>,
@ -138,19 +100,7 @@ impl<'a, 'i> Transform<'a, 'i> {
        );
        let documents_ids = index.documents_ids(wtxn)?;

-        Ok(Transform {
-            index,
-            fields_ids_map: index.fields_ids_map(wtxn)?,
-            indexer_settings,
-            available_documents_ids: AvailableIds::new(&documents_ids),
-            original_sorter,
-            flattened_sorter,
-            index_documents_method,
-            replaced_documents_ids: RoaringBitmap::new(),
-            new_documents_ids: RoaringBitmap::new(),
-            new_external_documents_ids_builder: FxHashMap::default(),
-            documents_count: 0,
-        })
+        Ok(Transform { index, indexer_settings, index_documents_method })
    }

    // Flatten a document from the fields ids map contained in self and insert the new
--- a/crates/milli/tests/search/facet_distribution.rs
+++ b/crates/milli/tests/search/facet_distribution.rs
@ -1,12 +1,17 @@
 use std::io::Cursor;

 use big_s::S;
+use bumpalo::Bump;
 use heed::EnvOpenOptions;
 use maplit::hashset;
-use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
-use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
+use milli::documents::{mmap_from_objects, DocumentsBatchBuilder, DocumentsBatchReader};
+use milli::update::new::indexer;
+use milli::update::{
+    IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
+};
+use milli::vector::EmbeddingConfigs;
 use milli::{FacetDistribution, Index, Object, OrderBy};
-use serde_json::Deserializer;
+use serde_json::{from_value, json, Deserializer};

 #[test]
 fn test_facet_distribution_with_no_facet_values() {
@ -27,37 +32,41 @@ fn test_facet_distribution_with_no_facet_values() {

    // index documents
    let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
-    let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
+    let rtxn = index.read_txn().unwrap();
+    let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
+    let mut new_fields_ids_map = db_fields_ids_map.clone();

-    let builder =
-        IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
-    let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
-    let reader = Cursor::new(
-        r#"{
-            "id": 123,
-            "title": "What a week, hu...",
-            "genres": [],
-            "tags": ["blue"]
-        }
-        {
-            "id": 345,
-            "title": "I am the pig!",
-            "tags": ["red"]
-        }"#,
-    );
+    let embedders = EmbeddingConfigs::default();
+    let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);

-    for result in Deserializer::from_reader(reader).into_iter::<Object>() {
-        let object = result.unwrap();
-        documents_builder.append_json_object(&object).unwrap();
-    }
-
-    let vector = documents_builder.into_inner().unwrap();
+    let doc1: Object = from_value(
+        json!({ "id": 123, "title": "What a week, hu...", "genres": [], "tags": ["blue"] }),
+    )
+    .unwrap();
+    let doc2: Object =
+        from_value(json!({ "id": 345, "title": "I am the pig!", "tags": ["red"] })).unwrap();
+    let documents = mmap_from_objects(vec![doc1, doc2]);

    // index documents
-    let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
-    let (builder, user_error) = builder.add_documents(content).unwrap();
-    user_error.unwrap();
-    builder.execute().unwrap();
+    indexer.add_documents(&documents).unwrap();
+
+    let indexer_alloc = Bump::new();
+    let (document_changes, _operation_stats, primary_key) =
+        indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
+
+    indexer::index(
+        &mut wtxn,
+        &index,
+        config.grenad_parameters(),
+        &db_fields_ids_map,
+        new_fields_ids_map,
+        primary_key,
+        &document_changes,
+        embedders,
+        &|| false,
+        &|_| (),
+    )
+    .unwrap();

    wtxn.commit().unwrap();

--- a/crates/milli/tests/search/mod.rs
+++ b/crates/milli/tests/search/mod.rs
@ -1,14 +1,16 @@
 use std::cmp::Reverse;
 use std::collections::HashSet;
-use std::io::Cursor;
+use std::io::Write;

 use big_s::S;
+use bumpalo::Bump;
 use either::{Either, Left, Right};
 use heed::EnvOpenOptions;
 use maplit::{btreemap, hashset};
-use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
-use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
-use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy};
+use milli::update::new::indexer;
+use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
+use milli::vector::EmbeddingConfigs;
+use milli::{AscDesc, Criterion, DocumentId, Index, Member, TermsMatchingStrategy};
 use serde::{Deserialize, Deserializer};
 use slice_group_by::GroupBy;

@ -34,6 +36,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
    let index = Index::new(options, &path).unwrap();

    let mut wtxn = index.write_txn().unwrap();
+    let rtxn = index.read_txn().unwrap();
    let config = IndexerConfig::default();

    let mut builder = Settings::new(&mut wtxn, &index, &config);
@ -61,27 +64,41 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {

    // index documents
    let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
-    let indexing_config = IndexDocumentsConfig::default();
+    let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
+    let mut new_fields_ids_map = db_fields_ids_map.clone();

-    let builder =
-        IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
-    let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
-    let reader = Cursor::new(CONTENT.as_bytes());
+    let embedders = EmbeddingConfigs::default();
+    let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);

-    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
-        let object = result.unwrap();
-        documents_builder.append_json_object(&object).unwrap();
-    }
+    let mut file = tempfile::tempfile().unwrap();
+    file.write_all(CONTENT.as_bytes()).unwrap();
+    file.sync_all().unwrap();

-    let vector = documents_builder.into_inner().unwrap();
+    let payload = unsafe { memmap2::Mmap::map(&file).unwrap() };

    // index documents
-    let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
-    let (builder, user_error) = builder.add_documents(content).unwrap();
-    user_error.unwrap();
-    builder.execute().unwrap();
+    indexer.add_documents(&payload).unwrap();
+
+    let indexer_alloc = Bump::new();
+    let (document_changes, _operation_stats, primary_key) =
+        indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
+
+    indexer::index(
+        &mut wtxn,
+        &index,
+        config.grenad_parameters(),
+        &db_fields_ids_map,
+        new_fields_ids_map,
+        primary_key,
+        &document_changes,
+        embedders,
+        &|| false,
+        &|_| (),
+    )
+    .unwrap();

    wtxn.commit().unwrap();
+    drop(rtxn);

    index
 }
--- a/crates/milli/tests/search/query_criteria.rs
+++ b/crates/milli/tests/search/query_criteria.rs
@ -2,11 +2,16 @@ use std::cmp::Reverse;
 use std::io::Cursor;

 use big_s::S;
+use bumpalo::Bump;
 use heed::EnvOpenOptions;
 use itertools::Itertools;
 use maplit::hashset;
 use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
-use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
+use milli::update::new::indexer;
+use milli::update::{
+    IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
+};
+use milli::vector::EmbeddingConfigs;
 use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy};
 use rand::Rng;
 use Criterion::*;
@ -275,14 +280,20 @@ fn criteria_ascdesc() {
    });
    builder.execute(|_| (), || false).unwrap();

+    wtxn.commit().unwrap();
+    let mut wtxn = index.write_txn().unwrap();
+    let rtxn = index.read_txn().unwrap();
+
    // index documents
    let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
-    let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-    let builder =
-        IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
+    let indexer_alloc = Bump::new();
+    let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
+    let mut new_fields_ids_map = db_fields_ids_map.clone();

-    let mut batch_builder = DocumentsBatchBuilder::new(Vec::new());
+    let embedders = EmbeddingConfigs::default();
+    let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);

+    let mut file = tempfile::tempfile().unwrap();
    (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| {
        let mut rng = rand::thread_rng();

@ -304,15 +315,29 @@ fn criteria_ascdesc() {
            _ => panic!(),
        };

-        batch_builder.append_json_object(&object).unwrap();
+        serde_json::to_writer(&mut file, &object).unwrap();
    });

-    let vector = batch_builder.into_inner().unwrap();
+    file.sync_all().unwrap();

-    let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
-    let (builder, user_error) = builder.add_documents(reader).unwrap();
-    user_error.unwrap();
-    builder.execute().unwrap();
+    let payload = unsafe { memmap2::Mmap::map(&file).unwrap() };
+    indexer.add_documents(&payload).unwrap();
+    let (document_changes, _operation_stats, primary_key) =
+        indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
+
+    indexer::index(
+        &mut wtxn,
+        &index,
+        config.grenad_parameters(),
+        &db_fields_ids_map,
+        new_fields_ids_map,
+        primary_key,
+        &document_changes,
+        embedders,
+        &|| false,
+        &|_| (),
+    )
+    .unwrap();

    wtxn.commit().unwrap();

--- a/crates/milli/tests/search/typo_tolerance.rs
+++ b/crates/milli/tests/search/typo_tolerance.rs
@ -1,10 +1,15 @@
 use std::collections::BTreeSet;

+use bumpalo::Bump;
 use heed::EnvOpenOptions;
-use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
-use milli::{Criterion, Index, Search, TermsMatchingStrategy};
-use serde_json::json;
+use milli::documents::mmap_from_objects;
+use milli::update::new::indexer;
+use milli::update::{IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings};
+use milli::vector::EmbeddingConfigs;
+use milli::{Criterion, Index, Object, Search, TermsMatchingStrategy};
+use serde_json::from_value;
 use tempfile::tempdir;
+use ureq::json;
 use Criterion::*;

 #[test]
@ -106,34 +111,40 @@ fn test_typo_disabled_on_word() {
    options.map_size(4096 * 100);
    let index = Index::new(options, tmp.path()).unwrap();

-    let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new());
-    let doc1 = json!({
-        "id": 1usize,
-        "data": "zealand",
-    });
+    let doc1: Object = from_value(json!({ "id": 1usize, "data": "zealand" })).unwrap();
+    let doc2: Object = from_value(json!({ "id": 2usize, "data": "zearand" })).unwrap();
+    let documents = mmap_from_objects(vec![doc1, doc2]);

-    let doc2 = json!({
-        "id": 2usize,
-        "data": "zearand",
-    });
-
-    builder.append_json_object(doc1.as_object().unwrap()).unwrap();
-    builder.append_json_object(doc2.as_object().unwrap()).unwrap();
-    let vector = builder.into_inner().unwrap();
-
-    let documents =
-        milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap();
-
-    let mut txn = index.write_txn().unwrap();
+    let mut wtxn = index.write_txn().unwrap();
+    let rtxn = index.read_txn().unwrap();
    let config = IndexerConfig::default();
-    let indexing_config = IndexDocumentsConfig::default();
-    let builder =
-        IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| (), || false).unwrap();

-    let (builder, user_error) = builder.add_documents(documents).unwrap();
-    user_error.unwrap();
-    builder.execute().unwrap();
-    txn.commit().unwrap();
+    let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
+    let mut new_fields_ids_map = db_fields_ids_map.clone();
+    let embedders = EmbeddingConfigs::default();
+    let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments);
+
+    indexer.add_documents(&documents).unwrap();
+
+    let indexer_alloc = Bump::new();
+    let (document_changes, _operation_stats, primary_key) =
+        indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap();
+
+    indexer::index(
+        &mut wtxn,
+        &index,
+        config.grenad_parameters(),
+        &db_fields_ids_map,
+        new_fields_ids_map,
+        primary_key,
+        &document_changes,
+        embedders,
+        &|| false,
+        &|_| (),
+    )
+    .unwrap();
+
+    wtxn.commit().unwrap();

    // basic typo search with default typo settings
    {