2024-11-23 02:27:40 +08:00
28 changed files with 647 additions and 66 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -4483,7 +4483,7 @@ dependencies = [
 [[package]]
 name = "raw-collections"
 version = "0.1.0"
-source = "git+https://github.com/meilisearch/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a"
+source = "git+https://github.com/dureuill/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a"
 dependencies = [
 "allocator-api2",
 "bitpacking",
--- a/crates/index-scheduler/Cargo.toml
+++ b/crates/index-scheduler/Cargo.toml
@ -22,7 +22,7 @@ flate2 = "1.0.30"
 meilisearch-auth = { path = "../meilisearch-auth" }
 meilisearch-types = { path = "../meilisearch-types" }
 page_size = "0.6.0"
-raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
+raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
 rayon = "1.10.0"
 roaring = { version = "0.10.6", features = ["serde"] }
 serde = { version = "1.0.204", features = ["derive"] }
--- a/crates/index-scheduler/src/batch.rs
+++ b/crates/index-scheduler/src/batch.rs
@ -1239,7 +1239,7 @@ impl IndexScheduler {

        let started_processing_at = std::time::Instant::now();
        let secs_since_started_processing_at = AtomicU64::new(0);
-        const PRINT_SECS_DELTA: u64 = 5;
+        const PRINT_SECS_DELTA: u64 = 1;

        let processing_tasks = self.processing_tasks.clone();
        let must_stop_processing = self.must_stop_processing.clone();
@ -1329,6 +1329,7 @@ impl IndexScheduler {
                        DocumentOperation::Add(_content_uuid) => {
                            let mmap = content_files_iter.next().unwrap();
                            indexer.add_documents(mmap)?;
+                            // builder = builder.with_embedders(embedders.clone());
                        }
                        DocumentOperation::Delete(document_ids) => {
                            let document_ids: bumpalo::collections::vec::Vec<_> = document_ids
@ -1411,6 +1412,17 @@ impl IndexScheduler {

                    tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
                }
+                // else if primary_key_has_been_set {
+                //     // Everything failed but we've set a primary key.
+                //     // We need to remove it.
+                //     let mut builder =
+                //         milli::update::Settings::new(index_wtxn, index, indexer_config);
+                //     builder.reset_primary_key();
+                //     builder.execute(
+                //         |indexing_step| tracing::trace!(update = ?indexing_step),
+                //         || must_stop_processing.clone().get(),
+                //     )?;
+                // }

                Ok(tasks)
            }
--- a/crates/index-scheduler/src/lib.rs
+++ b/crates/index-scheduler/src/lib.rs
@ -1365,7 +1365,6 @@ impl IndexScheduler {
        let ProcessingTasks { batch, processing, progress } =
            self.processing_tasks.read().map_err(|_| Error::CorruptedTaskQueue)?.clone();

-        // ignored for now, might be added to batch details later
        let _ = progress;

        let ret = tasks.into_iter();
@ -5199,9 +5198,11 @@ mod tests {
        handle.advance_one_successful_batch();
        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed");

+        // The second batch should fail.
        handle.advance_one_successful_batch();
        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_task_fails");

+        // The second batch should fail.
        handle.advance_one_successful_batch();
        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "third_task_fails");

@ -5262,6 +5263,7 @@ mod tests {
        handle.advance_one_successful_batch();
        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "only_first_task_succeed");

+        // The second batch should fail and contains two tasks.
        handle.advance_one_successful_batch();
        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "second_and_third_tasks_fails");

--- a/crates/meili-snap/Cargo.toml
+++ b/crates/meili-snap/Cargo.toml
@ -11,7 +11,6 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-# fixed version due to format breakages in v1.40
 insta = { version = "=1.39.0", features = ["json", "redactions"] }
 md5 = "0.7.0"
 once_cell = "1.19"
--- a/crates/meilisearch-types/Cargo.toml
+++ b/crates/meilisearch-types/Cargo.toml
@ -24,7 +24,7 @@ flate2 = "1.0.30"
 fst = "0.4.7"
 memmap2 = "0.9.4"
 milli = { path = "../milli" }
-raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
+raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
 roaring = { version = "0.10.6", features = ["serde"] }
 serde = { version = "1.0.204", features = ["derive"] }
 serde-cs = "0.2.4"
--- a/crates/meilisearch-types/src/document_formats.rs
+++ b/crates/meilisearch-types/src/document_formats.rs
@ -6,6 +6,7 @@ use std::marker::PhantomData;
 use bumpalo::Bump;
 use memmap2::Mmap;
 use milli::documents::Error;
+use milli::update::new::TopLevelMap;
 use milli::Object;
 use raw_collections::RawMap;
 use serde::de::{SeqAccess, Visitor};
@ -127,6 +128,7 @@ impl ErrorCode for DocumentFormatError {
    }
 }

+// TODO remove that from the place I've borrowed it
 #[derive(Debug)]
 enum AllowedType {
    String,
@ -211,7 +213,7 @@ pub fn read_csv(input: &File, output: impl io::Write, delimiter: u8) -> Result<u

 /// Reads JSON from file and write it in NDJSON in a file checking it along the way.
 pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {
-    // We memory map to be able to deserialize into a RawMap that
+    // We memory map to be able to deserailize into a TopLevelMap<'pl> that
    // does not allocate when possible and only materialize the first/top level.
    let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
    let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB
@ -252,23 +254,16 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {

 /// Reads NDJSON from file and write it in NDJSON in a file checking it along the way.
 pub fn read_ndjson(input: &File, output: impl io::Write) -> Result<u64> {
-    // We memory map to be able to deserialize into a RawMap that
+    // We memory map to be able to deserailize into a TopLevelMap<'pl> that
    // does not allocate when possible and only materialize the first/top level.
    let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
    let mut output = BufWriter::new(output);

-    let mut bump = Bump::with_capacity(1024 * 1024);
-
    let mut count = 0;
    for result in serde_json::Deserializer::from_slice(&input).into_iter() {
-        bump.reset();
        count += 1;
        result
-            .and_then(|raw: &RawValue| {
-                // try to deserialize as a map
-                let map = RawMap::from_raw_value(raw, &bump)?;
-                to_writer(&mut output, &map)
-            })
+            .and_then(|map: TopLevelMap| to_writer(&mut output, &map))
            .map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?;
    }

--- a/crates/meilisearch/tests/vector/openai.rs
+++ b/crates/meilisearch/tests/vector/openai.rs
@ -172,7 +172,7 @@ async fn create_mock_with_template(
                }))
             }
            }
-            // 2. check API key
+            // 3. check API key
            match req.headers.get("Authorization") {
                Some(api_key) if api_key == API_KEY_BEARER => {
                    {}
--- a/crates/milli/Cargo.toml
+++ b/crates/milli/Cargo.toml
@ -95,7 +95,7 @@ ureq = { version = "2.10.0", features = ["json"] }
 url = "2.5.2"
 rayon-par-bridge = "0.1.0"
 hashbrown = "0.15.0"
-raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
+raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
 bumpalo = "3.16.0"
 thread_local = "1.1.8"
 allocator-api2 = "0.2.18"
--- a/crates/milli/src/documents/builder.rs
+++ b/crates/milli/src/documents/builder.rs
@ -2,7 +2,7 @@ use std::io::{self, Write};

 use grenad::{CompressionType, WriterBuilder};
 use serde::de::Deserializer;
-use serde_json::to_writer;
+use serde_json::{to_writer, Value};

 use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
 use crate::documents::serde_impl::DocumentVisitor;
@ -87,6 +87,95 @@ impl<W: Write> DocumentsBatchBuilder<W> {
        de.deserialize_any(&mut visitor)?
    }

+    /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly.
+    pub fn append_csv<R: io::Read>(&mut self, mut reader: csv::Reader<R>) -> Result<(), Error> {
+        // Make sure that we insert the fields ids in order as the obkv writer has this requirement.
+        let mut typed_fields_ids: Vec<_> = reader
+            .headers()?
+            .into_iter()
+            .map(parse_csv_header)
+            .map(|(k, t)| (self.fields_index.insert(k), t))
+            .enumerate()
+            .collect();
+        // Make sure that we insert the fields ids in order as the obkv writer has this requirement.
+        typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid);
+
+        let mut record = csv::StringRecord::new();
+        let mut line = 0;
+        while reader.read_record(&mut record)? {
+            // We increment here and not at the end of the while loop to take
+            // the header offset into account.
+            line += 1;
+
+            self.obkv_buffer.clear();
+            let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer);
+
+            for (i, (field_id, type_)) in typed_fields_ids.iter() {
+                self.value_buffer.clear();
+
+                let value = &record[*i];
+                let trimmed_value = value.trim();
+                match type_ {
+                    AllowedType::Number => {
+                        if trimmed_value.is_empty() {
+                            to_writer(&mut self.value_buffer, &Value::Null)?;
+                        } else if let Ok(integer) = trimmed_value.parse::<i64>() {
+                            to_writer(&mut self.value_buffer, &integer)?;
+                        } else {
+                            match trimmed_value.parse::<f64>() {
+                                Ok(float) => {
+                                    to_writer(&mut self.value_buffer, &float)?;
+                                }
+                                Err(error) => {
+                                    return Err(Error::ParseFloat {
+                                        error,
+                                        line,
+                                        value: value.to_string(),
+                                    });
+                                }
+                            }
+                        }
+                    }
+                    AllowedType::Boolean => {
+                        if trimmed_value.is_empty() {
+                            to_writer(&mut self.value_buffer, &Value::Null)?;
+                        } else {
+                            match trimmed_value.parse::<bool>() {
+                                Ok(bool) => {
+                                    to_writer(&mut self.value_buffer, &bool)?;
+                                }
+                                Err(error) => {
+                                    return Err(Error::ParseBool {
+                                        error,
+                                        line,
+                                        value: value.to_string(),
+                                    });
+                                }
+                            }
+                        }
+                    }
+                    AllowedType::String => {
+                        if value.is_empty() {
+                            to_writer(&mut self.value_buffer, &Value::Null)?;
+                        } else {
+                            to_writer(&mut self.value_buffer, value)?;
+                        }
+                    }
+                }
+
+                // We insert into the obkv writer the value buffer that has been filled just above.
+                writer.insert(*field_id, &self.value_buffer)?;
+            }
+
+            let internal_id = self.documents_count.to_be_bytes();
+            let document_bytes = writer.into_inner()?;
+            self.writer.insert(internal_id, &document_bytes)?;
+            self.documents_count += 1;
+        }
+
+        Ok(())
+    }
+
    /// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`.
    pub fn into_inner(mut self) -> io::Result<W> {
        let DocumentsBatchBuilder { mut writer, fields_index, .. } = self;
@ -100,12 +189,35 @@ impl<W: Write> DocumentsBatchBuilder<W> {
    }
 }

+#[derive(Debug)]
+enum AllowedType {
+    String,
+    Boolean,
+    Number,
+}
+
+fn parse_csv_header(header: &str) -> (&str, AllowedType) {
+    // if there are several separators we only split on the last one.
+    match header.rsplit_once(':') {
+        Some((field_name, field_type)) => match field_type {
+            "string" => (field_name, AllowedType::String),
+            "boolean" => (field_name, AllowedType::Boolean),
+            "number" => (field_name, AllowedType::Number),
+            // if the pattern isn't recognized, we keep the whole field.
+            _otherwise => (header, AllowedType::String),
+        },
+        None => (header, AllowedType::String),
+    }
+}
+
 #[cfg(test)]
 mod test {
    use std::io::Cursor;

+    use serde_json::json;
+
    use super::*;
-    use crate::documents::DocumentsBatchReader;
+    use crate::documents::{obkv_to_object, DocumentsBatchReader};

    #[test]
    fn add_single_documents_json() {
@ -141,4 +253,348 @@ mod test {

        assert!(cursor.next_document().unwrap().is_none());
    }
+
+    #[test]
+    fn add_documents_csv() {
+        let csv_content = "id:number,field:string\n1,hello!\n2,blabla";
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        assert_eq!(builder.documents_count(), 2);
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+        assert_eq!(index.len(), 2);
+
+        let document = cursor.next_document().unwrap().unwrap();
+        assert_eq!(document.iter().count(), 2);
+
+        let document = cursor.next_document().unwrap().unwrap();
+        assert_eq!(document.iter().count(), 2);
+
+        assert!(cursor.next_document().unwrap().is_none());
+    }
+
+    #[test]
+    fn simple_csv_document() {
+        let csv_content = r#"city,country,pop
+"Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+
+        assert!(cursor.next_document().unwrap().is_none());
+    }
+
+    #[test]
+    fn coma_in_field() {
+        let csv_content = r#"city,country,pop
+"Boston","United, States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United, States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn quote_in_field() {
+        let csv_content = r#"city,country,pop
+"Boston","United"" States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United\" States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn integer_in_field() {
+        let csv_content = r#"city,country,pop:number
+"Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United States",
+                "pop": 4628910,
+            })
+        );
+    }
+
+    #[test]
+    fn integer_as_id() {
+        let csv_content = r#""id:number","title:string","comment:string"
+"1239","Pride and Prejudice","A great book""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
+
+        assert_eq!(
+            val,
+            json!({
+                "id": 1239,
+                "title": "Pride and Prejudice",
+                "comment": "A great book",
+            })
+        );
+    }
+
+    #[test]
+    fn float_in_field() {
+        let csv_content = r#"city,country,pop:number
+"Boston","United States","4628910.01""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United States",
+                "pop": 4628910.01,
+            })
+        );
+    }
+
+    #[test]
+    fn several_colon_in_header() {
+        let csv_content = r#"city:love:string,country:state,pop
+"Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
+
+        assert_eq!(
+            val,
+            json!({
+                "city:love": "Boston",
+                "country:state": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn ending_by_colon_in_header() {
+        let csv_content = r#"city:,country,pop
+"Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
+
+        assert_eq!(
+            val,
+            json!({
+                "city:": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn starting_by_colon_in_header() {
+        let csv_content = r#":city,country,pop
+"Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
+
+        assert_eq!(
+            val,
+            json!({
+                ":city": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[ignore]
+    #[test]
+    fn starting_by_colon_in_header2() {
+        let csv_content = r#":string,country,pop
+"Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+
+        assert!(cursor.next_document().is_err());
+    }
+
+    #[test]
+    fn double_colon_in_header() {
+        let csv_content = r#"city::string,country,pop
+"Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
+
+        assert_eq!(
+            val,
+            json!({
+                "city:": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn bad_type_in_header() {
+        let csv_content = r#"city,country:number,pop
+"Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        assert!(builder.append_csv(csv).is_err());
+    }
+
+    #[test]
+    fn bad_column_count1() {
+        let csv_content = r#"city,country,pop
+"Boston","United States","4628910", "too much
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content"#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        assert!(builder.append_csv(csv).is_err());
+    }
+
+    #[test]
+    fn bad_column_count2() {
+        let csv_content = r#"city,country,pop
+"Boston","United States""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        assert!(builder.append_csv(csv).is_err());
+    }
 }
--- a/crates/milli/src/documents/mod.rs
+++ b/crates/milli/src/documents/mod.rs
@ -253,4 +253,33 @@ mod test {
            {"id": 2,"a": 0,"b": 0},
        ]);
    }
+
+    #[test]
+    fn csv_types_dont_panic() {
+        let csv1_content =
+            "id:number,b:boolean,c,d:number\n1,,,\n2,true,doggo,2\n3,false,the best doggo,-2\n4,,\"Hello, World!\",2.5";
+        let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv1).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
+    }
+
+    #[test]
+    fn out_of_order_csv_fields() {
+        let csv1_content = "id:number,b\n1,0";
+        let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
+
+        let csv2_content = "id:number,a,b\n2,0,0";
+        let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv1).unwrap();
+        builder.append_csv(csv2).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
+    }
 }
--- a/crates/milli/src/fields_ids_map/global.rs
+++ b/crates/milli/src/fields_ids_map/global.rs
@ -3,6 +3,7 @@ use std::sync::RwLock;

 use super::metadata::{FieldIdMapWithMetadata, Metadata};
 use super::MutFieldIdMapper;
+use crate::documents::FieldIdMapper;
 use crate::FieldId;

 /// A fields ids map that can be globally updated to add fields
@ -13,12 +14,22 @@ pub struct GlobalFieldsIdsMap<'indexing> {
 }

 #[derive(Debug, Clone)]
-struct LocalFieldsIdsMap {
+pub struct LocalFieldsIdsMap {
    names_ids: BTreeMap<String, FieldId>,
    ids_names: BTreeMap<FieldId, String>,
    metadata: BTreeMap<FieldId, Metadata>,
 }

+impl FieldIdMapper for LocalFieldsIdsMap {
+    fn id(&self, name: &str) -> Option<FieldId> {
+        self.id(name)
+    }
+
+    fn name(&self, id: FieldId) -> Option<&str> {
+        self.name(id)
+    }
+}
+
 impl LocalFieldsIdsMap {
    fn new(global: &RwLock<FieldIdMapWithMetadata>) -> Self {
        let global = global.read().unwrap();
@ -105,6 +116,10 @@ impl<'indexing> GlobalFieldsIdsMap<'indexing> {

        self.local.name(id)
    }
+
+    pub fn local_map(&self) -> &LocalFieldsIdsMap {
+        &self.local
+    }
 }

 impl<'indexing> MutFieldIdMapper for GlobalFieldsIdsMap<'indexing> {
--- a/crates/milli/src/fields_ids_map/metadata.rs
+++ b/crates/milli/src/fields_ids_map/metadata.rs
@ -102,7 +102,6 @@ impl Metadata {
        rules: &'rules [LocalizedAttributesRule],
    ) -> Option<&'rules [Language]> {
        let localized_attributes_rule_id = self.localized_attributes_rule_id?.get();
-        // - 1: `localized_attributes_rule_id` is NonZero
        let rule = rules.get((localized_attributes_rule_id - 1) as usize).unwrap();
        Some(rule.locales())
    }
@ -161,7 +160,6 @@ impl MetadataBuilder {
            .iter()
            .flat_map(|v| v.iter())
            .position(|rule| rule.match_str(field))
-            // saturating_add(1): make `id` `NonZero`
            .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap());

        Metadata { searchable, filterable, sortable, localized_attributes_rule_id }
--- a/crates/milli/src/heed_codec/facet/ordered_f64_codec.rs
+++ b/crates/milli/src/heed_codec/facet/ordered_f64_codec.rs
@ -46,7 +46,6 @@ fn encode_f64_into_ordered_bytes(
    f: f64,
    buffer: &mut [u8; 16],
 ) -> Result<(), InvalidGloballyOrderedFloatError> {
-    // write the globally ordered float
    let bytes = f64_into_bytes(f).ok_or(InvalidGloballyOrderedFloatError { float: f })?;
    buffer[..8].copy_from_slice(&bytes[..]);
    // Then the f64 value just to be able to read it back
--- a/crates/milli/src/prompt/document.rs
+++ b/crates/milli/src/prompt/document.rs
@ -160,7 +160,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc
    }

    fn size(&self) -> i64 {
-        self.document.top_level_fields_count() as i64
+        self.document.len() as i64
    }

    fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
--- a/crates/milli/src/update/new/channel.rs
+++ b/crates/milli/src/update/new/channel.rs
@ -87,10 +87,23 @@ pub enum WriterOperation {
 }

 pub enum ArroyOperation {
-    DeleteVectors { docid: DocumentId },
-    SetVectors { docid: DocumentId, embedder_id: u8, embeddings: Vec<Embedding> },
-    SetVector { docid: DocumentId, embedder_id: u8, embedding: Embedding },
-    Finish { configs: Vec<IndexEmbeddingConfig> },
+    /// TODO: call when deleting regular documents
+    DeleteVectors {
+        docid: DocumentId,
+    },
+    SetVectors {
+        docid: DocumentId,
+        embedder_id: u8,
+        embeddings: Vec<Embedding>,
+    },
+    SetVector {
+        docid: DocumentId,
+        embedder_id: u8,
+        embedding: Embedding,
+    },
+    Finish {
+        configs: Vec<IndexEmbeddingConfig>,
+    },
 }

 pub struct DbOperation {
@ -321,6 +334,7 @@ impl DocidsSender for FacetDocidsSender<'_> {
    fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
        let (facet_kind, key) = FacetKind::extract_from_key(key);
        let database = Database::from(facet_kind);
+        // let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
        let entry = match facet_kind {
            // skip level group size
            FacetKind::String | FacetKind::Number => {
--- a/crates/milli/src/update/new/document.rs
+++ b/crates/milli/src/update/new/document.rs
@ -21,8 +21,11 @@ pub trait Document<'doc> {
    /// - The `_vectors` and `_geo` fields are **ignored** by this method, meaning  they are **not returned** by this method.
    fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>>;

-    /// Number of top level fields, **excluding** `_vectors` and `_geo`
-    fn top_level_fields_count(&self) -> usize;
+    fn len(&self) -> usize;
+
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }

    /// Get the **top-level** with the specified name, if exists.
    ///
@ -102,15 +105,8 @@ impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> {
        self.field("_geo")
    }

-    fn top_level_fields_count(&self) -> usize {
-        let has_vectors_field = self.vectors_field().unwrap_or(None).is_some();
-        let has_geo_field = self.geo_field().unwrap_or(None).is_some();
-        let count = self.content.iter().count();
-        match (has_vectors_field, has_geo_field) {
-            (true, true) => count - 2,
-            (true, false) | (false, true) => count - 1,
-            (false, false) => count,
-        }
+    fn len(&self) -> usize {
+        self.content.iter().count()
    }

    fn top_level_field(&self, k: &str) -> Result<Option<&'t RawValue>> {
@ -166,15 +162,8 @@ impl<'a, 'doc> Document<'doc> for DocumentFromVersions<'a, 'doc> {
        Ok(self.versions.geo_field())
    }

-    fn top_level_fields_count(&self) -> usize {
-        let has_vectors_field = self.vectors_field().unwrap_or(None).is_some();
-        let has_geo_field = self.geo_field().unwrap_or(None).is_some();
-        let count = self.versions.len();
-        match (has_vectors_field, has_geo_field) {
-            (true, true) => count - 2,
-            (true, false) | (false, true) => count - 1,
-            (false, false) => count,
-        }
+    fn len(&self) -> usize {
+        self.versions.len()
    }

    fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
@ -254,7 +243,7 @@ impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d>
        db.geo_field()
    }

-    fn top_level_fields_count(&self) -> usize {
+    fn len(&self) -> usize {
        self.iter_top_level_fields().count()
    }

@ -285,8 +274,8 @@ where
        D::geo_field(self)
    }

-    fn top_level_fields_count(&self) -> usize {
-        D::top_level_fields_count(self)
+    fn len(&self) -> usize {
+        D::len(self)
    }

    fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
--- a/crates/milli/src/update/new/extract/cache.rs
+++ b/crates/milli/src/update/new/extract/cache.rs
@ -241,7 +241,6 @@ impl<'extractor> BalancedCaches<'extractor> {
    }
 }

-/// SAFETY: No Thread-Local inside
 unsafe impl MostlySend for BalancedCaches<'_> {}

 struct NormalCaches<'extractor> {
--- a/crates/milli/src/update/new/extract/documents.rs
+++ b/crates/milli/src/update/new/extract/documents.rs
@ -140,6 +140,7 @@ impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> {
                    )?;
                    document_extractor_data.docids_delta.insert_add_u32(docid);
                    self.document_sender.uncompressed(docid, external_docid, content).unwrap();
+                    // extracted_dictionary_sender.send(self, dictionary: &[u8]);
                }
            }
        }
--- a/crates/milli/src/update/new/extract/geo/mod.rs
+++ b/crates/milli/src/update/new/extract/geo/mod.rs
@ -60,11 +60,9 @@ pub struct GeoExtractorData<'extractor> {
    /// point being updated, we first put it in the deleted and then in the inserted.
    removed: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
    inserted: bumpalo::collections::Vec<'extractor, ExtractedGeoPoint>,
-    /// Contains a packed list of `ExtractedGeoPoint` of the inserted geo points
-    /// data structures if we have spilled to disk.
+    /// TODO Do the doc
    spilled_removed: Option<BufWriter<File>>,
-    /// Contains a packed list of `ExtractedGeoPoint` of the inserted geo points
-    /// data structures if we have spilled to disk.
+    /// TODO Do the doc
    spilled_inserted: Option<BufWriter<File>>,
 }

@ -139,6 +137,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
    fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
        Ok(RefCell::new(GeoExtractorData {
            removed: bumpalo::collections::Vec::new_in(extractor_alloc),
+            // inserted: Uell::new_in(extractor_alloc),
            inserted: bumpalo::collections::Vec::new_in(extractor_alloc),
            spilled_inserted: None,
            spilled_removed: None,
@ -243,7 +242,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
    }
 }

-/// Extracts and validates the latitude and latitude from a document geo field.
+/// Extracts and validate the latitude and latitude from a document geo field.
 ///
 /// It can be of the form `{ "lat": 0.0, "lng": "1.0" }`.
 pub fn extract_geo_coordinates(
--- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
@ -35,6 +35,7 @@ pub struct WordDocidsBalancedCaches<'extractor> {
 unsafe impl<'extractor> MostlySend for WordDocidsBalancedCaches<'extractor> {}

 impl<'extractor> WordDocidsBalancedCaches<'extractor> {
+    /// TODO Make sure to give the same max_memory to all of them, without splitting it
    pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
        Self {
            word_fid_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
@ -415,6 +416,6 @@ impl WordDocidsExtractors {
    }

    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
-        Ok(Vec::new())
+        Ok(vec!["_geo"])
    }
 }
--- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
@ -25,7 +25,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
    }

    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
-        Ok(Vec::new())
+        Ok(vec!["_geo"])
    }

    // This method is reimplemented to count the number of words in the document in each field
--- a/crates/milli/src/update/new/indexer/document_operation.rs
+++ b/crates/milli/src/update/new/indexer/document_operation.rs
@ -43,7 +43,6 @@ impl<'pl> DocumentOperation<'pl> {
    }

    #[allow(clippy::too_many_arguments)]
-    #[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")]
    pub fn into_changes<MSP, SP>(
        self,
        indexer: &'pl Bump,
--- a/crates/milli/src/update/new/indexer/mod.rs
+++ b/crates/milli/src/update/new/indexer/mod.rs
@ -103,6 +103,7 @@ where
        // prevent moving the field_distribution and document_ids in the inner closure...
        let field_distribution = &mut field_distribution;
        let document_ids = &mut document_ids;
+        // TODO manage the errors correctly
        let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
            let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
            let _entered = span.enter();
@ -175,6 +176,7 @@ where
                )?;

                // TODO Word Docids Merger
+                // extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
                {
                    let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
                    let _entered = span.enter();
@ -188,6 +190,7 @@ where
                }

                // Word Fid Docids Merging
+                // extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
                {
                    let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
                    let _entered = span.enter();
@ -201,6 +204,7 @@ where
                }

                // Exact Word Docids Merging
+                // extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
                {
                    let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
                    let _entered = span.enter();
@ -214,6 +218,7 @@ where
                }

                // Word Position Docids Merging
+                // extractor_sender.send_searchable::<WordPositionDocids>(word_position_docids).unwrap();
                {
                    let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
                    let _entered = span.enter();
@ -227,6 +232,7 @@ where
                }

                // Fid Word Count Docids Merging
+                // extractor_sender.send_searchable::<FidWordCountDocids>(fid_word_count_docids).unwrap();
                {
                    let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
                    let _entered = span.enter();
@ -515,6 +521,9 @@ fn compute_word_fst(index: &Index, wtxn: &mut RwTxn) -> Result<Option<PrefixDelt
        }
    }

+    let span = tracing::trace_span!(target: "indexing::documents::merge", "words_fst");
+    let _entered = span.enter();
+
    let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, &rtxn)?;
    index.main.remap_types::<Str, Bytes>().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?;
    if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data {
--- a/crates/milli/src/update/new/mod.rs
+++ b/crates/milli/src/update/new/mod.rs
@ -2,6 +2,7 @@ pub use document_change::{Deletion, DocumentChange, Insertion, Update};
 pub use merger::{
    merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases, FacetFieldIdsDelta,
 };
+pub use top_level_map::{CowStr, TopLevelMap};

 use super::del_add::DelAdd;
 use crate::FieldId;
@ -18,6 +19,7 @@ mod parallel_iterator_ext;
 mod ref_cell_ext;
 pub(crate) mod steps;
 pub(crate) mod thread_local;
+mod top_level_map;
 pub mod vector_document;
 mod word_fst_builder;
 mod words_prefix_docids;
--- a/crates/milli/src/update/new/top_level_map.rs
+++ b/crates/milli/src/update/new/top_level_map.rs
@ -0,0 +1,66 @@
+use std::borrow::{Borrow, Cow};
+use std::collections::BTreeMap;
+use std::{fmt, ops};
+
+use serde::{Deserialize, Serialize};
+use serde_json::value::RawValue;
+use serde_json::{Map, Value};
+
+#[derive(Deserialize, Serialize)]
+pub struct TopLevelMap<'p>(#[serde(borrow)] pub BTreeMap<CowStr<'p>, &'p RawValue>);
+
+impl TryFrom<&'_ TopLevelMap<'_>> for Map<String, Value> {
+    type Error = serde_json::Error;
+
+    fn try_from(tlmap: &TopLevelMap<'_>) -> Result<Self, Self::Error> {
+        let mut object = Map::new();
+        for (k, v) in &tlmap.0 {
+            let value = serde_json::from_str(v.get())?;
+            object.insert(k.to_string(), value);
+        }
+        Ok(object)
+    }
+}
+
+impl TryFrom<TopLevelMap<'_>> for Map<String, Value> {
+    type Error = serde_json::Error;
+
+    fn try_from(tlmap: TopLevelMap<'_>) -> Result<Self, Self::Error> {
+        TryFrom::try_from(&tlmap)
+    }
+}
+
+impl<'p> ops::Deref for TopLevelMap<'p> {
+    type Target = BTreeMap<CowStr<'p>, &'p RawValue>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl ops::DerefMut for TopLevelMap<'_> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+#[derive(Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
+pub struct CowStr<'p>(#[serde(borrow)] pub Cow<'p, str>);
+
+impl fmt::Display for CowStr<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Display::fmt(&self.0, f)
+    }
+}
+
+impl AsRef<str> for CowStr<'_> {
+    fn as_ref(&self) -> &str {
+        self.0.as_ref()
+    }
+}
+
+impl<'doc> Borrow<str> for CowStr<'doc> {
+    fn borrow(&self) -> &str {
+        self.0.borrow()
+    }
+}
--- a/crates/tracing-trace/src/processor/span_stats.rs
+++ b/crates/tracing-trace/src/processor/span_stats.rs
@ -113,10 +113,7 @@ pub fn to_call_stats<R: std::io::Read>(

                let span = *span;
                if let Some(parent_id) = span.parent_id {
-                    let Some((_, _, parent_self_time)) = spans.get_mut(&parent_id) else {
-                        let (c, _) = calls.get_mut(&span.call_id).unwrap();
-                        panic!("parent span not found for span: module_path: {:?}, name: {:?}, target: {:?}", c.module_path.as_deref().unwrap_or_default(), c.name, c.target);
-                    };
+                    let (_, _, parent_self_time) = spans.get_mut(&parent_id).unwrap();
                    parent_self_time.add_child_range(self_range.clone())
                }
                total_self_time.add_child_range(self_range);
--- a/workloads/movies.json
+++ b/workloads/movies.json
@ -1,6 +1,6 @@
 {
  "name": "movies.json",
-  "run_count": 10,
+  "run_count": 1,
  "extra_cli_args": [],
  "assets": {
    "movies.json": {