From e8297ad27e4977f4f43c43f181dc9d7c9ea041dd Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:04:27 +0200 Subject: [PATCH] Fix the tests for the new DocumentsBatchBuilder/Reader --- milli/src/documents/builder.rs | 332 +++++++++--------- milli/src/documents/mod.rs | 133 ++----- milli/src/search/distinct/mod.rs | 31 +- milli/src/update/index_documents/mod.rs | 63 ++-- milli/src/update/index_documents/transform.rs | 12 +- milli/tests/search/facet_distribution.rs | 26 +- milli/tests/search/mod.rs | 18 +- milli/tests/search/query_criteria.rs | 20 +- milli/tests/search/typo_tolerance.rs | 31 +- 9 files changed, 292 insertions(+), 374 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 159afb8d9..19cc1ce53 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -183,7 +183,8 @@ mod test { use serde_json::{json, Map}; use super::*; - use crate::documents::DocumentBatchReader; + use crate::documents::DocumentsBatchReader; + use crate::FieldId; fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { let mut map = Map::new(); @@ -192,7 +193,7 @@ mod test { let field_name = index.name(fid).unwrap().clone(); let value: Value = serde_json::from_slice(value).unwrap(); - map.insert(field_name, value); + map.insert(field_name.to_string(), value); } Value::Object(map) @@ -200,15 +201,13 @@ mod test { #[test] fn add_single_documents_json() { - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - let json = serde_json::json!({ "id": 1, "field": "hello!", }); - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(json.as_object().unwrap()).unwrap(); let json = serde_json::json!({ "blabla": false, @@ -216,100 +215,64 @@ mod test { "id": 1, }); - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + builder.append_json_object(json.as_object().unwrap()).unwrap(); - assert_eq!(builder.len(), 2); + assert_eq!(builder.documents_count(), 2); + let vector = builder.into_inner().unwrap(); - builder.finish().unwrap(); - - cursor.set_position(0); - - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); assert_eq!(index.len(), 3); + + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 3); - assert!(reader.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_documents_seq_json() { - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - let json = serde_json::json!([{ - "id": 1, - "field": "hello!", - },{ - "blabla": false, - "field": "hello!", - "id": 1, - } - ]); - - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); - - assert_eq!(builder.len(), 2); - - builder.finish().unwrap(); - - cursor.set_position(0); - - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); - assert_eq!(document.iter().count(), 2); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); - assert_eq!(document.iter().count(), 3); - - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn add_documents_csv() { - let mut cursor = Cursor::new(Vec::new()); + let csv_content = "id:number,field:string\n1,hello!\n2,blabla"; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let csv = "id:number,field:string\n1,hello!\n2,blabla"; + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + assert_eq!(builder.documents_count(), 2); + let vector = builder.into_inner().unwrap(); - let builder = - DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); - builder.finish().unwrap(); - - cursor.set_position(0); - - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); assert_eq!(index.len(), 2); + + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - let (_index, document) = reader.next_document_with_index().unwrap().unwrap(); + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn simple_csv_document() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -320,22 +283,25 @@ mod test { }) ); - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn coma_in_field() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United, States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -349,17 +315,20 @@ mod test { #[test] fn quote_in_field() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United"" States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -373,17 +342,20 @@ mod test { #[test] fn integer_in_field() { - let documents = r#"city,country,pop:number + let csv_content = r#"city,country,pop:number "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -397,17 +369,20 @@ mod test { #[test] fn float_in_field() { - let documents = r#"city,country,pop:number + let csv_content = r#"city,country,pop:number "Boston","United States","4628910.01""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -421,17 +396,20 @@ mod test { #[test] fn several_colon_in_header() { - let documents = r#"city:love:string,country:state,pop + let csv_content = r#"city:love:string,country:state,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -445,17 +423,20 @@ mod test { #[test] fn ending_by_colon_in_header() { - let documents = r#"city:,country,pop + let csv_content = r#"city:,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -469,17 +450,20 @@ mod test { #[test] fn starting_by_colon_in_header() { - let documents = r#":city,country,pop + let csv_content = r#":city,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -494,32 +478,36 @@ mod test { #[ignore] #[test] fn starting_by_colon_in_header2() { - let documents = r#":string,country,pop + let csv_content = r#":string,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); - assert!(reader.next_document_with_index().is_err()); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + + assert!(cursor.next_document().is_err()); } #[test] fn double_colon_in_header() { - let documents = r#"city::string,country,pop + let csv_content = r#"city::string,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -533,34 +521,32 @@ mod test { #[test] fn bad_type_in_header() { - let documents = r#"city,country:number,pop + let csv_content = r#"city,country:number,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } #[test] fn bad_column_count1() { - let documents = r#"city,country,pop -"Boston","United States","4628910", "too much""#; + let csv_content = r#"city,country,pop +"Boston","United States","4628910", "too much + let csv = csv::Reader::from_reader(Cursor::new(csv_content"#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } #[test] fn bad_column_count2() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United States""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index bd0afc6e4..7a34ae13b 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -159,7 +159,7 @@ mod test { #[test] fn create_documents_no_errors() { - let json = json!({ + let value = json!({ "number": 1, "string": "this is a field", "array": ["an", "array"], @@ -169,26 +169,17 @@ mod test { "bool": true }); - let json = serde_json::to_vec(&json).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(json)).unwrap(); - - builder.finish().unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(value.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - assert_eq!(documents.index().iter().count(), 5); - - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 5); - assert!(documents.next_document_with_index().unwrap().is_none()); + assert_eq!(documents.documents_batch_index().iter().count(), 5); + let reader = documents.next_document().unwrap().unwrap(); + assert_eq!(reader.iter().count(), 5); + assert!(documents.next_document().unwrap().is_none()); } #[test] @@ -200,101 +191,55 @@ mod test { "toto": false, }); - let doc1 = serde_json::to_vec(&doc1).unwrap(); - let doc2 = serde_json::to_vec(&doc2).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(doc1)).unwrap(); - builder.extend_from_json(Cursor::new(doc2)).unwrap(); - - builder.finish().unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(doc1.as_object().unwrap()).unwrap(); + builder.append_json_object(doc2.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); - - assert_eq!(documents.index().iter().count(), 2); - - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 1); - assert!(documents.next_document_with_index().unwrap().is_some()); - assert!(documents.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_documents_array() { - let docs = json!([ - { "toto": false }, - { "tata": "hello" }, - ]); - - let docs = serde_json::to_vec(&docs).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(docs)).unwrap(); - - builder.finish().unwrap(); - - let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); - - assert_eq!(documents.index().iter().count(), 2); - - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 1); - assert!(documents.next_document_with_index().unwrap().is_some()); - assert!(documents.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_invalid_document_format() { - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - let docs = json!([[ - { "toto": false }, - { "tata": "hello" }, - ]]); - - let docs = serde_json::to_vec(&docs).unwrap(); - assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); - - let docs = json!("hello"); - let docs = serde_json::to_vec(&docs).unwrap(); - - assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); + DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor(); + assert_eq!(documents.documents_batch_index().iter().count(), 2); + let reader = documents.next_document().unwrap().unwrap(); + assert_eq!(reader.iter().count(), 1); + assert!(documents.next_document().unwrap().is_some()); + assert!(documents.next_document().unwrap().is_none()); } #[test] fn test_nested() { - let mut docs = documents!([{ + let docs_reader = documents!([{ "hello": { "toto": ["hello"] } }]); - let (_index, doc) = docs.next_document_with_index().unwrap().unwrap(); - + let mut cursor = docs_reader.into_cursor(); + let doc = cursor.next_document().unwrap().unwrap(); let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); assert_eq!(nested, json!({ "toto": ["hello"] })); } #[test] - fn out_of_order_fields() { + fn out_of_order_json_fields() { let _documents = documents!([ {"id": 1,"b": 0}, {"id": 2,"a": 0,"b": 0}, ]); } + + #[test] + fn out_of_order_csv_fields() { + let csv1_content = "id:number,b\n1,0"; + let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content)); + + let csv2_content = "id:number,a,b\n2,0,0"; + let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv1).unwrap(); + builder.append_csv(csv2).unwrap(); + let vector = builder.into_inner().unwrap(); + + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); + } } diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 237fd718a..670fa01ac 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -35,7 +35,7 @@ mod test { use roaring::RoaringBitmap; use serde_json::{json, Value}; - use crate::documents::{DocumentBatchBuilder, DocumentBatchReader}; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; use crate::index::Index; use crate::update::{ @@ -43,14 +43,11 @@ mod test { }; use crate::{DocumentId, FieldId, BEU32}; - static JSON: Lazy> = Lazy::new(generate_documents); - - fn generate_documents() -> Vec { + static JSON: Lazy> = Lazy::new(|| { let mut rng = rand::thread_rng(); let num_docs = rng.gen_range(10..30); - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); let txts = ["Toto", "Titi", "Tata"]; let cats = (1..10).map(|i| i.to_string()).collect::>(); let cat_ints = (1..10).collect::>(); @@ -63,7 +60,7 @@ mod test { let mut sample_ints = cat_ints.clone(); sample_ints.shuffle(&mut rng); - let doc = json!({ + let json = json!({ "id": i, "txt": txt, "cat-int": rng.gen_range(0..3), @@ -71,13 +68,16 @@ mod test { "cat-ints": sample_ints[..(rng.gen_range(0..3))], }); - let doc = Cursor::new(serde_json::to_vec(&doc).unwrap()); - builder.extend_from_json(doc).unwrap(); + let object = match json { + Value::Object(object) => object, + _ => panic!(), + }; + + builder.append_json_object(&object).unwrap(); } - builder.finish().unwrap(); - cursor.into_inner() - } + builder.into_inner().unwrap() + }); /// Returns a temporary index populated with random test documents, the FieldId for the /// distinct attribute, and the RoaringBitmap with the document ids. @@ -101,7 +101,8 @@ mod test { IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); let reader = - crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); + crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) + .unwrap(); addition.add_documents(reader).unwrap(); addition.execute().unwrap(); @@ -109,8 +110,8 @@ mod test { let fields_map = index.fields_ids_map(&txn).unwrap(); let fid = fields_map.id(&distinct).unwrap(); - let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); - let map = (0..documents.len() as u32).collect(); + let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap(); + let map = (0..documents.documents_count() as u32).collect(); txn.commit().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba428f078..7f6e00b11 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -25,7 +25,7 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use crate::documents::DocumentBatchReader; +use crate::documents::DocumentsBatchReader; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, @@ -121,7 +121,7 @@ where /// builder, and the builder must be discarded. /// /// Returns the number of documents added to the builder. - pub fn add_documents(&mut self, reader: DocumentBatchReader) -> Result + pub fn add_documents(&mut self, reader: DocumentsBatchReader) -> Result where R: Read + Seek, { @@ -590,9 +590,8 @@ mod tests { use maplit::hashset; use super::*; - use crate::documents::DocumentBatchBuilder; + use crate::documents::DocumentsBatchBuilder; use crate::update::DeleteDocuments; - use crate::HashMap; #[test] fn simple_document_replacement() { @@ -1252,21 +1251,17 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); - let mut big_object = HashMap::new(); - big_object.insert(S("id"), "wow"); + let mut big_object = serde_json::Map::new(); + big_object.insert(S("id"), serde_json::Value::from("wow")); for i in 0..1000 { let key = i.to_string(); - big_object.insert(key, "I am a text!"); + big_object.insert(key, serde_json::Value::from("I am a text!")); } - let mut cursor = Cursor::new(Vec::new()); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap()); - builder.extend_from_json(big_object).unwrap(); - builder.finish().unwrap(); - cursor.set_position(0); - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(&big_object).unwrap(); + let vector = builder.into_inner().unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); @@ -1288,23 +1283,19 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); - let mut big_object = HashMap::new(); - big_object.insert(S("id"), "wow"); + let mut big_object = serde_json::Map::new(); + big_object.insert(S("id"), serde_json::Value::from("wow")); let content: String = (0..=u16::MAX) .into_iter() .map(|p| p.to_string()) .reduce(|a, b| a + " " + b.as_ref()) .unwrap(); - big_object.insert("content".to_string(), &content); + big_object.insert("content".to_string(), serde_json::Value::from(content)); - let mut cursor = Cursor::new(Vec::new()); - - let big_object = serde_json::to_string(&big_object).unwrap(); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.extend_from_json(&mut big_object.as_bytes()).unwrap(); - builder.finish().unwrap(); - cursor.set_position(0); - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(&big_object).unwrap(); + let vector = builder.into_inner().unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); @@ -1843,18 +1834,20 @@ mod tests { // Create 200 documents with a long text let content = { - let documents: Vec<_> = (0..200i32) + let documents_iter = (0..200i32) .into_iter() .map(|i| serde_json::json!({ "id": i, "script": script })) - .collect(); + .filter_map(|json| match json { + serde_json::Value::Object(object) => Some(object), + _ => None, + }); - let mut writer = std::io::Cursor::new(Vec::new()); - let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - let documents = serde_json::to_vec(&documents).unwrap(); - builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); - builder.finish().unwrap(); - writer.set_position(0); - crate::documents::DocumentBatchReader::from_reader(writer).unwrap() + let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new()); + for object in documents_iter { + builder.append_json_object(&object).unwrap(); + } + let vector = builder.into_inner().unwrap(); + crate::documents::DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap() }; // Index those 200 long documents diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 12a858024..129357075 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -14,7 +14,7 @@ use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::{IndexDocumentsMethod, IndexerConfig}; -use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; +use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; @@ -152,7 +152,7 @@ impl<'a, 'i> Transform<'a, 'i> { pub fn read_documents( &mut self, - mut reader: DocumentBatchReader, + reader: DocumentsBatchReader, wtxn: &mut heed::RwTxn, progress_callback: F, ) -> Result @@ -160,7 +160,8 @@ impl<'a, 'i> Transform<'a, 'i> { R: Read + Seek, F: Fn(UpdateIndexingStep) + Sync, { - let fields_index = reader.index(); + let mut cursor = reader.into_cursor(); + let fields_index = cursor.documents_batch_index(); let external_documents_ids = self.index.external_documents_ids(wtxn)?; let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; @@ -186,7 +187,8 @@ impl<'a, 'i> Transform<'a, 'i> { let mut documents_count = 0; let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); - while let Some((addition_index, document)) = reader.next_document_with_index()? { + let addition_index = cursor.documents_batch_index().clone(); + while let Some(document) = cursor.next_document()? { let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -840,7 +842,7 @@ fn update_primary_key<'a>( None => { let mut json = Map::new(); for (key, value) in document.iter() { - let key = addition_index.name(key).cloned(); + let key = addition_index.name(key).map(ToString::to_string); let value = serde_json::from_slice::(&value).ok(); if let Some((k, v)) = key.zip(value) { diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index d3aece2ab..66713de1e 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -3,9 +3,10 @@ use std::io::Cursor; use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{FacetDistribution, Index}; +use serde_json::{Deserializer, Map, Value}; #[test] fn test_facet_distribution_with_no_facet_values() { @@ -30,35 +31,30 @@ fn test_facet_distribution_with_no_facet_values() { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new( - r#"[ - { + r#"{ "id": 123, "title": "What a week, hu...", "genres": [], "tags": ["blue"] - }, + } { "id": 345, "title": "I am the pig!", "tags": ["red"] - } - ]"#, + }"#, ); - for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); - documents_builder.extend_from_json(doc).unwrap(); + for result in Deserializer::from_reader(reader).into_iter::>() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); } - documents_builder.finish().unwrap(); - - cursor.set_position(0); + let vector = documents_builder.into_inner().unwrap(); // index documents - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 472fbafe0..4cf117dc7 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -6,10 +6,11 @@ use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, DocumentId, Index, Member}; use serde::Deserialize; +use serde_json::{Deserializer, Map, Value}; use slice_group_by::GroupBy; mod distinct; @@ -62,21 +63,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new(CONTENT.as_bytes()); - for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); - documents_builder.extend_from_json(doc).unwrap(); + for result in Deserializer::from_reader(reader).into_iter::>() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); } - documents_builder.finish().unwrap(); - - cursor.set_position(0); + let vector = documents_builder.into_inner().unwrap(); // index documents - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 893d7c30a..89a6a6eec 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -5,7 +5,7 @@ use big_s::S; use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; use rand::Rng; @@ -393,8 +393,7 @@ fn criteria_ascdesc() { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut batch_builder = DocumentsBatchBuilder::new(Vec::new()); (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| { let mut rng = rand::thread_rng(); @@ -412,16 +411,17 @@ fn criteria_ascdesc() { "age": age, }); - let json = Cursor::new(serde_json::to_vec(&json).unwrap()); - batch_builder.extend_from_json(json).unwrap(); + let object = match json { + serde_json::Value::Object(object) => object, + _ => panic!(), + }; + + batch_builder.append_json_object(&object).unwrap(); }); - batch_builder.finish().unwrap(); - - cursor.set_position(0); - - let reader = DocumentBatchReader::from_reader(cursor).unwrap(); + let vector = batch_builder.into_inner().unwrap(); + let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(reader).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 9a7986c5e..63bf22579 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -106,26 +106,23 @@ fn test_typo_disabled_on_word() { options.map_size(4096 * 100); let index = Index::new(options, tmp.path()).unwrap(); - let documents = json!([ - { - "id": 1usize, - "data": "zealand", - }, - { - "id": 2usize, - "data": "zearand", - }, - ]); + let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new()); + let doc1 = json!({ + "id": 1usize, + "data": "zealand", + }); - let mut writer = std::io::Cursor::new(Vec::new()); - let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - let documents = serde_json::to_vec(&documents).unwrap(); - builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); - builder.finish().unwrap(); + let doc2 = json!({ + "id": 2usize, + "data": "zearand", + }); - writer.set_position(0); + builder.append_json_object(doc1.as_object().unwrap()).unwrap(); + builder.append_json_object(doc2.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); - let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap(); + let documents = + milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap(); let mut txn = index.write_txn().unwrap(); let config = IndexerConfig::default();