From 9f8095c06985bafb58272bc26374df7e81188cb9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 21 Jul 2021 10:35:35 +0200 Subject: [PATCH 1/3] Make sure that we don't keep a reference on the LMDB key when using put_current --- milli/src/update/delete_documents.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 222f3b2d3..e6f847d01 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -366,6 +366,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; } else if docids.len() != previous_len { + let key = key.to_owned(); // safety: we don't keep references from inside the LMDB database. unsafe { iter.put_current(&key, &docids)? }; } From 92c0a2cdc1d5c6afeef84c409422af2152787cee Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 22 Jul 2021 17:14:44 +0200 Subject: [PATCH 2/3] Add a test that triggers a panic when indexing zeroes --- milli/src/update/index_documents/mod.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 9ac05fe1a..cd48175f7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1380,4 +1380,26 @@ mod tests { wtxn.commit().unwrap(); } + + #[test] + fn index_documents_with_zeroes() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = r#"#id,title,au{hor,genre,price$ +2,"Prideand Prejudice","Jane Austin","romance",3.5$ +456,"Le Petit Prince","Antoine de Saint-Exupéry","adventure",10.0$ +1,Wonderland","Lewis Carroll","fantasy",25.99$ +4,"Harry Potter ing","fantasy\0lood Prince","J. K. Rowling","fantasy\0, +"#; + + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Csv); + builder.execute(content.as_bytes(), |_, _| ()).unwrap(); + + wtxn.commit().unwrap(); + } } From 0353fbb5df04146bd9297a5886a351e1d502be6f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 22 Jul 2021 16:23:36 +0200 Subject: [PATCH 3/3] Bump the tokenizer version to v0.2.4 --- Cargo.lock | 31 ++++++++++++++++++++++++++++--- milli/Cargo.toml | 2 +- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 18d42029f..cd1913ec8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -990,7 +990,7 @@ dependencies = [ "jemallocator", "log", "maplit", - "meilisearch-tokenizer", + "meilisearch-tokenizer 0.2.3", "memmap", "milli", "once_cell", @@ -1353,7 +1353,23 @@ dependencies = [ "once_cell", "slice-group-by", "unicode-segmentation", - "whatlang", + "whatlang 0.9.0", +] + +[[package]] +name = "meilisearch-tokenizer" +version = "0.2.4" +source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.4#135d08dce465a756abaf6a1bcad70f315bda99b9" +dependencies = [ + "character_converter", + "cow-utils", + "deunicode", + "fst", + "jieba-rs", + "once_cell", + "slice-group-by", + "unicode-segmentation", + "whatlang 0.12.0", ] [[package]] @@ -1404,7 +1420,7 @@ dependencies = [ "log", "logging_timer", "maplit", - "meilisearch-tokenizer", + "meilisearch-tokenizer 0.2.4", "memmap", "obkv", "once_cell", @@ -3087,6 +3103,15 @@ dependencies = [ "hashbrown 0.7.2", ] +[[package]] +name = "whatlang" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a346d2eb29c03618693ed24a29d1acd0c3f2cb08ae58b9669d7461e033cf703" +dependencies = [ + "hashbrown 0.7.2", +] + [[package]] name = "winapi" version = "0.2.8" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 6af928041..dc839a209 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -19,7 +19,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe human_format = "1.0.3" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.4" -meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" } +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.4" } memmap = "0.7.0" obkv = "0.2.0" once_cell = "1.5.2"