mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 09:04:59 +08:00
Merge #291
291: Fix a bug about zero bytes in the inputs r=irevoire a=Kerollmops Ok, good news, after a little session of debugging with `@irevoire` we found out that the bug seems to be related to zeroes in the input update. The engine wasn't designed to accept those. The chosen solution is to update the tokenizer to remove those zeroes. We are waiting on https://github.com/meilisearch/tokenizer/pull/52 to be merged and a new version to be released. It is not an undefined behavior, I repeat: it is a "normal" bug 🎉 👏 ---- This PR tries to fix a bug where we use LMDB in the wrong way, leading to panic due to an undefined behavior on the Rust side. I thought [we fixed it in a previous PR](https://github.com/meilisearch/milli/pull/264) but we found out that _a similar_ bug was still present. `@bb` found a way to trigger this bug and helped us find the origin of it. As I don't have a minimal reproducible example of this bug I bet on the unsafe `put_current` calls when we index new documents as the bug was trigger after a big indexation on a clean database, thus not triggering a deletion update. I only replaced the unsafe `put_current` with two safe calls to `get`/`put`. I hope it helps and fixes the bug, only `@bb` can help us check that. I am not even sure how I can create a custom Docker image and expose it for testing purposes. <details> <summary>The backtrace leading us to a panic in grenad.</summary> ``` meilisearch_1 | thread 'tokio-runtime-worker' panicked at 'assertion failed: key > &last_key', /root/.cargo/git/checkouts/grenad-e2cb77f65d31bb02/3adcb26/src/block_builder.rs:38:17 meilisearch_1 | stack backtrace: meilisearch_1 | 0: rust_begin_unwind meilisearch_1 | at ./rustc/53cb7b09b00cbea8754ffb78e7e3cb521cb8af4b/library/std/src/panicking.rs:493:5 meilisearch_1 | 1: core::panicking::panic_fmt meilisearch_1 | at ./rustc/53cb7b09b00cbea8754ffb78e7e3cb521cb8af4b/library/core/src/panicking.rs:92:14 meilisearch_1 | 2: core::panicking::panic meilisearch_1 | at ./rustc/53cb7b09b00cbea8754ffb78e7e3cb521cb8af4b/library/core/src/panicking.rs:50:5 meilisearch_1 | 3: grenad::block_builder::BlockBuilder::insert meilisearch_1 | at ./root/.cargo/git/checkouts/grenad-e2cb77f65d31bb02/3adcb26/src/block_builder.rs:38:17 meilisearch_1 | 4: grenad::writer::Writer<W>::insert meilisearch_1 | at ./root/.cargo/git/checkouts/grenad-e2cb77f65d31bb02/3adcb26/src/writer.rs:92:12 meilisearch_1 | 5: milli::update::words_level_positions::write_level_entry meilisearch_1 | at ./root/.cargo/git/checkouts/milli-00376cd5db949a15/007fec2/milli/src/update/words_level_positions.rs:262:5 meilisearch_1 | 6: milli::update::words_level_positions::compute_positions_levels meilisearch_1 | at ./root/.cargo/git/checkouts/milli-00376cd5db949a15/007fec2/milli/src/update/words_level_positions.rs:211:13 meilisearch_1 | 7: milli::update::words_level_positions::WordsLevelPositions::execute meilisearch_1 | at ./root/.cargo/git/checkouts/milli-00376cd5db949a15/007fec2/milli/src/update/words_level_positions.rs:65:23 meilisearch_1 | 8: milli::update::index_documents::IndexDocuments::execute_raw meilisearch_1 | at ./root/.cargo/git/checkouts/milli-00376cd5db949a15/007fec2/milli/src/update/index_documents/mod.rs:831:9 meilisearch_1 | 9: milli::update::index_documents::IndexDocuments::execute meilisearch_1 | at ./root/.cargo/git/checkouts/milli-00376cd5db949a15/007fec2/milli/src/update/index_documents/mod.rs:372:9 meilisearch_1 | 10: meilisearch_http::index::updates::<impl meilisearch_http::index::Index>::update_documents_txn meilisearch_1 | at ./meilisearch/meilisearch-http/src/index/updates.rs:225:30 meilisearch_1 | 11: meilisearch_http::index::updates::<impl meilisearch_http::index::Index>::update_documents meilisearch_1 | at ./meilisearch/meilisearch-http/src/index/updates.rs:183:22 meilisearch_1 | 12: meilisearch_http::index::update_handler::UpdateHandler::handle_update meilisearch_1 | at ./meilisearch/meilisearch-http/src/index/update_handler.rs:75:18 meilisearch_1 | 13: meilisearch_http::index_controller::index_actor::actor::IndexActor<S>::handle_update::{{closure}}::{{closure}} meilisearch_1 | at ./meilisearch/meilisearch-http/src/index_controller/index_actor/actor.rs:174:35 meilisearch_1 | 14: <tokio::runtime::blocking::task::BlockingTask<T> as core::future::future::Future>::poll meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/runtime/blocking/task.rs:42:21 meilisearch_1 | 15: tokio::runtime::task::core::CoreStage<T>::poll::{{closure}} meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/runtime/task/core.rs:243:17 meilisearch_1 | 16: tokio::loom::std::unsafe_cell::UnsafeCell<T>::with_mut meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/loom/std/unsafe_cell.rs:14:9 meilisearch_1 | 17: tokio::runtime::task::core::CoreStage<T>::poll meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/runtime/task/core.rs:233:13 meilisearch_1 | 18: tokio::runtime::task::harness::poll_future::{{closure}} meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/runtime/task/harness.rs:427:23 meilisearch_1 | 19: <std::panic::AssertUnwindSafe<F> as core::ops::function::FnOnce<()>>::call_once meilisearch_1 | at ./rustc/53cb7b09b00cbea8754ffb78e7e3cb521cb8af4b/library/std/src/panic.rs:344:9 meilisearch_1 | 20: std::panicking::try::do_call meilisearch_1 | at ./rustc/53cb7b09b00cbea8754ffb78e7e3cb521cb8af4b/library/std/src/panicking.rs:379:40 meilisearch_1 | 21: std::panicking::try meilisearch_1 | at ./rustc/53cb7b09b00cbea8754ffb78e7e3cb521cb8af4b/library/std/src/panicking.rs:343:19 meilisearch_1 | 22: std::panic::catch_unwind meilisearch_1 | at ./rustc/53cb7b09b00cbea8754ffb78e7e3cb521cb8af4b/library/std/src/panic.rs:431:14 meilisearch_1 | 23: tokio::runtime::task::harness::poll_future meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/runtime/task/harness.rs:414:19 meilisearch_1 | 24: tokio::runtime::task::harness::Harness<T,S>::poll_inner meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/runtime/task/harness.rs:89:9 meilisearch_1 | 25: tokio::runtime::task::harness::Harness<T,S>::poll meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/runtime/task/harness.rs:59:15 meilisearch_1 | 26: tokio::runtime::task::raw::RawTask::poll meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/runtime/task/raw.rs:66:18 meilisearch_1 | 27: tokio::runtime::task::Notified<S>::run meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/runtime/task/mod.rs:171:9 meilisearch_1 | 28: tokio::runtime::blocking::pool::Inner::run meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/runtime/blocking/pool.rs:265:17 meilisearch_1 | 29: tokio::runtime::blocking::pool::Spawner::spawn_thread::{{closure}} meilisearch_1 | at ./root/.cargo/registry/src/github.com-1ecc6299db9ec823/tokio-1.7.1/src/runtime/blocking/pool.rs:245:17 meilisearch_1 | note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose backtrace. ``` </details> Co-authored-by: Kerollmops <clement@meilisearch.com>
This commit is contained in:
commit
ee3a49cfba
31
Cargo.lock
generated
31
Cargo.lock
generated
@ -990,7 +990,7 @@ dependencies = [
|
|||||||
"jemallocator",
|
"jemallocator",
|
||||||
"log",
|
"log",
|
||||||
"maplit",
|
"maplit",
|
||||||
"meilisearch-tokenizer",
|
"meilisearch-tokenizer 0.2.3",
|
||||||
"memmap",
|
"memmap",
|
||||||
"milli",
|
"milli",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
@ -1353,7 +1353,23 @@ dependencies = [
|
|||||||
"once_cell",
|
"once_cell",
|
||||||
"slice-group-by",
|
"slice-group-by",
|
||||||
"unicode-segmentation",
|
"unicode-segmentation",
|
||||||
"whatlang",
|
"whatlang 0.9.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "meilisearch-tokenizer"
|
||||||
|
version = "0.2.4"
|
||||||
|
source = "git+https://github.com/meilisearch/Tokenizer.git?tag=v0.2.4#135d08dce465a756abaf6a1bcad70f315bda99b9"
|
||||||
|
dependencies = [
|
||||||
|
"character_converter",
|
||||||
|
"cow-utils",
|
||||||
|
"deunicode",
|
||||||
|
"fst",
|
||||||
|
"jieba-rs",
|
||||||
|
"once_cell",
|
||||||
|
"slice-group-by",
|
||||||
|
"unicode-segmentation",
|
||||||
|
"whatlang 0.12.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -1404,7 +1420,7 @@ dependencies = [
|
|||||||
"log",
|
"log",
|
||||||
"logging_timer",
|
"logging_timer",
|
||||||
"maplit",
|
"maplit",
|
||||||
"meilisearch-tokenizer",
|
"meilisearch-tokenizer 0.2.4",
|
||||||
"memmap",
|
"memmap",
|
||||||
"obkv",
|
"obkv",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
@ -3087,6 +3103,15 @@ dependencies = [
|
|||||||
"hashbrown 0.7.2",
|
"hashbrown 0.7.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "whatlang"
|
||||||
|
version = "0.12.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7a346d2eb29c03618693ed24a29d1acd0c3f2cb08ae58b9669d7461e033cf703"
|
||||||
|
dependencies = [
|
||||||
|
"hashbrown 0.7.2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "winapi"
|
name = "winapi"
|
||||||
version = "0.2.8"
|
version = "0.2.8"
|
||||||
|
@ -19,7 +19,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe
|
|||||||
human_format = "1.0.3"
|
human_format = "1.0.3"
|
||||||
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||||
linked-hash-map = "0.5.4"
|
linked-hash-map = "0.5.4"
|
||||||
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.3" }
|
meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", tag = "v0.2.4" }
|
||||||
memmap = "0.7.0"
|
memmap = "0.7.0"
|
||||||
obkv = "0.2.0"
|
obkv = "0.2.0"
|
||||||
once_cell = "1.5.2"
|
once_cell = "1.5.2"
|
||||||
|
@ -367,6 +367,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
// safety: we don't keep references from inside the LMDB database.
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
unsafe { iter.del_current()? };
|
unsafe { iter.del_current()? };
|
||||||
} else if docids.len() != previous_len {
|
} else if docids.len() != previous_len {
|
||||||
|
let key = key.to_owned();
|
||||||
// safety: we don't keep references from inside the LMDB database.
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
unsafe { iter.put_current(&key, &docids)? };
|
unsafe { iter.put_current(&key, &docids)? };
|
||||||
}
|
}
|
||||||
|
@ -1381,4 +1381,26 @@ mod tests {
|
|||||||
|
|
||||||
wtxn.commit().unwrap();
|
wtxn.commit().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn index_documents_with_zeroes() {
|
||||||
|
let path = tempfile::tempdir().unwrap();
|
||||||
|
let mut options = EnvOpenOptions::new();
|
||||||
|
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||||
|
let index = Index::new(options, &path).unwrap();
|
||||||
|
|
||||||
|
let mut wtxn = index.write_txn().unwrap();
|
||||||
|
let content = r#"#id,title,au{hor,genre,price$
|
||||||
|
2,"Prideand Prejudice","Jane Austin","romance",3.5$
|
||||||
|
456,"Le Petit Prince","Antoine de Saint-Exupéry","adventure",10.0$
|
||||||
|
1,Wonderland","Lewis Carroll","fantasy",25.99$
|
||||||
|
4,"Harry Potter ing","fantasy\0lood Prince","J. K. Rowling","fantasy\0,
|
||||||
|
"#;
|
||||||
|
|
||||||
|
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
|
||||||
|
builder.update_format(UpdateFormat::Csv);
|
||||||
|
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
|
||||||
|
|
||||||
|
wtxn.commit().unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user