From d18e775becec43c1c674680729615649f74bcc80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 10 Nov 2019 17:41:32 +0100 Subject: [PATCH] Ignore words that are too long --- meilidb-core/src/database.rs | 50 +++++++++++++++++++++++++++++++++ meilidb-core/src/raw_indexer.rs | 35 ++++++++++++++--------- 2 files changed, 71 insertions(+), 14 deletions(-) diff --git a/meilidb-core/src/database.rs b/meilidb-core/src/database.rs index 0e40b695d..07d015632 100644 --- a/meilidb-core/src/database.rs +++ b/meilidb-core/src/database.rs @@ -387,6 +387,56 @@ mod tests { assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_err()); } + #[test] + fn ignored_words_to_long() { + let dir = tempfile::tempdir().unwrap(); + + let database = Database::open_or_create(dir.path()).unwrap(); + let env = &database.env; + + let (sender, receiver) = mpsc::sync_channel(100); + let update_fn = move |update: ProcessedUpdateResult| sender.send(update.update_id).unwrap(); + let index = database.create_index("test").unwrap(); + + let done = database.set_update_callback("test", Box::new(update_fn)); + assert!(done, "could not set the index update function"); + + let schema = { + let data = r#" + identifier = "id" + + [attributes."name"] + displayed = true + indexed = true + "#; + toml::from_str(data).unwrap() + }; + + let mut writer = env.write_txn().unwrap(); + let _update_id = index.schema_update(&mut writer, schema).unwrap(); + writer.commit().unwrap(); + + let mut additions = index.documents_addition(); + + let doc1 = serde_json::json!({ + "id": 123, + "name": "s̷̡̢̡̧̺̜̞͕͉͉͕̜͔̟̼̥̝͍̟̖͔͔̪͉̲̹̝̣̖͎̞̤̥͓͎̭̩͕̙̩̿̀̋̅̈́̌́̏̍̄̽͂̆̾̀̿̕̚̚͜͠͠ͅͅļ̵̨̨̨̰̦̻̳̖̳͚̬̫͚̦͖͈̲̫̣̩̥̻̙̦̱̼̠̖̻̼̘̖͉̪̜̠̙͖̙̩͔̖̯̩̲̿̽͋̔̿̍̓͂̍̿͊͆̃͗̔̎͐͌̾̆͗́̆̒̔̾̅̚̚͜͜ͅͅī̵̛̦̅̔̓͂͌̾́͂͛̎̋͐͆̽̂̋̋́̾̀̉̓̏̽́̑̀͒̇͋͛̈́̃̉̏͊̌̄̽̿̏̇͘̕̚̕p̶̧̛̛̖̯̗͕̝̗̭̱͙̖̗̟̟̐͆̊̂͐̋̓̂̈́̓͊̆͌̾̾͐͋͗͌̆̿̅͆̈́̈́̉͋̍͊͗̌̓̅̈̎̇̃̎̈́̉̐̋͑̃͘̕͘d̴̢̨̛͕̘̯͖̭̮̝̝̐̊̈̅̐̀͒̀́̈́̀͌̽͛͆͑̀̽̿͛̃̋̇̎̀́̂́͘͠͝ǫ̵̨̛̮̩̘͚̬̯̖̱͍̼͑͑̓̐́̑̿̈́̔͌̂̄͐͝ģ̶̧̜͇̣̭̺̪̺̖̻͖̮̭̣̙̻͒͊͗̓̓͒̀̀ͅ", + }); + + additions.update_document(doc1); + + let mut writer = env.write_txn().unwrap(); + let update_id = additions.finalize(&mut writer).unwrap(); + writer.commit().unwrap(); + + // block until the transaction is processed + let _ = receiver.into_iter().find(|id| *id == update_id); + + let reader = env.read_txn().unwrap(); + let result = index.update_status(&reader, update_id).unwrap(); + assert_matches!(result, UpdateStatus::Processed(status) if status.result.is_ok()); + } + #[test] fn add_schema_attributes_at_end() { let dir = tempfile::tempdir().unwrap(); diff --git a/meilidb-core/src/raw_indexer.rs b/meilidb-core/src/raw_indexer.rs index f4304a33a..b642ec64c 100644 --- a/meilidb-core/src/raw_indexer.rs +++ b/meilidb-core/src/raw_indexer.rs @@ -7,6 +7,8 @@ use meilidb_schema::SchemaAttr; use meilidb_tokenizer::{is_cjk, SeqTokenizer, Token, Tokenizer}; use sdset::SetBuf; +const WORD_LENGTH_LIMIT: usize = 80; + type Word = Vec; // TODO make it be a SmallVec pub struct RawIndexer { @@ -128,21 +130,26 @@ fn index_token( match token_to_docindex(id, attr, token) { Some(docindex) => { let word = Vec::from(token.word); - words_doc_indexes - .entry(word.clone()) - .or_insert_with(Vec::new) - .push(docindex); - docs_words.entry(id).or_insert_with(Vec::new).push(word); - if !lower.contains(is_cjk) { - let unidecoded = deunicode_with_tofu(&lower, ""); - if unidecoded != lower && !unidecoded.is_empty() { - let word = Vec::from(unidecoded); - words_doc_indexes - .entry(word.clone()) - .or_insert_with(Vec::new) - .push(docindex); - docs_words.entry(id).or_insert_with(Vec::new).push(word); + if word.len() <= WORD_LENGTH_LIMIT { + words_doc_indexes + .entry(word.clone()) + .or_insert_with(Vec::new) + .push(docindex); + docs_words.entry(id).or_insert_with(Vec::new).push(word); + + if !lower.contains(is_cjk) { + let unidecoded = deunicode_with_tofu(&lower, ""); + if unidecoded != lower && !unidecoded.is_empty() { + let word = Vec::from(unidecoded); + if word.len() <= WORD_LENGTH_LIMIT { + words_doc_indexes + .entry(word.clone()) + .or_insert_with(Vec::new) + .push(docindex); + docs_words.entry(id).or_insert_with(Vec::new).push(word); + } + } } } }