From 1ae761311ee593a051056880929151c080ea7f9f Mon Sep 17 00:00:00 2001 From: mpostma Date: Wed, 23 Dec 2020 19:09:01 +0100 Subject: [PATCH] integrate with meilisearch tokenizer --- Cargo.lock | 147 ++++++++++++++++++- Cargo.toml | 5 +- http-ui/Cargo.lock | 145 ++++++++++++++++++- http-ui/Cargo.toml | 1 + src/lib.rs | 1 - src/query_tokens.rs | 212 ++++++++++++++++++---------- src/search/mod.rs | 16 ++- src/tokenizer.rs | 174 ----------------------- src/update/index_documents/mod.rs | 2 + src/update/index_documents/store.rs | 26 +++- 10 files changed, 460 insertions(+), 269 deletions(-) delete mode 100644 src/tokenizer.rs diff --git a/Cargo.lock b/Cargo.lock index e8eefa09b..843bba4e4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,6 +6,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10" +[[package]] +name = "ahash" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" + [[package]] name = "aho-corasick" version = "0.7.15" @@ -132,6 +138,15 @@ dependencies = [ "jobserver", ] +[[package]] +name = "cedarwood" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d" +dependencies = [ + "smallvec", +] + [[package]] name = "cfg-if" version = "0.1.10" @@ -144,6 +159,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "character_converter" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c" +dependencies = [ + "bincode", +] + [[package]] name = "chrono" version = "0.4.19" @@ -175,6 +199,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ce90df4c658c62f12d78f7508cf92f9173e5184a539c10bfe54a3107b3ffd0f2" +[[package]] +name = "cow-utils" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" + [[package]] name = "crc32fast" version = "1.2.0" @@ -330,6 +360,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "deunicode" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80115a2dfde04491e181c2440a39e4be26e52d9ca4e92bed213f65b94e0b8db1" + [[package]] name = "digest" version = "0.8.1" @@ -381,9 +417,9 @@ checksum = "5f2a4a2034423744d2cc7ca2068453168dcdb82c438419e639a26bd87839c674" [[package]] name = "fst" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7293de202dbfe786c0b3fe6110a027836c5438ed06db7b715c9955ff4bfea51" +checksum = "d79238883cf0307100b90aba4a755d8051a3182305dfe7f649a1e9dc0517006f" [[package]] name = "fxhash" @@ -440,6 +476,16 @@ version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d36fab90f82edc3c747f9d438e06cf0a491055896f2a279638bb5beed6c40177" +[[package]] +name = "hashbrown" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf" +dependencies = [ + "ahash", + "autocfg", +] + [[package]] name = "hashbrown" version = "0.9.1" @@ -525,7 +571,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2" dependencies = [ "autocfg", - "hashbrown", + "hashbrown 0.9.1", ] [[package]] @@ -564,6 +610,21 @@ dependencies = [ "libc", ] +[[package]] +name = "jieba-rs" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34fbdeee8786790f4a99fa30ff5c5f88aa5183f7583693e3788d17fc8a48f33a" +dependencies = [ + "cedarwood", + "fxhash", + "hashbrown 0.9.1", + "lazy_static", + "phf", + "phf_codegen", + "regex", +] + [[package]] name = "jobserver" version = "0.1.21" @@ -647,6 +708,22 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00" +[[package]] +name = "meilisearch-tokenizer" +version = "0.1.1" +source = "git+https://github.com/meilisearch/Tokenizer.git?branch=token-eq#daeb4a4ac91081f1c592e3ebb3ec5d8dcb4e6976" +dependencies = [ + "character_converter", + "cow-utils", + "deunicode", + "fst", + "jieba-rs", + "once_cell", + "slice-group-by", + "unicode-segmentation", + "whatlang", +] + [[package]] name = "memchr" version = "2.3.3" @@ -696,6 +773,7 @@ dependencies = [ "linked-hash-map", "log", "maplit", + "meilisearch-tokenizer", "memmap", "near-proximity", "num-traits", @@ -883,6 +961,44 @@ dependencies = [ "sha-1", ] +[[package]] +name = "phf" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +dependencies = [ + "siphasher", +] + [[package]] name = "pkg-config" version = "0.3.19" @@ -962,6 +1078,7 @@ dependencies = [ "rand_chacha", "rand_core", "rand_hc", + "rand_pcg", ] [[package]] @@ -992,6 +1109,15 @@ dependencies = [ "rand_core", ] +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core", +] + [[package]] name = "rayon" version = "1.3.1" @@ -1182,6 +1308,12 @@ dependencies = [ "opaque-debug", ] +[[package]] +name = "siphasher" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" + [[package]] name = "slice-group-by" version = "0.2.6" @@ -1558,6 +1690,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "whatlang" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc0289c1d1548414a5645e6583e118e9c569c579ec2a0c32417cc3dbf7a89075" +dependencies = [ + "hashbrown 0.7.2", +] + [[package]] name = "winapi" version = "0.3.8" diff --git a/Cargo.toml b/Cargo.toml index a2e2aa0df..52b25cfde 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ crossbeam-channel = "0.5.0" csv = "1.1.3" either = "1.6.1" flate2 = "1.0.17" -fst = "0.4.4" +fst = "0.4.5" fxhash = "0.2.1" grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = { version = "0.10.5", default-features = false, features = ["lmdb", "sync-read-txn"] } @@ -21,6 +21,7 @@ human_format = "1.0.3" jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.3" +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } memmap = "0.7.0" near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } num-traits = "0.2.14" @@ -56,7 +57,7 @@ criterion = "0.3.3" maplit = "1.0.2" [build-dependencies] -fst = "0.4.4" +fst = "0.4.5" [features] default = [] diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index 4b909a6eb..cc2e2f852 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -6,6 +6,12 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ee2a4ec343196209d6594e19543ae87a39f96d5534d7174822a3ad825dd6ed7e" +[[package]] +name = "ahash" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8fd72866655d1904d6b0997d0b07ba561047d070fbe29de039031c641b61217" + [[package]] name = "aho-corasick" version = "0.7.15" @@ -213,6 +219,15 @@ dependencies = [ "jobserver", ] +[[package]] +name = "cedarwood" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "963e82c7b94163808ca3a452608d260b64ba5bc7b5653b4af1af59887899f48d" +dependencies = [ + "smallvec", +] + [[package]] name = "cfg-if" version = "0.1.10" @@ -225,6 +240,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "character_converter" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e48477ece09d6a21c033cb604968524a37782532727055d6f6faafac1781e5c" +dependencies = [ + "bincode", +] + [[package]] name = "chrono" version = "0.4.19" @@ -265,6 +289,12 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c478836e029dcef17fb47c89023448c64f781a046e0300e257ad8225ae59afab" +[[package]] +name = "cow-utils" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" + [[package]] name = "cpuid-bool" version = "0.1.2" @@ -368,6 +398,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "deunicode" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80115a2dfde04491e181c2440a39e4be26e52d9ca4e92bed213f65b94e0b8db1" + [[package]] name = "digest" version = "0.8.1" @@ -640,6 +676,16 @@ dependencies = [ "tracing-futures", ] +[[package]] +name = "hashbrown" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96282e96bfcd3da0d3aa9938bedf1e50df3269b6db08b4876d2da0bb1a0841cf" +dependencies = [ + "ahash", + "autocfg 1.0.1", +] + [[package]] name = "hashbrown" version = "0.9.1" @@ -840,7 +886,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55e2e4c765aa53a0424761bf9f41aa7a6ac1efa87238f59560640e27fca028f2" dependencies = [ "autocfg 1.0.1", - "hashbrown", + "hashbrown 0.9.1", ] [[package]] @@ -897,6 +943,21 @@ dependencies = [ "libc", ] +[[package]] +name = "jieba-rs" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34fbdeee8786790f4a99fa30ff5c5f88aa5183f7583693e3788d17fc8a48f33a" +dependencies = [ + "cedarwood", + "fxhash", + "hashbrown 0.9.1", + "lazy_static", + "phf", + "phf_codegen", + "regex", +] + [[package]] name = "jobserver" version = "0.1.21" @@ -975,6 +1036,22 @@ version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" +[[package]] +name = "meilisearch-tokenizer" +version = "0.1.1" +source = "git+https://github.com/meilisearch/Tokenizer.git?branch=main#8d91cd52f30aa4b651a085c15056938f7b599646" +dependencies = [ + "character_converter", + "cow-utils", + "deunicode", + "fst", + "jieba-rs", + "once_cell", + "slice-group-by", + "unicode-segmentation", + "whatlang", +] + [[package]] name = "memchr" version = "2.3.4" @@ -1022,6 +1099,7 @@ dependencies = [ "levenshtein_automata", "linked-hash-map", "log", + "meilisearch-tokenizer", "memmap", "near-proximity", "num-traits", @@ -1323,6 +1401,44 @@ dependencies = [ "sha-1 0.8.2", ] +[[package]] +name = "phf" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" +dependencies = [ + "phf_shared", + "rand 0.7.3", +] + +[[package]] +name = "phf_shared" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "0.4.27" @@ -1461,7 +1577,7 @@ dependencies = [ "rand_isaac", "rand_jitter", "rand_os", - "rand_pcg", + "rand_pcg 0.1.2", "rand_xorshift", "winapi 0.3.9", ] @@ -1477,6 +1593,7 @@ dependencies = [ "rand_chacha 0.2.2", "rand_core 0.5.1", "rand_hc 0.2.0", + "rand_pcg 0.2.1", ] [[package]] @@ -1585,6 +1702,15 @@ dependencies = [ "rand_core 0.4.2", ] +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_xorshift" version = "0.1.1" @@ -1787,6 +1913,12 @@ dependencies = [ "libc", ] +[[package]] +name = "siphasher" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7" + [[package]] name = "slab" version = "0.4.2" @@ -2280,6 +2412,15 @@ version = "0.10.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" +[[package]] +name = "whatlang" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc0289c1d1548414a5645e6583e118e9c569c579ec2a0c32417cc3dbf7a89075" +dependencies = [ + "hashbrown 0.7.2", +] + [[package]] name = "winapi" version = "0.2.8" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 73470f7f4..ba094c79e 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -10,6 +10,7 @@ anyhow = "1.0.28" byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3adcb26" } heed = "0.10.5" +meilisearch-tokenizer = { git = "https://github.com/meilisearch/Tokenizer.git", branch = "main" } memmap = "0.7.0" milli = { path = ".." } once_cell = "1.4.1" diff --git a/src/lib.rs b/src/lib.rs index 9fa19c68c..435c3be91 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,6 @@ pub mod facet; pub mod heed_codec; pub mod proximity; pub mod subcommand; -pub mod tokenizer; pub mod update; use std::borrow::Cow; diff --git a/src/query_tokens.rs b/src/query_tokens.rs index defc3f5e7..ee15b15ea 100644 --- a/src/query_tokens.rs +++ b/src/query_tokens.rs @@ -1,5 +1,4 @@ -use std::str; -use crate::tokenizer::{simple_tokenizer, TokenType}; +use meilisearch_tokenizer::{Token, TokenKind}; #[derive(Debug)] enum State { @@ -18,138 +17,201 @@ impl State { #[derive(Debug, PartialEq, Eq)] pub enum QueryToken<'a> { - Free(&'a str), - Quoted(&'a str), + Free(Token<'a>), + Quoted(Token<'a>), } -pub struct QueryTokens<'a> { - state: State, - iter: Box + 'a>, -} - -impl QueryTokens<'_> { - pub fn new(query: &str) -> QueryTokens { - QueryTokens { - state: State::Free, - iter: Box::new(simple_tokenizer(query)), - } - } -} - -impl<'a> Iterator for QueryTokens<'a> { - type Item = QueryToken<'a>; - - fn next(&mut self) -> Option { +pub fn query_tokens<'a>(mut tokens: impl Iterator>) -> impl Iterator> { + let mut state = State::Free; + let f = move || { loop { - match self.iter.next()? { - (TokenType::Other, "\"") => self.state.swap(), - (TokenType::Word, token) => { - let token = match self.state { + let token = tokens.next()?; + match token.kind() { + _ if token.text().trim() == "\"" => state.swap(), + TokenKind::Word => { + let token = match state { State::Quoted => QueryToken::Quoted(token), State::Free => QueryToken::Free(token), }; return Some(token); }, - (_, _) => (), + _ => (), } } - } + }; + std::iter::from_fn(f) } #[cfg(test)] mod tests { use super::*; use QueryToken::{Quoted, Free}; + use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; + use fst::Set; + + macro_rules! assert_eq_query_token { + ($test:expr, Quoted($val:literal)) => { + match $test { + Quoted(val) => assert_eq!(val.text(), $val), + Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()), + } + }; + + ($test:expr, Free($val:literal)) => { + match $test { + Quoted(val) => panic!("expected Free(\"{}\"), found Quoted(\"{}\")", $val, val.text()), + Free(val) => assert_eq!(val.text(), $val), + } + }; + } #[test] fn empty() { - let mut iter = QueryTokens::new(""); - assert_eq!(iter.next(), None); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let query = ""; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert!(iter.next().is_none()); - let mut iter = QueryTokens::new(" "); - assert_eq!(iter.next(), None); + let query = " "; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert!(iter.next().is_none()); } #[test] fn one_quoted_string() { - let mut iter = QueryTokens::new("\"hello\""); - assert_eq!(iter.next(), Some(Quoted("hello"))); - assert_eq!(iter.next(), None); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let query = "\"hello\""; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); + assert!(iter.next().is_none()); } #[test] fn one_pending_quoted_string() { - let mut iter = QueryTokens::new("\"hello"); - assert_eq!(iter.next(), Some(Quoted("hello"))); - assert_eq!(iter.next(), None); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let query = "\"hello"; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); + assert!(iter.next().is_none()); } #[test] fn one_non_quoted_string() { - let mut iter = QueryTokens::new("hello"); - assert_eq!(iter.next(), Some(Free("hello"))); - assert_eq!(iter.next(), None); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let query = "hello"; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert_eq_query_token!(iter.next().unwrap(), Free("hello")); + assert!(iter.next().is_none()); } #[test] fn quoted_directly_followed_by_free_strings() { - let mut iter = QueryTokens::new("\"hello\"world"); - assert_eq!(iter.next(), Some(Quoted("hello"))); - assert_eq!(iter.next(), Some(Free("world"))); - assert_eq!(iter.next(), None); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let query = "\"hello\"world"; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); + assert_eq_query_token!(iter.next().unwrap(), Free("world")); + assert!(iter.next().is_none()); } #[test] fn free_directly_followed_by_quoted_strings() { - let mut iter = QueryTokens::new("hello\"world\""); - assert_eq!(iter.next(), Some(Free("hello"))); - assert_eq!(iter.next(), Some(Quoted("world"))); - assert_eq!(iter.next(), None); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let query = "hello\"world\""; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert_eq_query_token!(iter.next().unwrap(), Free("hello")); + assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); + assert!(iter.next().is_none()); } #[test] fn free_followed_by_quoted_strings() { - let mut iter = QueryTokens::new("hello \"world\""); - assert_eq!(iter.next(), Some(Free("hello"))); - assert_eq!(iter.next(), Some(Quoted("world"))); - assert_eq!(iter.next(), None); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let query = "hello \"world\""; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert_eq_query_token!(iter.next().unwrap(), Free("hello")); + assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); + assert!(iter.next().is_none()); } #[test] fn multiple_spaces_separated_strings() { - let mut iter = QueryTokens::new("hello world "); - assert_eq!(iter.next(), Some(Free("hello"))); - assert_eq!(iter.next(), Some(Free("world"))); - assert_eq!(iter.next(), None); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let query = "hello world "; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert_eq_query_token!(iter.next().unwrap(), Free("hello")); + assert_eq_query_token!(iter.next().unwrap(), Free("world")); + assert!(iter.next().is_none()); } #[test] fn multi_interleaved_quoted_free_strings() { - let mut iter = QueryTokens::new("hello \"world\" coucou \"monde\""); - assert_eq!(iter.next(), Some(Free("hello"))); - assert_eq!(iter.next(), Some(Quoted("world"))); - assert_eq!(iter.next(), Some(Free("coucou"))); - assert_eq!(iter.next(), Some(Quoted("monde"))); - assert_eq!(iter.next(), None); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let query = "hello \"world\" coucou \"monde\""; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert_eq_query_token!(iter.next().unwrap(), Free("hello")); + assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); + assert_eq_query_token!(iter.next().unwrap(), Free("coucou")); + assert_eq_query_token!(iter.next().unwrap(), Quoted("monde")); + assert!(iter.next().is_none()); } #[test] fn multi_quoted_strings() { - let mut iter = QueryTokens::new("\"hello world\" coucou \"monde est beau\""); - assert_eq!(iter.next(), Some(Quoted("hello"))); - assert_eq!(iter.next(), Some(Quoted("world"))); - assert_eq!(iter.next(), Some(Free("coucou"))); - assert_eq!(iter.next(), Some(Quoted("monde"))); - assert_eq!(iter.next(), Some(Quoted("est"))); - assert_eq!(iter.next(), Some(Quoted("beau"))); - assert_eq!(iter.next(), None); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let query = "\"hello world\" coucou \"monde est beau\""; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); + assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); + assert_eq_query_token!(iter.next().unwrap(), Free("coucou")); + assert_eq_query_token!(iter.next().unwrap(), Quoted("monde")); + assert_eq_query_token!(iter.next().unwrap(), Quoted("est")); + assert_eq_query_token!(iter.next().unwrap(), Quoted("beau")); + assert!(iter.next().is_none()); } #[test] fn chinese() { - let mut iter = QueryTokens::new("汽车男生"); - assert_eq!(iter.next(), Some(Free("汽车"))); - assert_eq!(iter.next(), Some(Free("男生"))); - assert_eq!(iter.next(), None); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let query = "汽车男生"; + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let mut iter = query_tokens(tokens); + assert_eq_query_token!(iter.next().unwrap(), Free("汽车")); + assert_eq_query_token!(iter.next().unwrap(), Free("男生")); + assert!(iter.next().is_none()); } } diff --git a/src/search/mod.rs b/src/search/mod.rs index a7c83e79b..415a4cfed 100644 --- a/src/search/mod.rs +++ b/src/search/mod.rs @@ -4,10 +4,11 @@ use std::fmt; use std::time::Instant; use anyhow::{bail, Context}; -use fst::{IntoStreamer, Streamer}; +use fst::{IntoStreamer, Streamer, Set}; use levenshtein_automata::DFA; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use log::debug; +use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; use ordered_float::OrderedFloat; use roaring::bitmap::RoaringBitmap; @@ -16,7 +17,7 @@ use crate::facet::FacetType; use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; use crate::mdfs::Mdfs; -use crate::query_tokens::{QueryTokens, QueryToken}; +use crate::query_tokens::{query_tokens, QueryToken}; use crate::{Index, FieldId, DocumentId, Criterion}; pub use self::facet::{FacetCondition, FacetNumberOperator, FacetStringOperator}; @@ -68,14 +69,19 @@ impl<'a> Search<'a> { fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2); - let words: Vec<_> = QueryTokens::new(query).collect(); + let stop_words = Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); + let analyzed = analyzer.analyze(query); + let tokens = analyzed.tokens(); + let words: Vec<_> = query_tokens(tokens).collect(); + let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace); let number_of_words = words.len(); words.into_iter().enumerate().map(|(i, word)| { let (word, quoted) = match word { - QueryToken::Free(word) => (word.to_lowercase(), word.len() <= 3), - QueryToken::Quoted(word) => (word.to_lowercase(), true), + QueryToken::Free(token) => (token.text().to_string(), token.text().len() <= 3), + QueryToken::Quoted(token) => (token.text().to_string(), true), }; let is_last = i + 1 == number_of_words; let is_prefix = is_last && !ends_with_whitespace && !quoted; diff --git a/src/tokenizer.rs b/src/tokenizer.rs deleted file mode 100644 index c64f5c360..000000000 --- a/src/tokenizer.rs +++ /dev/null @@ -1,174 +0,0 @@ -use std::{str, iter, mem}; - -use fst::raw::{Fst, Output}; -use once_cell::sync::Lazy; -use slice_group_by::StrGroupBy; - -use CharCategory::*; - -const CHINESE_FST_BYTES: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/chinese-words.fst")); -static CHINESE_WORDS_FST: Lazy> = Lazy::new(|| Fst::new(CHINESE_FST_BYTES).unwrap()); - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum TokenType { - Word, - Space, - Other, -} - -pub fn simple_tokenizer(text: &str) -> impl Iterator { - text - .linear_group_by_key(CharCategory::new) - .flat_map(|mut string| { - let first = string.chars().next().unwrap(); - let category = CharCategory::new(first); - iter::from_fn(move || { - if string.is_empty() { return None } - match category { - Chinese => { - let fst = &CHINESE_WORDS_FST; - match find_longest_prefix(fst, string.as_bytes()) { - Some((_, l)) => { - let s = &string[..l]; - string = &string[l..]; - Some((TokenType::Word, s)) - }, - None => { - let first = string.chars().next().unwrap(); - let len = first.len_utf8(); - let (head, tail) = string.split_at(len); - string = tail; - Some((TokenType::Word, head)) - }, - } - }, - Alphanumeric => Some((TokenType::Word, mem::take(&mut string))), - Space => Some((TokenType::Space, mem::take(&mut string))), - Other => Some((TokenType::Other, mem::take(&mut string))), - } - }) - }) -} - -pub fn only_token((t, w): (TokenType, &str)) -> Option<&str> { - if t == TokenType::Word { Some(w) } else { None } -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -enum CharCategory { - Chinese, - Alphanumeric, - Space, - Other, -} - -impl CharCategory { - fn new(c: char) -> Self { - if c.is_alphanumeric() { - if is_chinese(c) { Chinese } else { Alphanumeric } - } else if c.is_whitespace() { Space } else { Other } - } -} - -fn is_chinese(c: char) -> bool { - matches!( - u32::from(c), - 0x4E00..=0x9FEF - | 0x3400..=0x4DBF - | 0x20000..=0x2A6DF - | 0x2A700..=0x2B73F - | 0x2B740..=0x2B81F - | 0x2B820..=0x2CEAF - | 0x2CEB0..=0x2EBEF - | 0x3007..=0x3007 - ) -} - -/// Find the longest key that is prefix of the given value. -/// -/// If the key exists, then `Some((value, key_len))` is returned, where -/// `value` is the value associated with the key, and `key_len` is the -/// length of the found key. Otherwise `None` is returned. -/// -/// This can be used to e.g. build tokenizing functions. -// Copyright @llogiq -// https://github.com/BurntSushi/fst/pull/104 -#[inline] -fn find_longest_prefix(fst: &Fst<&[u8]>, value: &[u8]) -> Option<(u64, usize)> { - let mut node = fst.root(); - let mut out = Output::zero(); - let mut last_match = None; - for (i, &b) in value.iter().enumerate() { - if let Some(trans_index) = node.find_input(b) { - let t = node.transition(trans_index); - node = fst.node(t.addr); - out = out.cat(t.out); - if node.is_final() { - last_match = Some((out.cat(node.final_output()).value(), i + 1)); - } - } else { - return last_match; - } - } - last_match -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn without_chinese() { - let mut iter = simple_tokenizer("hello world!"); - assert_eq!(iter.next(), Some((TokenType::Word, "hello"))); - assert_eq!(iter.next(), Some((TokenType::Space, " "))); - assert_eq!(iter.next(), Some((TokenType::Word, "world"))); - assert_eq!(iter.next(), Some((TokenType::Other, "!"))); - assert_eq!(iter.next(), None); - } - - #[test] - fn only_chinese() { - let mut iter = simple_tokenizer("今天的天气真好"); - assert_eq!(iter.next(), Some((TokenType::Word, "今天"))); - assert_eq!(iter.next(), Some((TokenType::Word, "的"))); - assert_eq!(iter.next(), Some((TokenType::Word, "天气"))); - assert_eq!(iter.next(), Some((TokenType::Word, "真好"))); - assert_eq!(iter.next(), None); - } - - #[test] - fn mixup_chinese_with_alphabet() { - let mut iter = simple_tokenizer("今天的天气真好Apple is good今天的天气真好"); - assert_eq!(iter.next(), Some((TokenType::Word, "今天"))); - assert_eq!(iter.next(), Some((TokenType::Word, "的"))); - assert_eq!(iter.next(), Some((TokenType::Word, "天气"))); - assert_eq!(iter.next(), Some((TokenType::Word, "真好"))); - assert_eq!(iter.next(), Some((TokenType::Word, "Apple"))); - assert_eq!(iter.next(), Some((TokenType::Space, " "))); - assert_eq!(iter.next(), Some((TokenType::Word, "is"))); - assert_eq!(iter.next(), Some((TokenType::Space, " "))); - assert_eq!(iter.next(), Some((TokenType::Word, "good"))); - assert_eq!(iter.next(), Some((TokenType::Word, "今天"))); - assert_eq!(iter.next(), Some((TokenType::Word, "的"))); - assert_eq!(iter.next(), Some((TokenType::Word, "天气"))); - assert_eq!(iter.next(), Some((TokenType::Word, "真好"))); - assert_eq!(iter.next(), None); - } - - #[test] - fn unknown_chinese() { - let mut iter = simple_tokenizer("被虾头大讚好识𠱁女仔"); - assert_eq!(iter.next(), Some((TokenType::Word, "被"))); - assert_eq!(iter.next(), Some((TokenType::Word, "虾"))); - assert_eq!(iter.next(), Some((TokenType::Word, "头"))); - assert_eq!(iter.next(), Some((TokenType::Word, "大"))); - assert_eq!(iter.next(), Some((TokenType::Word, "讚"))); - assert_eq!(iter.next(), Some((TokenType::Word, "好"))); - assert_eq!(iter.next(), Some((TokenType::Word, "识"))); - assert_eq!(iter.next(), Some((TokenType::Word, "𠱁"))); - assert_eq!(iter.next(), Some((TokenType::Word, "女"))); - assert_eq!(iter.next(), Some((TokenType::Word, "仔"))); - assert_eq!(iter.next(), None); - } -} diff --git a/src/update/index_documents/mod.rs b/src/update/index_documents/mod.rs index 8b538b03d..4af6301c2 100644 --- a/src/update/index_documents/mod.rs +++ b/src/update/index_documents/mod.rs @@ -370,6 +370,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let readers = rayon::iter::repeatn(documents, num_threads) .enumerate() .map(|(i, documents)| { + let stop_words = fst::Set::default(); let store = Store::new( searchable_fields.clone(), faceted_fields.clone(), @@ -379,6 +380,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, + &stop_words, )?; store.index( documents, diff --git a/src/update/index_documents/store.rs b/src/update/index_documents/store.rs index b107d4be6..2b57d3b8d 100644 --- a/src/update/index_documents/store.rs +++ b/src/update/index_documents/store.rs @@ -16,12 +16,13 @@ use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; use tempfile::tempfile; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, TokenKind}; +use fst::Set; use crate::facet::FacetType; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; -use crate::tokenizer::{simple_tokenizer, only_token}; use crate::update::UpdateIndexingStep; use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId, FieldId}; @@ -47,7 +48,7 @@ pub struct Readers { pub documents: Reader, } -pub struct Store { +pub struct Store<'s, A> { // Indexing parameters searchable_fields: HashSet, faceted_fields: HashMap, @@ -71,9 +72,11 @@ pub struct Store { // MTBL writers docid_word_positions_writer: Writer, documents_writer: Writer, + // tokenizer + analyzer: Analyzer<'s, A>, } -impl Store { +impl<'s, A: AsRef<[u8]>> Store<'s, A> { pub fn new( searchable_fields: HashSet, faceted_fields: HashMap, @@ -83,7 +86,8 @@ impl Store { chunk_compression_type: CompressionType, chunk_compression_level: Option, chunk_fusing_shrink_size: Option, - ) -> anyhow::Result + stop_words: &'s Set, + ) -> anyhow::Result { // We divide the max memory by the number of sorter the Store have. let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4)); @@ -137,6 +141,8 @@ impl Store { create_writer(chunk_compression_type, chunk_compression_level, f) })?; + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + Ok(Store { // Indexing parameters. searchable_fields, @@ -161,6 +167,8 @@ impl Store { // MTBL writers docid_word_positions_writer, documents_writer, + //tokenizer + analyzer, }) } @@ -462,9 +470,13 @@ impl Store { None => continue, }; - let tokens = simple_tokenizer(&content).filter_map(only_token); - for (pos, token) in tokens.enumerate().take(MAX_POSITION) { - let word = token.to_lowercase(); + let analyzed = self.analyzer.analyze(&content); + let tokens = analyzed + .tokens() + .filter(|t| t.is_word()) + .map(|t| t.text().to_string()); + + for (pos, word) in tokens.enumerate().take(MAX_POSITION) { let position = (attr as usize * MAX_POSITION + pos) as u32; words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); }