From 10414791a2d42b124617e984222d13589b484c5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 22 Feb 2019 22:34:37 +0100 Subject: [PATCH 1/3] fix: Remove debug println from the tokenizer --- src/tokenizer/mod.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index a2fd96311..bdca8c4a4 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -105,8 +105,6 @@ impl<'a> Iterator for Tokenizer<'a> { char_index: self.char_index, }; - println!("no-cjk with start_word returns: {:?}", token); - self.char_index += word.chars().count(); return Some(token) } @@ -143,8 +141,6 @@ impl<'a> Iterator for Tokenizer<'a> { char_index: self.char_index, }; - println!("cjk with start_word returns: {:?}", token); - self.word_index += 1; self.char_index += word.chars().count(); @@ -164,8 +160,6 @@ impl<'a> Iterator for Tokenizer<'a> { char_index: self.char_index, }; - println!("cjk without start_word returns: {:?}", token); - if tail.chars().next().and_then(detect_separator).is_none() { self.word_index += 1; } From a7994709978a4002f4f56a29da18679627a0f3cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 22 Feb 2019 23:06:42 +0100 Subject: [PATCH 2/3] fix: Change the tokenizer to mesure cjk chars positions --- src/tokenizer/mod.rs | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index bdca8c4a4..f4c42b7d4 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -124,8 +124,6 @@ impl<'a> Iterator for Tokenizer<'a> { (c >= '\u{4e00}' && c <= '\u{9fff}') || (c >= '\u{f900}' && c <= '\u{faff}') { - let char_len = c.len_utf8(); - match start_word { Some(start_word) => { let (prefix, tail) = self.inner.split_at(i); @@ -147,7 +145,7 @@ impl<'a> Iterator for Tokenizer<'a> { return Some(token) }, None => { - let (prefix, tail) = self.inner.split_at(i + char_len); + let (prefix, tail) = self.inner.split_at(i + c.len_utf8()); let (spaces, word) = prefix.split_at(i); self.inner = tail; @@ -163,7 +161,7 @@ impl<'a> Iterator for Tokenizer<'a> { if tail.chars().next().and_then(detect_separator).is_none() { self.word_index += 1; } - self.char_index += char_len; + self.char_index += 1; return Some(token) } @@ -252,18 +250,18 @@ mod tests { let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}"); assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 3 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 10 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 })); assert_eq!(tokenizer.next(), None); let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}"); assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 3 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 6 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 10 })); - assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 20 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 29 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 })); + assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 })); assert_eq!(tokenizer.next(), None); } } From a960c325f30f38be6a63634b3bd621daf82912a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 23 Feb 2019 14:57:13 +0100 Subject: [PATCH 3/3] feat: Make query strings support cjk kanjis --- Cargo.toml | 5 ++++- src/lib.rs | 12 ++++++++++++ src/rank/query_builder.rs | 40 ++++++++++++++++++++++++++++++++------- src/tokenizer/mod.rs | 12 ++---------- 4 files changed, 51 insertions(+), 18 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cffc51348..37e7ea680 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,9 +21,12 @@ serde = "1.0" serde_derive = "1.0" serde_json = { version = "1.0", features = ["preserve_order"] } size_format = "1.0" -slice-group-by = "0.2" unidecode = "0.3" +[dependencies.slice-group-by] +git = "https://github.com/Kerollmops/slice-group-by.git" +tag = "v0.2.3-alpha.1" + [dependencies.toml] git = "https://github.com/Kerollmops/toml-rs.git" features = ["preserve_order"] diff --git a/src/lib.rs b/src/lib.rs index a111b5049..e77e03ecb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,6 +16,18 @@ pub use rocksdb; pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; +pub fn is_cjk(c: char) -> bool { + (c >= '\u{2e80}' && c <= '\u{2eff}') || + (c >= '\u{2f00}' && c <= '\u{2fdf}') || + (c >= '\u{3040}' && c <= '\u{309f}') || + (c >= '\u{30a0}' && c <= '\u{30ff}') || + (c >= '\u{3100}' && c <= '\u{312f}') || + (c >= '\u{3200}' && c <= '\u{32ff}') || + (c >= '\u{3400}' && c <= '\u{4dbf}') || + (c >= '\u{4e00}' && c <= '\u{9fff}') || + (c >= '\u{f900}' && c <= '\u{faff}') +} + /// Represent an internally generated document unique identifier. /// /// It is used to inform the database the document you want to deserialize. diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index f9415b638..0f3643554 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -6,7 +6,7 @@ use std::hash::Hash; use std::rc::Rc; use rayon::slice::ParallelSliceMut; -use slice_group_by::GroupByMut; +use slice_group_by::{GroupByMut, LinearStrGroupBy}; use hashbrown::HashMap; use fst::Streamer; use rocksdb::DB; @@ -16,17 +16,43 @@ use crate::automaton::{self, DfaExt, AutomatonExt}; use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::rank::criterion::Criteria; use crate::database::DatabaseView; -use crate::{Match, DocumentId}; use crate::rank::{raw_documents_from_matches, RawDocument, Document}; +use crate::{is_cjk, Match, DocumentId}; + +#[derive(Debug, PartialEq, Eq)] +enum CharCategory { + Space, + Cjk, + Other, +} + +fn classify_char(c: char) -> CharCategory { + if c.is_whitespace() { CharCategory::Space } + else if is_cjk(c) { CharCategory::Cjk } + else { CharCategory::Other } +} + +fn is_word(s: &&str) -> bool { + !s.chars().any(char::is_whitespace) +} + +fn same_group_category(a: char, b: char) -> bool { + let ca = classify_char(a); + let cb = classify_char(b); + if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb } +} fn split_whitespace_automatons(query: &str) -> Vec { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let mut automatons = Vec::new(); - let mut words = query.split_whitespace().map(str::to_lowercase).peekable(); + let mut groups = LinearStrGroupBy::new(query, same_group_category) + .filter(is_word) + .map(str::to_lowercase) + .peekable(); - while let Some(word) = words.next() { - let has_following_word = words.peek().is_some(); - let lev = if has_following_word || has_end_whitespace { + let mut automatons = Vec::new(); + while let Some(word) = groups.next() { + let has_following_word = groups.peek().is_some(); + let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) { automaton::build_dfa(&word) } else { automaton::build_prefix_dfa(&word) diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index f4c42b7d4..ed146c06f 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -1,4 +1,5 @@ use std::mem; +use crate::is_cjk; use self::Separator::*; pub trait TokenizerBuilder { @@ -114,16 +115,7 @@ impl<'a> Iterator for Tokenizer<'a> { None => { // if this is a Chinese, a Japanese or a Korean character // See - if (c >= '\u{2e80}' && c <= '\u{2eff}') || - (c >= '\u{2f00}' && c <= '\u{2fdf}') || - (c >= '\u{3040}' && c <= '\u{309f}') || - (c >= '\u{30a0}' && c <= '\u{30ff}') || - (c >= '\u{3100}' && c <= '\u{312f}') || - (c >= '\u{3200}' && c <= '\u{32ff}') || - (c >= '\u{3400}' && c <= '\u{4dbf}') || - (c >= '\u{4e00}' && c <= '\u{9fff}') || - (c >= '\u{f900}' && c <= '\u{faff}') - { + if is_cjk(c) { match start_word { Some(start_word) => { let (prefix, tail) = self.inner.split_at(i);