From 10414791a2d42b124617e984222d13589b484c5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <renault.cle@gmail.com>
Date: Fri, 22 Feb 2019 22:34:37 +0100
Subject: [PATCH 1/3] fix: Remove debug println from the tokenizer

---
 src/tokenizer/mod.rs | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index a2fd96311..bdca8c4a4 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -105,8 +105,6 @@ impl<'a> Iterator for Tokenizer<'a> {
                             char_index: self.char_index,
                         };
 
-                        println!("no-cjk with start_word returns: {:?}", token);
-
                         self.char_index += word.chars().count();
                         return Some(token)
                     }
@@ -143,8 +141,6 @@ impl<'a> Iterator for Tokenizer<'a> {
                                     char_index: self.char_index,
                                 };
 
-                                println!("cjk with start_word returns: {:?}", token);
-
                                 self.word_index += 1;
                                 self.char_index += word.chars().count();
 
@@ -164,8 +160,6 @@ impl<'a> Iterator for Tokenizer<'a> {
                                     char_index: self.char_index,
                                 };
 
-                                println!("cjk without start_word returns: {:?}", token);
-
                                 if tail.chars().next().and_then(detect_separator).is_none() {
                                     self.word_index += 1;
                                 }

From a7994709978a4002f4f56a29da18679627a0f3cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <renault.cle@gmail.com>
Date: Fri, 22 Feb 2019 23:06:42 +0100
Subject: [PATCH 2/3] fix: Change the tokenizer to mesure cjk chars positions

---
 src/tokenizer/mod.rs | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index bdca8c4a4..f4c42b7d4 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -124,8 +124,6 @@ impl<'a> Iterator for Tokenizer<'a> {
                        (c >= '\u{4e00}' && c <= '\u{9fff}') ||
                        (c >= '\u{f900}' && c <= '\u{faff}')
                     {
-                        let char_len = c.len_utf8();
-
                         match start_word {
                             Some(start_word) => {
                                 let (prefix, tail) = self.inner.split_at(i);
@@ -147,7 +145,7 @@ impl<'a> Iterator for Tokenizer<'a> {
                                 return Some(token)
                             },
                             None => {
-                                let (prefix, tail) = self.inner.split_at(i + char_len);
+                                let (prefix, tail) = self.inner.split_at(i + c.len_utf8());
                                 let (spaces, word) = prefix.split_at(i);
 
                                 self.inner = tail;
@@ -163,7 +161,7 @@ impl<'a> Iterator for Tokenizer<'a> {
                                 if tail.chars().next().and_then(detect_separator).is_none() {
                                     self.word_index += 1;
                                 }
-                                self.char_index += char_len;
+                                self.char_index += 1;
 
                                 return Some(token)
                             }
@@ -252,18 +250,18 @@ mod tests {
         let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
 
         assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 3 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 10 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
         assert_eq!(tokenizer.next(), None);
 
         let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello    \u{2ec7}");
 
         assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 3 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 6 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 10 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 20 }));
-        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 29 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
         assert_eq!(tokenizer.next(), None);
     }
 }

From a960c325f30f38be6a63634b3bd621daf82912a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <renault.cle@gmail.com>
Date: Sat, 23 Feb 2019 14:57:13 +0100
Subject: [PATCH 3/3] feat: Make query strings support cjk kanjis

---
 Cargo.toml                |  5 ++++-
 src/lib.rs                | 12 ++++++++++++
 src/rank/query_builder.rs | 40 ++++++++++++++++++++++++++++++++-------
 src/tokenizer/mod.rs      | 12 ++----------
 4 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index cffc51348..37e7ea680 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -21,9 +21,12 @@ serde = "1.0"
 serde_derive = "1.0"
 serde_json = { version = "1.0", features = ["preserve_order"] }
 size_format = "1.0"
-slice-group-by = "0.2"
 unidecode = "0.3"
 
+[dependencies.slice-group-by]
+git = "https://github.com/Kerollmops/slice-group-by.git"
+tag = "v0.2.3-alpha.1"
+
 [dependencies.toml]
 git = "https://github.com/Kerollmops/toml-rs.git"
 features = ["preserve_order"]
diff --git a/src/lib.rs b/src/lib.rs
index a111b5049..e77e03ecb 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -16,6 +16,18 @@ pub use rocksdb;
 pub use self::tokenizer::Tokenizer;
 pub use self::common_words::CommonWords;
 
+pub fn is_cjk(c: char) -> bool {
+    (c >= '\u{2e80}' && c <= '\u{2eff}') ||
+    (c >= '\u{2f00}' && c <= '\u{2fdf}') ||
+    (c >= '\u{3040}' && c <= '\u{309f}') ||
+    (c >= '\u{30a0}' && c <= '\u{30ff}') ||
+    (c >= '\u{3100}' && c <= '\u{312f}') ||
+    (c >= '\u{3200}' && c <= '\u{32ff}') ||
+    (c >= '\u{3400}' && c <= '\u{4dbf}') ||
+    (c >= '\u{4e00}' && c <= '\u{9fff}') ||
+    (c >= '\u{f900}' && c <= '\u{faff}')
+}
+
 /// Represent an internally generated document unique identifier.
 ///
 /// It is used to inform the database the document you want to deserialize.
diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs
index f9415b638..0f3643554 100644
--- a/src/rank/query_builder.rs
+++ b/src/rank/query_builder.rs
@@ -6,7 +6,7 @@ use std::hash::Hash;
 use std::rc::Rc;
 
 use rayon::slice::ParallelSliceMut;
-use slice_group_by::GroupByMut;
+use slice_group_by::{GroupByMut, LinearStrGroupBy};
 use hashbrown::HashMap;
 use fst::Streamer;
 use rocksdb::DB;
@@ -16,17 +16,43 @@ use crate::automaton::{self, DfaExt, AutomatonExt};
 use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
 use crate::rank::criterion::Criteria;
 use crate::database::DatabaseView;
-use crate::{Match, DocumentId};
 use crate::rank::{raw_documents_from_matches, RawDocument, Document};
+use crate::{is_cjk, Match, DocumentId};
+
+#[derive(Debug, PartialEq, Eq)]
+enum CharCategory {
+    Space,
+    Cjk,
+    Other,
+}
+
+fn classify_char(c: char) -> CharCategory {
+    if c.is_whitespace() { CharCategory::Space }
+    else if is_cjk(c) { CharCategory::Cjk }
+    else { CharCategory::Other }
+}
+
+fn is_word(s: &&str) -> bool {
+    !s.chars().any(char::is_whitespace)
+}
+
+fn same_group_category(a: char, b: char) -> bool {
+    let ca = classify_char(a);
+    let cb = classify_char(b);
+    if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
+}
 
 fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
     let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
-    let mut automatons = Vec::new();
-    let mut words = query.split_whitespace().map(str::to_lowercase).peekable();
+    let mut groups = LinearStrGroupBy::new(query, same_group_category)
+                        .filter(is_word)
+                        .map(str::to_lowercase)
+                        .peekable();
 
-    while let Some(word) = words.next() {
-        let has_following_word = words.peek().is_some();
-        let lev = if has_following_word || has_end_whitespace {
+    let mut automatons = Vec::new();
+    while let Some(word) = groups.next() {
+        let has_following_word = groups.peek().is_some();
+        let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) {
             automaton::build_dfa(&word)
         } else {
             automaton::build_prefix_dfa(&word)
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index f4c42b7d4..ed146c06f 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -1,4 +1,5 @@
 use std::mem;
+use crate::is_cjk;
 use self::Separator::*;
 
 pub trait TokenizerBuilder {
@@ -114,16 +115,7 @@ impl<'a> Iterator for Tokenizer<'a> {
                 None => {
                     // if this is a Chinese, a Japanese or a Korean character
                     // See <http://unicode-table.com>
-                    if (c >= '\u{2e80}' && c <= '\u{2eff}') ||
-                       (c >= '\u{2f00}' && c <= '\u{2fdf}') ||
-                       (c >= '\u{3040}' && c <= '\u{309f}') ||
-                       (c >= '\u{30a0}' && c <= '\u{30ff}') ||
-                       (c >= '\u{3100}' && c <= '\u{312f}') ||
-                       (c >= '\u{3200}' && c <= '\u{32ff}') ||
-                       (c >= '\u{3400}' && c <= '\u{4dbf}') ||
-                       (c >= '\u{4e00}' && c <= '\u{9fff}') ||
-                       (c >= '\u{f900}' && c <= '\u{faff}')
-                    {
+                    if is_cjk(c) {
                         match start_word {
                             Some(start_word) => {
                                 let (prefix, tail) = self.inner.split_at(i);