From bad06631382a3de7cecdb6da5c12f1c6f9839484 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Sun, 30 Aug 2020 21:50:30 +0200
Subject: [PATCH] Come back to the old tokenizer

---
 Cargo.lock          | 33 +--------------------------------
 Cargo.toml          |  1 -
 src/bin/indexer.rs  | 20 +++++++++++---------
 src/bin/serve.rs    | 18 +++++++++++-------
 src/lexer.rs        | 44 --------------------------------------------
 src/lib.rs          |  2 +-
 src/query_tokens.rs |  7 -------
 src/tokenizer.rs    | 21 +++++++++++++++++++++
 8 files changed, 45 insertions(+), 101 deletions(-)
 delete mode 100644 src/lexer.rs
 create mode 100644 src/tokenizer.rs
diff --git a/Cargo.lock b/Cargo.lock
index 447318d74..759e3ec77 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6,15 +6,6 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10"
 
-[[package]]
-name = "aho-corasick"
-version = "0.7.13"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86"
-dependencies = [
- "memchr",
-]
-
 [[package]]
 name = "anyhow"
 version = "1.0.31"
@@ -1029,7 +1020,6 @@ dependencies = [
  "structopt",
  "tempfile",
  "tokio",
- "unicode-linebreak",
  "warp",
 ]
 
@@ -1624,10 +1614,7 @@ version = "1.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
 dependencies = [
- "aho-corasick",
- "memchr",
  "regex-syntax",
- "thread_local 1.0.1",
 ]
 
 [[package]]
@@ -1851,7 +1838,7 @@ dependencies = [
  "chrono",
  "log 0.4.8",
  "termcolor",
- "thread_local 0.3.4",
+ "thread_local",
 ]
 
 [[package]]
@@ -1966,15 +1953,6 @@ dependencies = [
  "unreachable",
 ]
 
-[[package]]
-name = "thread_local"
-version = "1.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
-dependencies = [
- "lazy_static 1.4.0",
-]
-
 [[package]]
 name = "time"
 version = "0.1.43"
@@ -2139,15 +2117,6 @@ dependencies = [
  "matches",
 ]
 
-[[package]]
-name = "unicode-linebreak"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e30c7c3c3fa01e2c0da7008b57c2e5414b132a27fdf797e49e5ecbfe4f4b150"
-dependencies = [
- "regex",
-]
-
 [[package]]
 name = "unicode-normalization"
 version = "0.1.12"
diff --git a/Cargo.toml b/Cargo.toml
index 50683e7cb..50818ef87 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -31,7 +31,6 @@ smallstr = "0.2.0"
 smallvec = "1.4.0"
 structopt = { version = "0.3.14", default-features = false }
 tempfile = "3.1.0"
-unicode-linebreak = "0.1.0"
 
 # logging
 log = "0.4.8"
diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs
index d71efca6e..9104fc22c 100644
--- a/src/bin/indexer.rs
+++ b/src/bin/indexer.rs
@@ -21,7 +21,8 @@ use rayon::prelude::*;
 use roaring::RoaringBitmap;
 use structopt::StructOpt;
 
-use milli::{lexer, SmallVec32, Index, DocumentId, Position, Attribute, BEU32};
+use milli::{SmallVec32, Index, DocumentId, Position, Attribute, BEU32};
+use milli::tokenizer::{simple_tokenizer, only_words};
 
 const LMDB_MAX_KEY_LENGTH: usize = 511;
 const ONE_MILLION: usize = 1_000_000;
@@ -367,7 +368,7 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
         WORDS_FST_KEY => {
             let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect();
 
-            // Union of the two FSTs
+            // Union of the FSTs
             let mut op = fst::set::OpBuilder::new();
             fsts.iter().for_each(|fst| op.push(fst.into_stream()));
             let op = op.r#union();
@@ -387,15 +388,16 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
               | WORD_FOUR_POSITIONS_DOCIDS_BYTE
               | WORD_ATTRIBUTE_DOCIDS_BYTE =>
             {
-                let mut first = RoaringBitmap::deserialize_from(values[0].as_slice()).unwrap();
+                let (head, tail) = values.split_first().unwrap();
 
-                for value in &values[1..] {
+                let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
+                for value in tail {
                     let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap();
-                    first.union_with(&bitmap);
+                    head.union_with(&bitmap);
                 }
 
-                let mut vec = Vec::new();
-                first.serialize_into(&mut vec).unwrap();
+                let mut vec = Vec::with_capacity(head.serialized_size());
+                head.serialize_into(&mut vec).unwrap();
                 Ok(vec)
             },
             otherwise => panic!("wut {:?}", otherwise),
@@ -505,8 +507,8 @@ fn index_csv(
 
             let document_id = DocumentId::try_from(document_id).context("generated id is too big")?;
             for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
-                for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) {
-                    let word = word.cow_to_lowercase();
+                for (pos, (_, token)) in simple_tokenizer(&content).filter(only_words).enumerate().take(MAX_POSITION) {
+                    let word = token.cow_to_lowercase();
                     let position = (attr * MAX_POSITION + pos) as u32;
                     store.insert_word_position_docid(&word, position, document_id)?;
                 }
diff --git a/src/bin/serve.rs b/src/bin/serve.rs
index 67252f233..3a18023a0 100644
--- a/src/bin/serve.rs
+++ b/src/bin/serve.rs
@@ -9,10 +9,10 @@ use std::time::Instant;
 use askama_warp::Template;
 use heed::EnvOpenOptions;
 use serde::Deserialize;
-use slice_group_by::StrGroupBy;
 use structopt::StructOpt;
 use warp::{Filter, http::Response};
 
+use milli::tokenizer::{simple_tokenizer, TokenType};
 use milli::{Index, SearchResult};
 
 #[cfg(target_os = "linux")]
@@ -47,12 +47,16 @@ struct Opt {
 
 fn highlight_string(string: &str, words: &HashSet<String>) -> String {
     let mut output = String::new();
-    for token in string.linear_group_by_key(|c| c.is_alphanumeric()) {
-        let lowercase_token = token.to_lowercase();
-        let to_highlight = words.contains(&lowercase_token);
-        if to_highlight { output.push_str("<mark>") }
-        output.push_str(token);
-        if to_highlight { output.push_str("</mark>") }
+    for (token_type, token) in simple_tokenizer(string) {
+        if token_type == TokenType::Word {
+            let lowercase_token = token.to_lowercase();
+            let to_highlight = words.contains(&lowercase_token);
+            if to_highlight { output.push_str("<mark>") }
+            output.push_str(token);
+            if to_highlight { output.push_str("</mark>") }
+        } else {
+            output.push_str(token);
+        }
     }
     output
 }
diff --git a/src/lexer.rs b/src/lexer.rs
deleted file mode 100644
index c0910b231..000000000
--- a/src/lexer.rs
+++ /dev/null
@@ -1,44 +0,0 @@
-use unicode_linebreak::{linebreaks, BreakClass, break_property};
-
-fn can_be_broken(c: char) -> bool {
-    use BreakClass::*;
-
-    match break_property(c as u32) {
-          Ideographic
-        | Alphabetic
-        | Numeric
-        | CombiningMark
-        | WordJoiner
-        | NonBreakingGlue
-        | OpenPunctuation
-        | Symbol
-        | EmojiBase
-        | EmojiModifier
-        | HangulLJamo
-        | HangulVJamo
-        | HangulTJamo
-        | RegionalIndicator
-        | Quotation => false,
-        _ => true,
-    }
-}
-
-fn extract_token(s: &str) -> &str {
-    let end = s.char_indices().rev()
-        .take_while(|(_, c)| can_be_broken(*c))
-        .last()
-        .map(|(i, _)| i)
-        .unwrap_or(s.len());
-
-    &s[..end]
-}
-
-pub fn break_string(s: &str) -> impl Iterator<Item = &str> {
-    let mut prev = 0;
-    linebreaks(&s).map(move |(i, _)| {
-        let s = &s[prev..i];
-        prev = i;
-        extract_token(s)
-    })
-    .filter(|s| !s.is_empty())
-}
diff --git a/src/lib.rs b/src/lib.rs
index 5100ec411..8d231e154 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -3,7 +3,7 @@ mod node;
 mod query_tokens;
 mod search;
 pub mod heed_codec;
-pub mod lexer;
+pub mod tokenizer;
 
 use std::collections::HashMap;
 use std::hash::BuildHasherDefault;
diff --git a/src/query_tokens.rs b/src/query_tokens.rs
index 5ea606ac6..339a638c0 100644
--- a/src/query_tokens.rs
+++ b/src/query_tokens.rs
@@ -1,5 +1,4 @@
 use std::{mem, str};
-use unicode_linebreak::{break_property, BreakClass};
 
 use QueryToken::{Quoted, Free};
 
@@ -69,12 +68,6 @@ impl<'a> Iterator for QueryTokens<'a> {
                     },
                     State::Fused => return None,
                 }
-            } else if break_property(c as u32) == BreakClass::Ideographic {
-                match self.state.replace_by(State::Free(afteri)) {
-                    State::Quoted(s) => return Some(Quoted(&self.string[s..afteri])),
-                    State::Free(s) => return Some(Free(&self.string[s..afteri])),
-                    _ => self.state = State::Free(afteri),
-                }
             } else if !self.state.is_quoted() && !c.is_alphanumeric() {
                 match self.state.replace_by(State::Free(afteri)) {
                     State::Free(s) if i > s => return Some(Free(&self.string[s..i])),
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
new file mode 100644
index 000000000..782fbdcc5
--- /dev/null
+++ b/src/tokenizer.rs
@@ -0,0 +1,21 @@
+use slice_group_by::StrGroupBy;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum TokenType {
+    Word,
+    Space,
+}
+
+pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
+    text
+        .linear_group_by_key(|c| c.is_alphanumeric())
+        .map(|s| {
+            let first = s.chars().next().unwrap();
+            let type_ = if first.is_alphanumeric() { TokenType::Word } else { TokenType::Space };
+            (type_, s)
+        })
+}
+
+pub fn only_words((t, _): &(TokenType, &str)) -> bool {
+    *t == TokenType::Word
+}