From bad06631382a3de7cecdb6da5c12f1c6f9839484 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 30 Aug 2020 21:50:30 +0200 Subject: [PATCH] Come back to the old tokenizer --- Cargo.lock | 33 +-------------------------------- Cargo.toml | 1 - src/bin/indexer.rs | 20 +++++++++++--------- src/bin/serve.rs | 18 +++++++++++------- src/lexer.rs | 44 -------------------------------------------- src/lib.rs | 2 +- src/query_tokens.rs | 7 ------- src/tokenizer.rs | 21 +++++++++++++++++++++ 8 files changed, 45 insertions(+), 101 deletions(-) delete mode 100644 src/lexer.rs create mode 100644 src/tokenizer.rs diff --git a/Cargo.lock b/Cargo.lock index 447318d74..759e3ec77 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,15 +6,6 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10" -[[package]] -name = "aho-corasick" -version = "0.7.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86" -dependencies = [ - "memchr", -] - [[package]] name = "anyhow" version = "1.0.31" @@ -1029,7 +1020,6 @@ dependencies = [ "structopt", "tempfile", "tokio", - "unicode-linebreak", "warp", ] @@ -1624,10 +1614,7 @@ version = "1.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" dependencies = [ - "aho-corasick", - "memchr", "regex-syntax", - "thread_local 1.0.1", ] [[package]] @@ -1851,7 +1838,7 @@ dependencies = [ "chrono", "log 0.4.8", "termcolor", - "thread_local 0.3.4", + "thread_local", ] [[package]] @@ -1966,15 +1953,6 @@ dependencies = [ "unreachable", ] -[[package]] -name = "thread_local" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" -dependencies = [ - "lazy_static 1.4.0", -] - [[package]] name = "time" version = "0.1.43" @@ -2139,15 +2117,6 @@ dependencies = [ "matches", ] -[[package]] -name = "unicode-linebreak" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e30c7c3c3fa01e2c0da7008b57c2e5414b132a27fdf797e49e5ecbfe4f4b150" -dependencies = [ - "regex", -] - [[package]] name = "unicode-normalization" version = "0.1.12" diff --git a/Cargo.toml b/Cargo.toml index 50683e7cb..50818ef87 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,7 +31,6 @@ smallstr = "0.2.0" smallvec = "1.4.0" structopt = { version = "0.3.14", default-features = false } tempfile = "3.1.0" -unicode-linebreak = "0.1.0" # logging log = "0.4.8" diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index d71efca6e..9104fc22c 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -21,7 +21,8 @@ use rayon::prelude::*; use roaring::RoaringBitmap; use structopt::StructOpt; -use milli::{lexer, SmallVec32, Index, DocumentId, Position, Attribute, BEU32}; +use milli::{SmallVec32, Index, DocumentId, Position, Attribute, BEU32}; +use milli::tokenizer::{simple_tokenizer, only_words}; const LMDB_MAX_KEY_LENGTH: usize = 511; const ONE_MILLION: usize = 1_000_000; @@ -367,7 +368,7 @@ fn merge(key: &[u8], values: &[Vec]) -> Result, ()> { WORDS_FST_KEY => { let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect(); - // Union of the two FSTs + // Union of the FSTs let mut op = fst::set::OpBuilder::new(); fsts.iter().for_each(|fst| op.push(fst.into_stream())); let op = op.r#union(); @@ -387,15 +388,16 @@ fn merge(key: &[u8], values: &[Vec]) -> Result, ()> { | WORD_FOUR_POSITIONS_DOCIDS_BYTE | WORD_ATTRIBUTE_DOCIDS_BYTE => { - let mut first = RoaringBitmap::deserialize_from(values[0].as_slice()).unwrap(); + let (head, tail) = values.split_first().unwrap(); - for value in &values[1..] { + let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap(); + for value in tail { let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap(); - first.union_with(&bitmap); + head.union_with(&bitmap); } - let mut vec = Vec::new(); - first.serialize_into(&mut vec).unwrap(); + let mut vec = Vec::with_capacity(head.serialized_size()); + head.serialize_into(&mut vec).unwrap(); Ok(vec) }, otherwise => panic!("wut {:?}", otherwise), @@ -505,8 +507,8 @@ fn index_csv( let document_id = DocumentId::try_from(document_id).context("generated id is too big")?; for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { - for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) { - let word = word.cow_to_lowercase(); + for (pos, (_, token)) in simple_tokenizer(&content).filter(only_words).enumerate().take(MAX_POSITION) { + let word = token.cow_to_lowercase(); let position = (attr * MAX_POSITION + pos) as u32; store.insert_word_position_docid(&word, position, document_id)?; } diff --git a/src/bin/serve.rs b/src/bin/serve.rs index 67252f233..3a18023a0 100644 --- a/src/bin/serve.rs +++ b/src/bin/serve.rs @@ -9,10 +9,10 @@ use std::time::Instant; use askama_warp::Template; use heed::EnvOpenOptions; use serde::Deserialize; -use slice_group_by::StrGroupBy; use structopt::StructOpt; use warp::{Filter, http::Response}; +use milli::tokenizer::{simple_tokenizer, TokenType}; use milli::{Index, SearchResult}; #[cfg(target_os = "linux")] @@ -47,12 +47,16 @@ struct Opt { fn highlight_string(string: &str, words: &HashSet) -> String { let mut output = String::new(); - for token in string.linear_group_by_key(|c| c.is_alphanumeric()) { - let lowercase_token = token.to_lowercase(); - let to_highlight = words.contains(&lowercase_token); - if to_highlight { output.push_str("") } - output.push_str(token); - if to_highlight { output.push_str("") } + for (token_type, token) in simple_tokenizer(string) { + if token_type == TokenType::Word { + let lowercase_token = token.to_lowercase(); + let to_highlight = words.contains(&lowercase_token); + if to_highlight { output.push_str("") } + output.push_str(token); + if to_highlight { output.push_str("") } + } else { + output.push_str(token); + } } output } diff --git a/src/lexer.rs b/src/lexer.rs deleted file mode 100644 index c0910b231..000000000 --- a/src/lexer.rs +++ /dev/null @@ -1,44 +0,0 @@ -use unicode_linebreak::{linebreaks, BreakClass, break_property}; - -fn can_be_broken(c: char) -> bool { - use BreakClass::*; - - match break_property(c as u32) { - Ideographic - | Alphabetic - | Numeric - | CombiningMark - | WordJoiner - | NonBreakingGlue - | OpenPunctuation - | Symbol - | EmojiBase - | EmojiModifier - | HangulLJamo - | HangulVJamo - | HangulTJamo - | RegionalIndicator - | Quotation => false, - _ => true, - } -} - -fn extract_token(s: &str) -> &str { - let end = s.char_indices().rev() - .take_while(|(_, c)| can_be_broken(*c)) - .last() - .map(|(i, _)| i) - .unwrap_or(s.len()); - - &s[..end] -} - -pub fn break_string(s: &str) -> impl Iterator { - let mut prev = 0; - linebreaks(&s).map(move |(i, _)| { - let s = &s[prev..i]; - prev = i; - extract_token(s) - }) - .filter(|s| !s.is_empty()) -} diff --git a/src/lib.rs b/src/lib.rs index 5100ec411..8d231e154 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,7 +3,7 @@ mod node; mod query_tokens; mod search; pub mod heed_codec; -pub mod lexer; +pub mod tokenizer; use std::collections::HashMap; use std::hash::BuildHasherDefault; diff --git a/src/query_tokens.rs b/src/query_tokens.rs index 5ea606ac6..339a638c0 100644 --- a/src/query_tokens.rs +++ b/src/query_tokens.rs @@ -1,5 +1,4 @@ use std::{mem, str}; -use unicode_linebreak::{break_property, BreakClass}; use QueryToken::{Quoted, Free}; @@ -69,12 +68,6 @@ impl<'a> Iterator for QueryTokens<'a> { }, State::Fused => return None, } - } else if break_property(c as u32) == BreakClass::Ideographic { - match self.state.replace_by(State::Free(afteri)) { - State::Quoted(s) => return Some(Quoted(&self.string[s..afteri])), - State::Free(s) => return Some(Free(&self.string[s..afteri])), - _ => self.state = State::Free(afteri), - } } else if !self.state.is_quoted() && !c.is_alphanumeric() { match self.state.replace_by(State::Free(afteri)) { State::Free(s) if i > s => return Some(Free(&self.string[s..i])), diff --git a/src/tokenizer.rs b/src/tokenizer.rs new file mode 100644 index 000000000..782fbdcc5 --- /dev/null +++ b/src/tokenizer.rs @@ -0,0 +1,21 @@ +use slice_group_by::StrGroupBy; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum TokenType { + Word, + Space, +} + +pub fn simple_tokenizer(text: &str) -> impl Iterator { + text + .linear_group_by_key(|c| c.is_alphanumeric()) + .map(|s| { + let first = s.chars().next().unwrap(); + let type_ = if first.is_alphanumeric() { TokenType::Word } else { TokenType::Space }; + (type_, s) + }) +} + +pub fn only_words((t, _): &(TokenType, &str)) -> bool { + *t == TokenType::Word +}