Come back to the old tokenizer

This commit is contained in:
Clément Renault 2020-08-30 21:50:30 +02:00 committed by Kerollmops
parent 220ba0785c
commit bad0663138
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
8 changed files with 45 additions and 101 deletions

33
Cargo.lock generated
View File

@ -6,15 +6,6 @@ version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10" checksum = "ccc9a9dd069569f212bc4330af9f17c4afb5e8ce185e83dbb14f1349dda18b10"
[[package]]
name = "aho-corasick"
version = "0.7.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "043164d8ba5c4c3035fec9bbee8647c0261d788f3474306f93bb65901cae0e86"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "anyhow" name = "anyhow"
version = "1.0.31" version = "1.0.31"
@ -1029,7 +1020,6 @@ dependencies = [
"structopt", "structopt",
"tempfile", "tempfile",
"tokio", "tokio",
"unicode-linebreak",
"warp", "warp",
] ]
@ -1624,10 +1614,7 @@ version = "1.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6" checksum = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
dependencies = [ dependencies = [
"aho-corasick",
"memchr",
"regex-syntax", "regex-syntax",
"thread_local 1.0.1",
] ]
[[package]] [[package]]
@ -1851,7 +1838,7 @@ dependencies = [
"chrono", "chrono",
"log 0.4.8", "log 0.4.8",
"termcolor", "termcolor",
"thread_local 0.3.4", "thread_local",
] ]
[[package]] [[package]]
@ -1966,15 +1953,6 @@ dependencies = [
"unreachable", "unreachable",
] ]
[[package]]
name = "thread_local"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
dependencies = [
"lazy_static 1.4.0",
]
[[package]] [[package]]
name = "time" name = "time"
version = "0.1.43" version = "0.1.43"
@ -2139,15 +2117,6 @@ dependencies = [
"matches", "matches",
] ]
[[package]]
name = "unicode-linebreak"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4e30c7c3c3fa01e2c0da7008b57c2e5414b132a27fdf797e49e5ecbfe4f4b150"
dependencies = [
"regex",
]
[[package]] [[package]]
name = "unicode-normalization" name = "unicode-normalization"
version = "0.1.12" version = "0.1.12"

View File

@ -31,7 +31,6 @@ smallstr = "0.2.0"
smallvec = "1.4.0" smallvec = "1.4.0"
structopt = { version = "0.3.14", default-features = false } structopt = { version = "0.3.14", default-features = false }
tempfile = "3.1.0" tempfile = "3.1.0"
unicode-linebreak = "0.1.0"
# logging # logging
log = "0.4.8" log = "0.4.8"

View File

@ -21,7 +21,8 @@ use rayon::prelude::*;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use structopt::StructOpt; use structopt::StructOpt;
use milli::{lexer, SmallVec32, Index, DocumentId, Position, Attribute, BEU32}; use milli::{SmallVec32, Index, DocumentId, Position, Attribute, BEU32};
use milli::tokenizer::{simple_tokenizer, only_words};
const LMDB_MAX_KEY_LENGTH: usize = 511; const LMDB_MAX_KEY_LENGTH: usize = 511;
const ONE_MILLION: usize = 1_000_000; const ONE_MILLION: usize = 1_000_000;
@ -367,7 +368,7 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
WORDS_FST_KEY => { WORDS_FST_KEY => {
let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect(); let fsts: Vec<_> = values.iter().map(|v| fst::Set::new(v).unwrap()).collect();
// Union of the two FSTs // Union of the FSTs
let mut op = fst::set::OpBuilder::new(); let mut op = fst::set::OpBuilder::new();
fsts.iter().for_each(|fst| op.push(fst.into_stream())); fsts.iter().for_each(|fst| op.push(fst.into_stream()));
let op = op.r#union(); let op = op.r#union();
@ -387,15 +388,16 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> {
| WORD_FOUR_POSITIONS_DOCIDS_BYTE | WORD_FOUR_POSITIONS_DOCIDS_BYTE
| WORD_ATTRIBUTE_DOCIDS_BYTE => | WORD_ATTRIBUTE_DOCIDS_BYTE =>
{ {
let mut first = RoaringBitmap::deserialize_from(values[0].as_slice()).unwrap(); let (head, tail) = values.split_first().unwrap();
for value in &values[1..] { let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap();
for value in tail {
let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap(); let bitmap = RoaringBitmap::deserialize_from(value.as_slice()).unwrap();
first.union_with(&bitmap); head.union_with(&bitmap);
} }
let mut vec = Vec::new(); let mut vec = Vec::with_capacity(head.serialized_size());
first.serialize_into(&mut vec).unwrap(); head.serialize_into(&mut vec).unwrap();
Ok(vec) Ok(vec)
}, },
otherwise => panic!("wut {:?}", otherwise), otherwise => panic!("wut {:?}", otherwise),
@ -505,8 +507,8 @@ fn index_csv(
let document_id = DocumentId::try_from(document_id).context("generated id is too big")?; let document_id = DocumentId::try_from(document_id).context("generated id is too big")?;
for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) { for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) { for (pos, (_, token)) in simple_tokenizer(&content).filter(only_words).enumerate().take(MAX_POSITION) {
let word = word.cow_to_lowercase(); let word = token.cow_to_lowercase();
let position = (attr * MAX_POSITION + pos) as u32; let position = (attr * MAX_POSITION + pos) as u32;
store.insert_word_position_docid(&word, position, document_id)?; store.insert_word_position_docid(&word, position, document_id)?;
} }

View File

@ -9,10 +9,10 @@ use std::time::Instant;
use askama_warp::Template; use askama_warp::Template;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use serde::Deserialize; use serde::Deserialize;
use slice_group_by::StrGroupBy;
use structopt::StructOpt; use structopt::StructOpt;
use warp::{Filter, http::Response}; use warp::{Filter, http::Response};
use milli::tokenizer::{simple_tokenizer, TokenType};
use milli::{Index, SearchResult}; use milli::{Index, SearchResult};
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
@ -47,12 +47,16 @@ struct Opt {
fn highlight_string(string: &str, words: &HashSet<String>) -> String { fn highlight_string(string: &str, words: &HashSet<String>) -> String {
let mut output = String::new(); let mut output = String::new();
for token in string.linear_group_by_key(|c| c.is_alphanumeric()) { for (token_type, token) in simple_tokenizer(string) {
let lowercase_token = token.to_lowercase(); if token_type == TokenType::Word {
let to_highlight = words.contains(&lowercase_token); let lowercase_token = token.to_lowercase();
if to_highlight { output.push_str("<mark>") } let to_highlight = words.contains(&lowercase_token);
output.push_str(token); if to_highlight { output.push_str("<mark>") }
if to_highlight { output.push_str("</mark>") } output.push_str(token);
if to_highlight { output.push_str("</mark>") }
} else {
output.push_str(token);
}
} }
output output
} }

View File

@ -1,44 +0,0 @@
use unicode_linebreak::{linebreaks, BreakClass, break_property};
fn can_be_broken(c: char) -> bool {
use BreakClass::*;
match break_property(c as u32) {
Ideographic
| Alphabetic
| Numeric
| CombiningMark
| WordJoiner
| NonBreakingGlue
| OpenPunctuation
| Symbol
| EmojiBase
| EmojiModifier
| HangulLJamo
| HangulVJamo
| HangulTJamo
| RegionalIndicator
| Quotation => false,
_ => true,
}
}
fn extract_token(s: &str) -> &str {
let end = s.char_indices().rev()
.take_while(|(_, c)| can_be_broken(*c))
.last()
.map(|(i, _)| i)
.unwrap_or(s.len());
&s[..end]
}
pub fn break_string(s: &str) -> impl Iterator<Item = &str> {
let mut prev = 0;
linebreaks(&s).map(move |(i, _)| {
let s = &s[prev..i];
prev = i;
extract_token(s)
})
.filter(|s| !s.is_empty())
}

View File

@ -3,7 +3,7 @@ mod node;
mod query_tokens; mod query_tokens;
mod search; mod search;
pub mod heed_codec; pub mod heed_codec;
pub mod lexer; pub mod tokenizer;
use std::collections::HashMap; use std::collections::HashMap;
use std::hash::BuildHasherDefault; use std::hash::BuildHasherDefault;

View File

@ -1,5 +1,4 @@
use std::{mem, str}; use std::{mem, str};
use unicode_linebreak::{break_property, BreakClass};
use QueryToken::{Quoted, Free}; use QueryToken::{Quoted, Free};
@ -69,12 +68,6 @@ impl<'a> Iterator for QueryTokens<'a> {
}, },
State::Fused => return None, State::Fused => return None,
} }
} else if break_property(c as u32) == BreakClass::Ideographic {
match self.state.replace_by(State::Free(afteri)) {
State::Quoted(s) => return Some(Quoted(&self.string[s..afteri])),
State::Free(s) => return Some(Free(&self.string[s..afteri])),
_ => self.state = State::Free(afteri),
}
} else if !self.state.is_quoted() && !c.is_alphanumeric() { } else if !self.state.is_quoted() && !c.is_alphanumeric() {
match self.state.replace_by(State::Free(afteri)) { match self.state.replace_by(State::Free(afteri)) {
State::Free(s) if i > s => return Some(Free(&self.string[s..i])), State::Free(s) if i > s => return Some(Free(&self.string[s..i])),

21
src/tokenizer.rs Normal file
View File

@ -0,0 +1,21 @@
use slice_group_by::StrGroupBy;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenType {
Word,
Space,
}
pub fn simple_tokenizer(text: &str) -> impl Iterator<Item=(TokenType, &str)> {
text
.linear_group_by_key(|c| c.is_alphanumeric())
.map(|s| {
let first = s.chars().next().unwrap();
let type_ = if first.is_alphanumeric() { TokenType::Word } else { TokenType::Space };
(type_, s)
})
}
pub fn only_words((t, _): &(TokenType, &str)) -> bool {
*t == TokenType::Word
}