From 19e67dcf0b3c80ea156521813f562b2077ec6928 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 25 Feb 2019 18:34:51 +0100 Subject: [PATCH] feat: Move query splitting into the tokenizer workspace --- meilidb-core/src/query_builder.rs | 50 ++++--------------------------- meilidb-tokenizer/Cargo.toml | 2 +- meilidb-tokenizer/src/lib.rs | 28 +++++++++++++++++ 3 files changed, 35 insertions(+), 45 deletions(-) diff --git a/meilidb-core/src/query_builder.rs b/meilidb-core/src/query_builder.rs index f462a52e6..6d76cfb48 100644 --- a/meilidb-core/src/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -5,7 +5,8 @@ use std::hash::Hash; use std::rc::Rc; use rayon::slice::ParallelSliceMut; -use slice_group_by::{GroupByMut, LinearStrGroupBy}; +use slice_group_by::GroupByMut; +use meilidb_tokenizer::{is_cjk, split_query_string}; use hashbrown::{HashMap, HashSet}; use fst::Streamer; use log::info; @@ -16,50 +17,11 @@ use crate::criterion::Criteria; use crate::{raw_documents_from_matches, RawDocument, Document}; use crate::{Index, Match, DocumentId}; -// query splitting must move out of this crate -pub fn is_cjk(c: char) -> bool { - (c >= '\u{2e80}' && c <= '\u{2eff}') || - (c >= '\u{2f00}' && c <= '\u{2fdf}') || - (c >= '\u{3040}' && c <= '\u{309f}') || - (c >= '\u{30a0}' && c <= '\u{30ff}') || - (c >= '\u{3100}' && c <= '\u{312f}') || - (c >= '\u{3200}' && c <= '\u{32ff}') || - (c >= '\u{3400}' && c <= '\u{4dbf}') || - (c >= '\u{4e00}' && c <= '\u{9fff}') || - (c >= '\u{f900}' && c <= '\u{faff}') -} - -#[derive(Debug, PartialEq, Eq)] -enum CharCategory { - Space, - Cjk, - Other, -} - -fn classify_char(c: char) -> CharCategory { - if c.is_whitespace() { CharCategory::Space } - else if is_cjk(c) { CharCategory::Cjk } - else { CharCategory::Other } -} - -fn is_word(s: &&str) -> bool { - !s.chars().any(char::is_whitespace) -} - -fn same_group_category(a: char, b: char) -> bool { - let ca = classify_char(a); - let cb = classify_char(b); - if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb } -} - -fn split_whitespace_automatons(query: &str) -> Vec { +fn generate_automatons(query: &str) -> Vec { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let mut groups = LinearStrGroupBy::new(query, same_group_category) - .filter(is_word) - .map(str::to_lowercase) - .peekable(); - + let mut groups = split_query_string(query).map(str::to_lowercase).peekable(); let mut automatons = Vec::new(); + while let Some(word) = groups.next() { let has_following_word = groups.peek().is_some(); let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) { @@ -122,7 +84,7 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> } fn query_all(&self, query: &str) -> Vec { - let automatons = split_whitespace_automatons(query); + let automatons = generate_automatons(query); let mut stream = { let mut op_builder = fst::map::OpBuilder::new(); diff --git a/meilidb-tokenizer/Cargo.toml b/meilidb-tokenizer/Cargo.toml index c2077533e..c8b643d09 100644 --- a/meilidb-tokenizer/Cargo.toml +++ b/meilidb-tokenizer/Cargo.toml @@ -5,4 +5,4 @@ authors = ["Clément Renault "] edition = "2018" [dependencies] - +slice-group-by = "0.2.4" diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs index 7c4c8f915..8cdb32dc3 100644 --- a/meilidb-tokenizer/src/lib.rs +++ b/meilidb-tokenizer/src/lib.rs @@ -1,4 +1,5 @@ use std::mem; +use slice_group_by::LinearStrGroupBy; use self::Separator::*; pub fn is_cjk(c: char) -> bool { @@ -13,6 +14,33 @@ pub fn is_cjk(c: char) -> bool { (c >= '\u{f900}' && c <= '\u{faff}') } +#[derive(Debug, PartialEq, Eq)] +enum CharCategory { + Space, + Cjk, + Other, +} + +fn classify_char(c: char) -> CharCategory { + if c.is_whitespace() { CharCategory::Space } + else if is_cjk(c) { CharCategory::Cjk } + else { CharCategory::Other } +} + +fn is_word(s: &&str) -> bool { + !s.chars().any(char::is_whitespace) +} + +fn same_group_category(a: char, b: char) -> bool { + let ca = classify_char(a); + let cb = classify_char(b); + if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb } +} + +pub fn split_query_string(query: &str) -> impl Iterator { + LinearStrGroupBy::new(query, same_group_category).filter(is_word) +} + pub trait TokenizerBuilder { fn build<'a>(&self, text: &'a str) -> Box> + 'a>; }