feat: Make query strings support cjk kanjis

This commit is contained in:
Clément Renault 2019-02-23 14:57:13 +01:00
parent a799470997
commit a960c325f3
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
4 changed files with 51 additions and 18 deletions

View File

@ -21,9 +21,12 @@ serde = "1.0"
serde_derive = "1.0" serde_derive = "1.0"
serde_json = { version = "1.0", features = ["preserve_order"] } serde_json = { version = "1.0", features = ["preserve_order"] }
size_format = "1.0" size_format = "1.0"
slice-group-by = "0.2"
unidecode = "0.3" unidecode = "0.3"
[dependencies.slice-group-by]
git = "https://github.com/Kerollmops/slice-group-by.git"
tag = "v0.2.3-alpha.1"
[dependencies.toml] [dependencies.toml]
git = "https://github.com/Kerollmops/toml-rs.git" git = "https://github.com/Kerollmops/toml-rs.git"
features = ["preserve_order"] features = ["preserve_order"]

View File

@ -16,6 +16,18 @@ pub use rocksdb;
pub use self::tokenizer::Tokenizer; pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords; pub use self::common_words::CommonWords;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}
/// Represent an internally generated document unique identifier. /// Represent an internally generated document unique identifier.
/// ///
/// It is used to inform the database the document you want to deserialize. /// It is used to inform the database the document you want to deserialize.

View File

@ -6,7 +6,7 @@ use std::hash::Hash;
use std::rc::Rc; use std::rc::Rc;
use rayon::slice::ParallelSliceMut; use rayon::slice::ParallelSliceMut;
use slice_group_by::GroupByMut; use slice_group_by::{GroupByMut, LinearStrGroupBy};
use hashbrown::HashMap; use hashbrown::HashMap;
use fst::Streamer; use fst::Streamer;
use rocksdb::DB; use rocksdb::DB;
@ -16,17 +16,43 @@ use crate::automaton::{self, DfaExt, AutomatonExt};
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap};
use crate::rank::criterion::Criteria; use crate::rank::criterion::Criteria;
use crate::database::DatabaseView; use crate::database::DatabaseView;
use crate::{Match, DocumentId};
use crate::rank::{raw_documents_from_matches, RawDocument, Document}; use crate::rank::{raw_documents_from_matches, RawDocument, Document};
use crate::{is_cjk, Match, DocumentId};
#[derive(Debug, PartialEq, Eq)]
enum CharCategory {
Space,
Cjk,
Other,
}
fn classify_char(c: char) -> CharCategory {
if c.is_whitespace() { CharCategory::Space }
else if is_cjk(c) { CharCategory::Cjk }
else { CharCategory::Other }
}
fn is_word(s: &&str) -> bool {
!s.chars().any(char::is_whitespace)
}
fn same_group_category(a: char, b: char) -> bool {
let ca = classify_char(a);
let cb = classify_char(b);
if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
}
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> { fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let mut automatons = Vec::new(); let mut groups = LinearStrGroupBy::new(query, same_group_category)
let mut words = query.split_whitespace().map(str::to_lowercase).peekable(); .filter(is_word)
.map(str::to_lowercase)
.peekable();
while let Some(word) = words.next() { let mut automatons = Vec::new();
let has_following_word = words.peek().is_some(); while let Some(word) = groups.next() {
let lev = if has_following_word || has_end_whitespace { let has_following_word = groups.peek().is_some();
let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) {
automaton::build_dfa(&word) automaton::build_dfa(&word)
} else { } else {
automaton::build_prefix_dfa(&word) automaton::build_prefix_dfa(&word)

View File

@ -1,4 +1,5 @@
use std::mem; use std::mem;
use crate::is_cjk;
use self::Separator::*; use self::Separator::*;
pub trait TokenizerBuilder { pub trait TokenizerBuilder {
@ -114,16 +115,7 @@ impl<'a> Iterator for Tokenizer<'a> {
None => { None => {
// if this is a Chinese, a Japanese or a Korean character // if this is a Chinese, a Japanese or a Korean character
// See <http://unicode-table.com> // See <http://unicode-table.com>
if (c >= '\u{2e80}' && c <= '\u{2eff}') || if is_cjk(c) {
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
{
match start_word { match start_word {
Some(start_word) => { Some(start_word) => {
let (prefix, tail) = self.inner.split_at(i); let (prefix, tail) = self.inner.split_at(i);