feat: Move tokenizer things into the meilidb-tokenizer workspace

This commit is contained in:
Clément Renault 2019-02-25 18:24:46 +01:00
parent d8cbb03c42
commit 1897da5348
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
10 changed files with 28 additions and 22 deletions

View File

@ -2,4 +2,5 @@
members = [ members = [
"meilidb", "meilidb",
"meilidb-core", "meilidb-core",
"meilidb-tokenizer",
] ]

View File

@ -11,6 +11,7 @@ hashbrown = "0.1.8"
lazy_static = "1.2.0" lazy_static = "1.2.0"
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
log = "0.4.6" log = "0.4.6"
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
rayon = "1.0.3" rayon = "1.0.3"
sdset = "0.3.1" sdset = "0.3.1"
serde = "1.0.88" serde = "1.0.88"

View File

@ -0,0 +1,8 @@
[package]
name = "meilidb-tokenizer"
version = "0.1.0"
authors = ["Clément Renault <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]

View File

@ -1,7 +1,18 @@
use std::mem; use std::mem;
use crate::is_cjk;
use self::Separator::*; use self::Separator::*;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}
pub trait TokenizerBuilder { pub trait TokenizerBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>; fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
} }

View File

@ -13,6 +13,8 @@ hashbrown = { version = "0.1.8", features = ["serde"] }
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] } linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
lockfree = "0.5.1" lockfree = "0.5.1"
log = "0.4.6" log = "0.4.6"
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
sdset = "0.3.1" sdset = "0.3.1"
serde = "1.0.88" serde = "1.0.88"
serde_derive = "1.0.88" serde_derive = "1.0.88"
@ -20,7 +22,6 @@ serde_json = { version = "1.0.38", features = ["preserve_order"] }
size_format = "1.0.2" size_format = "1.0.2"
slice-group-by = "0.2.4" slice-group-by = "0.2.4"
unidecode = "0.3.0" unidecode = "0.3.0"
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
[dependencies.toml] [dependencies.toml]
git = "https://github.com/Kerollmops/toml-rs.git" git = "https://github.com/Kerollmops/toml-rs.git"

View File

@ -430,9 +430,9 @@ mod tests {
use std::error::Error; use std::error::Error;
use serde_derive::{Serialize, Deserialize}; use serde_derive::{Serialize, Deserialize};
use meilidb_tokenizer::DefaultBuilder;
use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
use crate::tokenizer::DefaultBuilder;
use super::*; use super::*;

View File

@ -3,13 +3,11 @@ use std::collections::HashSet;
use serde::Serialize; use serde::Serialize;
use serde::ser; use serde::ser;
use meilidb_core::{DocumentId, DocIndex}; use meilidb_core::{DocumentId, DocIndex};
use meilidb_tokenizer::{TokenizerBuilder, Token, is_cjk};
use crate::database::update::DocumentUpdate; use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError; use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr; use crate::database::schema::SchemaAttr;
use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::Token;
use crate::is_cjk;
pub struct IndexerSerializer<'a, 'b, B> { pub struct IndexerSerializer<'a, 'b, B> {
pub tokenizer_builder: &'a B, pub tokenizer_builder: &'a B,

View File

@ -2,13 +2,13 @@ use std::collections::HashSet;
use serde::Serialize; use serde::Serialize;
use serde::ser; use serde::ser;
use meilidb_tokenizer::TokenizerBuilder;
use crate::database::serde::indexer_serializer::IndexerSerializer; use crate::database::serde::indexer_serializer::IndexerSerializer;
use crate::database::serde::key_to_string::KeyToStringSerializer; use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::serde::value_to_number::ValueToNumberSerializer; use crate::database::serde::value_to_number::ValueToNumberSerializer;
use crate::database::update::DocumentUpdate; use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError; use crate::database::serde::SerializerError;
use crate::tokenizer::TokenizerBuilder;
use crate::database::schema::Schema; use crate::database::schema::Schema;
use meilidb_core::DocumentId; use meilidb_core::DocumentId;

View File

@ -8,6 +8,7 @@ use serde::Serialize;
use meilidb_core::write_to_bytes::WriteToBytes; use meilidb_core::write_to_bytes::WriteToBytes;
use meilidb_core::data::DocIds; use meilidb_core::data::DocIds;
use meilidb_core::{IndexBuilder, DocumentId, DocIndex}; use meilidb_core::{IndexBuilder, DocumentId, DocIndex};
use meilidb_tokenizer::TokenizerBuilder;
use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::database::serde::serializer::Serializer; use crate::database::serde::serializer::Serializer;
@ -16,7 +17,6 @@ use crate::database::schema::SchemaAttr;
use crate::database::schema::Schema; use crate::database::schema::Schema;
use crate::database::{DATA_INDEX, DATA_RANKED_MAP}; use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
use crate::database::{RankedMap, Number}; use crate::database::{RankedMap, Number};
use crate::tokenizer::TokenizerBuilder;
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent}; pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent}; pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};

View File

@ -1,24 +1,10 @@
#![cfg_attr(feature = "nightly", feature(test))] #![cfg_attr(feature = "nightly", feature(test))]
pub mod database; pub mod database;
pub mod tokenizer;
mod common_words; mod common_words;
mod sort_by_attr; mod sort_by_attr;
pub use rocksdb; pub use rocksdb;
pub use self::sort_by_attr::SortByAttr; pub use self::sort_by_attr::SortByAttr;
pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords; pub use self::common_words::CommonWords;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}