mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 09:04:59 +08:00
feat: Move tokenizer things into the meilidb-tokenizer workspace
This commit is contained in:
parent
d8cbb03c42
commit
1897da5348
@ -2,4 +2,5 @@
|
|||||||
members = [
|
members = [
|
||||||
"meilidb",
|
"meilidb",
|
||||||
"meilidb-core",
|
"meilidb-core",
|
||||||
|
"meilidb-tokenizer",
|
||||||
]
|
]
|
||||||
|
@ -11,6 +11,7 @@ hashbrown = "0.1.8"
|
|||||||
lazy_static = "1.2.0"
|
lazy_static = "1.2.0"
|
||||||
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||||
log = "0.4.6"
|
log = "0.4.6"
|
||||||
|
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
||||||
rayon = "1.0.3"
|
rayon = "1.0.3"
|
||||||
sdset = "0.3.1"
|
sdset = "0.3.1"
|
||||||
serde = "1.0.88"
|
serde = "1.0.88"
|
||||||
|
8
meilidb-tokenizer/Cargo.toml
Normal file
8
meilidb-tokenizer/Cargo.toml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
[package]
|
||||||
|
name = "meilidb-tokenizer"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Clément Renault <renault.cle@gmail.com>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
|
@ -1,7 +1,18 @@
|
|||||||
use std::mem;
|
use std::mem;
|
||||||
use crate::is_cjk;
|
|
||||||
use self::Separator::*;
|
use self::Separator::*;
|
||||||
|
|
||||||
|
pub fn is_cjk(c: char) -> bool {
|
||||||
|
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||||
|
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||||
|
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||||
|
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||||
|
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||||
|
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||||
|
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||||
|
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||||
|
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||||
|
}
|
||||||
|
|
||||||
pub trait TokenizerBuilder {
|
pub trait TokenizerBuilder {
|
||||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
|
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
|
||||||
}
|
}
|
@ -13,6 +13,8 @@ hashbrown = { version = "0.1.8", features = ["serde"] }
|
|||||||
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
|
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
|
||||||
lockfree = "0.5.1"
|
lockfree = "0.5.1"
|
||||||
log = "0.4.6"
|
log = "0.4.6"
|
||||||
|
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
||||||
|
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
||||||
sdset = "0.3.1"
|
sdset = "0.3.1"
|
||||||
serde = "1.0.88"
|
serde = "1.0.88"
|
||||||
serde_derive = "1.0.88"
|
serde_derive = "1.0.88"
|
||||||
@ -20,7 +22,6 @@ serde_json = { version = "1.0.38", features = ["preserve_order"] }
|
|||||||
size_format = "1.0.2"
|
size_format = "1.0.2"
|
||||||
slice-group-by = "0.2.4"
|
slice-group-by = "0.2.4"
|
||||||
unidecode = "0.3.0"
|
unidecode = "0.3.0"
|
||||||
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
|
|
||||||
|
|
||||||
[dependencies.toml]
|
[dependencies.toml]
|
||||||
git = "https://github.com/Kerollmops/toml-rs.git"
|
git = "https://github.com/Kerollmops/toml-rs.git"
|
||||||
|
@ -430,9 +430,9 @@ mod tests {
|
|||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
|
||||||
use serde_derive::{Serialize, Deserialize};
|
use serde_derive::{Serialize, Deserialize};
|
||||||
|
use meilidb_tokenizer::DefaultBuilder;
|
||||||
|
|
||||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||||
use crate::tokenizer::DefaultBuilder;
|
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
@ -3,13 +3,11 @@ use std::collections::HashSet;
|
|||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
use meilidb_core::{DocumentId, DocIndex};
|
use meilidb_core::{DocumentId, DocIndex};
|
||||||
|
use meilidb_tokenizer::{TokenizerBuilder, Token, is_cjk};
|
||||||
|
|
||||||
use crate::database::update::DocumentUpdate;
|
use crate::database::update::DocumentUpdate;
|
||||||
use crate::database::serde::SerializerError;
|
use crate::database::serde::SerializerError;
|
||||||
use crate::database::schema::SchemaAttr;
|
use crate::database::schema::SchemaAttr;
|
||||||
use crate::tokenizer::TokenizerBuilder;
|
|
||||||
use crate::tokenizer::Token;
|
|
||||||
use crate::is_cjk;
|
|
||||||
|
|
||||||
pub struct IndexerSerializer<'a, 'b, B> {
|
pub struct IndexerSerializer<'a, 'b, B> {
|
||||||
pub tokenizer_builder: &'a B,
|
pub tokenizer_builder: &'a B,
|
||||||
|
@ -2,13 +2,13 @@ use std::collections::HashSet;
|
|||||||
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use serde::ser;
|
use serde::ser;
|
||||||
|
use meilidb_tokenizer::TokenizerBuilder;
|
||||||
|
|
||||||
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
||||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||||
use crate::database::serde::value_to_number::ValueToNumberSerializer;
|
use crate::database::serde::value_to_number::ValueToNumberSerializer;
|
||||||
use crate::database::update::DocumentUpdate;
|
use crate::database::update::DocumentUpdate;
|
||||||
use crate::database::serde::SerializerError;
|
use crate::database::serde::SerializerError;
|
||||||
use crate::tokenizer::TokenizerBuilder;
|
|
||||||
use crate::database::schema::Schema;
|
use crate::database::schema::Schema;
|
||||||
use meilidb_core::DocumentId;
|
use meilidb_core::DocumentId;
|
||||||
|
|
||||||
|
@ -8,6 +8,7 @@ use serde::Serialize;
|
|||||||
use meilidb_core::write_to_bytes::WriteToBytes;
|
use meilidb_core::write_to_bytes::WriteToBytes;
|
||||||
use meilidb_core::data::DocIds;
|
use meilidb_core::data::DocIds;
|
||||||
use meilidb_core::{IndexBuilder, DocumentId, DocIndex};
|
use meilidb_core::{IndexBuilder, DocumentId, DocIndex};
|
||||||
|
use meilidb_tokenizer::TokenizerBuilder;
|
||||||
|
|
||||||
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||||
use crate::database::serde::serializer::Serializer;
|
use crate::database::serde::serializer::Serializer;
|
||||||
@ -16,7 +17,6 @@ use crate::database::schema::SchemaAttr;
|
|||||||
use crate::database::schema::Schema;
|
use crate::database::schema::Schema;
|
||||||
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
|
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
|
||||||
use crate::database::{RankedMap, Number};
|
use crate::database::{RankedMap, Number};
|
||||||
use crate::tokenizer::TokenizerBuilder;
|
|
||||||
|
|
||||||
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
|
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
|
||||||
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};
|
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};
|
||||||
|
@ -1,24 +1,10 @@
|
|||||||
#![cfg_attr(feature = "nightly", feature(test))]
|
#![cfg_attr(feature = "nightly", feature(test))]
|
||||||
|
|
||||||
pub mod database;
|
pub mod database;
|
||||||
pub mod tokenizer;
|
|
||||||
mod common_words;
|
mod common_words;
|
||||||
mod sort_by_attr;
|
mod sort_by_attr;
|
||||||
|
|
||||||
pub use rocksdb;
|
pub use rocksdb;
|
||||||
|
|
||||||
pub use self::sort_by_attr::SortByAttr;
|
pub use self::sort_by_attr::SortByAttr;
|
||||||
pub use self::tokenizer::Tokenizer;
|
|
||||||
pub use self::common_words::CommonWords;
|
pub use self::common_words::CommonWords;
|
||||||
|
|
||||||
pub fn is_cjk(c: char) -> bool {
|
|
||||||
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
|
||||||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
|
||||||
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
|
||||||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
|
||||||
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
|
||||||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
|
||||||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
|
||||||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
|
||||||
(c >= '\u{f900}' && c <= '\u{faff}')
|
|
||||||
}
|
|
||||||
|
Loading…
Reference in New Issue
Block a user