mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-12-02 01:55:03 +08:00
Add the meilidb-schema/tokenizer projects
This commit is contained in:
parent
62a0aefe44
commit
08e3f23408
@ -1,6 +1,8 @@
|
|||||||
[workspace]
|
[workspace]
|
||||||
members = [
|
members = [
|
||||||
"meilidb-core",
|
"meilidb-core",
|
||||||
|
"meilidb-schema",
|
||||||
|
"meilidb-tokenizer",
|
||||||
]
|
]
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
|
@ -19,8 +19,8 @@ siphasher = "0.3.0"
|
|||||||
slice-group-by = "0.2.6"
|
slice-group-by = "0.2.6"
|
||||||
zerocopy = "0.2.8"
|
zerocopy = "0.2.8"
|
||||||
|
|
||||||
meilidb-schema = { path = "../../MeiliDB/meilidb-schema", version = "0.1.0" }
|
meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
|
||||||
meilidb-tokenizer = { path = "../../MeiliDB/meilidb-tokenizer", version = "0.1.0" }
|
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
|
||||||
|
|
||||||
[dependencies.rmp-serde]
|
[dependencies.rmp-serde]
|
||||||
git = "https://github.com/3Hren/msgpack-rust.git"
|
git = "https://github.com/3Hren/msgpack-rust.git"
|
||||||
|
12
meilidb-schema/Cargo.toml
Normal file
12
meilidb-schema/Cargo.toml
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
[package]
|
||||||
|
name = "meilidb-schema"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
bincode = "1.1.2"
|
||||||
|
indexmap = { version = "1.1.0", features = ["serde-1"] }
|
||||||
|
serde = { version = "1.0.91", features = ["derive"] }
|
||||||
|
serde_json = { version = "1.0.39", features = ["preserve_order"] }
|
||||||
|
toml = { version = "0.5.0", features = ["preserve_order"] }
|
285
meilidb-schema/src/lib.rs
Normal file
285
meilidb-schema/src/lib.rs
Normal file
@ -0,0 +1,285 @@
|
|||||||
|
use std::collections::{HashMap, BTreeMap};
|
||||||
|
use std::{fmt, u16};
|
||||||
|
use std::ops::BitOr;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
use serde::{Serialize, Deserialize};
|
||||||
|
use indexmap::IndexMap;
|
||||||
|
|
||||||
|
pub const DISPLAYED: SchemaProps = SchemaProps { displayed: true, indexed: false, ranked: false };
|
||||||
|
pub const INDEXED: SchemaProps = SchemaProps { displayed: false, indexed: true, ranked: false };
|
||||||
|
pub const RANKED: SchemaProps = SchemaProps { displayed: false, indexed: false, ranked: true };
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
|
pub struct SchemaProps {
|
||||||
|
#[serde(default)]
|
||||||
|
pub displayed: bool,
|
||||||
|
|
||||||
|
#[serde(default)]
|
||||||
|
pub indexed: bool,
|
||||||
|
|
||||||
|
#[serde(default)]
|
||||||
|
pub ranked: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SchemaProps {
|
||||||
|
pub fn is_displayed(self) -> bool {
|
||||||
|
self.displayed
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_indexed(self) -> bool {
|
||||||
|
self.indexed
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_ranked(self) -> bool {
|
||||||
|
self.ranked
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BitOr for SchemaProps {
|
||||||
|
type Output = Self;
|
||||||
|
|
||||||
|
fn bitor(self, other: Self) -> Self::Output {
|
||||||
|
SchemaProps {
|
||||||
|
displayed: self.displayed | other.displayed,
|
||||||
|
indexed: self.indexed | other.indexed,
|
||||||
|
ranked: self.ranked | other.ranked,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
pub struct SchemaBuilder {
|
||||||
|
identifier: String,
|
||||||
|
attributes: IndexMap<String, SchemaProps>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SchemaBuilder {
|
||||||
|
pub fn with_identifier<S: Into<String>>(name: S) -> SchemaBuilder {
|
||||||
|
SchemaBuilder {
|
||||||
|
identifier: name.into(),
|
||||||
|
attributes: IndexMap::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new_attribute<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr {
|
||||||
|
let len = self.attributes.len();
|
||||||
|
if self.attributes.insert(name.into(), props).is_some() {
|
||||||
|
panic!("Field already inserted.")
|
||||||
|
}
|
||||||
|
SchemaAttr(len as u16)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build(self) -> Schema {
|
||||||
|
let mut attrs = HashMap::new();
|
||||||
|
let mut props = Vec::new();
|
||||||
|
|
||||||
|
for (i, (name, prop)) in self.attributes.into_iter().enumerate() {
|
||||||
|
attrs.insert(name.clone(), SchemaAttr(i as u16));
|
||||||
|
props.push((name, prop));
|
||||||
|
}
|
||||||
|
|
||||||
|
let identifier = self.identifier;
|
||||||
|
Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub struct Schema {
|
||||||
|
inner: Arc<InnerSchema>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
struct InnerSchema {
|
||||||
|
identifier: String,
|
||||||
|
attrs: HashMap<String, SchemaAttr>,
|
||||||
|
props: Vec<(String, SchemaProps)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Schema {
|
||||||
|
fn to_builder(&self) -> SchemaBuilder {
|
||||||
|
let identifier = self.inner.identifier.clone();
|
||||||
|
let attributes = self.attributes_ordered();
|
||||||
|
SchemaBuilder { identifier, attributes }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn attributes_ordered(&self) -> IndexMap<String, SchemaProps> {
|
||||||
|
let mut ordered = BTreeMap::new();
|
||||||
|
for (name, attr) in &self.inner.attrs {
|
||||||
|
let (_, props) = self.inner.props[attr.0 as usize];
|
||||||
|
ordered.insert(attr.0, (name, props));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut attributes = IndexMap::with_capacity(ordered.len());
|
||||||
|
for (_, (name, props)) in ordered {
|
||||||
|
attributes.insert(name.clone(), props);
|
||||||
|
}
|
||||||
|
|
||||||
|
attributes
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
|
||||||
|
let (_, props) = self.inner.props[attr.0 as usize];
|
||||||
|
props
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn identifier_name(&self) -> &str {
|
||||||
|
&self.inner.identifier
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> {
|
||||||
|
self.inner.attrs.get(name.as_ref()).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
|
||||||
|
let (name, _) = &self.inner.props[attr.0 as usize];
|
||||||
|
name
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn iter<'a>(&'a self) -> impl Iterator<Item=(&str, SchemaAttr, SchemaProps)> + 'a {
|
||||||
|
self.inner.props.iter()
|
||||||
|
.map(move |(name, prop)| {
|
||||||
|
let attr = self.inner.attrs.get(name).unwrap();
|
||||||
|
(name.as_str(), *attr, *prop)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Serialize for Schema {
|
||||||
|
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||||
|
where S: serde::ser::Serializer,
|
||||||
|
{
|
||||||
|
self.to_builder().serialize(serializer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'de> Deserialize<'de> for Schema {
|
||||||
|
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
|
||||||
|
where D: serde::de::Deserializer<'de>,
|
||||||
|
{
|
||||||
|
let builder = SchemaBuilder::deserialize(deserializer)?;
|
||||||
|
Ok(builder.build())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Deserialize)]
|
||||||
|
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
|
||||||
|
pub struct SchemaAttr(pub u16);
|
||||||
|
|
||||||
|
impl SchemaAttr {
|
||||||
|
pub const fn new(value: u16) -> SchemaAttr {
|
||||||
|
SchemaAttr(value)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn min() -> SchemaAttr {
|
||||||
|
SchemaAttr(u16::min_value())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub const fn max() -> SchemaAttr {
|
||||||
|
SchemaAttr(u16::max_value())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn next(self) -> Option<SchemaAttr> {
|
||||||
|
self.0.checked_add(1).map(SchemaAttr)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn prev(self) -> Option<SchemaAttr> {
|
||||||
|
self.0.checked_sub(1).map(SchemaAttr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl fmt::Display for SchemaAttr {
|
||||||
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
|
self.0.fmt(f)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::error::Error;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn serialize_deserialize() -> bincode::Result<()> {
|
||||||
|
let mut builder = SchemaBuilder::with_identifier("id");
|
||||||
|
builder.new_attribute("alpha", DISPLAYED);
|
||||||
|
builder.new_attribute("beta", DISPLAYED | INDEXED);
|
||||||
|
builder.new_attribute("gamma", INDEXED);
|
||||||
|
let schema = builder.build();
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
bincode::serialize_into(&mut buffer, &schema)?;
|
||||||
|
let schema2 = bincode::deserialize_from(buffer.as_slice())?;
|
||||||
|
|
||||||
|
assert_eq!(schema, schema2);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn serialize_deserialize_toml() -> Result<(), Box<dyn Error>> {
|
||||||
|
let mut builder = SchemaBuilder::with_identifier("id");
|
||||||
|
builder.new_attribute("alpha", DISPLAYED);
|
||||||
|
builder.new_attribute("beta", DISPLAYED | INDEXED);
|
||||||
|
builder.new_attribute("gamma", INDEXED);
|
||||||
|
let schema = builder.build();
|
||||||
|
|
||||||
|
let buffer = toml::to_vec(&schema)?;
|
||||||
|
let schema2 = toml::from_slice(buffer.as_slice())?;
|
||||||
|
|
||||||
|
assert_eq!(schema, schema2);
|
||||||
|
|
||||||
|
let data = r#"
|
||||||
|
identifier = "id"
|
||||||
|
|
||||||
|
[attributes."alpha"]
|
||||||
|
displayed = true
|
||||||
|
|
||||||
|
[attributes."beta"]
|
||||||
|
displayed = true
|
||||||
|
indexed = true
|
||||||
|
|
||||||
|
[attributes."gamma"]
|
||||||
|
indexed = true
|
||||||
|
"#;
|
||||||
|
let schema2 = toml::from_str(data)?;
|
||||||
|
assert_eq!(schema, schema2);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn serialize_deserialize_json() -> Result<(), Box<dyn Error>> {
|
||||||
|
let mut builder = SchemaBuilder::with_identifier("id");
|
||||||
|
builder.new_attribute("alpha", DISPLAYED);
|
||||||
|
builder.new_attribute("beta", DISPLAYED | INDEXED);
|
||||||
|
builder.new_attribute("gamma", INDEXED);
|
||||||
|
let schema = builder.build();
|
||||||
|
|
||||||
|
let buffer = serde_json::to_vec(&schema)?;
|
||||||
|
let schema2 = serde_json::from_slice(buffer.as_slice())?;
|
||||||
|
|
||||||
|
assert_eq!(schema, schema2);
|
||||||
|
|
||||||
|
let data = r#"
|
||||||
|
{
|
||||||
|
"identifier": "id",
|
||||||
|
"attributes": {
|
||||||
|
"alpha": {
|
||||||
|
"displayed": true
|
||||||
|
},
|
||||||
|
"beta": {
|
||||||
|
"displayed": true,
|
||||||
|
"indexed": true
|
||||||
|
},
|
||||||
|
"gamma": {
|
||||||
|
"indexed": true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}"#;
|
||||||
|
let schema2 = serde_json::from_str(data)?;
|
||||||
|
assert_eq!(schema, schema2);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
8
meilidb-tokenizer/Cargo.toml
Normal file
8
meilidb-tokenizer/Cargo.toml
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
[package]
|
||||||
|
name = "meilidb-tokenizer"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Kerollmops <renault.cle@gmail.com>"]
|
||||||
|
edition = "2018"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
slice-group-by = "0.2.4"
|
295
meilidb-tokenizer/src/lib.rs
Normal file
295
meilidb-tokenizer/src/lib.rs
Normal file
@ -0,0 +1,295 @@
|
|||||||
|
use std::iter::Peekable;
|
||||||
|
use slice_group_by::StrGroupBy;
|
||||||
|
use self::SeparatorCategory::*;
|
||||||
|
|
||||||
|
pub fn is_cjk(c: char) -> bool {
|
||||||
|
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
|
||||||
|
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
|
||||||
|
(c >= '\u{3040}' && c <= '\u{309f}') ||
|
||||||
|
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
|
||||||
|
(c >= '\u{3100}' && c <= '\u{312f}') ||
|
||||||
|
(c >= '\u{3200}' && c <= '\u{32ff}') ||
|
||||||
|
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
|
||||||
|
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
|
||||||
|
(c >= '\u{f900}' && c <= '\u{faff}')
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||||
|
enum SeparatorCategory {
|
||||||
|
Soft,
|
||||||
|
Hard,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SeparatorCategory {
|
||||||
|
fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
|
||||||
|
if let (Soft, Soft) = (self, other) { Soft } else { Hard }
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_usize(self) -> usize {
|
||||||
|
match self {
|
||||||
|
Soft => 1,
|
||||||
|
Hard => 8,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_separator(c: char) -> bool {
|
||||||
|
classify_separator(c).is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn classify_separator(c: char) -> Option<SeparatorCategory> {
|
||||||
|
match c {
|
||||||
|
' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
|
||||||
|
'.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||||
|
enum CharCategory {
|
||||||
|
Separator(SeparatorCategory),
|
||||||
|
Cjk,
|
||||||
|
Other,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn classify_char(c: char) -> CharCategory {
|
||||||
|
if let Some(category) = classify_separator(c) {
|
||||||
|
CharCategory::Separator(category)
|
||||||
|
} else if is_cjk(c) {
|
||||||
|
CharCategory::Cjk
|
||||||
|
} else {
|
||||||
|
CharCategory::Other
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_str_word(s: &str) -> bool {
|
||||||
|
!s.chars().any(is_separator)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn same_group_category(a: char, b: char) -> bool {
|
||||||
|
match (classify_char(a), classify_char(b)) {
|
||||||
|
(CharCategory::Cjk, _) | (_, CharCategory::Cjk) => false,
|
||||||
|
(CharCategory::Separator(_), CharCategory::Separator(_)) => true,
|
||||||
|
(a, b) => a == b,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// fold the number of chars along with the index position
|
||||||
|
fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) {
|
||||||
|
(n + 1, i + c.len_utf8())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
|
||||||
|
Tokenizer::new(query).map(|t| t.word)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||||
|
pub struct Token<'a> {
|
||||||
|
pub word: &'a str,
|
||||||
|
pub word_index: usize,
|
||||||
|
pub char_index: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Tokenizer<'a> {
|
||||||
|
inner: &'a str,
|
||||||
|
word_index: usize,
|
||||||
|
char_index: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Tokenizer<'a> {
|
||||||
|
pub fn new(string: &str) -> Tokenizer {
|
||||||
|
// skip every separator and set `char_index`
|
||||||
|
// to the number of char trimmed
|
||||||
|
let (count, index) = string.char_indices()
|
||||||
|
.take_while(|(_, c)| is_separator(*c))
|
||||||
|
.fold((0, 0), chars_count_index);
|
||||||
|
|
||||||
|
Tokenizer {
|
||||||
|
inner: &string[index..],
|
||||||
|
word_index: 0,
|
||||||
|
char_index: count,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> Iterator for Tokenizer<'a> {
|
||||||
|
type Item = Token<'a>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
let mut iter = self.inner.linear_group_by(same_group_category).peekable();
|
||||||
|
|
||||||
|
while let (Some(string), next_string) = (iter.next(), iter.peek()) {
|
||||||
|
let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
|
||||||
|
|
||||||
|
if !is_str_word(string) {
|
||||||
|
self.word_index += string.chars()
|
||||||
|
.filter_map(classify_separator)
|
||||||
|
.fold(Soft, |a, x| a.merge(x))
|
||||||
|
.to_usize();
|
||||||
|
self.char_index += count;
|
||||||
|
self.inner = &self.inner[index..];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let token = Token {
|
||||||
|
word: string,
|
||||||
|
word_index: self.word_index,
|
||||||
|
char_index: self.char_index,
|
||||||
|
};
|
||||||
|
|
||||||
|
if next_string.filter(|s| is_str_word(s)).is_some() {
|
||||||
|
self.word_index += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.char_index += count;
|
||||||
|
self.inner = &self.inner[index..];
|
||||||
|
|
||||||
|
return Some(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.inner = "";
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct SeqTokenizer<'a, I>
|
||||||
|
where I: Iterator<Item=&'a str>,
|
||||||
|
{
|
||||||
|
inner: I,
|
||||||
|
current: Option<Peekable<Tokenizer<'a>>>,
|
||||||
|
word_offset: usize,
|
||||||
|
char_offset: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, I> SeqTokenizer<'a, I>
|
||||||
|
where I: Iterator<Item=&'a str>,
|
||||||
|
{
|
||||||
|
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
|
||||||
|
let current = iter.next().map(|s| Tokenizer::new(s).peekable());
|
||||||
|
SeqTokenizer {
|
||||||
|
inner: iter,
|
||||||
|
current: current,
|
||||||
|
word_offset: 0,
|
||||||
|
char_offset: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, I> Iterator for SeqTokenizer<'a, I>
|
||||||
|
where I: Iterator<Item=&'a str>,
|
||||||
|
{
|
||||||
|
type Item = Token<'a>;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
match &mut self.current {
|
||||||
|
Some(current) => {
|
||||||
|
match current.next() {
|
||||||
|
Some(token) => {
|
||||||
|
// we must apply the word and char offsets
|
||||||
|
// to the token before returning it
|
||||||
|
let token = Token {
|
||||||
|
word: token.word,
|
||||||
|
word_index: token.word_index + self.word_offset,
|
||||||
|
char_index: token.char_index + self.char_offset,
|
||||||
|
};
|
||||||
|
|
||||||
|
// if this is the last iteration on this text
|
||||||
|
// we must save the offsets for next texts
|
||||||
|
if current.peek().is_none() {
|
||||||
|
let hard_space = SeparatorCategory::Hard.to_usize();
|
||||||
|
self.word_offset = token.word_index + hard_space;
|
||||||
|
self.char_offset = token.char_index + hard_space;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(token)
|
||||||
|
},
|
||||||
|
None => {
|
||||||
|
// no more words in this text we must
|
||||||
|
// start tokenizing the next text
|
||||||
|
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
|
||||||
|
self.next()
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// no more texts available
|
||||||
|
None => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn easy() {
|
||||||
|
let mut tokenizer = Tokenizer::new("salut");
|
||||||
|
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
|
||||||
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
|
let mut tokenizer = Tokenizer::new("yo ");
|
||||||
|
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||||
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn hard() {
|
||||||
|
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
|
||||||
|
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
|
||||||
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
|
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
|
||||||
|
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 18 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 25, char_index: 24 }));
|
||||||
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn hard_long_chars() {
|
||||||
|
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
|
||||||
|
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
|
||||||
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
|
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
|
||||||
|
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 16 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 25, char_index: 22 }));
|
||||||
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn hard_kanjis() {
|
||||||
|
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
|
||||||
|
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
|
||||||
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
|
||||||
|
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
|
||||||
|
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 4, char_index: 14 }));
|
||||||
|
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 5, char_index: 23 }));
|
||||||
|
assert_eq!(tokenizer.next(), None);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user