Add the meilidb-schema/tokenizer projects

2024-11-26 12:05:05 +08:00 · 2019-10-04 10:29:44 +02:00 · 2019-10-04 10:29:44 +02:00 · 08e3f23408
commit 08e3f23408
parent 62a0aefe44
6 changed files with 604 additions and 2 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -1,6 +1,8 @@
 [workspace]
 members = [
    "meilidb-core",
+    "meilidb-schema",
+    "meilidb-tokenizer",
 ]

 [profile.release]
--- a/meilidb-core/Cargo.toml
+++ b/meilidb-core/Cargo.toml
@ -19,8 +19,8 @@ siphasher = "0.3.0"
 slice-group-by = "0.2.6"
 zerocopy = "0.2.8"

-meilidb-schema = { path = "../../MeiliDB/meilidb-schema", version = "0.1.0" }
-meilidb-tokenizer = { path = "../../MeiliDB/meilidb-tokenizer", version = "0.1.0" }
+meilidb-schema = { path = "../meilidb-schema", version = "0.1.0" }
+meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }

 [dependencies.rmp-serde]
 git = "https://github.com/3Hren/msgpack-rust.git"
--- a/meilidb-schema/Cargo.toml
+++ b/meilidb-schema/Cargo.toml
@ -0,0 +1,12 @@
+[package]
+name = "meilidb-schema"
+version = "0.1.0"
+authors = ["Kerollmops <renault.cle@gmail.com>"]
+edition = "2018"
+
+[dependencies]
+bincode = "1.1.2"
+indexmap = { version = "1.1.0", features = ["serde-1"] }
+serde = { version = "1.0.91", features = ["derive"] }
+serde_json = { version = "1.0.39", features = ["preserve_order"] }
+toml = { version = "0.5.0", features = ["preserve_order"] }
--- a/meilidb-schema/src/lib.rs
+++ b/meilidb-schema/src/lib.rs
@ -0,0 +1,285 @@
+use std::collections::{HashMap, BTreeMap};
+use std::{fmt, u16};
+use std::ops::BitOr;
+use std::sync::Arc;
+
+use serde::{Serialize, Deserialize};
+use indexmap::IndexMap;
+
+pub const DISPLAYED: SchemaProps = SchemaProps { displayed: true,  indexed: false, ranked: false };
+pub const INDEXED: SchemaProps   = SchemaProps { displayed: false, indexed: true,  ranked: false };
+pub const RANKED: SchemaProps    = SchemaProps { displayed: false, indexed: false, ranked: true  };
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub struct SchemaProps {
+    #[serde(default)]
+    pub displayed: bool,
+
+    #[serde(default)]
+    pub indexed: bool,
+
+    #[serde(default)]
+    pub ranked: bool,
+}
+
+impl SchemaProps {
+    pub fn is_displayed(self) -> bool {
+        self.displayed
+    }
+
+    pub fn is_indexed(self) -> bool {
+        self.indexed
+    }
+
+    pub fn is_ranked(self) -> bool {
+        self.ranked
+    }
+}
+
+impl BitOr for SchemaProps {
+    type Output = Self;
+
+    fn bitor(self, other: Self) -> Self::Output {
+        SchemaProps {
+            displayed: self.displayed | other.displayed,
+            indexed: self.indexed | other.indexed,
+            ranked: self.ranked | other.ranked,
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct SchemaBuilder {
+    identifier: String,
+    attributes: IndexMap<String, SchemaProps>,
+}
+
+impl SchemaBuilder {
+    pub fn with_identifier<S: Into<String>>(name: S) -> SchemaBuilder {
+        SchemaBuilder {
+            identifier: name.into(),
+            attributes: IndexMap::new(),
+        }
+    }
+
+    pub fn new_attribute<S: Into<String>>(&mut self, name: S, props: SchemaProps) -> SchemaAttr {
+        let len = self.attributes.len();
+        if self.attributes.insert(name.into(), props).is_some() {
+            panic!("Field already inserted.")
+        }
+        SchemaAttr(len as u16)
+    }
+
+    pub fn build(self) -> Schema {
+        let mut attrs = HashMap::new();
+        let mut props = Vec::new();
+
+        for (i, (name, prop)) in self.attributes.into_iter().enumerate() {
+            attrs.insert(name.clone(), SchemaAttr(i as u16));
+            props.push((name, prop));
+        }
+
+        let identifier = self.identifier;
+        Schema { inner: Arc::new(InnerSchema { identifier, attrs, props }) }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Schema {
+    inner: Arc<InnerSchema>,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+struct InnerSchema {
+    identifier: String,
+    attrs: HashMap<String, SchemaAttr>,
+    props: Vec<(String, SchemaProps)>,
+}
+
+impl Schema {
+    fn to_builder(&self) -> SchemaBuilder {
+        let identifier = self.inner.identifier.clone();
+        let attributes = self.attributes_ordered();
+        SchemaBuilder { identifier, attributes }
+    }
+
+    fn attributes_ordered(&self) -> IndexMap<String, SchemaProps> {
+        let mut ordered = BTreeMap::new();
+        for (name, attr) in &self.inner.attrs {
+            let (_, props) = self.inner.props[attr.0 as usize];
+            ordered.insert(attr.0, (name, props));
+        }
+
+        let mut attributes = IndexMap::with_capacity(ordered.len());
+        for (_, (name, props)) in ordered {
+            attributes.insert(name.clone(), props);
+        }
+
+        attributes
+    }
+
+    pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
+        let (_, props) = self.inner.props[attr.0 as usize];
+        props
+    }
+
+    pub fn identifier_name(&self) -> &str {
+        &self.inner.identifier
+    }
+
+    pub fn attribute<S: AsRef<str>>(&self, name: S) -> Option<SchemaAttr> {
+        self.inner.attrs.get(name.as_ref()).cloned()
+    }
+
+    pub fn attribute_name(&self, attr: SchemaAttr) -> &str {
+        let (name, _) = &self.inner.props[attr.0 as usize];
+        name
+    }
+
+    pub fn iter<'a>(&'a self) -> impl Iterator<Item=(&str, SchemaAttr, SchemaProps)> + 'a {
+        self.inner.props.iter()
+            .map(move |(name, prop)| {
+                let attr = self.inner.attrs.get(name).unwrap();
+                (name.as_str(), *attr, *prop)
+            })
+    }
+}
+
+impl Serialize for Schema {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where S: serde::ser::Serializer,
+    {
+        self.to_builder().serialize(serializer)
+    }
+}
+
+impl<'de> Deserialize<'de> for Schema {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where D: serde::de::Deserializer<'de>,
+    {
+        let builder = SchemaBuilder::deserialize(deserializer)?;
+        Ok(builder.build())
+    }
+}
+
+#[derive(Serialize, Deserialize)]
+#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
+pub struct SchemaAttr(pub u16);
+
+impl SchemaAttr {
+    pub const fn new(value: u16) -> SchemaAttr {
+        SchemaAttr(value)
+    }
+
+    pub const fn min() -> SchemaAttr {
+        SchemaAttr(u16::min_value())
+    }
+
+    pub const fn max() -> SchemaAttr {
+        SchemaAttr(u16::max_value())
+    }
+
+    pub fn next(self) -> Option<SchemaAttr> {
+        self.0.checked_add(1).map(SchemaAttr)
+    }
+
+    pub fn prev(self) -> Option<SchemaAttr> {
+        self.0.checked_sub(1).map(SchemaAttr)
+    }
+}
+
+impl fmt::Display for SchemaAttr {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::error::Error;
+
+    #[test]
+    fn serialize_deserialize() -> bincode::Result<()> {
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("alpha", DISPLAYED);
+        builder.new_attribute("beta", DISPLAYED | INDEXED);
+        builder.new_attribute("gamma", INDEXED);
+        let schema = builder.build();
+
+        let mut buffer = Vec::new();
+        bincode::serialize_into(&mut buffer, &schema)?;
+        let schema2 = bincode::deserialize_from(buffer.as_slice())?;
+
+        assert_eq!(schema, schema2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn serialize_deserialize_toml() -> Result<(), Box<dyn Error>> {
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("alpha", DISPLAYED);
+        builder.new_attribute("beta", DISPLAYED | INDEXED);
+        builder.new_attribute("gamma", INDEXED);
+        let schema = builder.build();
+
+        let buffer = toml::to_vec(&schema)?;
+        let schema2 = toml::from_slice(buffer.as_slice())?;
+
+        assert_eq!(schema, schema2);
+
+        let data = r#"
+            identifier = "id"
+
+            [attributes."alpha"]
+            displayed = true
+
+            [attributes."beta"]
+            displayed = true
+            indexed = true
+
+            [attributes."gamma"]
+            indexed = true
+        "#;
+        let schema2 = toml::from_str(data)?;
+        assert_eq!(schema, schema2);
+
+        Ok(())
+    }
+
+    #[test]
+    fn serialize_deserialize_json() -> Result<(), Box<dyn Error>> {
+        let mut builder = SchemaBuilder::with_identifier("id");
+        builder.new_attribute("alpha", DISPLAYED);
+        builder.new_attribute("beta", DISPLAYED | INDEXED);
+        builder.new_attribute("gamma", INDEXED);
+        let schema = builder.build();
+
+        let buffer = serde_json::to_vec(&schema)?;
+        let schema2 = serde_json::from_slice(buffer.as_slice())?;
+
+        assert_eq!(schema, schema2);
+
+        let data = r#"
+            {
+                "identifier": "id",
+                "attributes": {
+                    "alpha": {
+                        "displayed": true
+                    },
+                    "beta": {
+                        "displayed": true,
+                        "indexed": true
+                    },
+                    "gamma": {
+                        "indexed": true
+                    }
+                }
+            }"#;
+        let schema2 = serde_json::from_str(data)?;
+        assert_eq!(schema, schema2);
+
+        Ok(())
+    }
+}
--- a/meilidb-tokenizer/Cargo.toml
+++ b/meilidb-tokenizer/Cargo.toml
@ -0,0 +1,8 @@
+[package]
+name = "meilidb-tokenizer"
+version = "0.1.0"
+authors = ["Kerollmops <renault.cle@gmail.com>"]
+edition = "2018"
+
+[dependencies]
+slice-group-by = "0.2.4"
--- a/meilidb-tokenizer/src/lib.rs
+++ b/meilidb-tokenizer/src/lib.rs
@ -0,0 +1,295 @@
+use std::iter::Peekable;
+use slice_group_by::StrGroupBy;
+use self::SeparatorCategory::*;
+
+pub fn is_cjk(c: char) -> bool {
+    (c >= '\u{2e80}' && c <= '\u{2eff}') ||
+    (c >= '\u{2f00}' && c <= '\u{2fdf}') ||
+    (c >= '\u{3040}' && c <= '\u{309f}') ||
+    (c >= '\u{30a0}' && c <= '\u{30ff}') ||
+    (c >= '\u{3100}' && c <= '\u{312f}') ||
+    (c >= '\u{3200}' && c <= '\u{32ff}') ||
+    (c >= '\u{3400}' && c <= '\u{4dbf}') ||
+    (c >= '\u{4e00}' && c <= '\u{9fff}') ||
+    (c >= '\u{f900}' && c <= '\u{faff}')
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+enum SeparatorCategory {
+    Soft,
+    Hard,
+}
+
+impl SeparatorCategory {
+    fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
+        if let (Soft, Soft) = (self, other) { Soft } else { Hard }
+    }
+
+    fn to_usize(self) -> usize {
+        match self {
+            Soft => 1,
+            Hard => 8,
+        }
+    }
+}
+
+fn is_separator(c: char) -> bool {
+    classify_separator(c).is_some()
+}
+
+fn classify_separator(c: char) -> Option<SeparatorCategory> {
+    match c {
+        ' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
+        '.' | ';' | ',' | '!' | '?' |  '(' | ')' => Some(Hard),
+        _ => None,
+    }
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+enum CharCategory {
+    Separator(SeparatorCategory),
+    Cjk,
+    Other,
+}
+
+fn classify_char(c: char) -> CharCategory {
+    if let Some(category) = classify_separator(c) {
+        CharCategory::Separator(category)
+    } else if is_cjk(c) {
+        CharCategory::Cjk
+    } else {
+        CharCategory::Other
+    }
+}
+
+fn is_str_word(s: &str) -> bool {
+    !s.chars().any(is_separator)
+}
+
+fn same_group_category(a: char, b: char) -> bool {
+    match (classify_char(a), classify_char(b)) {
+        (CharCategory::Cjk, _) | (_, CharCategory::Cjk) => false,
+        (CharCategory::Separator(_), CharCategory::Separator(_)) => true,
+        (a, b) => a == b,
+    }
+}
+
+// fold the number of chars along with the index position
+fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) {
+    (n + 1, i + c.len_utf8())
+}
+
+pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
+    Tokenizer::new(query).map(|t| t.word)
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub struct Token<'a> {
+    pub word: &'a str,
+    pub word_index: usize,
+    pub char_index: usize,
+}
+
+pub struct Tokenizer<'a> {
+    inner: &'a str,
+    word_index: usize,
+    char_index: usize,
+}
+
+impl<'a> Tokenizer<'a> {
+    pub fn new(string: &str) -> Tokenizer {
+        // skip every separator and set `char_index`
+        // to the number of char trimmed
+        let (count, index) = string.char_indices()
+                                   .take_while(|(_, c)| is_separator(*c))
+                                   .fold((0, 0), chars_count_index);
+
+        Tokenizer {
+            inner: &string[index..],
+            word_index: 0,
+            char_index: count,
+        }
+    }
+}
+
+impl<'a> Iterator for Tokenizer<'a> {
+    type Item = Token<'a>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut iter = self.inner.linear_group_by(same_group_category).peekable();
+
+        while let (Some(string), next_string) = (iter.next(), iter.peek()) {
+            let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
+
+            if !is_str_word(string) {
+                self.word_index += string.chars()
+                                         .filter_map(classify_separator)
+                                         .fold(Soft, |a, x| a.merge(x))
+                                         .to_usize();
+                self.char_index += count;
+                self.inner = &self.inner[index..];
+                continue;
+            }
+
+            let token = Token {
+                word: string,
+                word_index: self.word_index,
+                char_index: self.char_index,
+            };
+
+            if next_string.filter(|s| is_str_word(s)).is_some() {
+                self.word_index += 1;
+            }
+
+            self.char_index += count;
+            self.inner = &self.inner[index..];
+
+            return Some(token);
+        }
+
+        self.inner = "";
+        None
+    }
+}
+
+pub struct SeqTokenizer<'a, I>
+where I: Iterator<Item=&'a str>,
+{
+    inner: I,
+    current: Option<Peekable<Tokenizer<'a>>>,
+    word_offset: usize,
+    char_offset: usize,
+}
+
+impl<'a, I> SeqTokenizer<'a, I>
+where I: Iterator<Item=&'a str>,
+{
+    pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
+        let current = iter.next().map(|s| Tokenizer::new(s).peekable());
+        SeqTokenizer {
+            inner: iter,
+            current: current,
+            word_offset: 0,
+            char_offset: 0,
+        }
+    }
+}
+
+impl<'a, I> Iterator for SeqTokenizer<'a, I>
+where I: Iterator<Item=&'a str>,
+{
+    type Item = Token<'a>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match &mut self.current {
+            Some(current) => {
+                match current.next() {
+                    Some(token) => {
+                        // we must apply the word and char offsets
+                        // to the token before returning it
+                        let token = Token {
+                            word: token.word,
+                            word_index: token.word_index + self.word_offset,
+                            char_index: token.char_index + self.char_offset,
+                        };
+
+                        // if this is the last iteration on this text
+                        // we must save the offsets for next texts
+                        if current.peek().is_none() {
+                            let hard_space = SeparatorCategory::Hard.to_usize();
+                            self.word_offset = token.word_index + hard_space;
+                            self.char_offset = token.char_index + hard_space;
+                        }
+
+                        Some(token)
+                    },
+                    None => {
+                        // no more words in this text we must
+                        // start tokenizing the next text
+                        self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
+                        self.next()
+                    },
+                }
+            },
+            // no more texts available
+            None => None,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn easy() {
+        let mut tokenizer = Tokenizer::new("salut");
+
+        assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
+        assert_eq!(tokenizer.next(), None);
+
+        let mut tokenizer = Tokenizer::new("yo    ");
+
+        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
+        assert_eq!(tokenizer.next(), None);
+    }
+
+    #[test]
+    fn hard() {
+        let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
+
+        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
+        assert_eq!(tokenizer.next(), None);
+
+        let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
+
+        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 18 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 25, char_index: 24 }));
+        assert_eq!(tokenizer.next(), None);
+    }
+
+    #[test]
+    fn hard_long_chars() {
+        let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
+
+        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
+        assert_eq!(tokenizer.next(), None);
+
+        let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
+
+        assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 17, char_index: 16 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 25, char_index: 22 }));
+        assert_eq!(tokenizer.next(), None);
+    }
+
+    #[test]
+    fn hard_kanjis() {
+        let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
+
+        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
+        assert_eq!(tokenizer.next(), None);
+
+        let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello    \u{2ec7}");
+
+        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 4, char_index: 14 }));
+        assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 5, char_index: 23 }));
+        assert_eq!(tokenizer.next(), None);
+    }
+}