feat: Introduce the Indexer struct

2025-01-18 17:11:15 +08:00 · 2019-04-15 15:16:53 +02:00 · 2019-04-15 15:16:53 +02:00 · 25a4961453
commit 25a4961453
parent 7338e522bd
3 changed files with 92 additions and 4 deletions
--- a/meilidb-data/Cargo.toml
+++ b/meilidb-data/Cargo.toml
@ -9,7 +9,9 @@ bincode = "1.1.2"
 hashbrown = { version = "0.1.8", features = ["serde"] }
 linked-hash-map = { version = "0.5.2", features = ["serde_impl"] }
 meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
+meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
 ordered-float = { version = "1.0.2", features = ["serde"] }
+sdset = "0.3.1"
 serde = { version = "1.0.88", features = ["derive"] }
 serde_json = { version = "1.0.39", features = ["preserve_order"] }
 sled = "0.22.1"
--- a/meilidb-data/src/indexer.rs
+++ b/meilidb-data/src/indexer.rs
@ -0,0 +1,84 @@
+use std::collections::BTreeMap;
+use std::convert::TryFrom;
+
+use meilidb_core::{DocumentId, DocIndex};
+use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder};
+use meilidb_tokenizer::{Tokenizer, SeqTokenizer, Token};
+use crate::SchemaAttr;
+
+use sdset::Set;
+
+type Word = Vec<u8>; // TODO make it be a SmallVec
+
+pub struct Indexer {
+    word_limit: usize, // the maximum number of indexed words
+    indexed: BTreeMap<Word, Vec<DocIndex>>,
+}
+
+impl Indexer {
+    pub fn new() -> Indexer {
+        Indexer {
+            word_limit: 1000,
+            indexed: BTreeMap::new(),
+        }
+    }
+
+    pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
+        for token in Tokenizer::new(text) {
+            if token.word_index >= self.word_limit { break }
+            let docindex = match token_to_docindex(id, attr, token) {
+                Some(docindex) => docindex,
+                None => break,
+            };
+
+            let word = Vec::from(token.word);
+            self.indexed.entry(word).or_insert_with(Vec::new).push(docindex);
+        }
+    }
+
+    pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
+    where I: IntoIterator<Item=&'a str>,
+    {
+        let iter = iter.into_iter();
+        for token in SeqTokenizer::new(iter) {
+            if token.word_index >= self.word_limit { break }
+            let docindex = match token_to_docindex(id, attr, token) {
+                Some(docindex) => docindex,
+                None => break,
+            };
+
+            let word = Vec::from(token.word);
+            self.indexed.entry(word).or_insert_with(Vec::new).push(docindex);
+        }
+    }
+
+    pub fn build(self) -> WordIndex {
+        let mut builder = WordIndexBuilder::new();
+
+        for (key, mut indexes) in self.indexed {
+            indexes.sort_unstable();
+            indexes.dedup();
+
+            let indexes = Set::new_unchecked(&indexes);
+            builder.insert(key, indexes).unwrap();
+        }
+
+        builder.build()
+    }
+}
+
+fn token_to_docindex<'a>(id: DocumentId, attr: SchemaAttr, token: Token<'a>) -> Option<DocIndex> {
+    let word_index = u16::try_from(token.word_index).ok()?;
+    let char_index = u16::try_from(token.char_index).ok()?;
+    let char_length = u16::try_from(token.word.chars().count()).ok()?;
+
+    let docindex = DocIndex {
+        document_id: id,
+        attribute: attr.0,
+        word_index: word_index,
+        char_index: char_index,
+        char_length: char_length,
+    };
+
+    Some(docindex)
+}
--- a/meilidb-data/src/lib.rs
+++ b/meilidb-data/src/lib.rs
@ -1,9 +1,11 @@
 mod database;
-pub mod schema;
-mod ranked_map;
+mod indexer;
 mod number;
+mod ranked_map;
+pub mod schema;

 pub use self::database::{Database, Index};
-pub use self::schema::{Schema, SchemaAttr};
-pub use self::ranked_map::RankedMap;
 pub use self::number::Number;
+pub use self::ranked_map::RankedMap;
+pub use self::schema::{Schema, SchemaAttr};
+pub use self::indexer::Indexer;