From 3a194bfcc7f2c8bcb7a05c6a252f6fa395d9a13d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 27 May 2018 11:15:05 +0200 Subject: [PATCH] feat: Define a `DocIndex` struct --- raptor-bin/src/main.rs | 20 ++++++------- raptor-http/src/main.rs | 6 ++-- src/lib.rs | 64 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 69 insertions(+), 21 deletions(-) diff --git a/raptor-bin/src/main.rs b/raptor-bin/src/main.rs index dda7d808d..63b236a98 100644 --- a/raptor-bin/src/main.rs +++ b/raptor-bin/src/main.rs @@ -9,7 +9,7 @@ use std::fs::File; use std::io::{BufReader, BufRead}; use std::iter; -use raptor::{MapBuilder, Map, Value, AttrIndex}; +use raptor::{DocIndexMapBuilder, DocIndexMap, DocIndex}; use serde_json::from_str; fn main() { @@ -35,7 +35,7 @@ fn main() { } }; - let mut builder = MapBuilder::new(); + let mut builder = DocIndexMapBuilder::new(); for line in data.lines() { let line = line.unwrap(); @@ -51,14 +51,12 @@ fn main() { let words = title.chain(description); for (i, (attr, word)) in words { - let value = Value { - id: product["product_id"].as_u64().expect("invalid `product_id`"), - attr_index: AttrIndex { - attribute: attr, - index: i as u64, - }, + let doc_index = DocIndex { + document: product["product_id"].as_u64().expect("invalid `product_id`"), + attribute: attr, + attribute_index: i as u32, }; - builder.insert(word, value); + builder.insert(word, doc_index); } } @@ -66,6 +64,6 @@ fn main() { let values = File::create("values.vecs").unwrap(); let (map, values) = builder.build(map, values).unwrap(); - eprintln!("Checking the dump consistency..."); - unsafe { Map::::from_paths("map.fst", "values.vecs").unwrap() }; + println!("Checking the dump consistency..."); + unsafe { DocIndexMap::from_paths("map.fst", "values.vecs").unwrap() }; } diff --git a/raptor-http/src/main.rs b/raptor-http/src/main.rs index 897f7043b..74c6d39fc 100644 --- a/raptor-http/src/main.rs +++ b/raptor-http/src/main.rs @@ -16,10 +16,10 @@ use tokio_minihttp::{Request, Response, Http}; use tokio_proto::TcpServer; use tokio_service::Service; -use raptor::{Map, OpWithStateBuilder, LevBuilder, Value}; +use raptor::{DocIndexMap, OpWithStateBuilder, LevBuilder}; struct MainService { - map: Arc>, + map: Arc, lev_builder: Arc, } @@ -92,7 +92,7 @@ fn main() { let map = { let fst = fs::read("map.fst").unwrap(); let values = fs::read("values.vecs").unwrap(); - let map = Map::from_bytes(fst, &values).unwrap(); + let map = DocIndexMap::from_bytes(fst, &values).unwrap(); Arc::new(map) }; diff --git a/src/lib.rs b/src/lib.rs index 3f436a907..c29d79747 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,14 +16,64 @@ pub use self::map::{ pub use self::capped_btree_map::{CappedBTreeMap, Insertion}; pub use self::levenshtein::LevBuilder; -#[derive(Debug, Serialize, Deserialize)] -pub struct Value { - pub id: u64, - pub attr_index: AttrIndex, +pub type DocIndexMap = Map; +pub type DocIndexMapBuilder = MapBuilder; + +/// This structure represent the position of a word +/// in a document and its attributes. +/// +/// This is stored in the map, generated at index time, +/// extracted and interpreted at search time. +#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash, Serialize, Deserialize)] +pub struct DocIndex { + + /// The document identifier where the word was found. + pub document: u64, + + /// The attribute identifier in the document + /// where the word was found. + /// + /// This is an `u8` therefore a document + /// can not have more than `2^8` attributes. + pub attribute: u8, + + /// The index where the word was found in the attribute. + /// + /// Only the first 1000 words are indexed. + pub attribute_index: u32, } -#[derive(Debug, Serialize, Deserialize)] -pub struct AttrIndex { +/// This structure represent a matching word with informations +/// on the location of the word in the document. +/// +/// The order of the field is important because it defines +/// the way these structures are ordered between themselves. +/// +/// The word in itself is not important. +#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct Match { + + /// The distance the word has with the query word + /// (i.e. the Levenshtein distance). + pub distance: u8, + + /// The attribute in which the word is located + /// (i.e. Title is 0, Description is 1). + /// + /// This is an `u8` therefore a document + /// can not have more than `2^8` attributes. pub attribute: u8, - pub index: u64, + + /// The word index in the query sentence. + /// Same as the `attribute_index` but for the query words. + /// + /// Used to retrieve the automaton that match this word. + pub query_index: u32, + + /// Where does this word is located in the attribute string + /// (i.e. at the start or the end of the attribute). + /// + /// The index in the attribute is limited to a maximum of `2^32` + /// this is because we index only the first 1000 words in an attribute. + pub attribute_index: u32, }