meilisearch/raptor/src/lib.rs

#[macro_use] extern crate serde_derive;
extern crate bincode;
extern crate fst;
extern crate group_by;
extern crate levenshtein_automata;
extern crate serde;

pub mod map;
pub mod rank;
mod levenshtein;

use std::path::Path;
use std::fs;

pub use self::map::{Map, MapBuilder, Values};
pub use self::map::{
    OpBuilder, IndexedValues,
    OpWithStateBuilder, IndexedValuesWithState,
};
pub use self::rank::{RankedStream};
pub use self::levenshtein::LevBuilder;

pub type DocIndexMap = Map<DocIndex>;
pub type DocIndexMapBuilder = MapBuilder<DocIndex>;

pub type DocumentId = u64;

/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub struct DocIndex {

    /// The document identifier where the word was found.
    pub document: DocumentId,

    /// The attribute identifier in the document
    /// where the word was found.
    ///
    /// This is an `u8` therefore a document
    /// can not have more than `2^8` attributes.
    pub attribute: u8,

    /// The index where the word was found in the attribute.
    ///
    /// Only the first 1000 words are indexed.
    pub attribute_index: u32,
}

/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct Match {

    /// The word index in the query sentence.
    /// Same as the `attribute_index` but for the query words.
    ///
    /// Used to retrieve the automaton that match this word.
    pub query_index: u32,

    /// The distance the word has with the query word
    /// (i.e. the Levenshtein distance).
    pub distance: u8,

    /// The attribute in which the word is located
    /// (i.e. Title is 0, Description is 1).
    ///
    /// This is an `u8` therefore a document
    /// can not have more than `2^8` attributes.
    pub attribute: u8,

    /// Where does this word is located in the attribute string
    /// (i.e. at the start or the end of the attribute).
    ///
    /// The index in the attribute is limited to a maximum of `2^32`
    /// this is because we index only the first 1000 words in an attribute.
    pub attribute_index: u32,

    /// Whether the word that match is an exact match or a prefix.
    pub is_exact: bool,
}

impl Match {
    pub fn zero() -> Self {
        Match {
            query_index: 0,
            distance: 0,
            attribute: 0,
            attribute_index: 0,
            is_exact: false,
        }
    }

    pub fn max() -> Self {
        Match {
            query_index: u32::max_value(),
            distance: u8::max_value(),
            attribute: u8::max_value(),
            attribute_index: u32::max_value(),
            is_exact: true,
        }
    }
}


pub fn load_map<P, Q>(map: P, values: Q) -> fst::Result<DocIndexMap>
where P: AsRef<Path>, Q: AsRef<Path>,
{
    let fst = fs::read(map)?;
    let values = fs::read(values)?;
    DocIndexMap::from_bytes(fst, &values)
}
dump: Make the data less prone of memory indirections 2018-05-05 16:59:03 +08:00			`#[macro_use] extern crate serde_derive;`
cli: Make work to index json lines 2018-04-22 23:34:41 +08:00			`extern crate bincode;`
chore: Initial commit 2018-04-22 21:54:34 +08:00			`extern crate fst;`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`extern crate group_by;`
clean: Remove statics and use `Arc`s 2018-05-13 18:38:24 +08:00			`extern crate levenshtein_automata;`
dump: Make the data less prone of memory indirections 2018-05-05 16:59:03 +08:00			`extern crate serde;`
chore: Initial commit 2018-04-22 21:54:34 +08:00
chore: Move Streams to map file 2018-05-12 19:28:43 +08:00			`pub mod map;`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`pub mod rank;`
clean: Remove statics and use `Arc`s 2018-05-13 18:38:24 +08:00			`mod levenshtein;`
chore: Initial commit 2018-04-22 21:54:34 +08:00
test: Add a raptor-search bench 2018-06-24 21:10:13 +08:00			`use std::path::Path;`
			`use std::fs;`

chore: Remove useless `Fst` prefixes 2018-05-12 19:22:07 +08:00			`pub use self::map::{Map, MapBuilder, Values};`
			`pub use self::map::{`
feat(search): Accept multiple words and do a simple union 2018-05-06 18:23:42 +08:00			`OpBuilder, IndexedValues,`
			`OpWithStateBuilder, IndexedValuesWithState,`
			`};`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`pub use self::rank::{RankedStream};`
clean: Remove statics and use `Arc`s 2018-05-13 18:38:24 +08:00			`pub use self::levenshtein::LevBuilder;`
feat: Make the parsing more generic over json 2018-05-13 21:12:15 +08:00
feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00			`pub type DocIndexMap = Map<DocIndex>;`
			`pub type DocIndexMapBuilder = MapBuilder<DocIndex>;`

feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`pub type DocumentId = u64;`

feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00			`/// This structure represent the position of a word`
			`/// in a document and its attributes.`
			`///`
			`/// This is stored in the map, generated at index time,`
			`/// extracted and interpreted at search time.`
			`#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash, Serialize, Deserialize)]`
			`pub struct DocIndex {`

			`/// The document identifier where the word was found.`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`pub document: DocumentId,`
feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00
			`/// The attribute identifier in the document`
			`/// where the word was found.`
			`///`
			/// This is an `u8` therefore a document
			/// can not have more than `2^8` attributes.
			`pub attribute: u8,`

			`/// The index where the word was found in the attribute.`
			`///`
			`/// Only the first 1000 words are indexed.`
			`pub attribute_index: u32,`
feat: Make the parsing more generic over json 2018-05-13 21:12:15 +08:00			`}`

feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00			`/// This structure represent a matching word with informations`
			`/// on the location of the word in the document.`
			`///`
			`/// The order of the field is important because it defines`
			`/// the way these structures are ordered between themselves.`
			`///`
			`/// The word in itself is not important.`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`// TODO do data oriented programming ? very arrays ?`
feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00			`#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]`
			`pub struct Match {`

feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`/// The word index in the query sentence.`
			/// Same as the `attribute_index` but for the query words.
			`///`
			`/// Used to retrieve the automaton that match this word.`
			`pub query_index: u32,`

feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00			`/// The distance the word has with the query word`
			`/// (i.e. the Levenshtein distance).`
			`pub distance: u8,`

			`/// The attribute in which the word is located`
			`/// (i.e. Title is 0, Description is 1).`
			`///`
			/// This is an `u8` therefore a document
			/// can not have more than `2^8` attributes.
feat: Make the parsing more generic over json 2018-05-13 21:12:15 +08:00			`pub attribute: u8,`
feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00
			`/// Where does this word is located in the attribute string`
			`/// (i.e. at the start or the end of the attribute).`
			`///`
			/// The index in the attribute is limited to a maximum of `2^32`
			`/// this is because we index only the first 1000 words in an attribute.`
			`pub attribute_index: u32,`
feat: Implement the excat match ranking rule 2018-07-07 02:58:06 +08:00
			`/// Whether the word that match is an exact match or a prefix.`
			`pub is_exact: bool,`
feat: Make the parsing more generic over json 2018-05-13 21:12:15 +08:00			`}`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00
			`impl Match {`
			`pub fn zero() -> Self {`
			`Match {`
			`query_index: 0,`
			`distance: 0,`
			`attribute: 0,`
			`attribute_index: 0,`
feat: Implement the excat match ranking rule 2018-07-07 02:58:06 +08:00			`is_exact: false,`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`}`
			`}`

			`pub fn max() -> Self {`
			`Match {`
			`query_index: u32::max_value(),`
			`distance: u8::max_value(),`
			`attribute: u8::max_value(),`
			`attribute_index: u32::max_value(),`
feat: Implement the excat match ranking rule 2018-07-07 02:58:06 +08:00			`is_exact: true,`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`}`
			`}`
			`}`
test: Add a raptor-search bench 2018-06-24 21:10:13 +08:00

			`pub fn load_map<P, Q>(map: P, values: Q) -> fst::Result<DocIndexMap>`
			`where P: AsRef<Path>, Q: AsRef<Path>,`
			`{`
			`let fst = fs::read(map)?;`
			`let values = fs::read(values)?;`
			`DocIndexMap::from_bytes(fst, &values)`
			`}`