meilisearch/src/lib.rs

pub mod automaton;
pub mod database;
pub mod data;
pub mod rank;
pub mod tokenizer;
pub mod vec_read_only;
mod common_words;

use std::fmt;

pub use rocksdb;

pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords;

/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(u64);

/// Represent an attribute number along with the word index
/// according to the tokenizer used.
///
/// It can accept up to 1024 attributes and word positions
/// can be maximum 2^22.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Attribute(u32);

impl Attribute {
    /// Construct an `Attribute` from an attribute number and
    /// the word position of a match according to the tokenizer used.
    ///
    /// # Panics
    ///
    /// The attribute must not be greater than 1024
    /// and the word index not greater than 2^22.
    fn new(attribute: u16, index: u32) -> Attribute {
        assert!(attribute & 0b1111_1100_0000_0000 == 0);
        assert!(index & 0b1111_1111_1100_0000_0000_0000_0000 == 0);

        let attribute = (attribute as u32) << 22;
        Attribute(attribute | index)
    }

    pub fn attribute(&self) -> u16 {
        (self.0 >> 22) as u16
    }

    pub fn word_index(&self) -> u32 {
        self.0 & 0b0000_0000_0011_1111_1111_1111_1111
    }
}

impl fmt::Debug for Attribute {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        f.debug_struct("Attribute")
            .field("attribute", &self.attribute())
            .field("word_index", &self.word_index())
            .finish()
    }
}

/// Represent a word position in bytes along with the length of it.
///
/// It can represent words byte index to maximum 2^22 and
/// up to words of length 1024.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct WordArea(u32);

impl WordArea {
    /// Construct a `WordArea` from a word position in bytes
    /// and the length of it.
    ///
    /// # Panics
    ///
    /// The byte index must not be greater than 2^22
    /// and the length not greater than 1024.
    fn new(byte_index: u32, length: u16) -> WordArea {
        assert!(byte_index & 0b1111_1111_1100_0000_0000_0000_0000 == 0);
        assert!(length & 0b1111_1100_0000_0000 == 0);

        let byte_index = byte_index << 10;
        WordArea(byte_index | (length as u32))
    }

    pub fn byte_index(&self) -> u32 {
        self.0 >> 10
    }

    pub fn length(&self) -> u16 {
        (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16
    }
}

impl fmt::Debug for WordArea {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        f.debug_struct("WordArea")
            .field("byte_index", &self.byte_index())
            .field("length", &self.length())
            .finish()
    }
}

/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct DocIndex {
    /// The document identifier where the word was found.
    pub document_id: DocumentId,

    /// The attribute in the document where the word was found
    /// along with the index in it.
    pub attribute: Attribute,

    /// The position in bytes where the word was found
    /// along with the length of it.
    ///
    /// It informs on the original word area in the text indexed
    /// without needing to run the tokenizer again.
    pub word_area: WordArea,
}

/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match {
    /// The word index in the query sentence.
    /// Same as the `attribute_index` but for the query words.
    ///
    /// Used to retrieve the automaton that match this word.
    pub query_index: u32,

    /// The distance the word has with the query word
    /// (i.e. the Levenshtein distance).
    pub distance: u8,

    /// The attribute in the document where the word was found
    /// along with the index in it.
    pub attribute: Attribute,

    /// Whether the word that match is an exact match or a prefix.
    pub is_exact: bool,

    /// The position in bytes where the word was found
    /// along with the length of it.
    ///
    /// It informs on the original word area in the text indexed
    /// without needing to run the tokenizer again.
    pub word_area: WordArea,
}

impl Match {
    pub fn zero() -> Self {
        Match {
            query_index: 0,
            distance: 0,
            attribute: Attribute::new(0, 0),
            is_exact: false,
            word_area: WordArea::new(0, 0),
        }
    }

    pub fn max() -> Self {
        Match {
            query_index: u32::max_value(),
            distance: u8::max_value(),
            attribute: Attribute(u32::max_value()),
            is_exact: true,
            word_area: WordArea(u32::max_value()),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use quickcheck::{quickcheck, TestResult};
    use std::mem;

    #[test]
    fn docindex_mem_size() {
        assert_eq!(mem::size_of::<DocIndex>(), 16);
    }

    quickcheck! {
        fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult {
            if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) {
                return TestResult::discard()
            }

            let attribute = Attribute::new(gen_attr, gen_index);

            let valid_attribute = attribute.attribute() == gen_attr;
            let valid_index = attribute.word_index() == gen_index;

            TestResult::from_bool(valid_attribute && valid_index)
        }

        fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult {
            if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) {
                return TestResult::discard()
            }

            let a = Attribute::new(gen_attr, gen_index);
            let b = Attribute::new(gen_attr + 1, gen_index + 1);

            TestResult::from_bool(a < b)
        }

        fn qc_word_area(gen_byte_index: u32, gen_length: u16) -> TestResult {
            if gen_byte_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) {
                return TestResult::discard()
            }

            let word_area = WordArea::new(gen_byte_index, gen_length);

            let valid_char_index = word_area.byte_index() == gen_byte_index;
            let valid_length = word_area.length() == gen_length;

            TestResult::from_bool(valid_char_index && valid_length)
        }

        fn qc_word_area_ord(gen_byte_index: u32, gen_length: u16) -> TestResult {
            if gen_byte_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) {
                return TestResult::discard()
            }

            let a = WordArea::new(gen_byte_index, gen_length);
            let b = WordArea::new(gen_byte_index + 1, gen_length + 1);

            TestResult::from_bool(a < b)
        }
    }
}
feat: Introduce the QueryBuilder struct 2018-11-28 02:11:33 +08:00			`pub mod automaton;`
feat: Introduce the Database and DatabaseView 2018-12-02 23:45:17 +08:00			`pub mod database;`
feat: Working on ops for Positive and Negative blobs 2018-11-08 19:05:59 +08:00			`pub mod data;`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`pub mod rank;`
feat: introduce a better simple word lexer 2018-09-27 22:32:17 +08:00			`pub mod tokenizer;`
feat: Introduce the QueryBuilder struct 2018-11-28 02:11:33 +08:00			`pub mod vec_read_only;`
chore: Make the repo use examples and keep the library 2018-10-10 00:23:35 +08:00			`mod common_words;`
feat: Improve the indexing time a little bit ...by a factor of 17.6x. 2018-07-11 03:29:17 +08:00
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`use std::fmt;`

feat: Reexport the internal rocksdb 2018-12-13 18:52:34 +08:00			`pub use rocksdb;`

feat: Use the new Tokenizer in the csv-indexer 2018-09-27 22:59:41 +08:00			`pub use self::tokenizer::Tokenizer;`
chore: Make the repo use examples and keep the library 2018-10-10 00:23:35 +08:00			`pub use self::common_words::CommonWords;`
feat: Make the parsing more generic over json 2018-05-13 21:12:15 +08:00
feat: Create a strong DocumentId type Forcing it to be something internal will permit to avoid possible miss comparisons to be done with other types. 2018-12-22 19:00:24 +08:00			`/// Represent an internally generated document unique identifier.`
			`///`
			`/// It is used to inform the database the document you want to deserialize.`
			`/// Helpful for custom ranking.`
			`#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]`
feat: Make the schema consider document ids 2018-12-25 19:26:38 +08:00			`pub struct DocumentId(u64);`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`/// Represent an attribute number along with the word index`
			`/// according to the tokenizer used.`
			`///`
			`/// It can accept up to 1024 attributes and word positions`
			`/// can be maximum 2^22.`
			`#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]`
			`pub struct Attribute(u32);`

			`impl Attribute {`
			/// Construct an `Attribute` from an attribute number and
			`/// the word position of a match according to the tokenizer used.`
			`///`
			`/// # Panics`
			`///`
			`/// The attribute must not be greater than 1024`
			`/// and the word index not greater than 2^22.`
			`fn new(attribute: u16, index: u32) -> Attribute {`
			`assert!(attribute & 0b1111_1100_0000_0000 == 0);`
			`assert!(index & 0b1111_1111_1100_0000_0000_0000_0000 == 0);`

			`let attribute = (attribute as u32) << 22;`
			`Attribute(attribute \| index)`
			`}`

			`pub fn attribute(&self) -> u16 {`
			`(self.0 >> 22) as u16`
			`}`

			`pub fn word_index(&self) -> u32 {`
			`self.0 & 0b0000_0000_0011_1111_1111_1111_1111`
			`}`
			`}`

			`impl fmt::Debug for Attribute {`
			`fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {`
			`f.debug_struct("Attribute")`
			`.field("attribute", &self.attribute())`
			`.field("word_index", &self.word_index())`
			`.finish()`
			`}`
			`}`

			`/// Represent a word position in bytes along with the length of it.`
			`///`
			`/// It can represent words byte index to maximum 2^22 and`
			`/// up to words of length 1024.`
			`#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]`
			`pub struct WordArea(u32);`

			`impl WordArea {`
			/// Construct a `WordArea` from a word position in bytes
			`/// and the length of it.`
			`///`
			`/// # Panics`
			`///`
			`/// The byte index must not be greater than 2^22`
			`/// and the length not greater than 1024.`
			`fn new(byte_index: u32, length: u16) -> WordArea {`
			`assert!(byte_index & 0b1111_1111_1100_0000_0000_0000_0000 == 0);`
			`assert!(length & 0b1111_1100_0000_0000 == 0);`

			`let byte_index = byte_index << 10;`
			`WordArea(byte_index \| (length as u32))`
			`}`

			`pub fn byte_index(&self) -> u32 {`
			`self.0 >> 10`
			`}`

			`pub fn length(&self) -> u16 {`
			`(self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16`
			`}`
			`}`

			`impl fmt::Debug for WordArea {`
			`fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {`
			`f.debug_struct("WordArea")`
			`.field("byte_index", &self.byte_index())`
			`.field("length", &self.length())`
			`.finish()`
			`}`
			`}`

feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00			`/// This structure represent the position of a word`
			`/// in a document and its attributes.`
			`///`
			`/// This is stored in the map, generated at index time,`
			`/// extracted and interpreted at search time.`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]`
feat: Improve the indexing time a little bit ...by a factor of 17.6x. 2018-07-11 03:29:17 +08:00			`#[repr(C)]`
feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00			`pub struct DocIndex {`
			`/// The document identifier where the word was found.`
feat: Introduce a way to distinct documents 2018-10-17 19:35:34 +08:00			`pub document_id: DocumentId,`
feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`/// The attribute in the document where the word was found`
			`/// along with the index in it.`
			`pub attribute: Attribute,`
feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`/// The position in bytes where the word was found`
			`/// along with the length of it.`
feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00			`///`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`/// It informs on the original word area in the text indexed`
			`/// without needing to run the tokenizer again.`
			`pub word_area: WordArea,`
feat: Make the parsing more generic over json 2018-05-13 21:12:15 +08:00			`}`

feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00			`/// This structure represent a matching word with informations`
			`/// on the location of the word in the document.`
			`///`
			`/// The order of the field is important because it defines`
			`/// the way these structures are ordered between themselves.`
			`///`
			`/// The word in itself is not important.`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`// TODO do data oriented programming ? very arrays ?`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]`
feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00			`pub struct Match {`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`/// The word index in the query sentence.`
			/// Same as the `attribute_index` but for the query words.
			`///`
			`/// Used to retrieve the automaton that match this word.`
			`pub query_index: u32,`

feat: Define a `DocIndex` struct 2018-05-27 17:15:05 +08:00			`/// The distance the word has with the query word`
			`/// (i.e. the Levenshtein distance).`
			`pub distance: u8,`

feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`/// The attribute in the document where the word was found`
			`/// along with the index in it.`
			`pub attribute: Attribute,`
feat: Implement the excat match ranking rule 2018-07-07 02:58:06 +08:00
			`/// Whether the word that match is an exact match or a prefix.`
			`pub is_exact: bool,`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00
			`/// The position in bytes where the word was found`
			`/// along with the length of it.`
			`///`
			`/// It informs on the original word area in the text indexed`
			`/// without needing to run the tokenizer again.`
			`pub word_area: WordArea,`
feat: Make the parsing more generic over json 2018-05-13 21:12:15 +08:00			`}`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00
			`impl Match {`
			`pub fn zero() -> Self {`
			`Match {`
			`query_index: 0,`
			`distance: 0,`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`attribute: Attribute::new(0, 0),`
feat: Implement the excat match ranking rule 2018-07-07 02:58:06 +08:00			`is_exact: false,`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`word_area: WordArea::new(0, 0),`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`}`
			`}`

			`pub fn max() -> Self {`
			`Match {`
			`query_index: u32::max_value(),`
			`distance: u8::max_value(),`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`attribute: Attribute(u32::max_value()),`
feat: Implement the excat match ranking rule 2018-07-07 02:58:06 +08:00			`is_exact: true,`
feat: Introduce a WordArea struct Useful to highlight matching areas in the original text. 2018-12-23 23:46:49 +08:00			`word_area: WordArea(u32::max_value()),`
			`}`
			`}`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`
			`use quickcheck::{quickcheck, TestResult};`
			`use std::mem;`

			`#[test]`
			`fn docindex_mem_size() {`
			`assert_eq!(mem::size_of::<DocIndex>(), 16);`
			`}`

			`quickcheck! {`
			`fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult {`
			`if gen_attr > 2_u16.pow(10) \|\| gen_index > 2_u32.pow(22) {`
			`return TestResult::discard()`
			`}`

			`let attribute = Attribute::new(gen_attr, gen_index);`

			`let valid_attribute = attribute.attribute() == gen_attr;`
			`let valid_index = attribute.word_index() == gen_index;`

			`TestResult::from_bool(valid_attribute && valid_index)`
			`}`

			`fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult {`
			`if gen_attr >= 2_u16.pow(10) \|\| gen_index >= 2_u32.pow(22) {`
			`return TestResult::discard()`
			`}`

			`let a = Attribute::new(gen_attr, gen_index);`
			`let b = Attribute::new(gen_attr + 1, gen_index + 1);`

			`TestResult::from_bool(a < b)`
			`}`

			`fn qc_word_area(gen_byte_index: u32, gen_length: u16) -> TestResult {`
			`if gen_byte_index > 2_u32.pow(22) \|\| gen_length > 2_u16.pow(10) {`
			`return TestResult::discard()`
			`}`

			`let word_area = WordArea::new(gen_byte_index, gen_length);`

			`let valid_char_index = word_area.byte_index() == gen_byte_index;`
			`let valid_length = word_area.length() == gen_length;`

			`TestResult::from_bool(valid_char_index && valid_length)`
			`}`

			`fn qc_word_area_ord(gen_byte_index: u32, gen_length: u16) -> TestResult {`
			`if gen_byte_index >= 2_u32.pow(22) \|\| gen_length >= 2_u16.pow(10) {`
			`return TestResult::discard()`
			`}`

			`let a = WordArea::new(gen_byte_index, gen_length);`
			`let b = WordArea::new(gen_byte_index + 1, gen_length + 1);`

			`TestResult::from_bool(a < b)`
feat: Introduce basic ranking rules 2018-05-27 21:23:43 +08:00			`}`
			`}`
			`}`