diff --git a/src/attribute.rs b/src/attribute.rs new file mode 100644 index 000000000..4c075e475 --- /dev/null +++ b/src/attribute.rs @@ -0,0 +1,105 @@ +use std::fmt; + +/// Represent an attribute number along with the word index +/// according to the tokenizer used. +/// +/// It can accept up to 1024 attributes and word positions +/// can be maximum 2^22. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Attribute(u32); + +impl Attribute { + /// Construct an `Attribute` from an attribute number and + /// the word position of a match according to the tokenizer used. + pub(crate) fn new(attribute: u16, index: u32) -> Result { + if attribute & 0b1111_1100_0000_0000 != 0 { + return Err(AttributeError::AttributeTooBig) + } + + if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { + return Err(AttributeError::IndexTooBig) + } + + let attribute = u32::from(attribute) << 22; + Ok(Attribute(attribute | index)) + } + + /// Construct an `Attribute` from an attribute number and + /// the word position of a match according to the tokenizer used. + /// + /// # Panics + /// + /// The attribute must not be greater than 1024 + /// and the word index not greater than 2^22. + pub(crate) fn new_faillible(attribute: u16, index: u32) -> Attribute { + match Attribute::new(attribute, index) { + Ok(attribute) => attribute, + Err(AttributeError::AttributeTooBig) => { + panic!("attribute must not be greater than 1024") + }, + Err(AttributeError::IndexTooBig) => { + panic!("attribute word index must not be greater than 2^22") + }, + } + } + + pub(crate) fn max_value() -> Attribute { + Attribute(u32::max_value()) + } + + #[inline] + pub fn attribute(self) -> u16 { + (self.0 >> 22) as u16 + } + + #[inline] + pub fn word_index(self) -> u32 { + self.0 & 0b0000_0000_0011_1111_1111_1111_1111 + } +} + +impl fmt::Debug for Attribute { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Attribute") + .field("attribute", &self.attribute()) + .field("word_index", &self.word_index()) + .finish() + } +} + +pub enum AttributeError { + AttributeTooBig, + IndexTooBig, +} + +#[cfg(test)] +mod tests { + use super::*; + use quickcheck::{quickcheck, TestResult}; + + quickcheck! { + fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult { + if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) { + return TestResult::discard() + } + + let attribute = Attribute::new_faillible(gen_attr, gen_index); + + let valid_attribute = attribute.attribute() == gen_attr; + let valid_index = attribute.word_index() == gen_index; + + TestResult::from_bool(valid_attribute && valid_index) + } + + fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult { + if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) { + return TestResult::discard() + } + + let a = Attribute::new_faillible(gen_attr, gen_index); + let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1); + + TestResult::from_bool(a < b) + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 03c2a200d..19e451f63 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,14 +5,16 @@ pub mod database; pub mod data; pub mod rank; pub mod tokenizer; +mod attribute; +mod word_area; mod common_words; -use std::fmt; - pub use rocksdb; pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; +pub use self::attribute::{Attribute, AttributeError}; +pub use self::word_area::{WordArea, WordAreaError}; /// Represent an internally generated document unique identifier. /// @@ -21,139 +23,6 @@ pub use self::common_words::CommonWords; #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] pub struct DocumentId(u64); -/// Represent an attribute number along with the word index -/// according to the tokenizer used. -/// -/// It can accept up to 1024 attributes and word positions -/// can be maximum 2^22. -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Attribute(u32); - -impl Attribute { - /// Construct an `Attribute` from an attribute number and - /// the word position of a match according to the tokenizer used. - fn new(attribute: u16, index: u32) -> Result { - if attribute & 0b1111_1100_0000_0000 != 0 { - return Err(AttributeError::AttributeTooBig) - } - - if index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { - return Err(AttributeError::IndexTooBig) - } - - let attribute = u32::from(attribute) << 22; - Ok(Attribute(attribute | index)) - } - - /// Construct an `Attribute` from an attribute number and - /// the word position of a match according to the tokenizer used. - /// - /// # Panics - /// - /// The attribute must not be greater than 1024 - /// and the word index not greater than 2^22. - fn new_faillible(attribute: u16, index: u32) -> Attribute { - match Attribute::new(attribute, index) { - Ok(attribute) => attribute, - Err(AttributeError::AttributeTooBig) => { - panic!("attribute must not be greater than 1024") - }, - Err(AttributeError::IndexTooBig) => { - panic!("attribute word index must not be greater than 2^22") - }, - } - } - - #[inline] - pub fn attribute(self) -> u16 { - (self.0 >> 22) as u16 - } - - #[inline] - pub fn word_index(self) -> u32 { - self.0 & 0b0000_0000_0011_1111_1111_1111_1111 - } -} - -impl fmt::Debug for Attribute { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("Attribute") - .field("attribute", &self.attribute()) - .field("word_index", &self.word_index()) - .finish() - } -} - -enum AttributeError { - AttributeTooBig, - IndexTooBig, -} - -/// Represent a word position in bytes along with the length of it. -/// -/// It can represent words byte index to maximum 2^22 and -/// up to words of length 1024. -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct WordArea(u32); - -impl WordArea { - /// Construct a `WordArea` from a word position in expresed as - /// a number of characters and the length of it. - /// - /// # Panics - /// - /// The char index must not be greater than 2^22 - /// and the length not greater than 1024. - fn new(char_index: u32, length: u16) -> Result { - if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { - return Err(WordAreaError::ByteIndexTooBig) - } - - if length & 0b1111_1100_0000_0000 != 0 { - return Err(WordAreaError::LengthTooBig) - } - - let char_index = char_index << 10; - Ok(WordArea(char_index | u32::from(length))) - } - - fn new_faillible(char_index: u32, length: u16) -> WordArea { - match WordArea::new(char_index, length) { - Ok(word_area) => word_area, - Err(WordAreaError::ByteIndexTooBig) => { - panic!("word area byte index must not be greater than 2^22") - }, - Err(WordAreaError::LengthTooBig) => { - panic!("word area length must not be greater than 1024") - }, - } - } - - #[inline] - pub fn char_index(self) -> u32 { - self.0 >> 10 - } - - #[inline] - pub fn length(self) -> u16 { - (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16 - } -} - -impl fmt::Debug for WordArea { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("WordArea") - .field("char_index", &self.char_index()) - .field("length", &self.length()) - .finish() - } -} - -enum WordAreaError { - ByteIndexTooBig, - LengthTooBig, -} - /// This structure represent the position of a word /// in a document and its attributes. /// @@ -227,9 +96,9 @@ impl Match { Match { query_index: u32::max_value(), distance: u8::max_value(), - attribute: Attribute(u32::max_value()), + attribute: Attribute::max_value(), is_exact: true, - word_area: WordArea(u32::max_value()), + word_area: WordArea::max_value(), } } } @@ -237,61 +106,10 @@ impl Match { #[cfg(test)] mod tests { use super::*; - use quickcheck::{quickcheck, TestResult}; use std::mem; #[test] fn docindex_mem_size() { assert_eq!(mem::size_of::(), 16); } - - quickcheck! { - fn qc_attribute(gen_attr: u16, gen_index: u32) -> TestResult { - if gen_attr > 2_u16.pow(10) || gen_index > 2_u32.pow(22) { - return TestResult::discard() - } - - let attribute = Attribute::new_faillible(gen_attr, gen_index); - - let valid_attribute = attribute.attribute() == gen_attr; - let valid_index = attribute.word_index() == gen_index; - - TestResult::from_bool(valid_attribute && valid_index) - } - - fn qc_attribute_ord(gen_attr: u16, gen_index: u32) -> TestResult { - if gen_attr >= 2_u16.pow(10) || gen_index >= 2_u32.pow(22) { - return TestResult::discard() - } - - let a = Attribute::new_faillible(gen_attr, gen_index); - let b = Attribute::new_faillible(gen_attr + 1, gen_index + 1); - - TestResult::from_bool(a < b) - } - - fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult { - if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { - return TestResult::discard() - } - - let word_area = WordArea::new_faillible(gen_char_index, gen_length); - - let valid_char_index = word_area.char_index() == gen_char_index; - let valid_length = word_area.length() == gen_length; - - TestResult::from_bool(valid_char_index && valid_length) - } - - fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult { - if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { - return TestResult::discard() - } - - let a = WordArea::new_faillible(gen_char_index, gen_length); - let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1); - - TestResult::from_bool(a < b) - } - } } diff --git a/src/word_area.rs b/src/word_area.rs new file mode 100644 index 000000000..593b462a6 --- /dev/null +++ b/src/word_area.rs @@ -0,0 +1,102 @@ +use std::fmt; + +/// Represent a word position in bytes along with the length of it. +/// +/// It can represent words byte index to maximum 2^22 and +/// up to words of length 1024. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct WordArea(u32); + +impl WordArea { + /// Construct a `WordArea` from a word position in expresed as + /// a number of characters and the length of it. + /// + /// # Panics + /// + /// The char index must not be greater than 2^22 + /// and the length not greater than 1024. + pub(crate) fn new(char_index: u32, length: u16) -> Result { + if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { + return Err(WordAreaError::ByteIndexTooBig) + } + + if length & 0b1111_1100_0000_0000 != 0 { + return Err(WordAreaError::LengthTooBig) + } + + let char_index = char_index << 10; + Ok(WordArea(char_index | u32::from(length))) + } + + pub(crate) fn new_faillible(char_index: u32, length: u16) -> WordArea { + match WordArea::new(char_index, length) { + Ok(word_area) => word_area, + Err(WordAreaError::ByteIndexTooBig) => { + panic!("word area byte index must not be greater than 2^22") + }, + Err(WordAreaError::LengthTooBig) => { + panic!("word area length must not be greater than 1024") + }, + } + } + + pub(crate) fn max_value() -> WordArea { + WordArea(u32::max_value()) + } + + #[inline] + pub fn char_index(self) -> u32 { + self.0 >> 10 + } + + #[inline] + pub fn length(self) -> u16 { + (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16 + } +} + +impl fmt::Debug for WordArea { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("WordArea") + .field("char_index", &self.char_index()) + .field("length", &self.length()) + .finish() + } +} + +pub enum WordAreaError { + ByteIndexTooBig, + LengthTooBig, +} + +#[cfg(test)] +mod tests { + use super::*; + use quickcheck::{quickcheck, TestResult}; + + quickcheck! { + fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult { + if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { + return TestResult::discard() + } + + let word_area = WordArea::new_faillible(gen_char_index, gen_length); + + let valid_char_index = word_area.char_index() == gen_char_index; + let valid_length = word_area.length() == gen_length; + + TestResult::from_bool(valid_char_index && valid_length) + } + + fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult { + if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { + return TestResult::discard() + } + + let a = WordArea::new_faillible(gen_char_index, gen_length); + let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1); + + TestResult::from_bool(a < b) + } + } +}