diff --git a/src/lib.rs b/src/lib.rs index c7f10d08f..19e451f63 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,15 +6,15 @@ pub mod data; pub mod rank; pub mod tokenizer; mod attribute; +mod word_area; mod common_words; -use std::fmt; - pub use rocksdb; pub use self::tokenizer::Tokenizer; pub use self::common_words::CommonWords; pub use self::attribute::{Attribute, AttributeError}; +pub use self::word_area::{WordArea, WordAreaError}; /// Represent an internally generated document unique identifier. /// @@ -23,71 +23,6 @@ pub use self::attribute::{Attribute, AttributeError}; #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] pub struct DocumentId(u64); -/// Represent a word position in bytes along with the length of it. -/// -/// It can represent words byte index to maximum 2^22 and -/// up to words of length 1024. -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct WordArea(u32); - -impl WordArea { - /// Construct a `WordArea` from a word position in expresed as - /// a number of characters and the length of it. - /// - /// # Panics - /// - /// The char index must not be greater than 2^22 - /// and the length not greater than 1024. - fn new(char_index: u32, length: u16) -> Result { - if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { - return Err(WordAreaError::ByteIndexTooBig) - } - - if length & 0b1111_1100_0000_0000 != 0 { - return Err(WordAreaError::LengthTooBig) - } - - let char_index = char_index << 10; - Ok(WordArea(char_index | u32::from(length))) - } - - fn new_faillible(char_index: u32, length: u16) -> WordArea { - match WordArea::new(char_index, length) { - Ok(word_area) => word_area, - Err(WordAreaError::ByteIndexTooBig) => { - panic!("word area byte index must not be greater than 2^22") - }, - Err(WordAreaError::LengthTooBig) => { - panic!("word area length must not be greater than 1024") - }, - } - } - - #[inline] - pub fn char_index(self) -> u32 { - self.0 >> 10 - } - - #[inline] - pub fn length(self) -> u16 { - (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16 - } -} - -impl fmt::Debug for WordArea { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("WordArea") - .field("char_index", &self.char_index()) - .field("length", &self.length()) - .finish() - } -} - -enum WordAreaError { - ByteIndexTooBig, - LengthTooBig, -} - /// This structure represent the position of a word /// in a document and its attributes. /// @@ -163,7 +98,7 @@ impl Match { distance: u8::max_value(), attribute: Attribute::max_value(), is_exact: true, - word_area: WordArea(u32::max_value()), + word_area: WordArea::max_value(), } } } @@ -171,37 +106,10 @@ impl Match { #[cfg(test)] mod tests { use super::*; - use quickcheck::{quickcheck, TestResult}; use std::mem; #[test] fn docindex_mem_size() { assert_eq!(mem::size_of::(), 16); } - - quickcheck! { - fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult { - if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { - return TestResult::discard() - } - - let word_area = WordArea::new_faillible(gen_char_index, gen_length); - - let valid_char_index = word_area.char_index() == gen_char_index; - let valid_length = word_area.length() == gen_length; - - TestResult::from_bool(valid_char_index && valid_length) - } - - fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult { - if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { - return TestResult::discard() - } - - let a = WordArea::new_faillible(gen_char_index, gen_length); - let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1); - - TestResult::from_bool(a < b) - } - } } diff --git a/src/word_area.rs b/src/word_area.rs new file mode 100644 index 000000000..593b462a6 --- /dev/null +++ b/src/word_area.rs @@ -0,0 +1,102 @@ +use std::fmt; + +/// Represent a word position in bytes along with the length of it. +/// +/// It can represent words byte index to maximum 2^22 and +/// up to words of length 1024. +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct WordArea(u32); + +impl WordArea { + /// Construct a `WordArea` from a word position in expresed as + /// a number of characters and the length of it. + /// + /// # Panics + /// + /// The char index must not be greater than 2^22 + /// and the length not greater than 1024. + pub(crate) fn new(char_index: u32, length: u16) -> Result { + if char_index & 0b1111_1111_1100_0000_0000_0000_0000 != 0 { + return Err(WordAreaError::ByteIndexTooBig) + } + + if length & 0b1111_1100_0000_0000 != 0 { + return Err(WordAreaError::LengthTooBig) + } + + let char_index = char_index << 10; + Ok(WordArea(char_index | u32::from(length))) + } + + pub(crate) fn new_faillible(char_index: u32, length: u16) -> WordArea { + match WordArea::new(char_index, length) { + Ok(word_area) => word_area, + Err(WordAreaError::ByteIndexTooBig) => { + panic!("word area byte index must not be greater than 2^22") + }, + Err(WordAreaError::LengthTooBig) => { + panic!("word area length must not be greater than 1024") + }, + } + } + + pub(crate) fn max_value() -> WordArea { + WordArea(u32::max_value()) + } + + #[inline] + pub fn char_index(self) -> u32 { + self.0 >> 10 + } + + #[inline] + pub fn length(self) -> u16 { + (self.0 & 0b0000_0000_0000_0000_0011_1111_1111) as u16 + } +} + +impl fmt::Debug for WordArea { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("WordArea") + .field("char_index", &self.char_index()) + .field("length", &self.length()) + .finish() + } +} + +pub enum WordAreaError { + ByteIndexTooBig, + LengthTooBig, +} + +#[cfg(test)] +mod tests { + use super::*; + use quickcheck::{quickcheck, TestResult}; + + quickcheck! { + fn qc_word_area(gen_char_index: u32, gen_length: u16) -> TestResult { + if gen_char_index > 2_u32.pow(22) || gen_length > 2_u16.pow(10) { + return TestResult::discard() + } + + let word_area = WordArea::new_faillible(gen_char_index, gen_length); + + let valid_char_index = word_area.char_index() == gen_char_index; + let valid_length = word_area.length() == gen_length; + + TestResult::from_bool(valid_char_index && valid_length) + } + + fn qc_word_area_ord(gen_char_index: u32, gen_length: u16) -> TestResult { + if gen_char_index >= 2_u32.pow(22) || gen_length >= 2_u16.pow(10) { + return TestResult::discard() + } + + let a = WordArea::new_faillible(gen_char_index, gen_length); + let b = WordArea::new_faillible(gen_char_index + 1, gen_length + 1); + + TestResult::from_bool(a < b) + } + } +}