feat: Replace compressed Match fields by uncompressed ones

This commit is contained in:
Clément Renault 2019-02-02 14:17:50 +01:00
parent b0b3175641
commit a3a28c56fa
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
5 changed files with 56 additions and 42 deletions

View File

@ -70,12 +70,10 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr)
let mut byte_indexes = BTreeMap::new(); let mut byte_indexes = BTreeMap::new();
for match_ in matches { for match_ in matches {
let match_attribute = match_.attribute.attribute(); let match_attribute = match_.attribute;
if SchemaAttr::new(match_attribute) == attribute { if SchemaAttr::new(match_attribute) == attribute {
let word_area = match_.word_area; let char_index = match_.char_index as usize;
let char_length = match_.char_length as usize;
let char_index = word_area.char_index() as usize;
let char_length = word_area.length() as usize;
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text); let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
match byte_indexes.entry(byte_index) { match byte_indexes.entry(byte_index) {
@ -151,7 +149,7 @@ fn main() -> Result<(), Box<Error>> {
let mut matching_attributes = HashSet::new(); let mut matching_attributes = HashSet::new();
for _match in doc.matches { for _match in doc.matches {
let attr = SchemaAttr::new(_match.attribute.attribute()); let attr = SchemaAttr::new(_match.attribute);
let name = schema.attribute_name(attr); let name = schema.attribute_name(attr);
matching_attributes.insert(name); matching_attributes.insert(name);
} }

View File

@ -158,18 +158,24 @@ mod tests {
fn builder_serialize_deserialize() -> Result<(), Box<Error>> { fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { let a = DocIndex {
document_id: DocumentId(0), document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11), attribute: 3,
word_area: WordArea::new_faillible(30, 4) word_index: 11,
char_index: 30,
char_length: 4,
}; };
let b = DocIndex { let b = DocIndex {
document_id: DocumentId(1), document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21), attribute: 4,
word_area: WordArea::new_faillible(35, 6) word_index: 21,
char_index: 35,
char_length: 6,
}; };
let c = DocIndex { let c = DocIndex {
document_id: DocumentId(2), document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2), attribute: 8,
word_area: WordArea::new_faillible(89, 6) word_index: 2,
char_index: 89,
char_length: 6,
}; };
let mut builder = DocIndexesBuilder::memory(); let mut builder = DocIndexesBuilder::memory();
@ -193,18 +199,24 @@ mod tests {
fn serialize_deserialize() -> Result<(), Box<Error>> { fn serialize_deserialize() -> Result<(), Box<Error>> {
let a = DocIndex { let a = DocIndex {
document_id: DocumentId(0), document_id: DocumentId(0),
attribute: Attribute::new_faillible(3, 11), attribute: 3,
word_area: WordArea::new_faillible(30, 4) word_index: 11,
char_index: 30,
char_length: 4,
}; };
let b = DocIndex { let b = DocIndex {
document_id: DocumentId(1), document_id: DocumentId(1),
attribute: Attribute::new_faillible(4, 21), attribute: 4,
word_area: WordArea::new_faillible(35, 6) word_index: 21,
char_index: 35,
char_length: 6,
}; };
let c = DocIndex { let c = DocIndex {
document_id: DocumentId(2), document_id: DocumentId(2),
attribute: Attribute::new_faillible(8, 2), attribute: 8,
word_area: WordArea::new_faillible(89, 6) word_index: 2,
char_index: 89,
char_length: 6,
}; };
let mut builder = DocIndexesBuilder::memory(); let mut builder = DocIndexesBuilder::memory();

View File

@ -54,10 +54,8 @@ where B: TokenizerBuilder
let document_id = self.document_id; let document_id = self.document_id;
// FIXME must u32::try_from instead // FIXME must u32::try_from instead
let attribute = match Attribute::new(self.attribute.0, word_index as u32) { let attribute = self.attribute.0;
Ok(attribute) => attribute, let word_index = word_index as u32;
Err(_) => return Ok(()),
};
// insert the exact representation // insert the exact representation
let word_lower = word.to_lowercase(); let word_lower = word.to_lowercase();
@ -68,21 +66,17 @@ where B: TokenizerBuilder
// and the unidecoded lowercased version // and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase(); let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded { if word_lower != word_unidecoded {
let word_area = match WordArea::new(char_index as u32, length) { let char_index = char_index as u32;
Ok(word_area) => word_area, let char_length = length;
Err(_) => return Ok(()),
};
let doc_index = DocIndex { document_id, attribute, word_area }; let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index); self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
} }
let word_area = match WordArea::new(char_index as u32, length) { let char_index = char_index as u32;
Ok(word_area) => word_area, let char_length = length;
Err(_) => return Ok(()),
};
let doc_index = DocIndex { document_id, attribute, word_area }; let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
self.update.insert_doc_index(word_lower.into_bytes(), doc_index); self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
} }
Ok(()) Ok(())

View File

@ -36,14 +36,16 @@ pub struct DocIndex {
/// The attribute in the document where the word was found /// The attribute in the document where the word was found
/// along with the index in it. /// along with the index in it.
pub attribute: Attribute, pub attribute: u16,
pub word_index: u32,
/// The position in bytes where the word was found /// The position in bytes where the word was found
/// along with the length of it. /// along with the length of it.
/// ///
/// It informs on the original word area in the text indexed /// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again. /// without needing to run the tokenizer again.
pub word_area: WordArea, pub char_index: u32,
pub char_length: u16,
} }
/// This structure represent a matching word with informations /// This structure represent a matching word with informations
@ -68,7 +70,8 @@ pub struct Match {
/// The attribute in the document where the word was found /// The attribute in the document where the word was found
/// along with the index in it. /// along with the index in it.
pub attribute: Attribute, pub attribute: u16,
pub word_index: u32,
/// Whether the word that match is an exact match or a prefix. /// Whether the word that match is an exact match or a prefix.
pub is_exact: bool, pub is_exact: bool,
@ -78,7 +81,8 @@ pub struct Match {
/// ///
/// It informs on the original word area in the text indexed /// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again. /// without needing to run the tokenizer again.
pub word_area: WordArea, pub char_index: u32,
pub char_length: u16,
} }
impl Match { impl Match {
@ -86,9 +90,11 @@ impl Match {
Match { Match {
query_index: 0, query_index: 0,
distance: 0, distance: 0,
attribute: Attribute::new_faillible(0, 0), attribute: 0,
word_index: 0,
is_exact: false, is_exact: false,
word_area: WordArea::new_faillible(0, 0), char_index: 0,
char_length: 0,
} }
} }
@ -96,9 +102,11 @@ impl Match {
Match { Match {
query_index: u32::max_value(), query_index: u32::max_value(),
distance: u8::max_value(), distance: u8::max_value(),
attribute: Attribute::max_value(), attribute: u16::max_value(),
word_index: u32::max_value(),
is_exact: true, is_exact: true,
word_area: WordArea::max_value(), char_index: u32::max_value(),
char_length: u16::max_value(),
} }
} }
} }
@ -110,6 +118,6 @@ mod tests {
#[test] #[test]
fn docindex_mem_size() { fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16); assert_eq!(mem::size_of::<DocIndex>(), 24);
} }
} }

View File

@ -111,8 +111,10 @@ where D: Deref<Target=DB>,
query_index: iv.index as u32, query_index: iv.index as u32,
distance: distance, distance: distance,
attribute: doc_index.attribute, attribute: doc_index.attribute,
word_index: doc_index.word_index,
is_exact: is_exact, is_exact: is_exact,
word_area: doc_index.word_area, char_index: doc_index.char_index,
char_length: doc_index.char_length,
}; };
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_); matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
} }