mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-19 01:18:31 +08:00
feat: Replace compressed Match fields by uncompressed ones
This commit is contained in:
parent
b0b3175641
commit
a3a28c56fa
@ -70,12 +70,10 @@ fn create_highlight_areas(text: &str, matches: &[Match], attribute: SchemaAttr)
|
|||||||
let mut byte_indexes = BTreeMap::new();
|
let mut byte_indexes = BTreeMap::new();
|
||||||
|
|
||||||
for match_ in matches {
|
for match_ in matches {
|
||||||
let match_attribute = match_.attribute.attribute();
|
let match_attribute = match_.attribute;
|
||||||
if SchemaAttr::new(match_attribute) == attribute {
|
if SchemaAttr::new(match_attribute) == attribute {
|
||||||
let word_area = match_.word_area;
|
let char_index = match_.char_index as usize;
|
||||||
|
let char_length = match_.char_length as usize;
|
||||||
let char_index = word_area.char_index() as usize;
|
|
||||||
let char_length = word_area.length() as usize;
|
|
||||||
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
let (byte_index, byte_length) = char_to_byte_range(char_index, char_length, text);
|
||||||
|
|
||||||
match byte_indexes.entry(byte_index) {
|
match byte_indexes.entry(byte_index) {
|
||||||
@ -151,7 +149,7 @@ fn main() -> Result<(), Box<Error>> {
|
|||||||
|
|
||||||
let mut matching_attributes = HashSet::new();
|
let mut matching_attributes = HashSet::new();
|
||||||
for _match in doc.matches {
|
for _match in doc.matches {
|
||||||
let attr = SchemaAttr::new(_match.attribute.attribute());
|
let attr = SchemaAttr::new(_match.attribute);
|
||||||
let name = schema.attribute_name(attr);
|
let name = schema.attribute_name(attr);
|
||||||
matching_attributes.insert(name);
|
matching_attributes.insert(name);
|
||||||
}
|
}
|
||||||
|
@ -158,18 +158,24 @@ mod tests {
|
|||||||
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
|
fn builder_serialize_deserialize() -> Result<(), Box<Error>> {
|
||||||
let a = DocIndex {
|
let a = DocIndex {
|
||||||
document_id: DocumentId(0),
|
document_id: DocumentId(0),
|
||||||
attribute: Attribute::new_faillible(3, 11),
|
attribute: 3,
|
||||||
word_area: WordArea::new_faillible(30, 4)
|
word_index: 11,
|
||||||
|
char_index: 30,
|
||||||
|
char_length: 4,
|
||||||
};
|
};
|
||||||
let b = DocIndex {
|
let b = DocIndex {
|
||||||
document_id: DocumentId(1),
|
document_id: DocumentId(1),
|
||||||
attribute: Attribute::new_faillible(4, 21),
|
attribute: 4,
|
||||||
word_area: WordArea::new_faillible(35, 6)
|
word_index: 21,
|
||||||
|
char_index: 35,
|
||||||
|
char_length: 6,
|
||||||
};
|
};
|
||||||
let c = DocIndex {
|
let c = DocIndex {
|
||||||
document_id: DocumentId(2),
|
document_id: DocumentId(2),
|
||||||
attribute: Attribute::new_faillible(8, 2),
|
attribute: 8,
|
||||||
word_area: WordArea::new_faillible(89, 6)
|
word_index: 2,
|
||||||
|
char_index: 89,
|
||||||
|
char_length: 6,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut builder = DocIndexesBuilder::memory();
|
let mut builder = DocIndexesBuilder::memory();
|
||||||
@ -193,18 +199,24 @@ mod tests {
|
|||||||
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
||||||
let a = DocIndex {
|
let a = DocIndex {
|
||||||
document_id: DocumentId(0),
|
document_id: DocumentId(0),
|
||||||
attribute: Attribute::new_faillible(3, 11),
|
attribute: 3,
|
||||||
word_area: WordArea::new_faillible(30, 4)
|
word_index: 11,
|
||||||
|
char_index: 30,
|
||||||
|
char_length: 4,
|
||||||
};
|
};
|
||||||
let b = DocIndex {
|
let b = DocIndex {
|
||||||
document_id: DocumentId(1),
|
document_id: DocumentId(1),
|
||||||
attribute: Attribute::new_faillible(4, 21),
|
attribute: 4,
|
||||||
word_area: WordArea::new_faillible(35, 6)
|
word_index: 21,
|
||||||
|
char_index: 35,
|
||||||
|
char_length: 6,
|
||||||
};
|
};
|
||||||
let c = DocIndex {
|
let c = DocIndex {
|
||||||
document_id: DocumentId(2),
|
document_id: DocumentId(2),
|
||||||
attribute: Attribute::new_faillible(8, 2),
|
attribute: 8,
|
||||||
word_area: WordArea::new_faillible(89, 6)
|
word_index: 2,
|
||||||
|
char_index: 89,
|
||||||
|
char_length: 6,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut builder = DocIndexesBuilder::memory();
|
let mut builder = DocIndexesBuilder::memory();
|
||||||
|
@ -54,10 +54,8 @@ where B: TokenizerBuilder
|
|||||||
let document_id = self.document_id;
|
let document_id = self.document_id;
|
||||||
|
|
||||||
// FIXME must u32::try_from instead
|
// FIXME must u32::try_from instead
|
||||||
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
|
let attribute = self.attribute.0;
|
||||||
Ok(attribute) => attribute,
|
let word_index = word_index as u32;
|
||||||
Err(_) => return Ok(()),
|
|
||||||
};
|
|
||||||
|
|
||||||
// insert the exact representation
|
// insert the exact representation
|
||||||
let word_lower = word.to_lowercase();
|
let word_lower = word.to_lowercase();
|
||||||
@ -68,21 +66,17 @@ where B: TokenizerBuilder
|
|||||||
// and the unidecoded lowercased version
|
// and the unidecoded lowercased version
|
||||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||||
if word_lower != word_unidecoded {
|
if word_lower != word_unidecoded {
|
||||||
let word_area = match WordArea::new(char_index as u32, length) {
|
let char_index = char_index as u32;
|
||||||
Ok(word_area) => word_area,
|
let char_length = length;
|
||||||
Err(_) => return Ok(()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let doc_index = DocIndex { document_id, attribute, word_area };
|
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
||||||
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
|
self.update.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
let word_area = match WordArea::new(char_index as u32, length) {
|
let char_index = char_index as u32;
|
||||||
Ok(word_area) => word_area,
|
let char_length = length;
|
||||||
Err(_) => return Ok(()),
|
|
||||||
};
|
|
||||||
|
|
||||||
let doc_index = DocIndex { document_id, attribute, word_area };
|
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
||||||
self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
|
self.update.insert_doc_index(word_lower.into_bytes(), doc_index);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
26
src/lib.rs
26
src/lib.rs
@ -36,14 +36,16 @@ pub struct DocIndex {
|
|||||||
|
|
||||||
/// The attribute in the document where the word was found
|
/// The attribute in the document where the word was found
|
||||||
/// along with the index in it.
|
/// along with the index in it.
|
||||||
pub attribute: Attribute,
|
pub attribute: u16,
|
||||||
|
pub word_index: u32,
|
||||||
|
|
||||||
/// The position in bytes where the word was found
|
/// The position in bytes where the word was found
|
||||||
/// along with the length of it.
|
/// along with the length of it.
|
||||||
///
|
///
|
||||||
/// It informs on the original word area in the text indexed
|
/// It informs on the original word area in the text indexed
|
||||||
/// without needing to run the tokenizer again.
|
/// without needing to run the tokenizer again.
|
||||||
pub word_area: WordArea,
|
pub char_index: u32,
|
||||||
|
pub char_length: u16,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This structure represent a matching word with informations
|
/// This structure represent a matching word with informations
|
||||||
@ -68,7 +70,8 @@ pub struct Match {
|
|||||||
|
|
||||||
/// The attribute in the document where the word was found
|
/// The attribute in the document where the word was found
|
||||||
/// along with the index in it.
|
/// along with the index in it.
|
||||||
pub attribute: Attribute,
|
pub attribute: u16,
|
||||||
|
pub word_index: u32,
|
||||||
|
|
||||||
/// Whether the word that match is an exact match or a prefix.
|
/// Whether the word that match is an exact match or a prefix.
|
||||||
pub is_exact: bool,
|
pub is_exact: bool,
|
||||||
@ -78,7 +81,8 @@ pub struct Match {
|
|||||||
///
|
///
|
||||||
/// It informs on the original word area in the text indexed
|
/// It informs on the original word area in the text indexed
|
||||||
/// without needing to run the tokenizer again.
|
/// without needing to run the tokenizer again.
|
||||||
pub word_area: WordArea,
|
pub char_index: u32,
|
||||||
|
pub char_length: u16,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Match {
|
impl Match {
|
||||||
@ -86,9 +90,11 @@ impl Match {
|
|||||||
Match {
|
Match {
|
||||||
query_index: 0,
|
query_index: 0,
|
||||||
distance: 0,
|
distance: 0,
|
||||||
attribute: Attribute::new_faillible(0, 0),
|
attribute: 0,
|
||||||
|
word_index: 0,
|
||||||
is_exact: false,
|
is_exact: false,
|
||||||
word_area: WordArea::new_faillible(0, 0),
|
char_index: 0,
|
||||||
|
char_length: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -96,9 +102,11 @@ impl Match {
|
|||||||
Match {
|
Match {
|
||||||
query_index: u32::max_value(),
|
query_index: u32::max_value(),
|
||||||
distance: u8::max_value(),
|
distance: u8::max_value(),
|
||||||
attribute: Attribute::max_value(),
|
attribute: u16::max_value(),
|
||||||
|
word_index: u32::max_value(),
|
||||||
is_exact: true,
|
is_exact: true,
|
||||||
word_area: WordArea::max_value(),
|
char_index: u32::max_value(),
|
||||||
|
char_length: u16::max_value(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -110,6 +118,6 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn docindex_mem_size() {
|
fn docindex_mem_size() {
|
||||||
assert_eq!(mem::size_of::<DocIndex>(), 16);
|
assert_eq!(mem::size_of::<DocIndex>(), 24);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -111,8 +111,10 @@ where D: Deref<Target=DB>,
|
|||||||
query_index: iv.index as u32,
|
query_index: iv.index as u32,
|
||||||
distance: distance,
|
distance: distance,
|
||||||
attribute: doc_index.attribute,
|
attribute: doc_index.attribute,
|
||||||
|
word_index: doc_index.word_index,
|
||||||
is_exact: is_exact,
|
is_exact: is_exact,
|
||||||
word_area: doc_index.word_area,
|
char_index: doc_index.char_index,
|
||||||
|
char_length: doc_index.char_length,
|
||||||
};
|
};
|
||||||
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
|
matches.entry(doc_index.document_id).or_insert_with(Vec::new).push(match_);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user