mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 20:15:07 +08:00
fix: Reduce the size of the DocIndex type
This commit is contained in:
parent
aef7d7825f
commit
a45cc4b618
@ -126,7 +126,7 @@ fn crop_text(
|
|||||||
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
|
(m.char_index as usize) + (m.char_length as usize) <= start + (context * 2)
|
||||||
})
|
})
|
||||||
.map(|match_| {
|
.map(|match_| {
|
||||||
Match { char_index: match_.char_index - start as u32, ..match_ }
|
Match { char_index: match_.char_index - start as u16, ..match_ }
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
@ -56,7 +56,7 @@ where B: TokenizerBuilder
|
|||||||
|
|
||||||
// FIXME must u32::try_from instead
|
// FIXME must u32::try_from instead
|
||||||
let attribute = self.attribute.0;
|
let attribute = self.attribute.0;
|
||||||
let word_index = word_index as u32;
|
let word_index = word_index as u16;
|
||||||
|
|
||||||
// insert the exact representation
|
// insert the exact representation
|
||||||
let word_lower = word.to_lowercase();
|
let word_lower = word.to_lowercase();
|
||||||
@ -69,7 +69,7 @@ where B: TokenizerBuilder
|
|||||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||||
let word_unidecoded = word_unidecoded.trim();
|
let word_unidecoded = word_unidecoded.trim();
|
||||||
if word_lower != word_unidecoded {
|
if word_lower != word_unidecoded {
|
||||||
let char_index = char_index as u32;
|
let char_index = char_index as u16;
|
||||||
let char_length = length;
|
let char_length = length;
|
||||||
|
|
||||||
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
||||||
@ -77,7 +77,7 @@ where B: TokenizerBuilder
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let char_index = char_index as u32;
|
let char_index = char_index as u16;
|
||||||
let char_length = length;
|
let char_length = length;
|
||||||
|
|
||||||
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
|
||||||
|
14
src/lib.rs
14
src/lib.rs
@ -50,14 +50,14 @@ pub struct DocIndex {
|
|||||||
/// The attribute in the document where the word was found
|
/// The attribute in the document where the word was found
|
||||||
/// along with the index in it.
|
/// along with the index in it.
|
||||||
pub attribute: u16,
|
pub attribute: u16,
|
||||||
pub word_index: u32,
|
pub word_index: u16,
|
||||||
|
|
||||||
/// The position in bytes where the word was found
|
/// The position in bytes where the word was found
|
||||||
/// along with the length of it.
|
/// along with the length of it.
|
||||||
///
|
///
|
||||||
/// It informs on the original word area in the text indexed
|
/// It informs on the original word area in the text indexed
|
||||||
/// without needing to run the tokenizer again.
|
/// without needing to run the tokenizer again.
|
||||||
pub char_index: u32,
|
pub char_index: u16,
|
||||||
pub char_length: u16,
|
pub char_length: u16,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,7 +84,7 @@ pub struct Match {
|
|||||||
/// The attribute in the document where the word was found
|
/// The attribute in the document where the word was found
|
||||||
/// along with the index in it.
|
/// along with the index in it.
|
||||||
pub attribute: u16,
|
pub attribute: u16,
|
||||||
pub word_index: u32,
|
pub word_index: u16,
|
||||||
|
|
||||||
/// Whether the word that match is an exact match or a prefix.
|
/// Whether the word that match is an exact match or a prefix.
|
||||||
pub is_exact: bool,
|
pub is_exact: bool,
|
||||||
@ -94,7 +94,7 @@ pub struct Match {
|
|||||||
///
|
///
|
||||||
/// It informs on the original word area in the text indexed
|
/// It informs on the original word area in the text indexed
|
||||||
/// without needing to run the tokenizer again.
|
/// without needing to run the tokenizer again.
|
||||||
pub char_index: u32,
|
pub char_index: u16,
|
||||||
pub char_length: u16,
|
pub char_length: u16,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -116,9 +116,9 @@ impl Match {
|
|||||||
query_index: u32::max_value(),
|
query_index: u32::max_value(),
|
||||||
distance: u8::max_value(),
|
distance: u8::max_value(),
|
||||||
attribute: u16::max_value(),
|
attribute: u16::max_value(),
|
||||||
word_index: u32::max_value(),
|
word_index: u16::max_value(),
|
||||||
is_exact: true,
|
is_exact: true,
|
||||||
char_index: u32::max_value(),
|
char_index: u16::max_value(),
|
||||||
char_length: u16::max_value(),
|
char_length: u16::max_value(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -131,6 +131,6 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn docindex_mem_size() {
|
fn docindex_mem_size() {
|
||||||
assert_eq!(mem::size_of::<DocIndex>(), 24);
|
assert_eq!(mem::size_of::<DocIndex>(), 16);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,7 +6,7 @@ use crate::rank::criterion::Criterion;
|
|||||||
use crate::rank::RawDocument;
|
use crate::rank::RawDocument;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u32]) -> usize {
|
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
|
||||||
let mut sum_word_index = 0;
|
let mut sum_word_index = 0;
|
||||||
let mut index = 0;
|
let mut index = 0;
|
||||||
|
|
||||||
|
@ -5,14 +5,14 @@ use slice_group_by::GroupBy;
|
|||||||
use crate::rank::criterion::Criterion;
|
use crate::rank::criterion::Criterion;
|
||||||
use crate::rank::RawDocument;
|
use crate::rank::RawDocument;
|
||||||
|
|
||||||
const MAX_DISTANCE: u32 = 8;
|
const MAX_DISTANCE: u16 = 8;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
|
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
|
||||||
(a.clone(), b.clone())
|
(a.clone(), b.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
|
||||||
if lhs < rhs {
|
if lhs < rhs {
|
||||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||||
} else {
|
} else {
|
||||||
@ -20,13 +20,13 @@ fn index_proximity(lhs: u32, rhs: u32) -> u32 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn attribute_proximity((lattr, lwi): (u16, u32), (rattr, rwi): (u16, u32)) -> u32 {
|
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
|
||||||
if lattr != rattr { return MAX_DISTANCE }
|
if lattr != rattr { return MAX_DISTANCE }
|
||||||
index_proximity(lwi, rwi)
|
index_proximity(lwi, rwi)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn min_proximity((lattr, lwi): (&[u16], &[u32]), (rattr, rwi): (&[u16], &[u32])) -> u32 {
|
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
|
||||||
let mut min_prox = u32::max_value();
|
let mut min_prox = u16::max_value();
|
||||||
|
|
||||||
for a in lattr.iter().zip(lwi) {
|
for a in lattr.iter().zip(lwi) {
|
||||||
for b in rattr.iter().zip(rwi) {
|
for b in rattr.iter().zip(rwi) {
|
||||||
@ -43,8 +43,8 @@ fn matches_proximity(
|
|||||||
query_index: &[u32],
|
query_index: &[u32],
|
||||||
distance: &[u8],
|
distance: &[u8],
|
||||||
attribute: &[u16],
|
attribute: &[u16],
|
||||||
word_index: &[u32],
|
word_index: &[u16],
|
||||||
) -> u32
|
) -> u16
|
||||||
{
|
{
|
||||||
let mut query_index_groups = query_index.linear_group();
|
let mut query_index_groups = query_index.linear_group();
|
||||||
let mut proximity = 0;
|
let mut proximity = 0;
|
||||||
|
@ -79,7 +79,7 @@ impl RawDocument {
|
|||||||
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn word_index(&self) -> &[u32] {
|
pub fn word_index(&self) -> &[u16] {
|
||||||
let r = self.matches.range;
|
let r = self.matches.range;
|
||||||
// it is safe because construction/modifications
|
// it is safe because construction/modifications
|
||||||
// can only be done in this module
|
// can only be done in this module
|
||||||
@ -93,7 +93,7 @@ impl RawDocument {
|
|||||||
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn char_index(&self) -> &[u32] {
|
pub fn char_index(&self) -> &[u16] {
|
||||||
let r = self.matches.range;
|
let r = self.matches.range;
|
||||||
// it is safe because construction/modifications
|
// it is safe because construction/modifications
|
||||||
// can only be done in this module
|
// can only be done in this module
|
||||||
@ -150,9 +150,9 @@ struct Matches {
|
|||||||
query_index: Vec<u32>,
|
query_index: Vec<u32>,
|
||||||
distance: Vec<u8>,
|
distance: Vec<u8>,
|
||||||
attribute: Vec<u16>,
|
attribute: Vec<u16>,
|
||||||
word_index: Vec<u32>,
|
word_index: Vec<u16>,
|
||||||
is_exact: Vec<bool>,
|
is_exact: Vec<bool>,
|
||||||
char_index: Vec<u32>,
|
char_index: Vec<u16>,
|
||||||
char_length: Vec<u16>,
|
char_length: Vec<u16>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user