Deduplicate matching words

This commit is contained in:
Loïc Lecrenier 2022-10-31 13:33:49 +01:00
parent 84dd2e4df1
commit 86c34a996b
3 changed files with 151 additions and 44 deletions

View File

@ -2,6 +2,7 @@ use std::cmp::{min, Reverse};
use std::collections::BTreeMap;
use std::fmt;
use std::ops::{Index, IndexMut};
use std::rc::Rc;
use charabia::Token;
use levenshtein_automata::{Distance, DFA};
@ -14,11 +15,11 @@ type IsPrefix = bool;
/// referencing words that match the given query tree.
#[derive(Default)]
pub struct MatchingWords {
inner: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
inner: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
}
impl MatchingWords {
pub fn new(mut matching_words: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>) -> Self {
pub fn new(mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>) -> Self {
// Sort word by len in DESC order prioritizing the longuest matches,
// in order to highlight the longuest part of the matched word.
matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
@ -35,7 +36,8 @@ impl MatchingWords {
/// Iterator over terms that match the given token,
/// This allow to lazily evaluate matches.
pub struct MatchesIter<'a, 'b> {
inner: Box<dyn Iterator<Item = &'a (Vec<MatchingWord>, Vec<PrimitiveWordId>)> + 'a>,
#[allow(clippy::type_complexity)]
inner: Box<dyn Iterator<Item = &'a (Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)> + 'a>,
token: &'b Token<'b>,
}
@ -126,7 +128,7 @@ pub enum MatchType<'a> {
/// Structure helper to match several tokens in a row in order to complete a partial match.
#[derive(Debug, PartialEq)]
pub struct PartialMatch<'a> {
matching_words: &'a [MatchingWord],
matching_words: &'a [Rc<MatchingWord>],
ids: &'a [PrimitiveWordId],
char_len: usize,
}
@ -332,10 +334,15 @@ mod tests {
#[test]
fn matching_words() {
let all = vec![
Rc::new(MatchingWord::new("split".to_string(), 1, true)),
Rc::new(MatchingWord::new("this".to_string(), 0, false)),
Rc::new(MatchingWord::new("world".to_string(), 1, true)),
];
let matching_words = vec![
(vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]),
(vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]),
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
(vec![all[0].clone()], vec![0]),
(vec![all[1].clone()], vec![1]),
(vec![all[2].clone()], vec![2]),
];
let matching_words = MatchingWords::new(matching_words);

View File

@ -494,16 +494,23 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
#[cfg(test)]
mod tests {
use std::rc::Rc;
use charabia::TokenizerBuilder;
use super::*;
use crate::search::matches::matching_words::MatchingWord;
fn matching_words() -> MatchingWords {
let all = vec![
Rc::new(MatchingWord::new("split".to_string(), 0, false)),
Rc::new(MatchingWord::new("the".to_string(), 0, false)),
Rc::new(MatchingWord::new("world".to_string(), 1, true)),
];
let matching_words = vec![
(vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]),
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]),
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
(vec![all[0].clone()], vec![0]),
(vec![all[1].clone()], vec![1]),
(vec![all[2].clone()], vec![2]),
];
MatchingWords::new(matching_words)
@ -587,10 +594,11 @@ mod tests {
#[test]
fn highlight_unicode() {
let matching_words = vec![
(vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]),
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]),
let all = vec![
Rc::new(MatchingWord::new("wessfali".to_string(), 1, true)),
Rc::new(MatchingWord::new("world".to_string(), 1, true)),
];
let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
let matching_words = MatchingWords::new(matching_words);
@ -823,24 +831,20 @@ mod tests {
#[test]
fn partial_matches() {
let all = vec![
Rc::new(MatchingWord::new("the".to_string(), 0, false)),
Rc::new(MatchingWord::new("t".to_string(), 0, false)),
Rc::new(MatchingWord::new("he".to_string(), 0, false)),
Rc::new(MatchingWord::new("door".to_string(), 0, false)),
Rc::new(MatchingWord::new("do".to_string(), 0, false)),
Rc::new(MatchingWord::new("or".to_string(), 0, false)),
];
let matching_words = vec![
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]),
(
vec![
MatchingWord::new("t".to_string(), 0, false),
MatchingWord::new("he".to_string(), 0, false),
],
vec![0],
),
(vec![MatchingWord::new("door".to_string(), 0, false)], vec![1]),
(
vec![
MatchingWord::new("do".to_string(), 0, false),
MatchingWord::new("or".to_string(), 0, false),
],
vec![1],
),
(vec![MatchingWord::new("do".to_string(), 0, false)], vec![2]),
(vec![all[0].clone()], vec![0]),
(vec![all[1].clone(), all[2].clone()], vec![0]),
(vec![all[3].clone()], vec![1]),
(vec![all[4].clone(), all[5].clone()], vec![1]),
(vec![all[4].clone()], vec![2]),
];
let matching_words = MatchingWords::new(matching_words);

View File

@ -1,5 +1,9 @@
use std::borrow::Cow;
use std::cmp::max;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::hash::Hash;
use std::rc::Rc;
use std::{fmt, mem};
use charabia::classifier::ClassifiedTokenIter;
@ -540,6 +544,30 @@ fn create_query_tree(
Ok(Operation::or(true, operation_children))
}
#[derive(Default, Debug)]
struct MatchingWordCache {
all: Vec<Rc<MatchingWord>>,
map: HashMap<(String, u8, bool), Rc<MatchingWord>>,
}
impl MatchingWordCache {
fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Rc<MatchingWord> {
// Toggle the (un)commented code to switch between cached and non-cached
// implementations.
// self.all.push(MatchingWord::new(word, typo, prefix));
// self.all.len() - 1
match self.map.entry((word.clone(), typo, prefix)) {
Entry::Occupied(idx) => idx.get().clone(),
Entry::Vacant(vacant) => {
let matching_word = Rc::new(MatchingWord::new(word, typo, prefix));
self.all.push(matching_word.clone());
vacant.insert(matching_word.clone());
matching_word
}
}
}
}
/// Main function that matchings words used for crop and highlight.
fn create_matching_words(
ctx: &impl Context,
@ -551,7 +579,8 @@ fn create_matching_words(
ctx: &impl Context,
authorize_typos: bool,
part: PrimitiveQueryPart,
matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
matching_words: &mut Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
matching_word_cache: &mut MatchingWordCache,
id: PrimitiveWordId,
) -> Result<()> {
match part {
@ -562,15 +591,15 @@ fn create_matching_words(
for synonym in synonyms {
let synonym = synonym
.into_iter()
.map(|syn| MatchingWord::new(syn, 0, false))
.map(|syn| matching_word_cache.insert(syn, 0, false))
.collect();
matching_words.push((synonym, vec![id]));
}
}
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
let left = MatchingWord::new(left.to_string(), 0, false);
let right = MatchingWord::new(right.to_string(), 0, false);
let left = matching_word_cache.insert(left.to_string(), 0, false);
let right = matching_word_cache.insert(right.to_string(), 0, false);
matching_words.push((vec![left, right], vec![id]));
}
@ -580,8 +609,10 @@ fn create_matching_words(
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
let matching_word = match typos(word, authorize_typos, config) {
QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix),
QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix),
QueryKind::Exact { word, .. } => matching_word_cache.insert(word, 0, prefix),
QueryKind::Tolerant { typo, word } => {
matching_word_cache.insert(word, typo, prefix)
}
};
matching_words.push((vec![matching_word], vec![id]));
}
@ -589,8 +620,11 @@ fn create_matching_words(
PrimitiveQueryPart::Phrase(words) => {
let ids: Vec<_> =
(0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
let words =
words.into_iter().flatten().map(|w| MatchingWord::new(w, 0, false)).collect();
let words = words
.into_iter()
.flatten()
.map(|w| matching_word_cache.insert(w, 0, false))
.collect();
matching_words.push((words, ids));
}
}
@ -603,7 +637,8 @@ fn create_matching_words(
ctx: &impl Context,
authorize_typos: bool,
query: &[PrimitiveQueryPart],
matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
matching_words: &mut Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
matching_word_cache: &mut MatchingWordCache,
mut id: PrimitiveWordId,
) -> Result<()> {
const MAX_NGRAM: usize = 3;
@ -621,6 +656,7 @@ fn create_matching_words(
authorize_typos,
part.clone(),
matching_words,
matching_word_cache,
id,
)?;
}
@ -645,7 +681,7 @@ fn create_matching_words(
for synonym in synonyms {
let synonym = synonym
.into_iter()
.map(|syn| MatchingWord::new(syn, 0, false))
.map(|syn| matching_word_cache.insert(syn, 0, false))
.collect();
matching_words.push((synonym, ids.clone()));
}
@ -662,10 +698,10 @@ fn create_matching_words(
};
let matching_word = match typos(word, authorize_typos, config) {
QueryKind::Exact { word, .. } => {
MatchingWord::new(word, 0, is_prefix)
matching_word_cache.insert(word, 0, is_prefix)
}
QueryKind::Tolerant { typo, word } => {
MatchingWord::new(word, typo, is_prefix)
matching_word_cache.insert(word, typo, is_prefix)
}
};
matching_words.push((vec![matching_word], ids));
@ -673,7 +709,14 @@ fn create_matching_words(
}
if !is_last {
ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?;
ngrams(
ctx,
authorize_typos,
tail,
matching_words,
matching_word_cache,
id + 1,
)?;
}
}
}
@ -683,8 +726,9 @@ fn create_matching_words(
Ok(())
}
let mut matching_word_cache = MatchingWordCache::default();
let mut matching_words = Vec::new();
ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?;
ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?;
Ok(MatchingWords::new(matching_words))
}
@ -806,7 +850,9 @@ pub fn maximum_proximity(operation: &Operation) -> usize {
#[cfg(test)]
mod test {
use std::alloc::{GlobalAlloc, System};
use std::collections::HashMap;
use std::sync::atomic::{self, AtomicI64};
use charabia::Tokenize;
use maplit::hashmap;
@ -814,6 +860,7 @@ mod test {
use rand::{Rng, SeedableRng};
use super::*;
use crate::index::tests::TempIndex;
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
#[derive(Debug)]
@ -1310,4 +1357,53 @@ mod test {
Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
));
}
#[global_allocator]
static ALLOC: CountingAlloc =
CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) };
pub struct CountingAlloc {
pub resident: AtomicI64,
pub allocated: AtomicI64,
}
unsafe impl GlobalAlloc for CountingAlloc {
unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
System.alloc(layout)
}
unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::SeqCst);
System.dealloc(ptr, layout)
}
}
// This test must be run
#[test]
fn ten_words() {
let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst);
let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst);
let index = TempIndex::new();
let rtxn = index.read_txn().unwrap();
let query = "a beautiful summer house by the beach overlooking what seems";
let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap();
builder.words_limit(10);
let x = builder.build(query.tokenize()).unwrap().unwrap();
let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst);
let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst);
insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4521710");
insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7259092");
// Note, if the matching word cache is deactivated, the memory usage is:
// insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91311265");
// insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125948410");
// or about 20x more resident memory (90MB vs 4.5MB)
// Use x
let _x = x;
}
}