Remove old query_tree code and make clippy happy

This commit is contained in:
Loïc Lecrenier 2023-03-23 09:39:16 +01:00
parent f5f5f03ec0
commit 7169d85115
8 changed files with 82 additions and 1590 deletions

View File

@ -73,7 +73,7 @@ impl<'a> FacetDistribution<'a> {
let distribution_prelength = distribution.len();
let db = self.index.field_id_docid_facet_f64s;
for docid in candidates.into_iter() {
for docid in candidates {
key_buffer.truncate(mem::size_of::<FieldId>());
key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = db
@ -97,7 +97,7 @@ impl<'a> FacetDistribution<'a> {
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
let db = self.index.field_id_docid_facet_strings;
'outer: for docid in candidates.into_iter() {
'outer: for docid in candidates {
key_buffer.truncate(mem::size_of::<FieldId>());
key_buffer.extend_from_slice(&docid.to_be_bytes());
let iter = db
@ -505,7 +505,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((0..10_000).into_iter().collect())
.candidates((0..10_000).collect())
.execute()
.unwrap();
@ -513,7 +513,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((0..5_000).into_iter().collect())
.candidates((0..5_000).collect())
.execute()
.unwrap();
@ -521,7 +521,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((0..5_000).into_iter().collect())
.candidates((0..5_000).collect())
.execute()
.unwrap();
@ -529,7 +529,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((0..5_000).into_iter().collect())
.candidates((0..5_000).collect())
.max_values_per_facet(1)
.execute()
.unwrap();
@ -546,7 +546,7 @@ mod tests {
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let facet_values = (0..1000).into_iter().map(|x| format!("{x:x}")).collect::<Vec<_>>();
let facet_values = (0..1000).map(|x| format!("{x:x}")).collect::<Vec<_>>();
let mut documents = vec![];
for i in 0..10_000 {
@ -582,7 +582,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((0..10_000).into_iter().collect())
.candidates((0..10_000).collect())
.execute()
.unwrap();
@ -590,7 +590,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((0..5_000).into_iter().collect())
.candidates((0..5_000).collect())
.execute()
.unwrap();
@ -606,7 +606,7 @@ mod tests {
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let facet_values = (0..1000).into_iter().collect::<Vec<_>>();
let facet_values = (0..1000).collect::<Vec<_>>();
let mut documents = vec![];
for i in 0..1000 {
@ -634,7 +634,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((0..1000).into_iter().collect())
.candidates((0..1000).collect())
.compute_stats()
.unwrap();
@ -642,7 +642,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((217..777).into_iter().collect())
.candidates((217..777).collect())
.compute_stats()
.unwrap();
@ -658,7 +658,7 @@ mod tests {
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let facet_values = (0..1000).into_iter().collect::<Vec<_>>();
let facet_values = (0..1000).collect::<Vec<_>>();
let mut documents = vec![];
for i in 0..1000 {
@ -686,7 +686,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((0..1000).into_iter().collect())
.candidates((0..1000).collect())
.compute_stats()
.unwrap();
@ -694,7 +694,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((217..777).into_iter().collect())
.candidates((217..777).collect())
.compute_stats()
.unwrap();
@ -710,7 +710,7 @@ mod tests {
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let facet_values = (0..1000).into_iter().collect::<Vec<_>>();
let facet_values = (0..1000).collect::<Vec<_>>();
let mut documents = vec![];
for i in 0..1000 {
@ -738,7 +738,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((0..1000).into_iter().collect())
.candidates((0..1000).collect())
.compute_stats()
.unwrap();
@ -746,7 +746,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((217..777).into_iter().collect())
.candidates((217..777).collect())
.compute_stats()
.unwrap();
@ -762,7 +762,7 @@ mod tests {
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
.unwrap();
let facet_values = (0..1000).into_iter().collect::<Vec<_>>();
let facet_values = (0..1000).collect::<Vec<_>>();
let mut documents = vec![];
for i in 0..1000 {
@ -794,7 +794,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((0..1000).into_iter().collect())
.candidates((0..1000).collect())
.compute_stats()
.unwrap();
@ -802,7 +802,7 @@ mod tests {
let map = FacetDistribution::new(&txn, &index)
.facets(std::iter::once("colour"))
.candidates((217..777).into_iter().collect())
.candidates((217..777).collect())
.compute_stats()
.unwrap();

View File

@ -142,7 +142,7 @@ mod tests {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (0..=255).into_iter().collect::<RoaringBitmap>();
let candidates = (0..=255).collect::<RoaringBitmap>();
let mut results = String::new();
iterate_over_facet_distribution(
&txn,
@ -166,7 +166,7 @@ mod tests {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (0..=255).into_iter().collect::<RoaringBitmap>();
let candidates = (0..=255).collect::<RoaringBitmap>();
let mut results = String::new();
let mut nbr_facets = 0;
iterate_over_facet_distribution(

View File

@ -410,7 +410,7 @@ mod tests {
let mut results = String::new();
for i in (0..=255).into_iter().rev() {
for i in (0..=255).rev() {
let i = i as f64;
let start = Bound::Included(i);
let end = Bound::Included(255.);
@ -431,7 +431,7 @@ mod tests {
let mut results = String::new();
for i in (0..=255).into_iter().rev() {
for i in (0..=255).rev() {
let i = i as f64;
let start = Bound::Excluded(i);
let end = Bound::Excluded(255.);
@ -466,7 +466,7 @@ mod tests {
let mut results = String::new();
for i in (0..=128).into_iter().rev() {
for i in (0..=128).rev() {
let i = i as f64;
let start = Bound::Included(i);
let end = Bound::Included(255. - i);
@ -491,7 +491,7 @@ mod tests {
let mut results = String::new();
for i in (0..=128).into_iter().rev() {
for i in (0..=128).rev() {
let i = i as f64;
let start = Bound::Excluded(i);
let end = Bound::Excluded(255. - i);

View File

@ -132,7 +132,7 @@ mod tests {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
let candidates = (200..=300).collect::<RoaringBitmap>();
let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap();
for el in iter {
@ -154,7 +154,7 @@ mod tests {
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
let candidates = (200..=300).collect::<RoaringBitmap>();
let mut results = String::new();
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
for el in iter {

View File

@ -142,7 +142,7 @@ mod tests {
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
let candidates = (200..=300).collect::<RoaringBitmap>();
let mut results = String::new();
let db = index.content.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap();
@ -165,7 +165,7 @@ mod tests {
];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
let candidates = (200..=300).collect::<RoaringBitmap>();
let mut results = String::new();
let db = index.content.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>();
let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap();

View File

@ -1,21 +1,14 @@
pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
use self::fst_utils::{Complement, Intersection, StartsWith, Union};
pub use self::matches::{
FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords,
};
use crate::{
execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext,
};
use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap;
use std::borrow::Cow;
use std::collections::hash_map::{Entry, HashMap};
use std::fmt;
use std::result::Result as StdResult;
use std::str::Utf8Error;
// Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
@ -26,7 +19,6 @@ pub mod facet;
mod fst_utils;
mod matches;
pub mod new;
mod query_tree;
pub struct Search<'a> {
query: Option<String>,
@ -200,70 +192,6 @@ impl Default for TermsMatchingStrategy {
}
}
pub type WordDerivationsCache = HashMap<(String, bool, u8), Vec<(String, u8)>>;
pub fn word_derivations<'c>(
word: &str,
is_prefix: bool,
max_typo: u8,
fst: &fst::Set<Cow<[u8]>>,
cache: &'c mut WordDerivationsCache,
) -> StdResult<&'c [(String, u8)], Utf8Error> {
match cache.entry((word.to_string(), is_prefix, max_typo)) {
Entry::Occupied(entry) => Ok(entry.into_mut()),
Entry::Vacant(entry) => {
// println!("word derivations {word} {is_prefix} {max_typo}");
let mut derived_words = Vec::new();
if max_typo == 0 {
if is_prefix {
let prefix = Str::new(word).starts_with();
let mut stream = fst.search(prefix).into_stream();
while let Some(word) = stream.next() {
let word = std::str::from_utf8(word)?;
derived_words.push((word.to_string(), 0));
}
} else if fst.contains(word) {
derived_words.push((word.to_string(), 0));
}
} else if max_typo == 1 {
let dfa = build_dfa(word, 1, is_prefix);
let starts = StartsWith(Str::new(get_first(word)));
let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream();
while let Some((word, state)) = stream.next() {
let word = std::str::from_utf8(word)?;
let d = dfa.distance(state.1);
derived_words.push((word.to_string(), d.to_u8()));
}
} else {
let starts = StartsWith(Str::new(get_first(word)));
let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts));
let second_dfa = build_dfa(word, 2, is_prefix);
let second = Intersection(&second_dfa, &starts);
let automaton = Union(first, &second);
let mut stream = fst.search_with_state(automaton).into_stream();
while let Some((found_word, state)) = stream.next() {
let found_word = std::str::from_utf8(found_word)?;
// in the case the typo is on the first letter, we know the number of typo
// is two
if get_first(found_word) != get_first(word) {
derived_words.push((found_word.to_string(), 2));
} else {
// Else, we know that it is the second dfa that matched and compute the
// correct distance
let d = second_dfa.distance((state.1).0);
derived_words.push((found_word.to_string(), d.to_u8()));
}
}
}
Ok(entry.insert(derived_words))
}
}
}
fn get_first(s: &str) -> &str {
match s.chars().next() {
Some(c) => &s[..c.len_utf8()],
@ -337,66 +265,66 @@ mod test {
assert!(!search.is_typo_authorized().unwrap());
}
#[test]
fn test_one_typos_tolerance() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap();
// #[test]
// fn test_one_typos_tolerance() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap();
assert_eq!(found, &[("zealand".to_string(), 1)]);
}
// assert_eq!(found, &[("zealand".to_string(), 1)]);
// }
#[test]
fn test_one_typos_first_letter() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap();
// #[test]
// fn test_one_typos_first_letter() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap();
assert_eq!(found, &[]);
}
// assert_eq!(found, &[]);
// }
#[test]
fn test_two_typos_tolerance() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap();
// #[test]
// fn test_two_typos_tolerance() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap();
assert_eq!(found, &[("zealand".to_string(), 2)]);
}
// assert_eq!(found, &[("zealand".to_string(), 2)]);
// }
#[test]
fn test_two_typos_first_letter() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap();
// #[test]
// fn test_two_typos_first_letter() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap();
assert_eq!(found, &[("zealand".to_string(), 2)]);
}
// assert_eq!(found, &[("zealand".to_string(), 2)]);
// }
#[test]
fn test_prefix() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap();
// #[test]
// fn test_prefix() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap();
assert_eq!(found, &[("zealand".to_string(), 0)]);
}
// assert_eq!(found, &[("zealand".to_string(), 0)]);
// }
#[test]
fn test_bad_prefix() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap();
// #[test]
// fn test_bad_prefix() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap();
assert_eq!(found, &[]);
}
// assert_eq!(found, &[]);
// }
#[test]
fn test_prefix_with_typo() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap();
// #[test]
// fn test_prefix_with_typo() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap();
assert_eq!(found, &[("zealand".to_string(), 1)]);
}
// assert_eq!(found, &[("zealand".to_string(), 1)]);
// }
}

View File

@ -15,10 +15,9 @@ mod sort;
// TODO: documentation + comments
mod words;
// #[cfg(test)]
use std::collections::{BTreeSet, HashSet};
use charabia::{Tokenize, TokenizerBuilder};
use charabia::TokenizerBuilder;
use db_cache::DatabaseCache;
use graph_based_ranking_rule::{Proximity, Typo};
use heed::RoTxn;
@ -254,7 +253,7 @@ pub fn execute_search(
}
let tokenizer = tokbuilder.build();
let tokens = tokenizer.tokenize(&query);
let tokens = tokenizer.tokenize(query);
let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
let graph = QueryGraph::from_query(ctx, query_terms)?;

File diff suppressed because it is too large Load Diff