mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 17:11:15 +08:00
feat: Normalize synonym strings and query strings to search for synonyms
This commit is contained in:
parent
3dcbc737f3
commit
0a5d4eb7ed
@ -6,6 +6,7 @@ edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.3.1"
|
||||
deunicode = "1.0.0"
|
||||
hashbrown = "0.2.2"
|
||||
lazy_static = "1.2.0"
|
||||
log = "0.4.6"
|
||||
|
@ -15,7 +15,7 @@ use serde::{Serialize, Deserialize};
|
||||
use slice_group_by::GroupBy;
|
||||
use zerocopy::{AsBytes, FromBytes};
|
||||
|
||||
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder};
|
||||
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
|
||||
pub use self::store::Store;
|
||||
|
||||
/// Represent an internally generated document unique identifier.
|
||||
|
@ -37,6 +37,16 @@ impl Automaton {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn normalize_str(string: &str) -> String {
|
||||
let mut string = string.to_lowercase();
|
||||
|
||||
if !string.contains(is_cjk) {
|
||||
string = deunicode::deunicode_with_tofu(&string, "");
|
||||
}
|
||||
|
||||
string
|
||||
}
|
||||
|
||||
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
@ -55,7 +65,10 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
|
||||
let has_following_word = ngrams.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
|
||||
let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) };
|
||||
let lev = {
|
||||
let normalized = normalize_str(&ngram);
|
||||
if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) }
|
||||
};
|
||||
let mut stream = synonyms.search(&lev).into_stream();
|
||||
while let Some(base) = stream.next() {
|
||||
|
||||
@ -82,6 +95,7 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
|
||||
}
|
||||
|
||||
if n == 1 {
|
||||
let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) };
|
||||
let automaton = Automaton::original(index, ngram_nb_words, lev);
|
||||
automatons.push((automaton, ngram));
|
||||
}
|
||||
@ -443,7 +457,7 @@ mod tests {
|
||||
|
||||
fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set {
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
let set = SetBuf::from_dirty(set.into_iter().map(|s| s.to_lowercase()).collect());
|
||||
let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect());
|
||||
builder.extend_iter(set.into_iter()).unwrap();
|
||||
builder.into_inner().and_then(Set::from_bytes).unwrap()
|
||||
}
|
||||
@ -953,4 +967,65 @@ mod tests {
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deunicoded_synonyms() {
|
||||
let mut store = InMemorySetStore::from_iter(vec![
|
||||
("iPhone", &[doc_index(0, 0)][..]),
|
||||
("telephone", &[doc_index(1, 0)][..]), // meilidb-data indexes the unidecoded
|
||||
("téléphone", &[doc_index(1, 0)][..]), // and the original words with the same DocIndex
|
||||
]);
|
||||
|
||||
store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iPhone"]));
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("telephone", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("téléphone", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
|
||||
let builder = QueryBuilder::new(&store);
|
||||
let results = builder.query("télephone", 0..20).unwrap();
|
||||
let mut iter = results.into_iter();
|
||||
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||
let mut iter = matches.into_iter();
|
||||
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||
assert_matches!(iter.next(), None);
|
||||
});
|
||||
assert_matches!(iter.next(), None);
|
||||
}
|
||||
}
|
||||
|
@ -2,6 +2,8 @@ use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::{SetBuilder, set::OpBuilder};
|
||||
use meilidb_tokenizer::is_cjk;
|
||||
use meilidb_core::normalize_str;
|
||||
use sdset::SetBuf;
|
||||
|
||||
use crate::database::index::InnerIndex;
|
||||
@ -20,6 +22,8 @@ impl<'a> SynonymsAddition<'a> {
|
||||
pub fn add_synonym<I>(&mut self, synonym: String, alternatives: I)
|
||||
where I: Iterator<Item=String>,
|
||||
{
|
||||
let mut synonym = normalize_str(&synonym);
|
||||
let alternatives = alternatives.map(|s| s.to_lowercase());
|
||||
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
|
||||
}
|
||||
|
||||
@ -30,13 +34,11 @@ impl<'a> SynonymsAddition<'a> {
|
||||
|
||||
let mut synonyms_builder = SetBuilder::memory();
|
||||
|
||||
for (synonym, mut alternatives) in self.synonyms {
|
||||
for (synonym, alternatives) in self.synonyms {
|
||||
synonyms_builder.insert(&synonym).unwrap();
|
||||
|
||||
let alternatives = {
|
||||
alternatives.iter_mut().for_each(|s| *s = s.to_lowercase());
|
||||
let alternatives = SetBuf::from_dirty(alternatives);
|
||||
|
||||
let mut alternatives_builder = SetBuilder::memory();
|
||||
alternatives_builder.extend_iter(alternatives).unwrap();
|
||||
alternatives_builder.into_inner().unwrap()
|
||||
|
Loading…
Reference in New Issue
Block a user