mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-30 17:14:59 +08:00
feat: Normalize synonym strings and query strings to search for synonyms
This commit is contained in:
parent
3dcbc737f3
commit
0a5d4eb7ed
@ -6,6 +6,7 @@ edition = "2018"
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
byteorder = "1.3.1"
|
byteorder = "1.3.1"
|
||||||
|
deunicode = "1.0.0"
|
||||||
hashbrown = "0.2.2"
|
hashbrown = "0.2.2"
|
||||||
lazy_static = "1.2.0"
|
lazy_static = "1.2.0"
|
||||||
log = "0.4.6"
|
log = "0.4.6"
|
||||||
|
@ -15,7 +15,7 @@ use serde::{Serialize, Deserialize};
|
|||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
use zerocopy::{AsBytes, FromBytes};
|
use zerocopy::{AsBytes, FromBytes};
|
||||||
|
|
||||||
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder};
|
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder, normalize_str};
|
||||||
pub use self::store::Store;
|
pub use self::store::Store;
|
||||||
|
|
||||||
/// Represent an internally generated document unique identifier.
|
/// Represent an internally generated document unique identifier.
|
||||||
|
@ -37,6 +37,16 @@ impl Automaton {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn normalize_str(string: &str) -> String {
|
||||||
|
let mut string = string.to_lowercase();
|
||||||
|
|
||||||
|
if !string.contains(is_cjk) {
|
||||||
|
string = deunicode::deunicode_with_tofu(&string, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
string
|
||||||
|
}
|
||||||
|
|
||||||
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
|
fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton>, S::Error> {
|
||||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||||
@ -55,7 +65,10 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
|
|||||||
let has_following_word = ngrams.peek().is_some();
|
let has_following_word = ngrams.peek().is_some();
|
||||||
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
let not_prefix_dfa = has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||||
|
|
||||||
let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) };
|
let lev = {
|
||||||
|
let normalized = normalize_str(&ngram);
|
||||||
|
if not_prefix_dfa { build_dfa(&normalized) } else { build_prefix_dfa(&normalized) }
|
||||||
|
};
|
||||||
let mut stream = synonyms.search(&lev).into_stream();
|
let mut stream = synonyms.search(&lev).into_stream();
|
||||||
while let Some(base) = stream.next() {
|
while let Some(base) = stream.next() {
|
||||||
|
|
||||||
@ -82,6 +95,7 @@ fn generate_automatons<S: Store>(query: &str, store: &S) -> Result<Vec<Automaton
|
|||||||
}
|
}
|
||||||
|
|
||||||
if n == 1 {
|
if n == 1 {
|
||||||
|
let lev = if not_prefix_dfa { build_dfa(&ngram) } else { build_prefix_dfa(&ngram) };
|
||||||
let automaton = Automaton::original(index, ngram_nb_words, lev);
|
let automaton = Automaton::original(index, ngram_nb_words, lev);
|
||||||
automatons.push((automaton, ngram));
|
automatons.push((automaton, ngram));
|
||||||
}
|
}
|
||||||
@ -443,7 +457,7 @@ mod tests {
|
|||||||
|
|
||||||
fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set {
|
fn sdset_into_fstset(set: &sdset::Set<&str>) -> Set {
|
||||||
let mut builder = fst::SetBuilder::memory();
|
let mut builder = fst::SetBuilder::memory();
|
||||||
let set = SetBuf::from_dirty(set.into_iter().map(|s| s.to_lowercase()).collect());
|
let set = SetBuf::from_dirty(set.into_iter().map(|s| normalize_str(s)).collect());
|
||||||
builder.extend_iter(set.into_iter()).unwrap();
|
builder.extend_iter(set.into_iter()).unwrap();
|
||||||
builder.into_inner().and_then(Set::from_bytes).unwrap()
|
builder.into_inner().and_then(Set::from_bytes).unwrap()
|
||||||
}
|
}
|
||||||
@ -953,4 +967,65 @@ mod tests {
|
|||||||
});
|
});
|
||||||
assert_matches!(iter.next(), None);
|
assert_matches!(iter.next(), None);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn deunicoded_synonyms() {
|
||||||
|
let mut store = InMemorySetStore::from_iter(vec![
|
||||||
|
("iPhone", &[doc_index(0, 0)][..]),
|
||||||
|
("telephone", &[doc_index(1, 0)][..]), // meilidb-data indexes the unidecoded
|
||||||
|
("téléphone", &[doc_index(1, 0)][..]), // and the original words with the same DocIndex
|
||||||
|
]);
|
||||||
|
|
||||||
|
store.add_synonym("téléphone", SetBuf::from_dirty(vec!["iPhone"]));
|
||||||
|
|
||||||
|
let builder = QueryBuilder::new(&store);
|
||||||
|
let results = builder.query("telephone", 0..20).unwrap();
|
||||||
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
|
let builder = QueryBuilder::new(&store);
|
||||||
|
let results = builder.query("téléphone", 0..20).unwrap();
|
||||||
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
|
||||||
|
let builder = QueryBuilder::new(&store);
|
||||||
|
let results = builder.query("télephone", 0..20).unwrap();
|
||||||
|
let mut iter = results.into_iter();
|
||||||
|
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(0), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), Some(Document { id: DocumentId(1), matches }) => {
|
||||||
|
let mut iter = matches.into_iter();
|
||||||
|
assert_matches!(iter.next(), Some(Match { query_index: 0, .. }));
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
});
|
||||||
|
assert_matches!(iter.next(), None);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,8 @@ use std::collections::BTreeMap;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use fst::{SetBuilder, set::OpBuilder};
|
use fst::{SetBuilder, set::OpBuilder};
|
||||||
|
use meilidb_tokenizer::is_cjk;
|
||||||
|
use meilidb_core::normalize_str;
|
||||||
use sdset::SetBuf;
|
use sdset::SetBuf;
|
||||||
|
|
||||||
use crate::database::index::InnerIndex;
|
use crate::database::index::InnerIndex;
|
||||||
@ -20,6 +22,8 @@ impl<'a> SynonymsAddition<'a> {
|
|||||||
pub fn add_synonym<I>(&mut self, synonym: String, alternatives: I)
|
pub fn add_synonym<I>(&mut self, synonym: String, alternatives: I)
|
||||||
where I: Iterator<Item=String>,
|
where I: Iterator<Item=String>,
|
||||||
{
|
{
|
||||||
|
let mut synonym = normalize_str(&synonym);
|
||||||
|
let alternatives = alternatives.map(|s| s.to_lowercase());
|
||||||
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
|
self.synonyms.entry(synonym).or_insert_with(Vec::new).extend(alternatives);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -30,13 +34,11 @@ impl<'a> SynonymsAddition<'a> {
|
|||||||
|
|
||||||
let mut synonyms_builder = SetBuilder::memory();
|
let mut synonyms_builder = SetBuilder::memory();
|
||||||
|
|
||||||
for (synonym, mut alternatives) in self.synonyms {
|
for (synonym, alternatives) in self.synonyms {
|
||||||
synonyms_builder.insert(&synonym).unwrap();
|
synonyms_builder.insert(&synonym).unwrap();
|
||||||
|
|
||||||
let alternatives = {
|
let alternatives = {
|
||||||
alternatives.iter_mut().for_each(|s| *s = s.to_lowercase());
|
|
||||||
let alternatives = SetBuf::from_dirty(alternatives);
|
let alternatives = SetBuf::from_dirty(alternatives);
|
||||||
|
|
||||||
let mut alternatives_builder = SetBuilder::memory();
|
let mut alternatives_builder = SetBuilder::memory();
|
||||||
alternatives_builder.extend_iter(alternatives).unwrap();
|
alternatives_builder.extend_iter(alternatives).unwrap();
|
||||||
alternatives_builder.into_inner().unwrap()
|
alternatives_builder.into_inner().unwrap()
|
||||||
|
Loading…
Reference in New Issue
Block a user