Feat(Search): Enhance formating search results

This commit is contained in:
ManyTheFish 2022-04-07 11:27:06 +02:00
parent 7f7958f815
commit dc2cc1ee89
5 changed files with 233 additions and 53 deletions

View File

@ -2,7 +2,10 @@ use actix_web::{web, HttpRequest, HttpResponse};
use log::debug; use log::debug;
use meilisearch_auth::IndexSearchRules; use meilisearch_auth::IndexSearchRules;
use meilisearch_error::ResponseError; use meilisearch_error::ResponseError;
use meilisearch_lib::index::{default_crop_length, SearchQuery, DEFAULT_SEARCH_LIMIT}; use meilisearch_lib::index::{
default_crop_length, default_crop_marker, default_highlight_post_tag,
default_highlight_pre_tag, SearchQuery, DEFAULT_SEARCH_LIMIT,
};
use meilisearch_lib::MeiliSearch; use meilisearch_lib::MeiliSearch;
use serde::Deserialize; use serde::Deserialize;
use serde_json::Value; use serde_json::Value;
@ -35,6 +38,12 @@ pub struct SearchQueryGet {
#[serde(default = "Default::default")] #[serde(default = "Default::default")]
matches: bool, matches: bool,
facets_distribution: Option<String>, facets_distribution: Option<String>,
#[serde(default = "default_highlight_pre_tag")]
highlight_pre_tag: String,
#[serde(default = "default_highlight_post_tag")]
highlight_post_tag: String,
#[serde(default = "default_crop_marker")]
crop_marker: String,
} }
impl From<SearchQueryGet> for SearchQuery { impl From<SearchQueryGet> for SearchQuery {
@ -77,6 +86,9 @@ impl From<SearchQueryGet> for SearchQuery {
sort, sort,
matches: other.matches, matches: other.matches,
facets_distribution, facets_distribution,
highlight_pre_tag: other.highlight_pre_tag,
highlight_post_tag: other.highlight_post_tag,
crop_marker: other.crop_marker,
} }
} }
} }

View File

@ -36,6 +36,38 @@ async fn search_unexisting_parameter() {
.await; .await;
} }
#[actix_rt::test]
async fn search_invalid_highlight_and_crop_tags() {
let server = Server::new().await;
let index = server.index("test");
let fields = &["cropMarker", "highlightPreTag", "highlightPostTag"];
for field in fields {
// object
index
.search(
json!({field.to_string(): {"marker": "<crop>"}}),
|response, code| {
assert_eq!(code, 400, "field {} passing object: {}", &field, response);
assert_eq!(response["code"], "bad_request");
},
)
.await;
// array
index
.search(
json!({field.to_string(): ["marker", "<crop>"]}),
|response, code| {
assert_eq!(code, 400, "field {} passing array: {}", &field, response);
assert_eq!(response["code"], "bad_request");
},
)
.await;
}
}
#[actix_rt::test] #[actix_rt::test]
async fn filter_invalid_syntax_object() { async fn filter_invalid_syntax_object() {
let server = Server::new().await; let server = Server::new().await;

View File

@ -1,4 +1,7 @@
pub use search::{default_crop_length, SearchQuery, SearchResult, DEFAULT_SEARCH_LIMIT}; pub use search::{
default_crop_length, default_crop_marker, default_highlight_post_tag,
default_highlight_pre_tag, SearchQuery, SearchResult, DEFAULT_SEARCH_LIMIT,
};
pub use updates::{apply_settings_to_builder, Checked, Facets, Settings, Unchecked}; pub use updates::{apply_settings_to_builder, Checked, Facets, Settings, Unchecked};
mod dump; mod dump;

View File

@ -30,11 +30,26 @@ const fn default_search_limit() -> usize {
DEFAULT_SEARCH_LIMIT DEFAULT_SEARCH_LIMIT
} }
pub const DEFAULT_CROP_LENGTH: usize = 200; pub const DEFAULT_CROP_LENGTH: usize = 10;
pub const fn default_crop_length() -> usize { pub const fn default_crop_length() -> usize {
DEFAULT_CROP_LENGTH DEFAULT_CROP_LENGTH
} }
const DEFAULT_CROP_MARKER: &str = "";
pub fn default_crop_marker() -> String {
DEFAULT_CROP_MARKER.to_string()
}
const DEFAULT_HIGHLIGHT_PRE_TAG: &str = "<em>";
pub fn default_highlight_pre_tag() -> String {
DEFAULT_HIGHLIGHT_PRE_TAG.to_string()
}
const DEFAULT_HIGHLIGHT_POST_TAG: &str = "</em>";
pub fn default_highlight_post_tag() -> String {
DEFAULT_HIGHLIGHT_POST_TAG.to_string()
}
/// The maximimum number of results that the engine /// The maximimum number of results that the engine
/// will be able to return in one search call. /// will be able to return in one search call.
pub const HARD_RESULT_LIMIT: usize = 1000; pub const HARD_RESULT_LIMIT: usize = 1000;
@ -57,6 +72,12 @@ pub struct SearchQuery {
pub filter: Option<Value>, pub filter: Option<Value>,
pub sort: Option<Vec<String>>, pub sort: Option<Vec<String>>,
pub facets_distribution: Option<Vec<String>>, pub facets_distribution: Option<Vec<String>>,
#[serde(default = "default_highlight_pre_tag")]
pub highlight_pre_tag: String,
#[serde(default = "default_highlight_post_tag")]
pub highlight_post_tag: String,
#[serde(default = "default_crop_marker")]
pub crop_marker: String,
} }
#[derive(Debug, Clone, Serialize, PartialEq)] #[derive(Debug, Clone, Serialize, PartialEq)]
@ -192,7 +213,11 @@ impl Index {
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(query.highlight_pre_tag, query.highlight_post_tag),
query.crop_marker,
);
let mut documents = Vec::new(); let mut documents = Vec::new();
@ -514,12 +539,21 @@ impl Matcher for MatchingWords {
struct Formatter<'a, A> { struct Formatter<'a, A> {
analyzer: &'a Analyzer<'a, A>, analyzer: &'a Analyzer<'a, A>,
marks: (String, String), highlight_tags: (String, String),
crop_marker: String,
} }
impl<'a, A: AsRef<[u8]>> Formatter<'a, A> { impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
pub fn new(analyzer: &'a Analyzer<'a, A>, marks: (String, String)) -> Self { pub fn new(
Self { analyzer, marks } analyzer: &'a Analyzer<'a, A>,
highlight_tags: (String, String),
crop_marker: String,
) -> Self {
Self {
analyzer,
highlight_tags,
crop_marker,
}
} }
fn format_value( fn format_value(
@ -583,10 +617,13 @@ impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
) -> String { ) -> String {
let analyzed = self.analyzer.analyze(&s); let analyzed = self.analyzer.analyze(&s);
let tokens: Box<dyn Iterator<Item = (&str, Token)>> = match format_options.crop { let mut tokens = analyzed.reconstruct();
Some(crop_len) => { let mut crop_marker_before = false;
let tokens_interval: Box<dyn Iterator<Item = (&str, Token)>> = match format_options.crop {
Some(crop_len) if crop_len > 0 => {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
let mut tokens = analyzed.reconstruct().peekable(); let mut tokens = tokens.by_ref().peekable();
while let Some((word, token)) = while let Some((word, token)) =
tokens.next_if(|(_, token)| matcher.matches(token).is_none()) tokens.next_if(|(_, token)| matcher.matches(token).is_none())
@ -596,16 +633,35 @@ impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
match tokens.next() { match tokens.next() {
Some(token) => { Some(token) => {
let mut total_len: usize = buffer.iter().map(|(word, _)| word.len()).sum(); let mut total_count: usize = buffer
let before_iter = buffer.into_iter().skip_while(move |(word, _)| { .iter()
total_len -= word.len(); .filter(|(_, token)| token.is_separator().is_none())
total_len >= crop_len .count();
let crop_len_before = crop_len / 2;
// check if start will be cropped.
crop_marker_before = total_count > crop_len_before;
let before_iter = buffer.into_iter().skip_while(move |(_, token)| {
if token.is_separator().is_none() {
total_count -= 1;
}
total_count >= crop_len_before
}); });
// rebalance remaining word count after the match.
let crop_len_after = if crop_marker_before {
crop_len.saturating_sub(crop_len_before + 1)
} else {
crop_len.saturating_sub(total_count + 1)
};
let mut taken_after = 0; let mut taken_after = 0;
let after_iter = tokens.take_while(move |(word, _)| { let after_iter = tokens.take_while(move |(_, token)| {
let take = taken_after < crop_len; let take = taken_after < crop_len_after;
taken_after += word.chars().count(); if token.is_separator().is_none() {
taken_after += 1;
}
take take
}); });
@ -616,38 +672,57 @@ impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
// If no word matches in the attribute // If no word matches in the attribute
None => { None => {
let mut count = 0; let mut count = 0;
let iter = buffer.into_iter().take_while(move |(word, _)| { let mut tokens = buffer.into_iter();
let mut out: String = tokens
.by_ref()
.take_while(move |(_, token)| {
let take = count < crop_len; let take = count < crop_len;
count += word.len(); if token.is_separator().is_none() {
count += 1;
}
take take
}); })
.map(|(word, _)| word)
.collect();
Box::new(iter) // if there are remaining tokens after formatted interval,
// put a crop marker at the end.
if tokens.next().is_some() {
out.push_str(&self.crop_marker);
}
return out;
} }
} }
} }
None => Box::new(analyzed.reconstruct()), _ => Box::new(tokens.by_ref()),
}; };
tokens.fold(String::new(), |mut out, (word, token)| { let out = if crop_marker_before {
self.crop_marker.clone()
} else {
String::new()
};
let mut out = tokens_interval.fold(out, |mut out, (word, token)| {
// Check if we need to do highlighting or computed matches before calling // Check if we need to do highlighting or computed matches before calling
// Matcher::match since the call is expensive. // Matcher::match since the call is expensive.
if format_options.highlight && token.is_word() { if format_options.highlight && token.is_word() {
if let Some(length) = matcher.matches(&token) { if let Some(length) = matcher.matches(&token) {
match word.get(..length).zip(word.get(length..)) { match word.get(..length).zip(word.get(length..)) {
Some((head, tail)) => { Some((head, tail)) => {
out.push_str(&self.marks.0); out.push_str(&self.highlight_tags.0);
out.push_str(head); out.push_str(head);
out.push_str(&self.marks.1); out.push_str(&self.highlight_tags.1);
out.push_str(tail); out.push_str(tail);
} }
// if we are in the middle of a character // if we are in the middle of a character
// or if all the word should be highlighted, // or if all the word should be highlighted,
// we highlight the complete word. // we highlight the complete word.
None => { None => {
out.push_str(&self.marks.0); out.push_str(&self.highlight_tags.0);
out.push_str(word); out.push_str(word);
out.push_str(&self.marks.1); out.push_str(&self.highlight_tags.1);
} }
} }
return out; return out;
@ -655,7 +730,15 @@ impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
} }
out.push_str(word); out.push_str(word);
out out
}) });
// if there are remaining tokens after formatted interval,
// put a crop marker at the end.
if tokens.next().is_some() {
out.push_str(&self.crop_marker);
}
out
} }
} }
@ -708,7 +791,11 @@ mod test {
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(String::from("<em>"), String::from("</em>")),
String::from(""),
);
let mut fields = FieldsIdsMap::new(); let mut fields = FieldsIdsMap::new();
let id = fields.insert("test").unwrap(); let id = fields.insert("test").unwrap();
@ -743,7 +830,11 @@ mod test {
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(String::from("<em>"), String::from("</em>")),
String::from(""),
);
let mut fields = FieldsIdsMap::new(); let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap(); let title = fields.insert("title").unwrap();
@ -807,7 +898,11 @@ mod test {
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(String::from("<em>"), String::from("</em>")),
String::from(""),
);
let mut fields = FieldsIdsMap::new(); let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap(); let title = fields.insert("title").unwrap();
@ -894,7 +989,11 @@ mod test {
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(String::from("<em>"), String::from("</em>")),
String::from(""),
);
let mut fields = FieldsIdsMap::new(); let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap(); let title = fields.insert("title").unwrap();
@ -958,7 +1057,11 @@ mod test {
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(String::from("<em>"), String::from("</em>")),
String::from(""),
);
let mut fields = FieldsIdsMap::new(); let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap(); let title = fields.insert("title").unwrap();
@ -1019,7 +1122,11 @@ mod test {
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(String::from("<em>"), String::from("</em>")),
String::from(""),
);
let mut fields = FieldsIdsMap::new(); let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap(); let title = fields.insert("title").unwrap();
@ -1062,7 +1169,7 @@ mod test {
); );
let mut matching_words = BTreeMap::new(); let mut matching_words = BTreeMap::new();
matching_words.insert("potter", Some(6)); matching_words.insert("potter", Some(3));
let value = format_fields( let value = format_fields(
&fields, &fields,
@ -1073,17 +1180,21 @@ mod test {
) )
.unwrap(); .unwrap();
assert_eq!(value["title"], "Harry Potter and"); assert_eq!(value["title"], "Harry Potter");
assert_eq!(value["author"], "J. K. Rowling"); assert_eq!(value["author"], "J. K. Rowling");
} }
#[test] #[test]
fn formatted_with_crop_10() { fn formatted_with_crop_5() {
let stop_words = fst::Set::default(); let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(String::from("<em>"), String::from("</em>")),
String::from(""),
);
let mut fields = FieldsIdsMap::new(); let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap(); let title = fields.insert("title").unwrap();
@ -1114,7 +1225,7 @@ mod test {
title, title,
FormatOptions { FormatOptions {
highlight: false, highlight: false,
crop: Some(10), crop: Some(5),
}, },
); );
formatted_options.insert( formatted_options.insert(
@ -1126,7 +1237,7 @@ mod test {
); );
let mut matching_words = BTreeMap::new(); let mut matching_words = BTreeMap::new();
matching_words.insert("potter", Some(6)); matching_words.insert("potter", Some(5));
let value = format_fields( let value = format_fields(
&fields, &fields,
@ -1137,7 +1248,7 @@ mod test {
) )
.unwrap(); .unwrap();
assert_eq!(value["title"], "Harry Potter and the Half"); assert_eq!(value["title"], "Harry Potter and the Half");
assert_eq!(value["author"], "J. K. Rowling"); assert_eq!(value["author"], "J. K. Rowling");
} }
@ -1147,7 +1258,11 @@ mod test {
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(String::from("<em>"), String::from("</em>")),
String::from(""),
);
let mut fields = FieldsIdsMap::new(); let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap(); let title = fields.insert("title").unwrap();
@ -1201,7 +1316,7 @@ mod test {
) )
.unwrap(); .unwrap();
assert_eq!(value["title"], "Potter"); assert_eq!(value["title"], "Harry Potter and the Half-Blood Prince");
assert_eq!(value["author"], "J. K. Rowling"); assert_eq!(value["author"], "J. K. Rowling");
} }
@ -1211,7 +1326,11 @@ mod test {
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(String::from("<em>"), String::from("</em>")),
String::from(""),
);
let mut fields = FieldsIdsMap::new(); let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap(); let title = fields.insert("title").unwrap();
@ -1242,7 +1361,7 @@ mod test {
title, title,
FormatOptions { FormatOptions {
highlight: false, highlight: false,
crop: Some(6), crop: Some(1),
}, },
); );
formatted_options.insert( formatted_options.insert(
@ -1265,7 +1384,7 @@ mod test {
) )
.unwrap(); .unwrap();
assert_eq!(value["title"], "Harry "); assert_eq!(value["title"], "Harry");
assert_eq!(value["author"], "J. K. Rowling"); assert_eq!(value["author"], "J. K. Rowling");
} }
@ -1275,7 +1394,11 @@ mod test {
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(String::from("<em>"), String::from("</em>")),
String::from(""),
);
let mut fields = FieldsIdsMap::new(); let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap(); let title = fields.insert("title").unwrap();
@ -1329,7 +1452,7 @@ mod test {
) )
.unwrap(); .unwrap();
assert_eq!(value["title"], " <em>and</em> "); assert_eq!(value["title"], "…<em>and</em>…");
assert_eq!(value["author"], "J. K. Rowling"); assert_eq!(value["author"], "J. K. Rowling");
} }
@ -1339,7 +1462,11 @@ mod test {
let mut config = AnalyzerConfig::default(); let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words); config.stop_words(&stop_words);
let analyzer = Analyzer::new(config); let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>"))); let formatter = Formatter::new(
&analyzer,
(String::from("<em>"), String::from("</em>")),
String::from(""),
);
let mut fields = FieldsIdsMap::new(); let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap(); let title = fields.insert("title").unwrap();
@ -1370,7 +1497,7 @@ mod test {
title, title,
FormatOptions { FormatOptions {
highlight: true, highlight: true,
crop: Some(9), crop: Some(4),
}, },
); );
formatted_options.insert( formatted_options.insert(
@ -1393,7 +1520,7 @@ mod test {
) )
.unwrap(); .unwrap();
assert_eq!(value["title"], "the Half-<em>Blo</em>od Prince"); assert_eq!(value["title"], "the Half-<em>Blo</em>od Prince");
assert_eq!(value["author"], "J. K. Rowling"); assert_eq!(value["author"], "J. K. Rowling");
} }

View File

@ -651,6 +651,9 @@ mod test {
use crate::index::error::Result as IndexResult; use crate::index::error::Result as IndexResult;
use crate::index::Index; use crate::index::Index;
use crate::index::{
default_crop_marker, default_highlight_post_tag, default_highlight_pre_tag,
};
use crate::index_resolver::index_store::MockIndexStore; use crate::index_resolver::index_store::MockIndexStore;
use crate::index_resolver::meta_store::MockIndexMetaStore; use crate::index_resolver::meta_store::MockIndexMetaStore;
use crate::index_resolver::IndexResolver; use crate::index_resolver::IndexResolver;
@ -691,6 +694,9 @@ mod test {
filter: None, filter: None,
sort: None, sort: None,
facets_distribution: None, facets_distribution: None,
highlight_pre_tag: default_highlight_pre_tag(),
highlight_post_tag: default_highlight_post_tag(),
crop_marker: default_crop_marker(),
}; };
let result = SearchResult { let result = SearchResult {