mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 10:37:41 +08:00
Create formater with some tests
This commit is contained in:
parent
900825bac0
commit
d96e72e5dc
@ -1,11 +1,11 @@
|
|||||||
use std::cmp::{min, Reverse};
|
use std::cmp::{min, Reverse};
|
||||||
use std::collections::{BTreeMap, HashSet};
|
use std::collections::{BTreeMap, HashMap};
|
||||||
use std::ops::{Index, IndexMut};
|
use std::ops::{Index, IndexMut};
|
||||||
|
|
||||||
use levenshtein_automata::{Distance, DFA};
|
use levenshtein_automata::{Distance, DFA};
|
||||||
use meilisearch_tokenizer::Token;
|
use meilisearch_tokenizer::Token;
|
||||||
|
|
||||||
use super::build_dfa;
|
use crate::search::build_dfa;
|
||||||
use crate::search::query_tree::{Operation, Query};
|
use crate::search::query_tree::{Operation, Query};
|
||||||
|
|
||||||
type IsPrefix = bool;
|
type IsPrefix = bool;
|
||||||
@ -14,7 +14,7 @@ type IsPrefix = bool;
|
|||||||
/// referencing words that match the given query tree.
|
/// referencing words that match the given query tree.
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct MatchingWords {
|
pub struct MatchingWords {
|
||||||
dfas: Vec<(DFA, String, u8, IsPrefix)>,
|
dfas: Vec<(DFA, String, u8, IsPrefix, usize)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MatchingWords {
|
impl MatchingWords {
|
||||||
@ -23,11 +23,11 @@ impl MatchingWords {
|
|||||||
let mut dfas: Vec<_> = fetch_queries(tree)
|
let mut dfas: Vec<_> = fetch_queries(tree)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
// create DFAs for each word
|
// create DFAs for each word
|
||||||
.map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p))
|
.map(|((w, t, p), id)| (build_dfa(w, t, p), w.to_string(), t, p, id))
|
||||||
.collect();
|
.collect();
|
||||||
// Sort word by len in DESC order prioritizing the longuest word,
|
// Sort word by len in DESC order prioritizing the longuest word,
|
||||||
// in order to highlight the longuest part of the matched word.
|
// in order to highlight the longuest part of the matched word.
|
||||||
dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| {
|
dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix, _id)| {
|
||||||
Reverse(query_word.len())
|
Reverse(query_word.len())
|
||||||
});
|
});
|
||||||
Self { dfas }
|
Self { dfas }
|
||||||
@ -35,14 +35,21 @@ impl MatchingWords {
|
|||||||
|
|
||||||
/// Returns the number of matching bytes if the word matches one of the query words.
|
/// Returns the number of matching bytes if the word matches one of the query words.
|
||||||
pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
|
pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
|
||||||
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
|
self.matching_bytes_with_id(word_to_highlight).map(|(len, _)| len)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn matching_bytes_with_id(&self, word_to_highlight: &Token) -> Option<(usize, usize)> {
|
||||||
|
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix, id)| {
|
||||||
match dfa.eval(word_to_highlight.text()) {
|
match dfa.eval(word_to_highlight.text()) {
|
||||||
Distance::Exact(t) if t <= *typo => {
|
Distance::Exact(t) if t <= *typo => {
|
||||||
if *is_prefix {
|
if *is_prefix {
|
||||||
let len = bytes_to_highlight(word_to_highlight.text(), query_word);
|
let len = bytes_to_highlight(word_to_highlight.text(), query_word);
|
||||||
Some(word_to_highlight.num_chars_from_bytes(len))
|
Some((word_to_highlight.num_chars_from_bytes(len), *id))
|
||||||
} else {
|
} else {
|
||||||
Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()))
|
Some((
|
||||||
|
word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()),
|
||||||
|
*id,
|
||||||
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_otherwise => None,
|
_otherwise => None,
|
||||||
@ -52,26 +59,37 @@ impl MatchingWords {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Lists all words which can be considered as a match for the query tree.
|
/// Lists all words which can be considered as a match for the query tree.
|
||||||
fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
|
fn fetch_queries(tree: &Operation) -> HashMap<(&str, u8, IsPrefix), usize> {
|
||||||
fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
|
fn resolve_ops<'a>(
|
||||||
|
tree: &'a Operation,
|
||||||
|
out: &mut HashMap<(&'a str, u8, IsPrefix), usize>,
|
||||||
|
id: &mut usize,
|
||||||
|
) {
|
||||||
match tree {
|
match tree {
|
||||||
Operation::Or(_, ops) | Operation::And(ops) => {
|
Operation::Or(_, ops) | Operation::And(ops) => {
|
||||||
ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
|
ops.as_slice().iter().for_each(|op| resolve_ops(op, out, id));
|
||||||
}
|
}
|
||||||
Operation::Query(Query { prefix, kind }) => {
|
Operation::Query(Query { prefix, kind }) => {
|
||||||
let typo = if kind.is_exact() { 0 } else { kind.typo() };
|
let typo = if kind.is_exact() { 0 } else { kind.typo() };
|
||||||
out.insert((kind.word(), typo, *prefix));
|
out.entry((kind.word(), typo, *prefix)).or_insert_with(|| {
|
||||||
|
*id += 1;
|
||||||
|
*id
|
||||||
|
});
|
||||||
}
|
}
|
||||||
Operation::Phrase(words) => {
|
Operation::Phrase(words) => {
|
||||||
for word in words {
|
for word in words {
|
||||||
out.insert((word, 0, false));
|
out.entry((word, 0, false)).or_insert_with(|| {
|
||||||
|
*id += 1;
|
||||||
|
*id
|
||||||
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut queries = HashSet::new();
|
let mut queries = HashMap::new();
|
||||||
resolve_ops(tree, &mut queries);
|
let mut id = 0;
|
||||||
|
resolve_ops(tree, &mut queries, &mut id);
|
||||||
queries
|
queries
|
||||||
}
|
}
|
||||||
|
|
434
milli/src/search/matches/mod.rs
Normal file
434
milli/src/search/matches/mod.rs
Normal file
@ -0,0 +1,434 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use matching_words::MatchingWords;
|
||||||
|
use meilisearch_tokenizer::token::SeparatorKind;
|
||||||
|
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token};
|
||||||
|
|
||||||
|
use crate::search::query_tree::Operation;
|
||||||
|
|
||||||
|
pub mod matching_words;
|
||||||
|
|
||||||
|
const DEFAULT_CROP_SIZE: usize = 10;
|
||||||
|
const DEFAULT_CROP_MARKER: &'static str = "…";
|
||||||
|
const DEFAULT_HIGHLIGHT_PREFIX: &'static str = "<em>";
|
||||||
|
const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = "</em>";
|
||||||
|
|
||||||
|
pub struct MatcherBuilder {
|
||||||
|
matching_words: MatchingWords,
|
||||||
|
crop_size: usize,
|
||||||
|
crop_marker: Option<String>,
|
||||||
|
highlight_prefix: Option<String>,
|
||||||
|
highlight_suffix: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MatcherBuilder {
|
||||||
|
pub fn from_query_tree(query_tree: &Operation) -> Self {
|
||||||
|
let matching_words = MatchingWords::from_query_tree(query_tree);
|
||||||
|
|
||||||
|
Self {
|
||||||
|
matching_words,
|
||||||
|
crop_size: DEFAULT_CROP_SIZE,
|
||||||
|
crop_marker: None,
|
||||||
|
highlight_prefix: None,
|
||||||
|
highlight_suffix: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn crop_size(&mut self, word_count: usize) -> &Self {
|
||||||
|
self.crop_size = word_count;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn crop_marker(&mut self, marker: String) -> &Self {
|
||||||
|
self.crop_marker = Some(marker);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn highlight_prefix(&mut self, prefix: String) -> &Self {
|
||||||
|
self.highlight_prefix = Some(prefix);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn highlight_suffix(&mut self, suffix: String) -> &Self {
|
||||||
|
self.highlight_suffix = Some(suffix);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build<'t, 'm>(&'m self, tokens: &'t [Token], text: &'t str) -> Matcher<'t, 'm> {
|
||||||
|
let crop_marker = match &self.crop_marker {
|
||||||
|
Some(marker) => marker.as_str(),
|
||||||
|
None => &DEFAULT_CROP_MARKER,
|
||||||
|
};
|
||||||
|
|
||||||
|
let highlight_prefix = match &self.highlight_prefix {
|
||||||
|
Some(marker) => marker.as_str(),
|
||||||
|
None => &DEFAULT_HIGHLIGHT_PREFIX,
|
||||||
|
};
|
||||||
|
let highlight_suffix = match &self.highlight_suffix {
|
||||||
|
Some(marker) => marker.as_str(),
|
||||||
|
None => &DEFAULT_HIGHLIGHT_SUFFIX,
|
||||||
|
};
|
||||||
|
Matcher {
|
||||||
|
text,
|
||||||
|
tokens,
|
||||||
|
matching_words: &self.matching_words,
|
||||||
|
crop_size: self.crop_size,
|
||||||
|
crop_marker,
|
||||||
|
highlight_prefix,
|
||||||
|
highlight_suffix,
|
||||||
|
matches: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// impl Default for MatcherBuilder {
|
||||||
|
// fn default() -> Self {
|
||||||
|
// Self {
|
||||||
|
// crop_size: DEFAULT_CROP_SIZE,
|
||||||
|
// crop_marker: None,
|
||||||
|
// highlight_prefix: None,
|
||||||
|
// highlight_suffix: None,
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
pub struct Match<'t> {
|
||||||
|
token: &'t Token<'t>,
|
||||||
|
match_len: usize,
|
||||||
|
// id of the query word that matches.
|
||||||
|
id: usize,
|
||||||
|
// position of the word in the whole text.
|
||||||
|
position: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct MatchBounds {
|
||||||
|
start: usize,
|
||||||
|
length: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t> From<&Match<'t>> for MatchBounds {
|
||||||
|
fn from(m: &Match) -> Self {
|
||||||
|
MatchBounds { start: m.token.byte_start, length: m.match_len }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Matcher<'t, 'm> {
|
||||||
|
text: &'t str,
|
||||||
|
tokens: &'t [Token<'t>],
|
||||||
|
matching_words: &'m MatchingWords,
|
||||||
|
crop_size: usize,
|
||||||
|
crop_marker: &'m str,
|
||||||
|
highlight_prefix: &'m str,
|
||||||
|
highlight_suffix: &'m str,
|
||||||
|
matches: Option<Vec<Match<'t>>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t> Matcher<'t, '_> {
|
||||||
|
fn compute_matches(&mut self) -> &mut Self {
|
||||||
|
let mut matches = Vec::new();
|
||||||
|
let mut position = 0;
|
||||||
|
for token in self.tokens {
|
||||||
|
match token.is_separator() {
|
||||||
|
Some(SeparatorKind::Hard) => position += 7,
|
||||||
|
None => {
|
||||||
|
if let Some((match_len, id)) =
|
||||||
|
self.matching_words.matching_bytes_with_id(&token)
|
||||||
|
{
|
||||||
|
matches.push(Match { token, match_len, id, position });
|
||||||
|
}
|
||||||
|
position += 1;
|
||||||
|
}
|
||||||
|
_otherwise => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.matches = Some(matches);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn matches(&mut self) -> Vec<MatchBounds> {
|
||||||
|
match &self.matches {
|
||||||
|
None => self.compute_matches().matches(),
|
||||||
|
Some(matches) => matches.iter().map(MatchBounds::from).collect(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn crop_bounds(&self, matches: &[Match<'t>]) -> (usize, usize) {
|
||||||
|
let byte_end = self
|
||||||
|
.tokens
|
||||||
|
.iter()
|
||||||
|
.filter(|t| t.is_separator().is_none())
|
||||||
|
.enumerate()
|
||||||
|
.take_while(|(i, _)| *i < self.crop_size)
|
||||||
|
.last()
|
||||||
|
.map_or(self.text.len(), |(_, t)| t.byte_end);
|
||||||
|
|
||||||
|
(0, byte_end)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
|
||||||
|
if !highlight && !crop {
|
||||||
|
// compute matches is not needed if no highlight or crop is requested.
|
||||||
|
Cow::Borrowed(self.text)
|
||||||
|
} else {
|
||||||
|
match &self.matches {
|
||||||
|
Some(matches) => {
|
||||||
|
let (byte_start, byte_end) =
|
||||||
|
if crop { self.crop_bounds(matches) } else { (0, self.text.len()) };
|
||||||
|
|
||||||
|
let mut formatted = Vec::new();
|
||||||
|
|
||||||
|
// push crop marker if it's not the start of the text.
|
||||||
|
if byte_start > 0 && !self.crop_marker.is_empty() {
|
||||||
|
formatted.push(self.crop_marker);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut byte_index = byte_start;
|
||||||
|
|
||||||
|
if highlight {
|
||||||
|
// insert highlight markers around matches.
|
||||||
|
for m in matches
|
||||||
|
.iter()
|
||||||
|
.skip_while(|m| m.token.byte_start < byte_start)
|
||||||
|
.take_while(|m| m.token.byte_start < byte_end)
|
||||||
|
{
|
||||||
|
if byte_index < m.token.byte_start {
|
||||||
|
formatted.push(&self.text[byte_index..m.token.byte_start]);
|
||||||
|
}
|
||||||
|
|
||||||
|
formatted.push(self.highlight_prefix);
|
||||||
|
formatted.push(&self.text[m.token.byte_start..m.token.byte_end]);
|
||||||
|
formatted.push(self.highlight_suffix);
|
||||||
|
|
||||||
|
byte_index = m.token.byte_end;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// push the rest of the text between last match and the end of crop.
|
||||||
|
if byte_index < byte_end {
|
||||||
|
formatted.push(&self.text[byte_index..byte_end]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// push crop marker if it's not the end of the text.
|
||||||
|
if byte_end < self.text.len() && !self.crop_marker.is_empty() {
|
||||||
|
formatted.push(self.crop_marker);
|
||||||
|
}
|
||||||
|
|
||||||
|
if formatted.len() == 1 {
|
||||||
|
// avoid concatenating if there is already 1 slice.
|
||||||
|
Cow::Borrowed(&self.text[byte_start..byte_end])
|
||||||
|
} else {
|
||||||
|
Cow::Owned(formatted.concat())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => self.compute_matches().format(highlight, crop),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::search::query_tree::{Query, QueryKind};
|
||||||
|
|
||||||
|
fn query_tree() -> Operation {
|
||||||
|
Operation::Or(
|
||||||
|
false,
|
||||||
|
vec![Operation::And(vec![
|
||||||
|
Operation::Query(Query {
|
||||||
|
prefix: true,
|
||||||
|
kind: QueryKind::exact("split".to_string()),
|
||||||
|
}),
|
||||||
|
Operation::Query(Query {
|
||||||
|
prefix: false,
|
||||||
|
kind: QueryKind::exact("the".to_string()),
|
||||||
|
}),
|
||||||
|
Operation::Query(Query {
|
||||||
|
prefix: true,
|
||||||
|
kind: QueryKind::tolerant(1, "world".to_string()),
|
||||||
|
}),
|
||||||
|
])],
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn format_identity() {
|
||||||
|
let query_tree = query_tree();
|
||||||
|
|
||||||
|
let builder = MatcherBuilder::from_query_tree(&query_tree);
|
||||||
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
|
|
||||||
|
let highlight = false;
|
||||||
|
let crop = false;
|
||||||
|
|
||||||
|
// Text without any match.
|
||||||
|
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// no crop and no highlight should return complete text.
|
||||||
|
assert_eq!(&matcher.format(highlight, crop), &text);
|
||||||
|
|
||||||
|
// Text containing all matches.
|
||||||
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// no crop and no highlight should return complete text.
|
||||||
|
assert_eq!(&matcher.format(highlight, crop), &text);
|
||||||
|
|
||||||
|
// Text containing some matches.
|
||||||
|
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// no crop and no highlight should return complete text.
|
||||||
|
assert_eq!(&matcher.format(highlight, crop), &text);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn format_highlight() {
|
||||||
|
let query_tree = query_tree();
|
||||||
|
|
||||||
|
let builder = MatcherBuilder::from_query_tree(&query_tree);
|
||||||
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
|
|
||||||
|
let highlight = true;
|
||||||
|
let crop = false;
|
||||||
|
|
||||||
|
// Text without any match.
|
||||||
|
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// no crop should return complete text, because there is no matches.
|
||||||
|
assert_eq!(&matcher.format(highlight, crop), &text);
|
||||||
|
|
||||||
|
// Text containing all matches.
|
||||||
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// no crop should return complete text with highlighted matches.
|
||||||
|
assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>");
|
||||||
|
|
||||||
|
// Text containing some matches.
|
||||||
|
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// no crop should return complete text with highlighted matches.
|
||||||
|
assert_eq!(
|
||||||
|
&matcher.format(highlight, crop),
|
||||||
|
"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn format_crop() {
|
||||||
|
let query_tree = query_tree();
|
||||||
|
|
||||||
|
let builder = MatcherBuilder::from_query_tree(&query_tree);
|
||||||
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
|
|
||||||
|
let highlight = false;
|
||||||
|
let crop = true;
|
||||||
|
|
||||||
|
// Text without any match.
|
||||||
|
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// no highlight should return 10 first words with a marker at the end.
|
||||||
|
assert_eq!(
|
||||||
|
&matcher.format(highlight, crop),
|
||||||
|
"A quick brown fox can not jump 32 feet, right…"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Text containing all matches.
|
||||||
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// no highlight should return 10 last words with a marker at the start.
|
||||||
|
assert_eq!(
|
||||||
|
&matcher.format(highlight, crop),
|
||||||
|
"…she loves. Emily Henry: The Love That Split The World"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Text containing some matches.
|
||||||
|
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// no highlight should return 10 last words with a marker at the start.
|
||||||
|
assert_eq!(
|
||||||
|
&matcher.format(highlight, crop),
|
||||||
|
"…future to build a world with the boy she loves."
|
||||||
|
);
|
||||||
|
|
||||||
|
// Text containing a match unordered and a match ordered.
|
||||||
|
let text = "The world split void void void void void void void void void split the world void void";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// crop should return 10 last words with a marker at the start.
|
||||||
|
assert_eq!(
|
||||||
|
&matcher.format(highlight, crop),
|
||||||
|
"…void void void void void split the world void void"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn format_highlight_crop() {
|
||||||
|
let query_tree = query_tree();
|
||||||
|
|
||||||
|
let builder = MatcherBuilder::from_query_tree(&query_tree);
|
||||||
|
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
||||||
|
|
||||||
|
let highlight = true;
|
||||||
|
let crop = true;
|
||||||
|
|
||||||
|
// Text without any match.
|
||||||
|
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// both should return 10 first words with a marker at the end.
|
||||||
|
assert_eq!(
|
||||||
|
&matcher.format(highlight, crop),
|
||||||
|
"A quick brown fox can not jump 32 feet, right…"
|
||||||
|
);
|
||||||
|
|
||||||
|
// Text containing all matches.
|
||||||
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// both should return 10 last words with a marker at the start and highlighted matches.
|
||||||
|
assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>");
|
||||||
|
|
||||||
|
// Text containing some matches.
|
||||||
|
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// both should return 10 last words with a marker at the start and highlighted matches.
|
||||||
|
assert_eq!(
|
||||||
|
&matcher.format(highlight, crop),
|
||||||
|
"…future to build a <em>world</em> with <em>the</em> boy she loves."
|
||||||
|
);
|
||||||
|
|
||||||
|
// Text containing a match unordered and a match ordered.
|
||||||
|
let text = "The world split void void void void void void void void void split the world void void";
|
||||||
|
let analyzed = analyzer.analyze(&text);
|
||||||
|
let tokens: Vec<_> = analyzed.tokens().collect();
|
||||||
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
|
// crop should return 10 last words with a marker at the start.
|
||||||
|
assert_eq!(
|
||||||
|
&matcher.format(highlight, crop),
|
||||||
|
"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap;
|
|||||||
|
|
||||||
pub use self::facet::{FacetDistribution, FacetNumberIter, Filter};
|
pub use self::facet::{FacetDistribution, FacetNumberIter, Filter};
|
||||||
use self::fst_utils::{Complement, Intersection, StartsWith, Union};
|
use self::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||||
pub use self::matching_words::MatchingWords;
|
pub use self::matches::matching_words::MatchingWords;
|
||||||
use self::query_tree::QueryTreeBuilder;
|
use self::query_tree::QueryTreeBuilder;
|
||||||
use crate::error::UserError;
|
use crate::error::UserError;
|
||||||
use crate::search::criteria::r#final::{Final, FinalResult};
|
use crate::search::criteria::r#final::{Final, FinalResult};
|
||||||
@ -32,7 +32,7 @@ mod criteria;
|
|||||||
mod distinct;
|
mod distinct;
|
||||||
mod facet;
|
mod facet;
|
||||||
mod fst_utils;
|
mod fst_utils;
|
||||||
mod matching_words;
|
mod matches;
|
||||||
mod query_tree;
|
mod query_tree;
|
||||||
|
|
||||||
pub struct Search<'a> {
|
pub struct Search<'a> {
|
||||||
|
Loading…
Reference in New Issue
Block a user