meilisearch/meilisearch-lib/src/index/search.rs

1518 lines
47 KiB
Rust
Raw Normal View History

2022-03-31 01:06:15 +08:00
use std::cmp::min;
2021-06-17 22:59:01 +08:00
use std::collections::{BTreeMap, BTreeSet, HashSet};
2021-08-24 18:31:35 +08:00
use std::str::FromStr;
2021-03-16 01:11:10 +08:00
use std::time::Instant;
2021-03-04 18:56:32 +08:00
2021-03-16 01:11:10 +08:00
use either::Either;
2021-04-21 03:19:37 +08:00
use indexmap::IndexMap;
use milli::tokenizer::{Analyzer, AnalyzerConfig, Token};
use milli::{AscDesc, FieldId, FieldsIdsMap, Filter, MatchingWords, SortError};
2021-09-27 21:41:14 +08:00
use regex::Regex;
2021-03-16 01:11:10 +08:00
use serde::{Deserialize, Serialize};
2021-09-27 21:41:14 +08:00
use serde_json::{json, Value};
2021-03-04 18:56:32 +08:00
use crate::index::error::FacetError;
2021-10-06 19:01:02 +08:00
use super::error::{IndexError, Result};
2021-10-04 18:15:21 +08:00
use super::index::Index;
2021-03-04 18:56:32 +08:00
2021-04-21 03:19:37 +08:00
pub type Document = IndexMap<String, Value>;
2021-06-22 05:38:59 +08:00
type MatchesInfo = BTreeMap<String, Vec<MatchInfo>>;
2021-10-06 19:01:02 +08:00
#[derive(Serialize, Debug, Clone, PartialEq)]
2021-06-22 05:38:59 +08:00
pub struct MatchInfo {
start: usize,
length: usize,
}
2021-04-19 22:22:41 +08:00
2021-03-04 18:56:32 +08:00
pub const DEFAULT_SEARCH_LIMIT: usize = 20;
const fn default_search_limit() -> usize {
DEFAULT_SEARCH_LIMIT
}
pub const DEFAULT_CROP_LENGTH: usize = 200;
2021-06-22 20:22:36 +08:00
pub const fn default_crop_length() -> usize {
DEFAULT_CROP_LENGTH
}
/// The maximimum number of results that the engine
/// will be able to return in one search call.
pub const HARD_RESULT_LIMIT: usize = 1000;
2021-10-06 19:01:02 +08:00
#[derive(Deserialize, Debug, Clone, PartialEq)]
2021-03-04 18:56:32 +08:00
#[serde(rename_all = "camelCase", deny_unknown_fields)]
pub struct SearchQuery {
pub q: Option<String>,
pub offset: Option<usize>,
#[serde(default = "default_search_limit")]
pub limit: usize,
2021-06-16 22:18:55 +08:00
pub attributes_to_retrieve: Option<BTreeSet<String>>,
pub attributes_to_crop: Option<Vec<String>>,
#[serde(default = "default_crop_length")]
pub crop_length: usize,
2021-03-04 18:56:32 +08:00
pub attributes_to_highlight: Option<HashSet<String>>,
2021-06-22 05:38:59 +08:00
// Default to false
#[serde(default = "Default::default")]
pub matches: bool,
pub filter: Option<Value>,
2021-08-24 18:31:35 +08:00
pub sort: Option<Vec<String>>,
2021-06-23 02:07:23 +08:00
pub facets_distribution: Option<Vec<String>>,
2021-03-04 18:56:32 +08:00
}
2021-10-06 19:01:02 +08:00
#[derive(Debug, Clone, Serialize, PartialEq)]
2021-04-19 16:13:13 +08:00
pub struct SearchHit {
#[serde(flatten)]
2021-04-19 22:22:41 +08:00
pub document: Document,
2021-04-20 19:10:50 +08:00
#[serde(rename = "_formatted", skip_serializing_if = "Document::is_empty")]
2021-04-19 22:22:41 +08:00
pub formatted: Document,
2021-06-22 17:06:30 +08:00
#[serde(rename = "_matchesInfo", skip_serializing_if = "Option::is_none")]
2021-06-22 05:38:59 +08:00
pub matches_info: Option<MatchesInfo>,
2021-04-19 16:13:13 +08:00
}
2021-10-06 19:01:02 +08:00
#[derive(Serialize, Debug, Clone, PartialEq)]
2021-03-04 18:56:32 +08:00
#[serde(rename_all = "camelCase")]
pub struct SearchResult {
2021-04-19 16:13:13 +08:00
pub hits: Vec<SearchHit>,
2021-03-04 18:56:32 +08:00
pub nb_hits: u64,
pub exhaustive_nb_hits: bool,
2021-03-04 18:56:32 +08:00
pub query: String,
pub limit: usize,
pub offset: usize,
pub processing_time_ms: u128,
#[serde(skip_serializing_if = "Option::is_none")]
2021-06-23 02:07:23 +08:00
pub facets_distribution: Option<BTreeMap<String, BTreeMap<String, u64>>>,
2021-06-23 17:05:30 +08:00
#[serde(skip_serializing_if = "Option::is_none")]
pub exhaustive_facets_count: Option<bool>,
2021-03-04 18:56:32 +08:00
}
2021-06-14 05:51:33 +08:00
#[derive(Copy, Clone)]
struct FormatOptions {
highlight: bool,
crop: Option<usize>,
}
2021-03-04 18:56:32 +08:00
impl Index {
pub fn perform_search(&self, query: SearchQuery) -> Result<SearchResult> {
2021-03-04 18:56:32 +08:00
let before_search = Instant::now();
let rtxn = self.read_txn()?;
let mut search = self.search(&rtxn);
if let Some(ref query) = query.q {
search.query(query);
}
2022-03-31 01:06:15 +08:00
// Make sure that a user can't get more documents than the hard limit,
// we align that on the offset too.
let offset = min(query.offset.unwrap_or(0), HARD_RESULT_LIMIT);
let limit = min(query.limit, HARD_RESULT_LIMIT.saturating_sub(offset));
search.offset(offset);
search.limit(limit);
2021-03-04 18:56:32 +08:00
if let Some(ref filter) = query.filter {
if let Some(facets) = parse_filter(filter)? {
2021-06-04 01:36:25 +08:00
search.filter(facets);
2021-03-04 18:56:32 +08:00
}
}
2021-08-24 18:31:35 +08:00
if let Some(ref sort) = query.sort {
let sort = match sort.iter().map(|s| AscDesc::from_str(s)).collect() {
Ok(sorts) => sorts,
2021-09-28 20:49:13 +08:00
Err(asc_desc_error) => {
return Err(IndexError::Milli(SortError::from(asc_desc_error).into()))
2021-09-27 21:41:14 +08:00
}
2021-08-24 18:31:35 +08:00
};
search.sort_criteria(sort);
}
2021-03-04 18:56:32 +08:00
let milli::SearchResult {
documents_ids,
2021-03-12 02:40:18 +08:00
matching_words,
2021-03-04 18:56:32 +08:00
candidates,
..
} = search.execute()?;
2021-06-17 20:36:32 +08:00
2021-03-04 18:56:32 +08:00
let fields_ids_map = self.fields_ids_map(&rtxn).unwrap();
2021-05-31 22:03:39 +08:00
let displayed_ids = self
.displayed_fields_ids(&rtxn)?
2021-06-16 22:18:55 +08:00
.map(|fields| fields.into_iter().collect::<BTreeSet<_>>())
2021-04-20 22:21:30 +08:00
.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
2021-04-20 19:10:50 +08:00
2021-06-16 22:18:55 +08:00
let fids = |attrs: &BTreeSet<String>| {
let mut ids = BTreeSet::new();
2021-04-20 19:10:50 +08:00
for attr in attrs {
if attr == "*" {
ids = displayed_ids.clone();
break;
}
if let Some(id) = fields_ids_map.id(attr) {
ids.insert(id);
}
}
ids
2021-04-19 22:22:41 +08:00
};
2021-06-15 22:21:41 +08:00
// The attributes to retrieve are the ones explicitly marked as to retrieve (all by default),
// but these attributes must be also be present
// - in the fields_ids_map
// - in the the displayed attributes
2021-06-16 22:18:55 +08:00
let to_retrieve_ids: BTreeSet<_> = query
2021-04-19 22:22:41 +08:00
.attributes_to_retrieve
.as_ref()
.map(fids)
2021-06-15 22:21:41 +08:00
.unwrap_or_else(|| displayed_ids.clone())
2021-06-14 05:51:33 +08:00
.intersection(&displayed_ids)
.cloned()
2021-06-15 22:21:41 +08:00
.collect();
2021-04-19 22:22:41 +08:00
2021-06-22 05:38:59 +08:00
let attr_to_highlight = query.attributes_to_highlight.unwrap_or_default();
2021-04-19 22:22:41 +08:00
2021-06-22 05:38:59 +08:00
let attr_to_crop = query.attributes_to_crop.unwrap_or_default();
2021-04-19 22:22:41 +08:00
// Attributes in `formatted_options` correspond to the attributes that will be in `_formatted`
// These attributes are:
// - the attributes asked to be highlighted or cropped (with `attributesToCrop` or `attributesToHighlight`)
// - the attributes asked to be retrieved: these attributes will not be highlighted/cropped
// But these attributes must be also present in displayed attributes
2021-06-16 20:23:08 +08:00
let formatted_options = compute_formatted_options(
&attr_to_highlight,
&attr_to_crop,
query.crop_length,
&to_retrieve_ids,
&fields_ids_map,
&displayed_ids,
);
2021-03-04 18:56:32 +08:00
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
2021-03-04 18:56:32 +08:00
let mut documents = Vec::new();
2021-06-17 20:36:32 +08:00
let documents_iter = self.documents(&rtxn, documents_ids)?;
2021-06-21 18:09:59 +08:00
for (_id, obkv) in documents_iter {
2021-09-27 21:41:14 +08:00
let mut document = make_document(&to_retrieve_ids, &fields_ids_map, obkv)?;
2021-06-22 05:38:59 +08:00
2021-06-22 16:08:38 +08:00
let matches_info = query
.matches
.then(|| compute_matches(&matching_words, &document, &analyzer));
2021-06-22 05:38:59 +08:00
2021-06-16 20:23:08 +08:00
let formatted = format_fields(
2021-04-19 22:22:41 +08:00
&fields_ids_map,
obkv,
&formatter,
2021-04-19 22:22:41 +08:00
&matching_words,
2021-06-14 05:51:33 +08:00
&formatted_options,
2021-04-19 22:22:41 +08:00
)?;
2021-09-27 21:41:14 +08:00
if let Some(sort) = query.sort.as_ref() {
insert_geo_distance(sort, &mut document);
}
2021-04-19 16:13:13 +08:00
let hit = SearchHit {
2021-04-19 22:22:41 +08:00
document,
formatted,
2021-06-22 05:38:59 +08:00
matches_info,
2021-04-19 16:13:13 +08:00
};
documents.push(hit);
2021-03-04 18:56:32 +08:00
}
let nb_hits = candidates.len();
2021-06-23 02:07:23 +08:00
let facets_distribution = match query.facets_distribution {
2021-03-04 18:56:32 +08:00
Some(ref fields) => {
2021-06-23 02:07:23 +08:00
let mut facets_distribution = self.facets_distribution(&rtxn);
2021-03-04 18:56:32 +08:00
if fields.iter().all(|f| f != "*") {
2021-06-23 02:07:23 +08:00
facets_distribution.facets(fields);
2021-03-04 18:56:32 +08:00
}
2021-06-23 02:07:23 +08:00
let distribution = facets_distribution.candidates(candidates).execute()?;
Some(distribution)
2021-03-04 18:56:32 +08:00
}
None => None,
};
2021-06-23 17:23:57 +08:00
let exhaustive_facets_count = facets_distribution.as_ref().map(|_| false); // not implemented yet
2021-06-23 17:05:30 +08:00
2021-03-04 18:56:32 +08:00
let result = SearchResult {
exhaustive_nb_hits: false, // not implemented yet
2021-03-04 18:56:32 +08:00
hits: documents,
nb_hits,
query: query.q.clone().unwrap_or_default(),
limit: query.limit,
offset: query.offset.unwrap_or_default(),
processing_time_ms: before_search.elapsed().as_millis(),
2021-06-23 02:07:23 +08:00
facets_distribution,
2021-06-23 17:05:30 +08:00
exhaustive_facets_count,
2021-03-04 18:56:32 +08:00
};
Ok(result)
}
}
2021-09-27 21:41:14 +08:00
fn insert_geo_distance(sorts: &[String], document: &mut Document) {
lazy_static::lazy_static! {
static ref GEO_REGEX: Regex =
Regex::new(r"_geoPoint\(\s*([[:digit:].\-]+)\s*,\s*([[:digit:].\-]+)\s*\)").unwrap();
};
if let Some(capture_group) = sorts.iter().find_map(|sort| GEO_REGEX.captures(sort)) {
// TODO: TAMO: milli encountered an internal error, what do we want to do?
let base = [
capture_group[1].parse().unwrap(),
capture_group[2].parse().unwrap(),
];
let geo_point = &document.get("_geo").unwrap_or(&json!(null));
if let Some((lat, lng)) = geo_point["lat"].as_f64().zip(geo_point["lng"].as_f64()) {
let distance = milli::distance_between_two_points(&base, &[lat, lng]);
document.insert("_geoDistance".to_string(), json!(distance.round() as usize));
}
}
}
fn compute_matches<A: AsRef<[u8]>>(
matcher: &impl Matcher,
document: &Document,
2021-06-23 20:48:33 +08:00
analyzer: &Analyzer<A>,
) -> MatchesInfo {
2021-06-22 05:38:59 +08:00
let mut matches = BTreeMap::new();
for (key, value) in document {
let mut infos = Vec::new();
2021-07-30 00:14:36 +08:00
compute_value_matches(&mut infos, value, matcher, analyzer);
2021-06-22 05:38:59 +08:00
if !infos.is_empty() {
matches.insert(key.clone(), infos);
}
}
matches
}
fn compute_value_matches<'a, A: AsRef<[u8]>>(
infos: &mut Vec<MatchInfo>,
value: &Value,
matcher: &impl Matcher,
analyzer: &Analyzer<'a, A>,
) {
match value {
Value::String(s) => {
let analyzed = analyzer.analyze(s);
let mut start = 0;
for (word, token) in analyzed.reconstruct() {
if token.is_word() {
2022-01-19 18:21:19 +08:00
if let Some(length) = matcher.matches(&token) {
2021-06-22 05:38:59 +08:00
infos.push(MatchInfo { start, length });
}
}
start += word.len();
}
}
Value::Array(vals) => vals
.iter()
.for_each(|val| compute_value_matches(infos, val, matcher, analyzer)),
Value::Object(vals) => vals
.values()
.for_each(|val| compute_value_matches(infos, val, matcher, analyzer)),
2021-11-10 23:10:30 +08:00
Value::Number(number) => {
compute_value_matches(infos, &Value::String(number.to_string()), matcher, analyzer)
}
2021-06-22 05:38:59 +08:00
_ => (),
}
}
2021-06-16 20:23:08 +08:00
fn compute_formatted_options(
attr_to_highlight: &HashSet<String>,
attr_to_crop: &[String],
query_crop_length: usize,
to_retrieve_ids: &BTreeSet<FieldId>,
fields_ids_map: &FieldsIdsMap,
displayed_ids: &BTreeSet<FieldId>,
2021-06-22 05:38:59 +08:00
) -> BTreeMap<FieldId, FormatOptions> {
let mut formatted_options = BTreeMap::new();
add_highlight_to_formatted_options(
&mut formatted_options,
2021-06-16 23:13:21 +08:00
attr_to_highlight,
fields_ids_map,
displayed_ids,
);
add_crop_to_formatted_options(
&mut formatted_options,
2021-06-16 23:13:21 +08:00
attr_to_crop,
query_crop_length,
fields_ids_map,
displayed_ids,
);
// Should not return `_formatted` if no valid attributes to highlight/crop
if !formatted_options.is_empty() {
2021-06-22 05:38:59 +08:00
add_non_formatted_ids_to_formatted_options(&mut formatted_options, to_retrieve_ids);
}
2021-06-16 23:13:21 +08:00
formatted_options
}
fn add_highlight_to_formatted_options(
formatted_options: &mut BTreeMap<FieldId, FormatOptions>,
2021-06-16 23:13:21 +08:00
attr_to_highlight: &HashSet<String>,
fields_ids_map: &FieldsIdsMap,
displayed_ids: &BTreeSet<FieldId>,
) {
for attr in attr_to_highlight {
let new_format = FormatOptions {
highlight: true,
crop: None,
};
if attr == "*" {
2021-06-16 00:44:56 +08:00
for id in displayed_ids {
formatted_options.insert(*id, new_format);
}
break;
}
2021-07-30 00:14:36 +08:00
if let Some(id) = fields_ids_map.id(attr) {
if displayed_ids.contains(&id) {
formatted_options.insert(id, new_format);
}
}
2021-06-16 00:44:56 +08:00
}
2021-06-16 23:13:21 +08:00
}
2021-06-16 23:13:21 +08:00
fn add_crop_to_formatted_options(
formatted_options: &mut BTreeMap<FieldId, FormatOptions>,
2021-06-16 23:13:21 +08:00
attr_to_crop: &[String],
crop_length: usize,
fields_ids_map: &FieldsIdsMap,
displayed_ids: &BTreeSet<FieldId>,
) {
for attr in attr_to_crop {
2021-06-17 22:59:01 +08:00
let mut split = attr.rsplitn(2, ':');
let (attr_name, attr_len) = match split.next().zip(split.next()) {
2021-06-16 00:44:56 +08:00
Some((len, name)) => {
2021-06-17 22:59:01 +08:00
let crop_len = len.parse::<usize>().unwrap_or(crop_length);
(name, crop_len)
2021-06-22 05:38:59 +08:00
}
2021-06-17 22:59:01 +08:00
None => (attr.as_str(), crop_length),
2021-06-16 00:44:56 +08:00
};
if attr_name == "*" {
2021-06-16 00:44:56 +08:00
for id in displayed_ids {
formatted_options
.entry(*id)
.and_modify(|f| f.crop = Some(attr_len))
.or_insert(FormatOptions {
highlight: false,
crop: Some(attr_len),
});
}
}
2021-07-30 00:14:36 +08:00
if let Some(id) = fields_ids_map.id(attr_name) {
if displayed_ids.contains(&id) {
formatted_options
.entry(id)
.and_modify(|f| f.crop = Some(attr_len))
.or_insert(FormatOptions {
highlight: false,
crop: Some(attr_len),
});
}
}
}
}
fn add_non_formatted_ids_to_formatted_options(
formatted_options: &mut BTreeMap<FieldId, FormatOptions>,
to_retrieve_ids: &BTreeSet<FieldId>,
) {
for id in to_retrieve_ids {
2021-06-22 05:38:59 +08:00
formatted_options.entry(*id).or_insert(FormatOptions {
highlight: false,
crop: None,
});
}
}
2021-04-21 03:19:37 +08:00
fn make_document(
2021-06-16 22:18:55 +08:00
attributes_to_retrieve: &BTreeSet<FieldId>,
2021-04-21 03:19:37 +08:00
field_ids_map: &FieldsIdsMap,
2021-07-28 16:52:47 +08:00
obkv: obkv::KvReaderU16,
) -> Result<Document> {
2021-04-21 03:19:37 +08:00
let mut document = Document::new();
2021-06-22 05:38:59 +08:00
2021-04-21 03:19:37 +08:00
for attr in attributes_to_retrieve {
if let Some(value) = obkv.get(*attr) {
let value = serde_json::from_slice(value)?;
// This unwrap must be safe since we got the ids from the fields_ids_map just
// before.
let key = field_ids_map
.name(*attr)
.expect("Missing field name")
.to_string();
document.insert(key, value);
}
}
Ok(document)
}
2021-06-16 20:23:08 +08:00
fn format_fields<A: AsRef<[u8]>>(
2021-04-19 22:22:41 +08:00
field_ids_map: &FieldsIdsMap,
2021-07-28 16:52:47 +08:00
obkv: obkv::KvReaderU16,
formatter: &Formatter<A>,
2021-04-20 01:03:53 +08:00
matching_words: &impl Matcher,
formatted_options: &BTreeMap<FieldId, FormatOptions>,
) -> Result<Document> {
2021-04-19 22:22:41 +08:00
let mut document = Document::new();
for (id, format) in formatted_options {
if let Some(value) = obkv.get(*id) {
2021-04-19 22:22:41 +08:00
let mut value: Value = serde_json::from_slice(value)?;
2021-06-22 05:38:59 +08:00
value = formatter.format_value(value, matching_words, *format);
2021-04-19 22:22:41 +08:00
// This unwrap must be safe since we got the ids from the fields_ids_map just
// before.
let key = field_ids_map
.name(*id)
2021-04-19 22:22:41 +08:00
.expect("Missing field name")
.to_string();
document.insert(key, value);
}
}
Ok(document)
}
2021-06-16 20:23:08 +08:00
/// trait to allow unit testing of `format_fields`
2021-04-20 01:03:53 +08:00
trait Matcher {
2022-01-19 18:21:19 +08:00
fn matches(&self, w: &Token) -> Option<usize>;
2021-04-20 01:03:53 +08:00
}
#[cfg(test)]
impl Matcher for BTreeMap<&str, Option<usize>> {
2022-01-19 18:21:19 +08:00
fn matches(&self, w: &Token) -> Option<usize> {
self.get(w.text()).cloned().flatten()
2021-04-20 01:03:53 +08:00
}
}
impl Matcher for MatchingWords {
2022-01-19 18:21:19 +08:00
fn matches(&self, w: &Token) -> Option<usize> {
self.matching_bytes(w)
2021-04-20 01:03:53 +08:00
}
}
2021-05-12 00:30:55 +08:00
struct Formatter<'a, A> {
analyzer: &'a Analyzer<'a, A>,
2021-04-19 22:22:41 +08:00
marks: (String, String),
2021-03-04 18:56:32 +08:00
}
2021-05-12 00:30:55 +08:00
impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
pub fn new(analyzer: &'a Analyzer<'a, A>, marks: (String, String)) -> Self {
2021-04-19 22:22:41 +08:00
Self { analyzer, marks }
2021-03-04 18:56:32 +08:00
}
2021-05-05 23:31:40 +08:00
fn format_value(
&self,
value: Value,
matcher: &impl Matcher,
2021-06-16 22:18:55 +08:00
format_options: FormatOptions,
2021-05-12 00:30:55 +08:00
) -> Value {
2021-03-04 18:56:32 +08:00
match value {
Value::String(old_string) => {
2021-06-22 05:38:59 +08:00
let value = self.format_string(old_string, matcher, format_options);
2021-05-05 23:31:40 +08:00
Value::String(value)
2021-03-04 18:56:32 +08:00
}
Value::Array(values) => Value::Array(
values
2021-03-16 01:11:10 +08:00
.into_iter()
2021-06-22 05:38:59 +08:00
.map(|v| {
self.format_value(
v,
matcher,
FormatOptions {
highlight: format_options.highlight,
crop: None,
},
)
})
2021-03-16 01:11:10 +08:00
.collect(),
2021-03-04 18:56:32 +08:00
),
Value::Object(object) => Value::Object(
object
2021-03-16 01:11:10 +08:00
.into_iter()
2021-06-22 05:38:59 +08:00
.map(|(k, v)| {
(
k,
self.format_value(
v,
matcher,
FormatOptions {
highlight: format_options.highlight,
crop: None,
},
),
)
})
2021-03-16 01:11:10 +08:00
.collect(),
2021-03-04 18:56:32 +08:00
),
Value::Number(number) => {
let number_string_value =
self.format_string(number.to_string(), matcher, format_options);
Value::String(number_string_value)
}
2021-05-05 23:31:40 +08:00
value => value,
2021-03-04 18:56:32 +08:00
}
}
2021-06-13 17:53:29 +08:00
2021-05-12 00:30:55 +08:00
fn format_string(
&self,
s: String,
matcher: &impl Matcher,
2021-06-16 22:18:55 +08:00
format_options: FormatOptions,
2021-05-12 00:30:55 +08:00
) -> String {
let analyzed = self.analyzer.analyze(&s);
2021-05-11 23:27:31 +08:00
2021-06-16 22:18:55 +08:00
let tokens: Box<dyn Iterator<Item = (&str, Token)>> = match format_options.crop {
2021-05-11 23:27:31 +08:00
Some(crop_len) => {
2021-06-17 22:59:01 +08:00
let mut buffer = Vec::new();
2021-05-12 00:30:55 +08:00
let mut tokens = analyzed.reconstruct().peekable();
2021-06-17 22:59:01 +08:00
2021-06-22 05:38:59 +08:00
while let Some((word, token)) =
2022-01-19 18:21:19 +08:00
tokens.next_if(|(_, token)| matcher.matches(token).is_none())
2021-06-22 05:38:59 +08:00
{
2021-06-17 22:59:01 +08:00
buffer.push((word, token));
2021-05-12 00:30:55 +08:00
}
2021-06-17 22:59:01 +08:00
match tokens.next() {
Some(token) => {
let mut total_len: usize = buffer.iter().map(|(word, _)| word.len()).sum();
let before_iter = buffer.into_iter().skip_while(move |(word, _)| {
total_len -= word.len();
2021-06-17 23:03:43 +08:00
total_len >= crop_len
2021-06-17 22:59:01 +08:00
});
let mut taken_after = 0;
2021-06-22 05:38:59 +08:00
let after_iter = tokens.take_while(move |(word, _)| {
2021-06-17 22:59:01 +08:00
let take = taken_after < crop_len;
taken_after += word.chars().count();
take
});
2021-06-22 05:38:59 +08:00
let iter = before_iter.chain(Some(token)).chain(after_iter);
2021-06-17 22:59:01 +08:00
Box::new(iter)
2021-06-22 05:38:59 +08:00
}
2021-06-17 22:59:01 +08:00
// If no word matches in the attribute
None => {
let mut count = 0;
let iter = buffer.into_iter().take_while(move |(word, _)| {
let take = count < crop_len;
count += word.len();
take
});
Box::new(iter)
}
2021-05-12 00:30:55 +08:00
}
}
2021-05-11 23:27:31 +08:00
None => Box::new(analyzed.reconstruct()),
2021-05-05 23:31:40 +08:00
};
2021-06-22 16:17:39 +08:00
tokens.fold(String::new(), |mut out, (word, token)| {
// Check if we need to do highlighting or computed matches before calling
// Matcher::match since the call is expensive.
if format_options.highlight && token.is_word() {
2022-01-19 18:21:19 +08:00
if let Some(length) = matcher.matches(&token) {
match word.get(..length).zip(word.get(length..)) {
Some((head, tail)) => {
out.push_str(&self.marks.0);
out.push_str(head);
out.push_str(&self.marks.1);
out.push_str(tail);
}
// if we are in the middle of a character
// or if all the word should be highlighted,
// we highlight the complete word.
None => {
out.push_str(&self.marks.0);
2021-07-30 00:14:36 +08:00
out.push_str(word);
out.push_str(&self.marks.1);
}
}
return out;
2021-05-12 00:30:55 +08:00
}
2021-06-22 16:17:39 +08:00
}
out.push_str(word);
out
})
2021-05-05 23:31:40 +08:00
}
2021-03-04 18:56:32 +08:00
}
fn parse_filter(facets: &Value) -> Result<Option<Filter>> {
2021-03-04 18:56:32 +08:00
match facets {
Value::String(expr) => {
let condition = Filter::from_str(expr)?;
2022-01-19 18:21:19 +08:00
Ok(condition)
}
Value::Array(arr) => parse_filter_array(arr),
2021-06-17 20:38:52 +08:00
v => Err(FacetError::InvalidExpression(&["Array"], v.clone()).into()),
2021-03-04 18:56:32 +08:00
}
}
2021-04-20 01:03:53 +08:00
fn parse_filter_array(arr: &[Value]) -> Result<Option<Filter>> {
2021-05-05 00:22:48 +08:00
let mut ands = Vec::new();
for value in arr {
match value {
Value::String(s) => ands.push(Either::Right(s.as_str())),
2021-05-05 00:22:48 +08:00
Value::Array(arr) => {
let mut ors = Vec::new();
for value in arr {
match value {
Value::String(s) => ors.push(s.as_str()),
v => {
return Err(FacetError::InvalidExpression(&["String"], v.clone()).into())
}
2021-05-05 00:22:48 +08:00
}
}
ands.push(Either::Left(ors));
}
v => {
return Err(
FacetError::InvalidExpression(&["String", "[String]"], v.clone()).into(),
)
}
2021-05-05 00:22:48 +08:00
}
}
Ok(Filter::from_array(ands)?)
2021-05-05 00:22:48 +08:00
}
2021-04-20 01:03:53 +08:00
#[cfg(test)]
mod test {
use super::*;
#[test]
2021-06-14 05:51:33 +08:00
fn no_ids_no_formatted() {
2021-04-20 01:03:53 +08:00
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
2021-04-20 01:03:53 +08:00
let mut fields = FieldsIdsMap::new();
let id = fields.insert("test").unwrap();
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
2021-05-31 22:03:39 +08:00
obkv.insert(id, Value::String("hello".into()).to_string().as_bytes())
.unwrap();
2021-04-20 01:03:53 +08:00
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let formatted_options = BTreeMap::new();
2021-04-20 01:03:53 +08:00
let matching_words = MatchingWords::default();
2021-06-16 20:23:08 +08:00
let value = format_fields(
2021-04-20 01:03:53 +08:00
&fields,
obkv,
&formatter,
2021-04-20 01:03:53 +08:00
&matching_words,
2021-06-14 05:51:33 +08:00
&formatted_options,
2021-05-31 22:03:39 +08:00
)
.unwrap();
2021-04-20 01:03:53 +08:00
assert!(value.is_empty());
}
#[test]
fn formatted_with_highlight_in_word() {
2021-04-20 01:03:53 +08:00
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
2021-04-20 01:03:53 +08:00
let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap();
let author = fields.insert("author").unwrap();
2021-04-20 01:03:53 +08:00
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
title,
Value::String("The Hobbit".into()).to_string().as_bytes(),
)
.unwrap();
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
author,
Value::String("J. R. R. Tolkien".into())
.to_string()
.as_bytes(),
)
.unwrap();
2021-04-20 01:03:53 +08:00
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let mut formatted_options = BTreeMap::new();
2021-06-22 05:38:59 +08:00
formatted_options.insert(
title,
FormatOptions {
highlight: true,
crop: None,
},
);
formatted_options.insert(
author,
FormatOptions {
highlight: false,
crop: None,
},
);
2021-04-20 01:03:53 +08:00
let mut matching_words = BTreeMap::new();
matching_words.insert("hobbit", Some(3));
2021-04-20 01:03:53 +08:00
2021-06-16 20:23:08 +08:00
let value = format_fields(
2021-04-20 01:03:53 +08:00
&fields,
obkv,
&formatter,
2021-04-20 01:03:53 +08:00
&matching_words,
&formatted_options,
2021-05-31 22:03:39 +08:00
)
.unwrap();
2021-04-20 01:03:53 +08:00
assert_eq!(value["title"], "The <em>Hob</em>bit");
assert_eq!(value["author"], "J. R. R. Tolkien");
2021-04-20 01:03:53 +08:00
}
2021-10-08 21:07:45 +08:00
#[test]
fn formatted_with_highlight_in_number() {
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap();
let author = fields.insert("author").unwrap();
let publication_year = fields.insert("publication_year").unwrap();
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
obkv.insert(
title,
Value::String("The Hobbit".into()).to_string().as_bytes(),
)
.unwrap();
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
obkv.insert(
author,
Value::String("J. R. R. Tolkien".into())
.to_string()
.as_bytes(),
)
.unwrap();
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
obkv.insert(
publication_year,
Value::Number(1937.into()).to_string().as_bytes(),
2021-10-08 21:07:45 +08:00
)
.unwrap();
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let mut formatted_options = BTreeMap::new();
formatted_options.insert(
title,
FormatOptions {
highlight: false,
crop: None,
},
);
formatted_options.insert(
author,
FormatOptions {
highlight: false,
crop: None,
},
);
formatted_options.insert(
publication_year,
FormatOptions {
highlight: true,
crop: None,
},
);
let mut matching_words = BTreeMap::new();
matching_words.insert("1937", Some(4));
let value = format_fields(
&fields,
obkv,
&formatter,
&matching_words,
&formatted_options,
)
.unwrap();
assert_eq!(value["title"], "The Hobbit");
assert_eq!(value["author"], "J. R. R. Tolkien");
assert_eq!(value["publication_year"], "<em>1937</em>");
}
/// https://github.com/meilisearch/meilisearch/issues/1368
#[test]
2021-07-01 22:44:17 +08:00
fn formatted_with_highlight_emoji() {
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap();
let author = fields.insert("author").unwrap();
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
obkv.insert(
title,
Value::String("Go💼od luck.".into()).to_string().as_bytes(),
)
.unwrap();
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
obkv.insert(
author,
Value::String("JacobLey".into()).to_string().as_bytes(),
)
.unwrap();
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let mut formatted_options = BTreeMap::new();
formatted_options.insert(
title,
FormatOptions {
highlight: true,
crop: None,
},
);
formatted_options.insert(
author,
FormatOptions {
highlight: false,
crop: None,
},
);
let mut matching_words = BTreeMap::new();
// emojis are deunicoded during tokenization
// TODO Tokenizer should remove spaces after deunicode
matching_words.insert("gobriefcase od", Some(11));
let value = format_fields(
&fields,
obkv,
&formatter,
&matching_words,
&formatted_options,
)
.unwrap();
assert_eq!(value["title"], "<em>Go💼od</em> luck.");
assert_eq!(value["author"], "JacobLey");
}
2021-07-01 22:44:17 +08:00
#[test]
fn formatted_with_highlight_in_unicode_word() {
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap();
let author = fields.insert("author").unwrap();
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
obkv.insert(title, Value::String("étoile".into()).to_string().as_bytes())
.unwrap();
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
obkv.insert(
author,
Value::String("J. R. R. Tolkien".into())
.to_string()
.as_bytes(),
)
.unwrap();
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let mut formatted_options = BTreeMap::new();
formatted_options.insert(
title,
FormatOptions {
highlight: true,
crop: None,
},
);
formatted_options.insert(
author,
FormatOptions {
highlight: false,
crop: None,
},
);
let mut matching_words = BTreeMap::new();
matching_words.insert("etoile", Some(1));
let value = format_fields(
&fields,
obkv,
&formatter,
&matching_words,
&formatted_options,
)
.unwrap();
assert_eq!(value["title"], "<em>étoile</em>");
assert_eq!(value["author"], "J. R. R. Tolkien");
}
2021-04-20 01:03:53 +08:00
#[test]
2021-06-14 05:51:33 +08:00
fn formatted_with_crop_2() {
2021-04-20 01:03:53 +08:00
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
2021-04-20 01:03:53 +08:00
let mut fields = FieldsIdsMap::new();
2021-06-14 05:51:33 +08:00
let title = fields.insert("title").unwrap();
let author = fields.insert("author").unwrap();
2021-04-20 01:03:53 +08:00
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
title,
Value::String("Harry Potter and the Half-Blood Prince".into())
.to_string()
.as_bytes(),
)
.unwrap();
2021-06-14 05:51:33 +08:00
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
author,
Value::String("J. K. Rowling".into()).to_string().as_bytes(),
)
.unwrap();
2021-06-14 05:51:33 +08:00
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let mut formatted_options = BTreeMap::new();
2021-06-22 05:38:59 +08:00
formatted_options.insert(
title,
FormatOptions {
highlight: false,
crop: Some(2),
},
);
formatted_options.insert(
author,
FormatOptions {
highlight: false,
crop: None,
},
);
2021-06-14 05:51:33 +08:00
let mut matching_words = BTreeMap::new();
matching_words.insert("potter", Some(6));
2021-06-14 05:51:33 +08:00
2021-06-16 20:23:08 +08:00
let value = format_fields(
2021-06-14 05:51:33 +08:00
&fields,
obkv,
&formatter,
&matching_words,
&formatted_options,
)
.unwrap();
assert_eq!(value["title"], "Harry Potter and");
assert_eq!(value["author"], "J. K. Rowling");
}
#[test]
fn formatted_with_crop_10() {
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
2021-06-14 05:51:33 +08:00
let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap();
let author = fields.insert("author").unwrap();
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
title,
Value::String("Harry Potter and the Half-Blood Prince".into())
.to_string()
.as_bytes(),
)
.unwrap();
2021-06-14 05:51:33 +08:00
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
author,
Value::String("J. K. Rowling".into()).to_string().as_bytes(),
)
.unwrap();
2021-06-14 05:51:33 +08:00
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let mut formatted_options = BTreeMap::new();
2021-06-22 05:38:59 +08:00
formatted_options.insert(
title,
FormatOptions {
highlight: false,
crop: Some(10),
},
);
formatted_options.insert(
author,
FormatOptions {
highlight: false,
crop: None,
},
);
2021-06-14 05:51:33 +08:00
let mut matching_words = BTreeMap::new();
matching_words.insert("potter", Some(6));
2021-06-14 05:51:33 +08:00
2021-06-16 20:23:08 +08:00
let value = format_fields(
2021-06-14 05:51:33 +08:00
&fields,
obkv,
&formatter,
&matching_words,
&formatted_options,
)
.unwrap();
assert_eq!(value["title"], "Harry Potter and the Half");
assert_eq!(value["author"], "J. K. Rowling");
}
#[test]
fn formatted_with_crop_0() {
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
2021-06-14 05:51:33 +08:00
let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap();
let author = fields.insert("author").unwrap();
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
title,
Value::String("Harry Potter and the Half-Blood Prince".into())
.to_string()
.as_bytes(),
)
.unwrap();
2021-06-14 05:51:33 +08:00
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
author,
Value::String("J. K. Rowling".into()).to_string().as_bytes(),
)
.unwrap();
2021-06-14 05:51:33 +08:00
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let mut formatted_options = BTreeMap::new();
2021-06-22 05:38:59 +08:00
formatted_options.insert(
title,
FormatOptions {
highlight: false,
crop: Some(0),
},
);
formatted_options.insert(
author,
FormatOptions {
highlight: false,
crop: None,
},
);
2021-06-14 05:51:33 +08:00
let mut matching_words = BTreeMap::new();
matching_words.insert("potter", Some(6));
2021-06-14 05:51:33 +08:00
2021-06-16 20:23:08 +08:00
let value = format_fields(
2021-06-14 05:51:33 +08:00
&fields,
obkv,
&formatter,
&matching_words,
&formatted_options,
)
.unwrap();
assert_eq!(value["title"], "Potter");
assert_eq!(value["author"], "J. K. Rowling");
2021-06-17 22:59:01 +08:00
}
#[test]
fn formatted_with_crop_and_no_match() {
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
2021-06-17 22:59:01 +08:00
let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap();
let author = fields.insert("author").unwrap();
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
title,
Value::String("Harry Potter and the Half-Blood Prince".into())
.to_string()
.as_bytes(),
)
.unwrap();
2021-06-17 22:59:01 +08:00
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
author,
Value::String("J. K. Rowling".into()).to_string().as_bytes(),
)
.unwrap();
2021-06-17 22:59:01 +08:00
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let mut formatted_options = BTreeMap::new();
2021-06-22 05:38:59 +08:00
formatted_options.insert(
title,
FormatOptions {
highlight: false,
crop: Some(6),
},
);
formatted_options.insert(
author,
FormatOptions {
highlight: false,
crop: Some(20),
},
);
2021-06-17 22:59:01 +08:00
let mut matching_words = BTreeMap::new();
matching_words.insert("rowling", Some(3));
let value = format_fields(
&fields,
obkv,
&formatter,
&matching_words,
&formatted_options,
)
.unwrap();
assert_eq!(value["title"], "Harry ");
assert_eq!(value["author"], "J. K. Rowling");
2021-06-14 05:51:33 +08:00
}
#[test]
fn formatted_with_crop_and_highlight() {
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
2021-06-14 05:51:33 +08:00
let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap();
let author = fields.insert("author").unwrap();
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
title,
Value::String("Harry Potter and the Half-Blood Prince".into())
.to_string()
.as_bytes(),
)
.unwrap();
2021-06-14 05:51:33 +08:00
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
author,
Value::String("J. K. Rowling".into()).to_string().as_bytes(),
)
.unwrap();
2021-04-20 01:03:53 +08:00
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let mut formatted_options = BTreeMap::new();
2021-06-22 05:38:59 +08:00
formatted_options.insert(
title,
FormatOptions {
highlight: true,
crop: Some(1),
},
);
formatted_options.insert(
author,
FormatOptions {
highlight: false,
crop: None,
},
);
2021-04-20 01:03:53 +08:00
let mut matching_words = BTreeMap::new();
matching_words.insert("and", Some(3));
2021-04-20 01:03:53 +08:00
2021-06-16 20:23:08 +08:00
let value = format_fields(
2021-04-20 01:03:53 +08:00
&fields,
obkv,
&formatter,
2021-04-20 01:03:53 +08:00
&matching_words,
2021-06-14 05:51:33 +08:00
&formatted_options,
2021-05-12 00:30:55 +08:00
)
.unwrap();
2021-04-20 01:03:53 +08:00
2021-06-14 05:51:33 +08:00
assert_eq!(value["title"], " <em>and</em> ");
assert_eq!(value["author"], "J. K. Rowling");
2021-04-20 01:03:53 +08:00
}
#[test]
fn formatted_with_crop_and_highlight_in_word() {
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let formatter = Formatter::new(&analyzer, (String::from("<em>"), String::from("</em>")));
let mut fields = FieldsIdsMap::new();
let title = fields.insert("title").unwrap();
let author = fields.insert("author").unwrap();
let mut buf = Vec::new();
let mut obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
title,
Value::String("Harry Potter and the Half-Blood Prince".into())
.to_string()
.as_bytes(),
)
.unwrap();
obkv.finish().unwrap();
obkv = obkv::KvWriter::new(&mut buf);
2021-06-22 05:38:59 +08:00
obkv.insert(
author,
Value::String("J. K. Rowling".into()).to_string().as_bytes(),
)
.unwrap();
2021-04-20 01:03:53 +08:00
obkv.finish().unwrap();
let obkv = obkv::KvReader::new(&buf);
let mut formatted_options = BTreeMap::new();
2021-06-22 05:38:59 +08:00
formatted_options.insert(
title,
FormatOptions {
highlight: true,
crop: Some(9),
},
);
formatted_options.insert(
author,
FormatOptions {
highlight: false,
crop: None,
},
);
2021-04-20 01:03:53 +08:00
let mut matching_words = BTreeMap::new();
matching_words.insert("blood", Some(3));
2021-04-20 01:03:53 +08:00
2021-06-16 20:23:08 +08:00
let value = format_fields(
2021-04-20 01:03:53 +08:00
&fields,
obkv,
&formatter,
2021-04-20 01:03:53 +08:00
&matching_words,
&formatted_options,
2021-05-31 22:03:39 +08:00
)
.unwrap();
2021-04-20 01:03:53 +08:00
assert_eq!(value["title"], "the Half-<em>Blo</em>od Prince");
assert_eq!(value["author"], "J. K. Rowling");
2021-04-20 01:03:53 +08:00
}
2021-06-22 17:06:30 +08:00
#[test]
fn test_compute_value_matches() {
let text = "Call me Ishmael. Some years ago—never mind how long precisely—having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world.";
let value = serde_json::json!(text);
let mut matcher = BTreeMap::new();
matcher.insert("ishmael", Some(3));
matcher.insert("little", Some(6));
matcher.insert("particular", Some(1));
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let mut infos = Vec::new();
compute_value_matches(&mut infos, &value, &matcher, &analyzer);
let mut infos = infos.into_iter();
let crop = |info: MatchInfo| &text[info.start..info.start + info.length];
assert_eq!(crop(infos.next().unwrap()), "Ish");
assert_eq!(crop(infos.next().unwrap()), "little");
assert_eq!(crop(infos.next().unwrap()), "p");
assert_eq!(crop(infos.next().unwrap()), "little");
assert!(infos.next().is_none());
}
#[test]
fn test_compute_match() {
let value = serde_json::from_str(r#"{
"color": "Green",
"name": "Lucas Hess",
"gender": "male",
2021-11-10 23:10:30 +08:00
"price": 3.5,
2021-06-22 17:06:30 +08:00
"address": "412 Losee Terrace, Blairstown, Georgia, 2825",
"about": "Mollit ad in exercitation quis Laboris . Anim est ut consequat fugiat duis magna aliquip velit nisi. Commodo eiusmod est consequat proident consectetur aliqua enim fugiat. Aliqua adipisicing laboris elit proident enim veniam laboris mollit. Incididunt fugiat minim ad nostrud deserunt tempor in. Id irure officia labore qui est labore nulla nisi. Magna sit quis tempor esse consectetur amet labore duis aliqua consequat.\r\n"
}"#).unwrap();
let mut matcher = BTreeMap::new();
2021-11-10 23:10:30 +08:00
matcher.insert("green", Some(5));
2021-06-22 17:06:30 +08:00
matcher.insert("mollit", Some(6));
matcher.insert("laboris", Some(7));
2021-11-10 23:10:30 +08:00
matcher.insert("3", Some(1));
2021-06-22 17:06:30 +08:00
let stop_words = fst::Set::default();
let mut config = AnalyzerConfig::default();
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
let matches = compute_matches(&matcher, &value, &analyzer);
2021-06-23 20:48:33 +08:00
assert_eq!(
format!("{:?}", matches),
2021-11-10 23:10:30 +08:00
r##"{"about": [MatchInfo { start: 0, length: 6 }, MatchInfo { start: 31, length: 7 }, MatchInfo { start: 191, length: 7 }, MatchInfo { start: 225, length: 7 }, MatchInfo { start: 233, length: 6 }], "color": [MatchInfo { start: 0, length: 5 }], "price": [MatchInfo { start: 0, length: 1 }]}"##
2021-06-23 20:48:33 +08:00
);
2021-06-22 17:06:30 +08:00
}
2021-09-27 21:41:14 +08:00
#[test]
fn test_insert_geo_distance() {
let value: Document = serde_json::from_str(
r#"{
"_geo": {
"lat": 50.629973371633746,
"lng": 3.0569447399419567
},
"city": "Lille",
"id": "1"
}"#,
)
.unwrap();
let sorters = &["_geoPoint(50.629973371633746,3.0569447399419567):desc".to_string()];
let mut document = value.clone();
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), Some(&json!(0)));
let sorters = &["_geoPoint(50.629973371633746, 3.0569447399419567):asc".to_string()];
let mut document = value.clone();
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), Some(&json!(0)));
let sorters =
&["_geoPoint( 50.629973371633746 , 3.0569447399419567 ):desc".to_string()];
let mut document = value.clone();
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), Some(&json!(0)));
let sorters = &[
"prix:asc",
"villeneuve:desc",
"_geoPoint(50.629973371633746, 3.0569447399419567):asc",
"ubu:asc",
]
.map(|s| s.to_string());
let mut document = value.clone();
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), Some(&json!(0)));
// only the first geoPoint is used to compute the distance
let sorters = &[
"chien:desc",
"_geoPoint(50.629973371633746, 3.0569447399419567):asc",
"pangolin:desc",
"_geoPoint(100.0, -80.0):asc",
"chat:asc",
]
.map(|s| s.to_string());
let mut document = value.clone();
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), Some(&json!(0)));
// there was no _geoPoint so nothing is inserted in the document
let sorters = &["chien:asc".to_string()];
let mut document = value;
insert_geo_distance(sorters, &mut document);
assert_eq!(document.get("_geoDistance"), None);
}
2021-04-20 01:03:53 +08:00
}