diff --git a/Cargo.lock b/Cargo.lock
index 64b683481..7b0897571 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1092,8 +1092,8 @@ dependencies = [
[[package]]
name = "filter-parser"
-version = "0.26.4"
-source = "git+https://github.com/meilisearch/milli.git?tag=v0.26.5#1f6dc31e2f8ee02cdda255a856d15f253daf17ec"
+version = "0.28.0"
+source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
dependencies = [
"nom",
"nom_locate",
@@ -1119,8 +1119,8 @@ dependencies = [
[[package]]
name = "flatten-serde-json"
-version = "0.26.4"
-source = "git+https://github.com/meilisearch/milli.git?tag=v0.26.5#1f6dc31e2f8ee02cdda255a856d15f253daf17ec"
+version = "0.28.0"
+source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
dependencies = [
"serde_json",
]
@@ -1622,8 +1622,8 @@ dependencies = [
[[package]]
name = "json-depth-checker"
-version = "0.26.4"
-source = "git+https://github.com/meilisearch/milli.git?tag=v0.26.5#1f6dc31e2f8ee02cdda255a856d15f253daf17ec"
+version = "0.28.0"
+source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
dependencies = [
"serde_json",
]
@@ -2151,8 +2151,8 @@ dependencies = [
[[package]]
name = "milli"
-version = "0.26.4"
-source = "git+https://github.com/meilisearch/milli.git?tag=v0.26.5#1f6dc31e2f8ee02cdda255a856d15f253daf17ec"
+version = "0.28.0"
+source = "git+https://github.com/meilisearch/milli.git?tag=v0.28.0#19dac01c5ca81543b751f66ad51fcff61608d969"
dependencies = [
"bimap",
"bincode",
@@ -2189,6 +2189,7 @@ dependencies = [
"smallvec",
"smartstring",
"tempfile",
+ "thiserror",
"time 0.3.9",
"uuid",
]
@@ -3360,18 +3361,18 @@ checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb"
[[package]]
name = "thiserror"
-version = "1.0.30"
+version = "1.0.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "854babe52e4df1653706b98fcfc05843010039b406875930a70e4d9644e5c417"
+checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
-version = "1.0.30"
+version = "1.0.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aa32fd3f627f367fe16f893e2597ae3c05020f8bba2666a4e6ea73d377e5714b"
+checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a"
dependencies = [
"proc-macro2 1.0.37",
"quote 1.0.17",
diff --git a/meilisearch-auth/Cargo.toml b/meilisearch-auth/Cargo.toml
index 2d9f229f0..dd12b5b63 100644
--- a/meilisearch-auth/Cargo.toml
+++ b/meilisearch-auth/Cargo.toml
@@ -6,7 +6,7 @@ edition = "2021"
[dependencies]
enum-iterator = "0.7.0"
meilisearch-error = { path = "../meilisearch-error" }
-milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.26.5" }
+milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" }
rand = "0.8.4"
serde = { version = "1.0.136", features = ["derive"] }
serde_json = { version = "1.0.79", features = ["preserve_order"] }
diff --git a/meilisearch-http/tests/documents/add_documents.rs b/meilisearch-http/tests/documents/add_documents.rs
index 911cfd312..0ac0436dc 100644
--- a/meilisearch-http/tests/documents/add_documents.rs
+++ b/meilisearch-http/tests/documents/add_documents.rs
@@ -868,7 +868,12 @@ async fn error_add_documents_bad_document_id() {
let (response, code) = index.get_task(1).await;
assert_eq!(code, 200);
assert_eq!(response["status"], json!("failed"));
- assert_eq!(response["error"]["message"], json!("Document identifier `foo & bar` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)."));
+ assert_eq!(
+ response["error"]["message"],
+ json!(
+ r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)."#
+ )
+ );
assert_eq!(response["error"]["code"], json!("invalid_document_id"));
assert_eq!(response["error"]["type"], json!("invalid_request"));
assert_eq!(
@@ -891,7 +896,12 @@ async fn error_update_documents_bad_document_id() {
index.update_documents(documents, None).await;
let response = index.wait_task(1).await;
assert_eq!(response["status"], json!("failed"));
- assert_eq!(response["error"]["message"], json!("Document identifier `foo & bar` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)."));
+ assert_eq!(
+ response["error"]["message"],
+ json!(
+ r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)."#
+ )
+ );
assert_eq!(response["error"]["code"], json!("invalid_document_id"));
assert_eq!(response["error"]["type"], json!("invalid_request"));
assert_eq!(
diff --git a/meilisearch-http/tests/search/formatted.rs b/meilisearch-http/tests/search/formatted.rs
index 13b8a07d8..19387bdc5 100644
--- a/meilisearch-http/tests/search/formatted.rs
+++ b/meilisearch-http/tests/search/formatted.rs
@@ -16,7 +16,7 @@ async fn formatted_contain_wildcard() {
index.wait_task(1).await;
let (response, code) = index
- .search_post(json!({ "q": "pesti", "attributesToRetrieve": ["father", "mother"], "attributesToHighlight": ["father", "mother", "*"], "attributesToCrop": ["doggos"] }))
+ .search_post(json!({ "q": "pesti", "attributesToRetrieve": ["father", "mother"], "attributesToHighlight": ["father", "mother", "*"], "attributesToCrop": ["doggos"], "matches": true }))
.await;
assert_eq!(code, 200, "{}", response);
assert_eq!(
@@ -25,7 +25,8 @@ async fn formatted_contain_wildcard() {
"_formatted": {
"id": "852",
"cattos": "pesti",
- }
+ },
+ "_matchesInfo": {"cattos": [{"start": 0, "length": 5}]},
})
);
@@ -43,7 +44,7 @@ async fn formatted_contain_wildcard() {
let (response, code) = index
.search_post(
- json!({ "q": "pesti", "attributesToRetrieve": ["*"], "attributesToHighlight": ["id"] }),
+ json!({ "q": "pesti", "attributesToRetrieve": ["*"], "attributesToHighlight": ["id"], "matches": true }),
)
.await;
assert_eq!(code, 200, "{}", response);
@@ -55,7 +56,8 @@ async fn formatted_contain_wildcard() {
"_formatted": {
"id": "852",
"cattos": "pesti",
- }
+ },
+ "_matchesInfo": {"cattos": [{"start": 0, "length": 5}]},
})
);
@@ -141,6 +143,27 @@ async fn format_nested() {
})
);
+ let (response, code) = index
+ .search_post(
+ json!({ "q": "bobby", "attributesToRetrieve": ["doggos.name"], "matches": true }),
+ )
+ .await;
+ assert_eq!(code, 200, "{}", response);
+ assert_eq!(
+ response["hits"][0],
+ json!({
+ "doggos": [
+ {
+ "name": "bobby",
+ },
+ {
+ "name": "buddy",
+ },
+ ],
+ "_matchesInfo": {"doggos.name": [{"start": 0, "length": 5}]},
+ })
+ );
+
let (response, code) = index
.search_post(json!({ "q": "pesti", "attributesToRetrieve": [], "attributesToHighlight": ["doggos.name"] }))
.await;
diff --git a/meilisearch-lib/Cargo.toml b/meilisearch-lib/Cargo.toml
index 0b6596ffd..85ae49f64 100644
--- a/meilisearch-lib/Cargo.toml
+++ b/meilisearch-lib/Cargo.toml
@@ -30,7 +30,7 @@ lazy_static = "1.4.0"
log = "0.4.14"
meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-error = { path = "../meilisearch-error" }
-milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.26.5" }
+milli = { git = "https://github.com/meilisearch/milli.git", tag = "v0.28.0" }
mime = "0.3.16"
num_cpus = "1.13.1"
obkv = "0.2.0"
diff --git a/meilisearch-lib/src/index/search.rs b/meilisearch-lib/src/index/search.rs
index 7c12f985e..bf543b377 100644
--- a/meilisearch-lib/src/index/search.rs
+++ b/meilisearch-lib/src/index/search.rs
@@ -4,8 +4,10 @@ use std::str::FromStr;
use std::time::Instant;
use either::Either;
-use milli::tokenizer::{Analyzer, AnalyzerConfig, Token};
-use milli::{AscDesc, FieldId, FieldsIdsMap, Filter, MatchingWords, SortError};
+use milli::tokenizer::{Analyzer, AnalyzerConfig};
+use milli::{
+ AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, MatchBounds, MatcherBuilder, SortError,
+};
use regex::Regex;
use serde::{Deserialize, Serialize};
use serde_json::{json, Value};
@@ -16,13 +18,7 @@ use super::error::{IndexError, Result};
use super::index::Index;
pub type Document = serde_json::Map;
-type MatchesInfo = BTreeMap>;
-
-#[derive(Serialize, Debug, Clone, PartialEq)]
-pub struct MatchInfo {
- start: usize,
- length: usize,
-}
+type MatchesInfo = BTreeMap>;
pub const DEFAULT_SEARCH_LIMIT: usize = 20;
const fn default_search_limit() -> usize {
@@ -105,21 +101,6 @@ pub struct SearchResult {
pub exhaustive_facets_count: Option,
}
-#[derive(Copy, Clone, Default)]
-struct FormatOptions {
- highlight: bool,
- crop: Option,
-}
-
-impl FormatOptions {
- pub fn merge(self, other: Self) -> Self {
- Self {
- highlight: self.highlight || other.highlight,
- crop: self.crop.or(other.crop),
- }
- }
-}
-
impl Index {
pub fn perform_search(&self, query: SearchQuery) -> Result {
let before_search = Instant::now();
@@ -221,11 +202,10 @@ impl Index {
config.stop_words(&stop_words);
let analyzer = Analyzer::new(config);
- let formatter = Formatter::new(
- &analyzer,
- (query.highlight_pre_tag, query.highlight_post_tag),
- query.crop_marker,
- );
+ let mut formatter_builder = MatcherBuilder::from_matching_words(matching_words);
+ formatter_builder.crop_marker(query.crop_marker);
+ formatter_builder.highlight_prefix(query.highlight_pre_tag);
+ formatter_builder.highlight_suffix(query.highlight_post_tag);
let mut documents = Vec::new();
@@ -242,16 +222,14 @@ impl Index {
let mut document =
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
- let matches_info = query
- .matches
- .then(|| compute_matches(&matching_words, &document, &analyzer));
-
- let formatted = format_fields(
+ let (matches_info, formatted) = format_fields(
&displayed_document,
&fields_ids_map,
- &formatter,
- &matching_words,
+ &formatter_builder,
+ &analyzer,
&formatted_options,
+ query.matches,
+ &displayed_ids,
)?;
if let Some(sort) = query.sort.as_ref() {
@@ -317,56 +295,6 @@ fn insert_geo_distance(sorts: &[String], document: &mut Document) {
}
}
-fn compute_matches>(
- matcher: &impl Matcher,
- document: &Document,
- analyzer: &Analyzer,
-) -> MatchesInfo {
- let mut matches = BTreeMap::new();
-
- for (key, value) in document {
- let mut infos = Vec::new();
- compute_value_matches(&mut infos, value, matcher, analyzer);
- if !infos.is_empty() {
- matches.insert(key.clone(), infos);
- }
- }
- matches
-}
-
-fn compute_value_matches<'a, A: AsRef<[u8]>>(
- infos: &mut Vec,
- value: &Value,
- matcher: &impl Matcher,
- analyzer: &Analyzer<'a, A>,
-) {
- match value {
- Value::String(s) => {
- let analyzed = analyzer.analyze(s);
- let mut start = 0;
- for (word, token) in analyzed.reconstruct() {
- if token.is_word() {
- if let Some(length) = matcher.matches(&token) {
- infos.push(MatchInfo { start, length });
- }
- }
-
- start += word.len();
- }
- }
- Value::Array(vals) => vals
- .iter()
- .for_each(|val| compute_value_matches(infos, val, matcher, analyzer)),
- Value::Object(vals) => vals
- .values()
- .for_each(|val| compute_value_matches(infos, val, matcher, analyzer)),
- Value::Number(number) => {
- compute_value_matches(infos, &Value::String(number.to_string()), matcher, analyzer)
- }
- _ => (),
- }
-}
-
fn compute_formatted_options(
attr_to_highlight: &HashSet,
attr_to_crop: &[String],
@@ -509,22 +437,23 @@ fn make_document(
Ok(document)
}
-fn format_fields>(
+fn format_fields<'a, A: AsRef<[u8]>>(
document: &Document,
field_ids_map: &FieldsIdsMap,
- formatter: &Formatter,
- matching_words: &impl Matcher,
+ builder: &MatcherBuilder,
+ analyzer: &'a Analyzer<'a, A>,
formatted_options: &BTreeMap,
-) -> Result {
- let selectors: Vec<_> = formatted_options
- .keys()
- // This unwrap must be safe since we got the ids from the fields_ids_map just
- // before.
- .map(|&fid| field_ids_map.name(fid).unwrap())
- .collect();
- let mut document = permissive_json_pointer::select_values(document, selectors.iter().copied());
+ compute_matches: bool,
+ displayable_ids: &BTreeSet,
+) -> Result<(Option, Document)> {
+ let mut matches = compute_matches.then(BTreeMap::new);
+ let mut document = document.clone();
- permissive_json_pointer::map_leaf_values(&mut document, selectors, |key, value| {
+ // select the attributes to retrieve
+ let displayable_names = displayable_ids
+ .iter()
+ .map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
+ permissive_json_pointer::map_leaf_values(&mut document, displayable_names, |key, value| {
// To get the formatting option of each key we need to see all the rules that applies
// to the value and merge them together. eg. If a user said he wanted to highlight `doggo`
// and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only
@@ -535,235 +464,124 @@ fn format_fields>(
let name = field_ids_map.name(**field).unwrap();
milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name)
})
- .fold(FormatOptions::default(), |acc, (_, option)| {
- acc.merge(*option)
- });
- *value = formatter.format_value(std::mem::take(value), matching_words, format);
+ .map(|(_, option)| *option)
+ .reduce(|acc, option| acc.merge(option));
+ let mut infos = Vec::new();
+
+ *value = format_value(
+ std::mem::take(value),
+ builder,
+ format,
+ analyzer,
+ &mut infos,
+ compute_matches,
+ );
+
+ if let Some(matches) = matches.as_mut() {
+ if !infos.is_empty() {
+ matches.insert(key.to_owned(), infos);
+ }
+ }
});
- Ok(document)
+ let selectors = formatted_options
+ .keys()
+ // This unwrap must be safe since we got the ids from the fields_ids_map just
+ // before.
+ .map(|&fid| field_ids_map.name(fid).unwrap());
+ let document = permissive_json_pointer::select_values(&document, selectors);
+
+ Ok((matches, document))
}
-/// trait to allow unit testing of `format_fields`
-trait Matcher {
- fn matches(&self, w: &Token) -> Option;
-}
-
-#[cfg(test)]
-impl Matcher for BTreeMap<&str, Option> {
- fn matches(&self, w: &Token) -> Option {
- self.get(w.text()).cloned().flatten()
- }
-}
-
-impl Matcher for MatchingWords {
- fn matches(&self, w: &Token) -> Option {
- self.matching_bytes(w)
- }
-}
-
-struct Formatter<'a, A> {
+fn format_value<'a, A: AsRef<[u8]>>(
+ value: Value,
+ builder: &MatcherBuilder,
+ format_options: Option,
analyzer: &'a Analyzer<'a, A>,
- highlight_tags: (String, String),
- crop_marker: String,
-}
+ infos: &mut Vec,
+ compute_matches: bool,
+) -> Value {
+ match value {
+ Value::String(old_string) => {
+ // this will be removed with charabia
+ let analyzed = analyzer.analyze(&old_string);
+ let tokens: Vec<_> = analyzed.tokens().collect();
-impl<'a, A: AsRef<[u8]>> Formatter<'a, A> {
- pub fn new(
- analyzer: &'a Analyzer<'a, A>,
- highlight_tags: (String, String),
- crop_marker: String,
- ) -> Self {
- Self {
- analyzer,
- highlight_tags,
- crop_marker,
- }
- }
-
- fn format_value(
- &self,
- value: Value,
- matcher: &impl Matcher,
- format_options: FormatOptions,
- ) -> Value {
- match value {
- Value::String(old_string) => {
- let value = self.format_string(old_string, matcher, format_options);
- Value::String(value)
+ let mut matcher = builder.build(&tokens[..], &old_string);
+ if compute_matches {
+ let matches = matcher.matches();
+ infos.extend_from_slice(&matches[..]);
}
- Value::Array(values) => Value::Array(
- values
- .into_iter()
- .map(|v| {
- self.format_value(
+
+ match format_options {
+ Some(format_options) => {
+ let value = matcher.format(format_options);
+ Value::String(value.into_owned())
+ }
+ None => Value::String(old_string),
+ }
+ }
+ Value::Array(values) => Value::Array(
+ values
+ .into_iter()
+ .map(|v| {
+ format_value(
+ v,
+ builder,
+ format_options.map(|format_options| FormatOptions {
+ highlight: format_options.highlight,
+ crop: None,
+ }),
+ analyzer,
+ infos,
+ compute_matches,
+ )
+ })
+ .collect(),
+ ),
+ Value::Object(object) => Value::Object(
+ object
+ .into_iter()
+ .map(|(k, v)| {
+ (
+ k,
+ format_value(
v,
- matcher,
- FormatOptions {
+ builder,
+ format_options.map(|format_options| FormatOptions {
highlight: format_options.highlight,
crop: None,
- },
- )
- })
- .collect(),
- ),
- Value::Object(object) => Value::Object(
- object
- .into_iter()
- .map(|(k, v)| {
- (
- k,
- self.format_value(
- v,
- matcher,
- FormatOptions {
- highlight: format_options.highlight,
- crop: None,
- },
- ),
- )
- })
- .collect(),
- ),
- Value::Number(number) => {
- let number_string_value =
- self.format_string(number.to_string(), matcher, format_options);
- Value::String(number_string_value)
+ }),
+ analyzer,
+ infos,
+ compute_matches,
+ ),
+ )
+ })
+ .collect(),
+ ),
+ Value::Number(number) => {
+ // this will be removed with charabia
+ let s = number.to_string();
+ let analyzed = analyzer.analyze(&s);
+ let tokens: Vec<_> = analyzed.tokens().collect();
+
+ let mut matcher = builder.build(&tokens[..], &s);
+ if compute_matches {
+ let matches = matcher.matches();
+ infos.extend_from_slice(&matches[..]);
+ }
+
+ match format_options {
+ Some(format_options) => {
+ let value = matcher.format(format_options);
+ Value::String(value.into_owned())
+ }
+ None => Value::Number(number),
}
- value => value,
}
- }
-
- fn format_string(
- &self,
- s: String,
- matcher: &impl Matcher,
- format_options: FormatOptions,
- ) -> String {
- let analyzed = self.analyzer.analyze(&s);
-
- let mut tokens = analyzed.reconstruct();
- let mut crop_marker_before = false;
-
- let tokens_interval: Box> = match format_options.crop {
- Some(crop_len) if crop_len > 0 => {
- let mut buffer = Vec::new();
- let mut tokens = tokens.by_ref().peekable();
-
- while let Some((word, token)) =
- tokens.next_if(|(_, token)| matcher.matches(token).is_none())
- {
- buffer.push((word, token));
- }
-
- match tokens.next() {
- Some(token) => {
- let mut total_count: usize = buffer
- .iter()
- .filter(|(_, token)| token.is_separator().is_none())
- .count();
-
- let crop_len_before = crop_len / 2;
- // check if start will be cropped.
- crop_marker_before = total_count > crop_len_before;
-
- let before_iter = buffer.into_iter().skip_while(move |(_, token)| {
- if token.is_separator().is_none() {
- total_count -= 1;
- }
- total_count >= crop_len_before
- });
-
- // rebalance remaining word count after the match.
- let crop_len_after = if crop_marker_before {
- crop_len.saturating_sub(crop_len_before + 1)
- } else {
- crop_len.saturating_sub(total_count + 1)
- };
-
- let mut taken_after = 0;
- let after_iter = tokens.take_while(move |(_, token)| {
- let take = taken_after < crop_len_after;
- if token.is_separator().is_none() {
- taken_after += 1;
- }
- take
- });
-
- let iter = before_iter.chain(Some(token)).chain(after_iter);
-
- Box::new(iter)
- }
- // If no word matches in the attribute
- None => {
- let mut count = 0;
- let mut tokens = buffer.into_iter();
- let mut out: String = tokens
- .by_ref()
- .take_while(move |(_, token)| {
- let take = count < crop_len;
- if token.is_separator().is_none() {
- count += 1;
- }
- take
- })
- .map(|(word, _)| word)
- .collect();
-
- // if there are remaining tokens after formatted interval,
- // put a crop marker at the end.
- if tokens.next().is_some() {
- out.push_str(&self.crop_marker);
- }
-
- return out;
- }
- }
- }
- _ => Box::new(tokens.by_ref()),
- };
-
- let out = if crop_marker_before {
- self.crop_marker.clone()
- } else {
- String::new()
- };
-
- let mut out = tokens_interval.fold(out, |mut out, (word, token)| {
- // Check if we need to do highlighting or computed matches before calling
- // Matcher::match since the call is expensive.
- if format_options.highlight && token.is_word() {
- if let Some(length) = matcher.matches(&token) {
- match word.get(..length).zip(word.get(length..)) {
- Some((head, tail)) => {
- out.push_str(&self.highlight_tags.0);
- out.push_str(head);
- out.push_str(&self.highlight_tags.1);
- out.push_str(tail);
- }
- // if we are in the middle of a character
- // or if all the word should be highlighted,
- // we highlight the complete word.
- None => {
- out.push_str(&self.highlight_tags.0);
- out.push_str(word);
- out.push_str(&self.highlight_tags.1);
- }
- }
- return out;
- }
- }
- out.push_str(word);
- out
- });
-
- // if there are remaining tokens after formatted interval,
- // put a crop marker at the end.
- if tokens.next().is_some() {
- out.push_str(&self.crop_marker);
- }
-
- out
+ value => value,
}
}
@@ -810,740 +628,17 @@ fn parse_filter_array(arr: &[Value]) -> Result