From edcb4c60ba0bc416152bdfd931598bfa0df87467 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 12 Sep 2024 09:44:37 +0300 Subject: [PATCH 01/92] Change Matcher so that phrases are counted as one instead of word by word --- milli/src/search/new/matches/mod.rs | 45 +++++++++++------------------ 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 4688b8f32..6ddb81c6a 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -132,37 +132,21 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { mut partial: PartialMatch<'a>, token_position: usize, word_position: usize, + first_word_char_start: &usize, words_positions: &mut impl Iterator)>, matches: &mut Vec, ) -> bool { - let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; - - for (token_position, word_position, word) in words_positions { + for (_, _, word) in words_positions { partial = match partial.match_token(word) { // token matches the partial match, but the match is not full, // we temporarily save the current token then we try to match the next one. - Some(MatchType::Partial(partial)) => { - potential_matches.push((token_position, word_position, partial.char_len())); - partial - } + Some(MatchType::Partial(partial)) => partial, // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - let ids: Vec<_> = ids.clone().collect(); - // save previously matched tokens as matches. - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| Match { - match_len, - ids: ids.clone(), - word_position, - token_position, - }, - ); - matches.extend(iter); - + Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - match_len: char_len, - ids, + match_len: word.char_end - first_word_char_start, + ids: ids.clone().collect(), word_position, token_position, }); @@ -221,6 +205,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { partial, token_position, word_position, + &word.char_start, &mut wp, &mut matches, ) { @@ -472,15 +457,17 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .enumerate() .find(|(i, _)| *i == m.match_len) .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + formatted.push(self.highlight_prefix); formatted.push(&self.text[token.byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); + // if it's a prefix highlight, we put the end of the word after the highlight marker. if highlight_byte_index < token.byte_end { formatted.push(&self.text[highlight_byte_index..token.byte_end]); } - byte_index = token.byte_end; + byte_index = token.byte_start + m.match_len; } } @@ -821,22 +808,24 @@ mod tests { fn format_highlight_crop_phrase_query() { //! testing: https://github.com/meilisearch/meilisearch/issues/3975 let temp_index = TempIndex::new(); + + let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; temp_index .add_documents(documents!([ - { "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" } + { "id": 1, "text": text } ])) .unwrap(); + let rtxn = temp_index.read_txn().unwrap(); let format_options = FormatOptions { highlight: true, crop: Some(10) }; - let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); let mut matcher = builder.build(text, None); // should return 10 words with a marker at the start as well the end, and the highlighted matches. insta::assert_snapshot!( matcher.format(format_options), - @"…had the power to split the world between those who…" + @"…had the power to split the world between those who…" ); let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); @@ -844,7 +833,7 @@ mod tests { // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), - @"…world between those who embraced progress and those who resisted…" + @"…world between those who embraced progress and those who resisted…" ); } @@ -900,7 +889,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" + @"_the_ _do or_ die can't be he do and or isn'_t he_" ); } } From e7af499314f24e51f1bff27ff231ceb898aa27a1 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 12 Sep 2024 16:58:13 +0300 Subject: [PATCH 02/92] Improve changes to Matcher --- milli/src/search/new/matches/mod.rs | 136 +++++++++++++++++++++------- 1 file changed, 104 insertions(+), 32 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 6ddb81c6a..26dd6f6e8 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -93,15 +93,28 @@ impl FormatOptions { } } +#[derive(Clone, Debug)] +pub enum MatchPosition { + Word { + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, + }, + Phrase { + // position of the first and last word in the phrase in the whole text. + word_positions: (usize, usize), + // position of the first and last token in the phrase in the whole text. + token_positions: (usize, usize), + }, +} + #[derive(Clone, Debug)] pub struct Match { match_len: usize, // ids of the query words that matches. ids: Vec, - // position of the word in the whole text. - word_position: usize, - // position of the token in the whole text. - token_position: usize, + position: MatchPosition, } #[derive(Serialize, Debug, Clone, PartialEq, Eq)] @@ -130,13 +143,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { /// compute_partial_match peek into next words to validate if the match is complete. fn compute_partial_match<'a>( mut partial: PartialMatch<'a>, - token_position: usize, - word_position: usize, + first_token_position: usize, + first_word_position: usize, first_word_char_start: &usize, words_positions: &mut impl Iterator)>, matches: &mut Vec, ) -> bool { - for (_, _, word) in words_positions { + for (token_position, word_position, word) in words_positions { partial = match partial.match_token(word) { // token matches the partial match, but the match is not full, // we temporarily save the current token then we try to match the next one. @@ -145,10 +158,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - match_len: word.char_end - first_word_char_start, + match_len: word.char_end - *first_word_char_start, ids: ids.clone().collect(), - word_position, - token_position, + position: MatchPosition::Phrase { + word_positions: (first_word_position, word_position), + token_positions: (first_token_position, token_position), + }, }); // the match is complete, we return true. @@ -191,8 +206,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { matches.push(Match { match_len: char_len, ids, - word_position, - token_position, + position: MatchPosition::Word { word_position, token_position }, }); break; } @@ -228,13 +242,47 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some((tokens, matches)) => matches .iter() .map(|m| MatchBounds { - start: tokens[m.token_position].byte_start, + start: tokens[match m.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { + token_positions: (first_token_position, _), + .. + } => first_token_position, + }] + .byte_start, length: m.match_len, }) .collect(), } } + // @TODO: This should be improved, looks nasty + fn get_match_pos(&self, m: &Match, is_first: bool, is_word: bool) -> usize { + match m.position { + MatchPosition::Word { word_position, token_position } => { + if is_word { + word_position + } else { + token_position + } + } + MatchPosition::Phrase { word_positions: (wpf, wpl), token_positions: (tpf, tpl) } => { + if is_word { + if is_first { + return wpf; + } else { + return wpl; + } + } + if is_first { + tpf + } else { + tpl + } + } + } + } + /// Returns the bounds in byte index of the crop window. fn crop_bounds( &self, @@ -243,10 +291,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { crop_size: usize, ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. - let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); - let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); - let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); - let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); + let first_match_word_position = + matches.first().map(|m| self.get_match_pos(m, true, true)).unwrap_or(0); + let first_match_token_position = + matches.first().map(|m| self.get_match_pos(m, true, false)).unwrap_or(0); + let last_match_word_position = + matches.last().map(|m| self.get_match_pos(m, false, true)).unwrap_or(0); + let last_match_token_position = + matches.last().map(|m| self.get_match_pos(m, false, false)).unwrap_or(0); // matches needs to be counted in the crop len. let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; @@ -350,7 +402,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } // compute distance between matches - distance_score -= (next_match.word_position - m.word_position).min(7) as i16; + distance_score -= (self.get_match_pos(next_match, true, true) + - self.get_match_pos(m, true, true)) + .min(7) as i16; } ids.extend(m.ids.iter()); @@ -378,7 +432,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - if next_match.word_position - matches[interval_first].word_position >= crop_size { + let next_match_word_position = self.get_match_pos(next_match, true, true); + + if next_match_word_position + - self.get_match_pos(&matches[interval_first], false, true) + >= crop_size + { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); @@ -389,10 +448,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } // advance start of the interval while interval is longer than crop_size. - while next_match.word_position - matches[interval_first].word_position - >= crop_size - { + loop { interval_first += 1; + + if next_match_word_position + - self.get_match_pos(&matches[interval_first], false, true) + < crop_size + { + break; + } } } interval_last = index; @@ -441,33 +505,41 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { if format_options.highlight { // insert highlight markers around matches. for m in matches { - let token = &tokens[m.token_position]; + let (current_byte_start, current_byte_end) = match m.position { + MatchPosition::Word { token_position, .. } => { + let token = &tokens[token_position]; + (&token.byte_start, &token.byte_end) + } + MatchPosition::Phrase { token_positions: (ftp, ltp), .. } => { + (&tokens[ftp].byte_start, &tokens[ltp].byte_end) + } + }; // skip matches out of the crop window. - if token.byte_start < byte_start || token.byte_end > byte_end { + if *current_byte_start < byte_start || *current_byte_end > byte_end { continue; } - if byte_index < token.byte_start { - formatted.push(&self.text[byte_index..token.byte_start]); + if byte_index < *current_byte_start { + formatted.push(&self.text[byte_index..*current_byte_start]); } - let highlight_byte_index = self.text[token.byte_start..] + let highlight_byte_index = self.text[*current_byte_start..] .char_indices() .enumerate() .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + .map_or(*current_byte_end, |(_, (i, _))| i + *current_byte_start); formatted.push(self.highlight_prefix); - formatted.push(&self.text[token.byte_start..highlight_byte_index]); + formatted.push(&self.text[*current_byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); // if it's a prefix highlight, we put the end of the word after the highlight marker. - if highlight_byte_index < token.byte_end { - formatted.push(&self.text[highlight_byte_index..token.byte_end]); + if highlight_byte_index < *current_byte_end { + formatted.push(&self.text[highlight_byte_index..*current_byte_end]); } - byte_index = token.byte_start + m.match_len; + byte_index = *current_byte_end; } } From cc6a2aec06ebd6cb7332afb0478affe3e63185af Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:31:07 +0300 Subject: [PATCH 03/92] Improve changes to Matcher --- milli/src/search/new/matches/mod.rs | 78 +++++++++++++++-------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 26dd6f6e8..a84b25923 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -93,6 +93,16 @@ impl FormatOptions { } } +enum FL { + First, + Last, +} + +enum WT { + Word, + Token, +} + #[derive(Clone, Debug)] pub enum MatchPosition { Word { @@ -256,28 +266,22 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } - // @TODO: This should be improved, looks nasty - fn get_match_pos(&self, m: &Match, is_first: bool, is_word: bool) -> usize { + fn get_match_pos(&self, m: &Match, wt: WT, fl: FL) -> usize { match m.position { - MatchPosition::Word { word_position, token_position } => { - if is_word { - word_position - } else { - token_position - } - } - MatchPosition::Phrase { word_positions: (wpf, wpl), token_positions: (tpf, tpl) } => { - if is_word { - if is_first { - return wpf; - } else { - return wpl; - } - } - if is_first { - tpf - } else { - tpl + MatchPosition::Word { word_position, token_position } => match wt { + WT::Word => word_position, + WT::Token => token_position, + }, + MatchPosition::Phrase { word_positions: (fwp, lwp), token_positions: (ftp, ltp) } => { + match wt { + WT::Word => match fl { + FL::First => fwp, + FL::Last => lwp, + }, + WT::Token => match fl { + FL::First => ftp, + FL::Last => ltp, + }, } } } @@ -292,13 +296,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = - matches.first().map(|m| self.get_match_pos(m, true, true)).unwrap_or(0); + matches.first().map(|m| self.get_match_pos(m, WT::Word, FL::First)).unwrap_or(0); let first_match_token_position = - matches.first().map(|m| self.get_match_pos(m, true, false)).unwrap_or(0); + matches.first().map(|m| self.get_match_pos(m, WT::Token, FL::First)).unwrap_or(0); let last_match_word_position = - matches.last().map(|m| self.get_match_pos(m, false, true)).unwrap_or(0); + matches.last().map(|m| self.get_match_pos(m, WT::Word, FL::Last)).unwrap_or(0); let last_match_token_position = - matches.last().map(|m| self.get_match_pos(m, false, false)).unwrap_or(0); + matches.last().map(|m| self.get_match_pos(m, WT::Token, FL::Last)).unwrap_or(0); // matches needs to be counted in the crop len. let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; @@ -401,10 +405,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { order_score += 1; } + let next_match_first_word_pos = self.get_match_pos(next_match, WT::Word, FL::First); + let current_match_first_word_pos = self.get_match_pos(m, WT::Word, FL::First); + // compute distance between matches - distance_score -= (self.get_match_pos(next_match, true, true) - - self.get_match_pos(m, true, true)) - .min(7) as i16; + distance_score -= + (next_match_first_word_pos - current_match_first_word_pos).min(7) as i16; } ids.extend(m.ids.iter()); @@ -432,12 +438,11 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - let next_match_word_position = self.get_match_pos(next_match, true, true); + let next_match_word_pos = self.get_match_pos(next_match, WT::Word, FL::First); + let mut interval_first_match_word_pos = + self.get_match_pos(&matches[interval_first], WT::Word, FL::Last); - if next_match_word_position - - self.get_match_pos(&matches[interval_first], false, true) - >= crop_size - { + if next_match_word_pos - interval_first_match_word_pos >= crop_size { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); @@ -450,11 +455,10 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // advance start of the interval while interval is longer than crop_size. loop { interval_first += 1; + interval_first_match_word_pos = + self.get_match_pos(&matches[interval_first], WT::Word, FL::Last); - if next_match_word_position - - self.get_match_pos(&matches[interval_first], false, true) - < crop_size - { + if next_match_word_pos - interval_first_match_word_pos < crop_size { break; } } From 65e3d61a955dd9b0f4b877d17a0b2b0dc087816c Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:35:58 +0300 Subject: [PATCH 04/92] Make use of helper function in one more place --- milli/src/search/new/matches/mod.rs | 35 ++++++++++++----------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index a84b25923..5a4f0b914 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -245,27 +245,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { self } - /// Returns boundaries of the words that match the query. - pub fn matches(&mut self) -> Vec { - match &self.matches { - None => self.compute_matches().matches(), - Some((tokens, matches)) => matches - .iter() - .map(|m| MatchBounds { - start: tokens[match m.position { - MatchPosition::Word { token_position, .. } => token_position, - MatchPosition::Phrase { - token_positions: (first_token_position, _), - .. - } => first_token_position, - }] - .byte_start, - length: m.match_len, - }) - .collect(), - } - } - fn get_match_pos(&self, m: &Match, wt: WT, fl: FL) -> usize { match m.position { MatchPosition::Word { word_position, token_position } => match wt { @@ -287,6 +266,20 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } + /// Returns boundaries of the words that match the query. + pub fn matches(&mut self) -> Vec { + match &self.matches { + None => self.compute_matches().matches(), + Some((tokens, matches)) => matches + .iter() + .map(|m| MatchBounds { + start: tokens[self.get_match_pos(m, WT::Token, FL::First)].byte_start, + length: m.match_len, + }) + .collect(), + } + } + /// Returns the bounds in byte index of the crop window. fn crop_bounds( &self, From cab63abc845d87350ab36c07d3999b58eebd0eaa Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 13 Sep 2024 14:35:28 +0300 Subject: [PATCH 05/92] Improve MatchesPosition enum with an impl --- milli/src/search/new/matches/mod.rs | 81 ++++++++++++++--------------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 5a4f0b914..ce878a1eb 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -93,16 +93,6 @@ impl FormatOptions { } } -enum FL { - First, - Last, -} - -enum WT { - Word, - Token, -} - #[derive(Clone, Debug)] pub enum MatchPosition { Word { @@ -127,6 +117,36 @@ pub struct Match { position: MatchPosition, } +impl MatchPosition { + fn get_first_word(m: &Match) -> usize { + match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp, + } + } + + fn get_last_word(m: &Match) -> usize { + match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: (_, lwp), .. } => lwp, + } + } + + fn get_first_token(m: &Match) -> usize { + match m.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: (ftp, _), .. } => ftp, + } + } + + fn get_last_token(m: &Match) -> usize { + match m.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp, + } + } +} + #[derive(Serialize, Debug, Clone, PartialEq, Eq)] pub struct MatchBounds { pub start: usize, @@ -245,27 +265,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { self } - fn get_match_pos(&self, m: &Match, wt: WT, fl: FL) -> usize { - match m.position { - MatchPosition::Word { word_position, token_position } => match wt { - WT::Word => word_position, - WT::Token => token_position, - }, - MatchPosition::Phrase { word_positions: (fwp, lwp), token_positions: (ftp, ltp) } => { - match wt { - WT::Word => match fl { - FL::First => fwp, - FL::Last => lwp, - }, - WT::Token => match fl { - FL::First => ftp, - FL::Last => ltp, - }, - } - } - } - } - /// Returns boundaries of the words that match the query. pub fn matches(&mut self) -> Vec { match &self.matches { @@ -273,7 +272,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some((tokens, matches)) => matches .iter() .map(|m| MatchBounds { - start: tokens[self.get_match_pos(m, WT::Token, FL::First)].byte_start, + start: tokens[MatchPosition::get_first_token(m)].byte_start, length: m.match_len, }) .collect(), @@ -289,13 +288,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = - matches.first().map(|m| self.get_match_pos(m, WT::Word, FL::First)).unwrap_or(0); + matches.first().map(|m| MatchPosition::get_first_word(m)).unwrap_or(0); let first_match_token_position = - matches.first().map(|m| self.get_match_pos(m, WT::Token, FL::First)).unwrap_or(0); + matches.first().map(|m| MatchPosition::get_first_token(m)).unwrap_or(0); let last_match_word_position = - matches.last().map(|m| self.get_match_pos(m, WT::Word, FL::Last)).unwrap_or(0); + matches.last().map(|m| MatchPosition::get_last_word(m)).unwrap_or(0); let last_match_token_position = - matches.last().map(|m| self.get_match_pos(m, WT::Token, FL::Last)).unwrap_or(0); + matches.last().map(|m| MatchPosition::get_last_token(m)).unwrap_or(0); // matches needs to be counted in the crop len. let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; @@ -398,8 +397,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { order_score += 1; } - let next_match_first_word_pos = self.get_match_pos(next_match, WT::Word, FL::First); - let current_match_first_word_pos = self.get_match_pos(m, WT::Word, FL::First); + let next_match_first_word_pos = MatchPosition::get_first_word(next_match); + let current_match_first_word_pos = MatchPosition::get_first_word(m); // compute distance between matches distance_score -= @@ -431,9 +430,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - let next_match_word_pos = self.get_match_pos(next_match, WT::Word, FL::First); + let next_match_word_pos = MatchPosition::get_first_word(next_match); let mut interval_first_match_word_pos = - self.get_match_pos(&matches[interval_first], WT::Word, FL::Last); + MatchPosition::get_last_word(&matches[interval_first]); if next_match_word_pos - interval_first_match_word_pos >= crop_size { let interval_score = @@ -449,7 +448,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { loop { interval_first += 1; interval_first_match_word_pos = - self.get_match_pos(&matches[interval_first], WT::Word, FL::Last); + MatchPosition::get_last_word(&matches[interval_first]); if next_match_word_pos - interval_first_match_word_pos < crop_size { break; From a2a16bf846066f422a5e6bd9bcb0009a894dcad0 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 13 Sep 2024 21:20:06 +0300 Subject: [PATCH 06/92] Move MatchPosition impl to Match, adjust counting score for phrases --- milli/src/search/new/matches/mod.rs | 66 +++++++++++++++++++---------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index ce878a1eb..e63920145 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -117,30 +117,30 @@ pub struct Match { position: MatchPosition, } -impl MatchPosition { - fn get_first_word(m: &Match) -> usize { - match m.position { +impl Match { + fn get_first_word_pos(&self) -> usize { + match self.position { MatchPosition::Word { word_position, .. } => word_position, MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp, } } - fn get_last_word(m: &Match) -> usize { - match m.position { + fn get_last_word_pos(&self) -> usize { + match self.position { MatchPosition::Word { word_position, .. } => word_position, MatchPosition::Phrase { word_positions: (_, lwp), .. } => lwp, } } - fn get_first_token(m: &Match) -> usize { - match m.position { + fn get_first_token_pos(&self) -> usize { + match self.position { MatchPosition::Word { token_position, .. } => token_position, MatchPosition::Phrase { token_positions: (ftp, _), .. } => ftp, } } - fn get_last_token(m: &Match) -> usize { - match m.position { + fn get_last_token_pos(&self) -> usize { + match self.position { MatchPosition::Word { token_position, .. } => token_position, MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp, } @@ -272,7 +272,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some((tokens, matches)) => matches .iter() .map(|m| MatchBounds { - start: tokens[MatchPosition::get_first_token(m)].byte_start, + start: tokens[m.get_first_token_pos()].byte_start, length: m.match_len, }) .collect(), @@ -288,13 +288,11 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = - matches.first().map(|m| MatchPosition::get_first_word(m)).unwrap_or(0); + matches.first().map(|m| m.get_first_word_pos()).unwrap_or(0); let first_match_token_position = - matches.first().map(|m| MatchPosition::get_first_token(m)).unwrap_or(0); - let last_match_word_position = - matches.last().map(|m| MatchPosition::get_last_word(m)).unwrap_or(0); - let last_match_token_position = - matches.last().map(|m| MatchPosition::get_last_token(m)).unwrap_or(0); + matches.first().map(|m| m.get_first_token_pos()).unwrap_or(0); + let last_match_word_position = matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0); + let last_match_token_position = matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0); // matches needs to be counted in the crop len. let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; @@ -389,6 +387,16 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let mut order_score = 0; let mut distance_score = 0; + // Count score for phrases + let tally_phrase_scores = + |fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16| { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be ordered, so +1 for each space between words + *order_score += words_in_phrase_minus_one; + // distance will always be 1, so -1 for each space between words + *distance_score -= words_in_phrase_minus_one; + }; + let mut iter = matches.iter().peekable(); while let Some(m) = iter.next() { if let Some(next_match) = iter.peek() { @@ -397,12 +405,24 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { order_score += 1; } - let next_match_first_word_pos = MatchPosition::get_first_word(next_match); - let current_match_first_word_pos = MatchPosition::get_first_word(m); + let m_last_word_pos = match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => { + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + lwp + } + }; + + let next_match_first_word_pos = match next_match.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp, + }; // compute distance between matches - distance_score -= - (next_match_first_word_pos - current_match_first_word_pos).min(7) as i16; + distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; + } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position { + // in case last match is a phrase, count score for its words + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); } ids.extend(m.ids.iter()); @@ -430,9 +450,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - let next_match_word_pos = MatchPosition::get_first_word(next_match); + let next_match_word_pos = next_match.get_last_word_pos(); let mut interval_first_match_word_pos = - MatchPosition::get_last_word(&matches[interval_first]); + matches[interval_first].get_first_word_pos(); if next_match_word_pos - interval_first_match_word_pos >= crop_size { let interval_score = @@ -448,7 +468,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { loop { interval_first += 1; interval_first_match_word_pos = - MatchPosition::get_last_word(&matches[interval_first]); + matches[interval_first].get_first_word_pos(); if next_match_word_pos - interval_first_match_word_pos < crop_size { break; From 51085206ccab6e8e0098c4cf8b2a3e67e06558a4 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Sat, 14 Sep 2024 10:14:07 +0300 Subject: [PATCH 07/92] Misc adjustments --- milli/src/search/new/matches/mod.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index e63920145..414509cd3 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -387,7 +387,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let mut order_score = 0; let mut distance_score = 0; - // Count score for phrases + // count score for phrases let tally_phrase_scores = |fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16| { let words_in_phrase_minus_one = (lwp - fwp) as i16; @@ -450,11 +450,11 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - let next_match_word_pos = next_match.get_last_word_pos(); - let mut interval_first_match_word_pos = + let next_match_last_word_pos = next_match.get_last_word_pos(); + let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - if next_match_word_pos - interval_first_match_word_pos >= crop_size { + if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); @@ -467,10 +467,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // advance start of the interval while interval is longer than crop_size. loop { interval_first += 1; - interval_first_match_word_pos = + interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - if next_match_word_pos - interval_first_match_word_pos < crop_size { + if next_match_last_word_pos - interval_first_match_first_word_pos + < crop_size + { break; } } From 993408d3ba65cbcea9920caeab8b421160a931ac Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Sun, 15 Sep 2024 16:15:09 +0300 Subject: [PATCH 08/92] Change closure to fn --- milli/src/search/new/matches/mod.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 414509cd3..df110aff9 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -388,14 +388,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let mut distance_score = 0; // count score for phrases - let tally_phrase_scores = - |fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16| { - let words_in_phrase_minus_one = (lwp - fwp) as i16; - // will always be ordered, so +1 for each space between words - *order_score += words_in_phrase_minus_one; - // distance will always be 1, so -1 for each space between words - *distance_score -= words_in_phrase_minus_one; - }; + fn tally_phrase_scores( + fwp: &usize, + lwp: &usize, + order_score: &mut i16, + distance_score: &mut i16, + ) { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be ordered, so +1 for each space between words + *order_score += words_in_phrase_minus_one; + // distance will always be 1, so -1 for each space between words + *distance_score -= words_in_phrase_minus_one; + } let mut iter = matches.iter().peekable(); while let Some(m) = iter.next() { From f7337affd6342ae495d99312862b300e7af461e0 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Tue, 17 Sep 2024 17:31:09 +0300 Subject: [PATCH 09/92] Adjust tests to changes --- meilisearch/tests/search/locales.rs | 44 ++++++++++++++--------------- milli/src/search/new/matches/mod.rs | 2 +- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/meilisearch/tests/search/locales.rs b/meilisearch/tests/search/locales.rs index dbc4fcc30..b9e70c5b1 100644 --- a/meilisearch/tests/search/locales.rs +++ b/meilisearch/tests/search/locales.rs @@ -400,9 +400,9 @@ async fn force_locales() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -447,9 +447,9 @@ async fn force_locales() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -524,9 +524,9 @@ async fn force_locales_with_pattern() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -571,9 +571,9 @@ async fn force_locales_with_pattern() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -689,8 +689,8 @@ async fn force_locales_with_pattern_nested() { "author": "諫山 創" }, "document_zh": { - "name": "巨人", - "description": "巨人是日本的漫画系列,由諫山 創作画。", + "name": "进击的巨人", + "description": "进击的巨人是日本的漫画系列,由諫山 創作画。", "author": "諫山創" }, "id": "852", @@ -788,9 +788,9 @@ async fn force_different_locales_with_pattern() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -889,9 +889,9 @@ async fn auto_infer_locales_at_search_with_attributes_to_search_on() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -965,9 +965,9 @@ async fn auto_infer_locales_at_search() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -1011,9 +1011,9 @@ async fn auto_infer_locales_at_search() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -1057,9 +1057,9 @@ async fn auto_infer_locales_at_search() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -1177,8 +1177,8 @@ async fn force_different_locales_with_pattern_nested() { "author": "諫山 創" }, "document_zh": { - "name": "巨人", - "description": "巨人是日本的漫画系列,由諫山 創作画。", + "name": "进击的巨人", + "description": "进击的巨人是日本的漫画系列,由諫山 創作画。", "author": "諫山創" }, "id": "852", diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index df110aff9..09d3db575 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -919,7 +919,7 @@ mod tests { // should return 10 words with a marker at the start as well the end, and the highlighted matches. insta::assert_snapshot!( matcher.format(format_options), - @"…had the power to split the world between those who…" + @"…the power to split the world between those who embraced…" ); let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); From 83113998f99bb6d59bb9e94e9ef3e527f4c93f62 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Wed, 18 Sep 2024 10:35:23 +0300 Subject: [PATCH 10/92] Add more test assertions --- milli/src/search/new/matches/mod.rs | 36 +++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 09d3db575..8a84f91bd 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -929,6 +929,42 @@ mod tests { matcher.format(format_options), @"…world between those who embraced progress and those who resisted…" ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention had the power to split the world\"", + ); + let mut matcher = builder.build(text, None); + // should highlight "those" and the phrase "and those". + insta::assert_snapshot!( + matcher.format(format_options), + @"The groundbreaking invention had the power to split the world…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention had the power to split the world between\"", + ); + let mut matcher = builder.build(text, None); + // should highlight "those" and the phrase "and those". + insta::assert_snapshot!( + matcher.format(format_options), + @"The groundbreaking invention had the power to split the world …" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention\" \"embraced progress and those who resisted change\"", + ); + let mut matcher = builder.build(text, None); + // should highlight "those" and the phrase "and those". + insta::assert_snapshot!( + matcher.format(format_options), + @"…between those who embraced progress and those who resisted change…" + ); } #[test] From 0ffeea5a5209f1e206720e3cf63d7fe627b8cee0 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 19 Sep 2024 09:06:40 +0300 Subject: [PATCH 11/92] Remove wrong comments --- milli/src/search/new/matches/mod.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 8a84f91bd..26115c39b 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -936,7 +936,6 @@ mod tests { "\"The groundbreaking invention had the power to split the world\"", ); let mut matcher = builder.build(text, None); - // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), @"The groundbreaking invention had the power to split the world…" @@ -948,7 +947,6 @@ mod tests { "\"The groundbreaking invention had the power to split the world between\"", ); let mut matcher = builder.build(text, None); - // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), @"The groundbreaking invention had the power to split the world …" @@ -960,7 +958,6 @@ mod tests { "\"The groundbreaking invention\" \"embraced progress and those who resisted change\"", ); let mut matcher = builder.build(text, None); - // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), @"…between those who embraced progress and those who resisted change…" From afa3ae0cbd9c7223d4068dd438d043a43d0d4fae Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 19 Sep 2024 17:42:52 +0200 Subject: [PATCH 12/92] WIP --- milli/src/update/index_documents/mod.rs | 17 ++----- .../src/update/index_documents/typed_chunk.rs | 16 ++---- milli/src/vector/mod.rs | 51 +++++++++++-------- 3 files changed, 38 insertions(+), 46 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 326dd842d..b03ab259a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -689,9 +689,8 @@ where key: None, }, )?; - let first_id = crate::vector::arroy_db_range_for_embedder(index).next().unwrap(); let reader = - ArroyWrapper::new(self.index.vector_arroy, first_id, action.was_quantized); + ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized); let dim = reader.dimensions(self.wtxn)?; dimension.insert(name.to_string(), dim); } @@ -713,17 +712,11 @@ where let is_quantizing = embedder_config.map_or(false, |action| action.is_being_quantized); pool.install(|| { - for k in crate::vector::arroy_db_range_for_embedder(embedder_index) { - let mut writer = ArroyWrapper::new(vector_arroy, k, was_quantized); - if is_quantizing { - writer.quantize(wtxn, k, dimension)?; - } - if writer.need_build(wtxn, dimension)? { - writer.build(wtxn, &mut rng, dimension)?; - } else if writer.is_empty(wtxn, dimension)? { - break; - } + let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); + if is_quantizing { + writer.quantize(wtxn, dimension)?; } + writer.build(wtxn, &mut rng, dimension)?; Result::Ok(()) }) .map_err(InternalError::from)??; diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 97a4bf712..e340137e2 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -673,22 +673,14 @@ pub(crate) fn write_typed_chunk_into_index( .get(&embedder_name) .map_or(false, |conf| conf.2); // FIXME: allow customizing distance - let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index) - .map(|k| ArroyWrapper::new(index.vector_arroy, k, binary_quantized)) - .collect(); + let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized); // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); let mut iter = merger.into_stream_merger_iter()?; while let Some((key, _)) = iter.next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - - for writer in &writers { - // Uses invariant: vectors are packed in the first writers. - if !writer.del_item(wtxn, expected_dimension, docid)? { - break; - } - } + writer.del_item(wtxn, expected_dimension, docid)?; } // add generated embeddings @@ -716,9 +708,7 @@ pub(crate) fn write_typed_chunk_into_index( embeddings.embedding_count(), ))); } - for (embedding, writer) in embeddings.iter().zip(&writers) { - writer.add_item(wtxn, expected_dimension, docid, embedding)?; - } + writer.add_items(wtxn, expected_dimension, docid, embeddings)?; } // perform the manual diff diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index d52e68bbe..644826dcd 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -32,60 +32,69 @@ pub const REQUEST_PARALLELISM: usize = 40; pub struct ArroyWrapper { quantized: bool, - index: u16, + index: u8, database: arroy::Database, } impl ArroyWrapper { - pub fn new(database: arroy::Database, index: u16, quantized: bool) -> Self { + pub fn new(database: arroy::Database, index: u8, quantized: bool) -> Self { Self { database, index, quantized } } - pub fn index(&self) -> u16 { + pub fn index(&self) -> u8 { self.index } pub fn dimensions(&self, rtxn: &RoTxn) -> Result { + let first_id = arroy_db_range_for_embedder(self.index).next().unwrap(); if self.quantized { - Ok(arroy::Reader::open(rtxn, self.index, self.quantized_db())?.dimensions()) + Ok(arroy::Reader::open(rtxn, first_id, self.quantized_db())?.dimensions()) } else { - Ok(arroy::Reader::open(rtxn, self.index, self.angular_db())?.dimensions()) + Ok(arroy::Reader::open(rtxn, first_id, self.angular_db())?.dimensions()) } } - pub fn quantize( - &mut self, - wtxn: &mut RwTxn, - index: u16, - dimension: usize, - ) -> Result<(), arroy::Error> { + pub fn quantize(&mut self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { if !self.quantized { - let writer = arroy::Writer::new(self.angular_db(), index, dimension); - writer.prepare_changing_distance::(wtxn)?; + for index in arroy_db_range_for_embedder(self.index) { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + writer.prepare_changing_distance::(wtxn)?; + } self.quantized = true; } Ok(()) } + // TODO: We can stop early when we find an empty DB pub fn need_build(&self, rtxn: &RoTxn, dimension: usize) -> Result { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).need_build(rtxn) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).need_build(rtxn) + for index in arroy_db_range_for_embedder(self.index) { + let need_build = if self.quantized { + arroy::Writer::new(self.quantized_db(), index, dimension).need_build(rtxn) + } else { + arroy::Writer::new(self.angular_db(), index, dimension).need_build(rtxn) + }; + if need_build? { + return Ok(true); + } } + Ok(false) } + /// TODO: We should early exit when it doesn't need to be built pub fn build( &self, wtxn: &mut RwTxn, rng: &mut R, dimension: usize, ) -> Result<(), arroy::Error> { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).build(wtxn, rng, None) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).build(wtxn, rng, None) + for index in arroy_db_range_for_embedder(self.index) { + if self.quantized { + arroy::Writer::new(self.quantized_db(), index, dimension).build(wtxn, rng, None)? + } else { + arroy::Writer::new(self.angular_db(), index, dimension).build(wtxn, rng, None)? + } } + Ok(()) } pub fn add_item( From 6ba4baecbf47e39339c22c67b60a5d0953f53fc5 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 23 Sep 2024 15:15:26 +0200 Subject: [PATCH 13/92] first ugly step --- milli/src/search/similar.rs | 26 +- .../src/update/index_documents/typed_chunk.rs | 45 +--- milli/src/vector/mod.rs | 232 ++++++++++++++---- 3 files changed, 203 insertions(+), 100 deletions(-) diff --git a/milli/src/search/similar.rs b/milli/src/search/similar.rs index 0cb8d723d..e408c94b1 100644 --- a/milli/src/search/similar.rs +++ b/milli/src/search/similar.rs @@ -4,7 +4,7 @@ use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use crate::score_details::{self, ScoreDetails}; -use crate::vector::Embedder; +use crate::vector::{ArroyWrapper, Embedder}; use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult}; pub struct Similar<'a> { @@ -71,23 +71,13 @@ impl<'a> Similar<'a> { .get(self.rtxn, &self.embedder_name)? .ok_or_else(|| crate::UserError::InvalidEmbedder(self.embedder_name.to_owned()))?; - let mut results = Vec::new(); - - for reader in self.index.arroy_readers(self.rtxn, embedder_index, self.quantized) { - let nns_by_item = reader?.nns_by_item( - self.rtxn, - self.id, - self.limit + self.offset + 1, - Some(&universe), - )?; - if let Some(mut nns_by_item) = nns_by_item { - results.append(&mut nns_by_item); - } else { - break; - } - } - - results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized); + let results = reader.nns_by_item( + self.rtxn, + self.id, + self.limit + self.offset + 1, + Some(&universe), + )?; let mut documents_ids = Vec::with_capacity(self.limit); let mut document_scores = Vec::with_capacity(self.limit); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e340137e2..e118420d8 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -680,7 +680,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut iter = merger.into_stream_merger_iter()?; while let Some((key, _)) = iter.next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - writer.del_item(wtxn, expected_dimension, docid)?; + writer.del_item_raw(wtxn, expected_dimension, docid)?; } // add generated embeddings @@ -708,7 +708,7 @@ pub(crate) fn write_typed_chunk_into_index( embeddings.embedding_count(), ))); } - writer.add_items(wtxn, expected_dimension, docid, embeddings)?; + writer.add_items(wtxn, docid, &embeddings)?; } // perform the manual diff @@ -723,51 +723,14 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { let vector: Vec = pod_collect_to_vec(value); - let mut deleted_index = None; - for (index, writer) in writers.iter().enumerate() { - let Some(candidate) = writer.item_vector(wtxn, docid)? else { - // uses invariant: vectors are packed in the first writers. - break; - }; - if candidate == vector { - writer.del_item(wtxn, expected_dimension, docid)?; - deleted_index = Some(index); - } - } - - // 🥲 enforce invariant: vectors are packed in the first writers. - if let Some(deleted_index) = deleted_index { - let mut last_index_with_a_vector = None; - for (index, writer) in writers.iter().enumerate().skip(deleted_index) { - let Some(candidate) = writer.item_vector(wtxn, docid)? else { - break; - }; - last_index_with_a_vector = Some((index, candidate)); - } - if let Some((last_index, vector)) = last_index_with_a_vector { - // unwrap: computed the index from the list of writers - let writer = writers.get(last_index).unwrap(); - writer.del_item(wtxn, expected_dimension, docid)?; - writers.get(deleted_index).unwrap().add_item( - wtxn, - expected_dimension, - docid, - &vector, - )?; - } - } + writer.del_item(wtxn, docid, &vector)?; } if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { let vector = pod_collect_to_vec(value); // overflow was detected during vector extraction. - for writer in &writers { - if !writer.contains_item(wtxn, expected_dimension, docid)? { - writer.add_item(wtxn, expected_dimension, docid, &vector)?; - break; - } - } + writer.add_item(wtxn, docid, &vector)?; } } diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 644826dcd..54765cfef 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -97,49 +97,165 @@ impl ArroyWrapper { Ok(()) } + pub fn add_items( + &self, + wtxn: &mut RwTxn, + item_id: arroy::ItemId, + embeddings: &Embeddings, + ) -> Result<(), arroy::Error> { + let dimension = embeddings.dimension(); + for (index, vector) in arroy_db_range_for_embedder(self.index).zip(embeddings.iter()) { + if self.quantized { + arroy::Writer::new(self.quantized_db(), index, dimension) + .add_item(wtxn, item_id, vector)? + } else { + arroy::Writer::new(self.angular_db(), index, dimension) + .add_item(wtxn, item_id, vector)? + } + } + Ok(()) + } + pub fn add_item( &self, wtxn: &mut RwTxn, - dimension: usize, item_id: arroy::ItemId, vector: &[f32], ) -> Result<(), arroy::Error> { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension) - .add_item(wtxn, item_id, vector) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension) - .add_item(wtxn, item_id, vector) + let dimension = vector.len(); + + for index in arroy_db_range_for_embedder(self.index) { + if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if !writer.contains_item(wtxn, item_id)? { + writer.add_item(wtxn, item_id, &vector)?; + break; + } + } else { + arroy::Writer::new(self.angular_db(), index, dimension) + .add_item(wtxn, item_id, vector)? + } } + + Ok(()) } - pub fn del_item( + pub fn del_item_raw( &self, wtxn: &mut RwTxn, dimension: usize, item_id: arroy::ItemId, ) -> Result { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).del_item(wtxn, item_id) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).del_item(wtxn, item_id) + for index in arroy_db_range_for_embedder(self.index) { + if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if writer.del_item(wtxn, item_id)? { + return Ok(true); + } + } else { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + if writer.del_item(wtxn, item_id)? { + return Ok(true); + } + } } + + Ok(false) + } + + pub fn del_item( + &self, + wtxn: &mut RwTxn, + itemid: arroy::ItemId, + vector: &[f32], + ) -> Result { + let dimension = vector.len(); + let mut deleted_index = None; + + for index in arroy_db_range_for_embedder(self.index) { + if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + let Some(candidate) = writer.item_vector(wtxn, itemid)? else { + // uses invariant: vectors are packed in the first writers. + break; + }; + if candidate == vector { + writer.del_item(wtxn, itemid)?; + deleted_index = Some(index); + } + } else { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + let Some(candidate) = writer.item_vector(wtxn, itemid)? else { + // uses invariant: vectors are packed in the first writers. + break; + }; + if candidate == vector { + writer.del_item(wtxn, itemid)?; + deleted_index = Some(index); + } + } + } + + // 🥲 enforce invariant: vectors are packed in the first writers. + if let Some(deleted_index) = deleted_index { + let mut last_index_with_a_vector = None; + for index in arroy_db_range_for_embedder(self.index).skip(deleted_index as usize) { + if self.quantized { + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + let Some(candidate) = writer.item_vector(wtxn, itemid)? else { + break; + }; + last_index_with_a_vector = Some((index, candidate)); + } else { + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + let Some(candidate) = writer.item_vector(wtxn, itemid)? else { + break; + }; + last_index_with_a_vector = Some((index, candidate)); + } + } + if let Some((last_index, vector)) = last_index_with_a_vector { + if self.quantized { + // unwrap: computed the index from the list of writers + let writer = arroy::Writer::new(self.quantized_db(), last_index, dimension); + writer.del_item(wtxn, itemid)?; + let writer = arroy::Writer::new(self.quantized_db(), deleted_index, dimension); + writer.add_item(wtxn, itemid, &vector)?; + } else { + // unwrap: computed the index from the list of writers + let writer = arroy::Writer::new(self.angular_db(), last_index, dimension); + writer.del_item(wtxn, itemid)?; + let writer = arroy::Writer::new(self.angular_db(), deleted_index, dimension); + writer.add_item(wtxn, itemid, &vector)?; + } + } + } + Ok(deleted_index.is_some()) } pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).clear(wtxn) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).clear(wtxn) + for index in arroy_db_range_for_embedder(self.index) { + if self.quantized { + arroy::Writer::new(self.quantized_db(), index, dimension).clear(wtxn)?; + } else { + arroy::Writer::new(self.angular_db(), index, dimension).clear(wtxn)?; + } } + Ok(()) } pub fn is_empty(&self, rtxn: &RoTxn, dimension: usize) -> Result { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).is_empty(rtxn) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).is_empty(rtxn) + for index in arroy_db_range_for_embedder(self.index) { + let empty = if self.quantized { + arroy::Writer::new(self.quantized_db(), index, dimension).is_empty(rtxn)? + } else { + arroy::Writer::new(self.angular_db(), index, dimension).is_empty(rtxn)? + }; + if !empty { + return Ok(false); + } } + Ok(true) } pub fn contains_item( @@ -148,11 +264,18 @@ impl ArroyWrapper { dimension: usize, item: arroy::ItemId, ) -> Result { - if self.quantized { - arroy::Writer::new(self.quantized_db(), self.index, dimension).contains_item(rtxn, item) - } else { - arroy::Writer::new(self.angular_db(), self.index, dimension).contains_item(rtxn, item) + for index in arroy_db_range_for_embedder(self.index) { + let contains = if self.quantized { + arroy::Writer::new(self.quantized_db(), index, dimension) + .contains_item(rtxn, item)? + } else { + arroy::Writer::new(self.angular_db(), index, dimension).contains_item(rtxn, item)? + }; + if contains { + return Ok(contains); + } } + Ok(false) } pub fn nns_by_item( @@ -161,14 +284,26 @@ impl ArroyWrapper { item: ItemId, limit: usize, filter: Option<&RoaringBitmap>, - ) -> Result>, arroy::Error> { - if self.quantized { - arroy::Reader::open(rtxn, self.index, self.quantized_db())? - .nns_by_item(rtxn, item, limit, None, None, filter) - } else { - arroy::Reader::open(rtxn, self.index, self.angular_db())? - .nns_by_item(rtxn, item, limit, None, None, filter) + ) -> Result, arroy::Error> { + let mut results = Vec::new(); + + for index in arroy_db_range_for_embedder(self.index) { + let ret = if self.quantized { + arroy::Reader::open(rtxn, index, self.quantized_db())? + .nns_by_item(rtxn, item, limit, None, None, filter)? + } else { + arroy::Reader::open(rtxn, index, self.angular_db())? + .nns_by_item(rtxn, item, limit, None, None, filter)? + }; + if let Some(mut ret) = ret { + results.append(&mut ret); + } else { + break; + } } + results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + + Ok(results) } pub fn nns_by_vector( @@ -178,21 +313,36 @@ impl ArroyWrapper { limit: usize, filter: Option<&RoaringBitmap>, ) -> Result, arroy::Error> { - if self.quantized { - arroy::Reader::open(txn, self.index, self.quantized_db())? - .nns_by_vector(txn, item, limit, None, None, filter) - } else { - arroy::Reader::open(txn, self.index, self.angular_db())? - .nns_by_vector(txn, item, limit, None, None, filter) + let mut results = Vec::new(); + + for index in arroy_db_range_for_embedder(self.index) { + let mut ret = if self.quantized { + arroy::Reader::open(txn, index, self.quantized_db())? + .nns_by_vector(txn, item, limit, None, None, filter)? + } else { + arroy::Reader::open(txn, index, self.angular_db())? + .nns_by_vector(txn, item, limit, None, None, filter)? + }; + results.append(&mut ret); } + + results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + + Ok(results) } pub fn item_vector(&self, rtxn: &RoTxn, docid: u32) -> Result>, arroy::Error> { - if self.quantized { - arroy::Reader::open(rtxn, self.index, self.quantized_db())?.item_vector(rtxn, docid) - } else { - arroy::Reader::open(rtxn, self.index, self.angular_db())?.item_vector(rtxn, docid) + for index in arroy_db_range_for_embedder(self.index) { + let ret = if self.quantized { + arroy::Reader::open(rtxn, index, self.quantized_db())?.item_vector(rtxn, docid)? + } else { + arroy::Reader::open(rtxn, index, self.angular_db())?.item_vector(rtxn, docid)? + }; + if ret.is_some() { + return Ok(ret); + } } + Ok(None) } fn angular_db(&self) -> arroy::Database { From 1e4d4e69c4cebee8f09d905c5cc8130b08214f04 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 23 Sep 2024 18:56:15 +0200 Subject: [PATCH 14/92] finish the arroywrapper --- milli/src/index.rs | 29 +-- milli/src/search/new/vector_sort.rs | 12 +- milli/src/search/similar.rs | 1 - milli/src/update/index_documents/transform.rs | 63 ++---- milli/src/vector/mod.rs | 211 +++++++++++------- 5 files changed, 155 insertions(+), 161 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index c47896df7..5b7a9c58c 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1610,24 +1610,6 @@ impl Index { .unwrap_or_default()) } - pub fn arroy_readers<'a>( - &'a self, - rtxn: &'a RoTxn<'a>, - embedder_id: u8, - quantized: bool, - ) -> impl Iterator> + 'a { - crate::vector::arroy_db_range_for_embedder(embedder_id).map_while(move |k| { - let reader = ArroyWrapper::new(self.vector_arroy, k, quantized); - // Here we don't care about the dimensions, but we want to know if we can read - // in the database or if its metadata are missing because there is no document with that many vectors. - match reader.dimensions(rtxn) { - Ok(_) => Some(Ok(reader)), - Err(arroy::Error::MissingMetadata(_)) => None, - Err(e) => Some(Err(e.into())), - } - }) - } - pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> { self.main.remap_types::().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff) } @@ -1649,14 +1631,9 @@ impl Index { let embedding_configs = self.embedding_configs(rtxn)?; for config in embedding_configs { let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); - let embeddings = self - .arroy_readers(rtxn, embedder_id, config.config.quantized()) - .map_while(|reader| { - reader - .and_then(|r| r.item_vector(rtxn, docid).map_err(|e| e.into())) - .transpose() - }) - .collect::>>()?; + let reader = + ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); + let embeddings = reader.item_vectors(rtxn, docid)?; res.insert(config.name.to_owned(), embeddings); } Ok(res) diff --git a/milli/src/search/new/vector_sort.rs b/milli/src/search/new/vector_sort.rs index de1dacbe7..90377c09c 100644 --- a/milli/src/search/new/vector_sort.rs +++ b/milli/src/search/new/vector_sort.rs @@ -1,11 +1,10 @@ use std::iter::FromIterator; -use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use crate::score_details::{self, ScoreDetails}; -use crate::vector::{DistributionShift, Embedder}; +use crate::vector::{ArroyWrapper, DistributionShift, Embedder}; use crate::{DocumentId, Result, SearchContext, SearchLogger}; pub struct VectorSort { @@ -53,14 +52,9 @@ impl VectorSort { vector_candidates: &RoaringBitmap, ) -> Result<()> { let target = &self.target; - let mut results = Vec::new(); - for reader in ctx.index.arroy_readers(ctx.txn, self.embedder_index, self.quantized) { - let nns_by_vector = - reader?.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; - results.extend(nns_by_vector.into_iter()); - } - results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + let reader = ArroyWrapper::new(ctx.index.vector_arroy, self.embedder_index, self.quantized); + let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; self.cached_sorted_docids = results.into_iter(); Ok(()) diff --git a/milli/src/search/similar.rs b/milli/src/search/similar.rs index e408c94b1..5547d800e 100644 --- a/milli/src/search/similar.rs +++ b/milli/src/search/similar.rs @@ -1,6 +1,5 @@ use std::sync::Arc; -use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use crate::score_details::{self, ScoreDetails}; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index bb2cfe56c..763f30d0f 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -990,27 +990,24 @@ impl<'a, 'i> Transform<'a, 'i> { None }; - let readers: Result, &RoaringBitmap)>> = settings_diff + let readers: BTreeMap<&str, (ArroyWrapper, &RoaringBitmap)> = settings_diff .embedding_config_updates .iter() .filter_map(|(name, action)| { if let Some(WriteBackToDocuments { embedder_id, user_provided }) = action.write_back() { - let readers: Result> = self - .index - .arroy_readers(wtxn, *embedder_id, action.was_quantized) - .collect(); - match readers { - Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))), - Err(error) => Some(Err(error)), - } + let reader = ArroyWrapper::new( + self.index.vector_arroy, + *embedder_id, + action.was_quantized, + ); + Some((name.as_str(), (reader, user_provided))) } else { None } }) .collect(); - let readers = readers?; let old_vectors_fid = settings_diff .old @@ -1048,34 +1045,24 @@ impl<'a, 'i> Transform<'a, 'i> { arroy::Error, > = readers .iter() - .filter_map(|(name, (readers, user_provided))| { + .filter_map(|(name, (reader, user_provided))| { if !user_provided.contains(docid) { return None; } - let mut vectors = Vec::new(); - for reader in readers { - let Some(vector) = reader.item_vector(wtxn, docid).transpose() else { - break; - }; - - match vector { - Ok(vector) => vectors.push(vector), - Err(error) => return Some(Err(error)), - } + match reader.item_vectors(wtxn, docid) { + Ok(vectors) if vectors.is_empty() => None, + Ok(vectors) => Some(Ok(( + name.to_string(), + serde_json::to_value(ExplicitVectors { + embeddings: Some( + VectorOrArrayOfVectors::from_array_of_vectors(vectors), + ), + regenerate: false, + }) + .unwrap(), + ))), + Err(e) => Some(Err(e)), } - if vectors.is_empty() { - return None; - } - Some(Ok(( - name.to_string(), - serde_json::to_value(ExplicitVectors { - embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( - vectors, - )), - regenerate: false, - }) - .unwrap(), - ))) }) .collect(); @@ -1104,11 +1091,9 @@ impl<'a, 'i> Transform<'a, 'i> { } // delete all vectors from the embedders that need removal - for (_, (readers, _)) in readers { - for reader in readers { - let dimensions = reader.dimensions(wtxn)?; - reader.clear(wtxn, dimensions)?; - } + for (_, (reader, _)) in readers { + let dimensions = reader.dimensions(wtxn)?; + reader.clear(wtxn, dimensions)?; } let grenad_params = GrenadParameters { diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 54765cfef..b5b6cd953 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -45,6 +45,20 @@ impl ArroyWrapper { self.index } + fn readers<'a, D: arroy::Distance>( + &'a self, + rtxn: &'a RoTxn<'a>, + db: arroy::Database, + ) -> impl Iterator, arroy::Error>> + 'a { + arroy_db_range_for_embedder(self.index).map_while(move |index| { + match arroy::Reader::open(rtxn, index, db) { + Ok(reader) => Some(Ok(reader)), + Err(arroy::Error::MissingMetadata(_)) => None, + Err(e) => Some(Err(e)), + } + }) + } + pub fn dimensions(&self, rtxn: &RoTxn) -> Result { let first_id = arroy_db_range_for_embedder(self.index).next().unwrap(); if self.quantized { @@ -97,6 +111,7 @@ impl ArroyWrapper { Ok(()) } + /// Overwrite all the embeddings associated to the index and item id. pub fn add_items( &self, wtxn: &mut RwTxn, @@ -116,30 +131,41 @@ impl ArroyWrapper { Ok(()) } + /// Add one document int for this index where we can find an empty spot. pub fn add_item( &self, wtxn: &mut RwTxn, item_id: arroy::ItemId, vector: &[f32], + ) -> Result<(), arroy::Error> { + if self.quantized { + self._add_item(wtxn, self.quantized_db(), item_id, vector) + } else { + self._add_item(wtxn, self.angular_db(), item_id, vector) + } + } + + fn _add_item( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + vector: &[f32], ) -> Result<(), arroy::Error> { let dimension = vector.len(); for index in arroy_db_range_for_embedder(self.index) { - if self.quantized { - let writer = arroy::Writer::new(self.quantized_db(), index, dimension); - if !writer.contains_item(wtxn, item_id)? { - writer.add_item(wtxn, item_id, &vector)?; - break; - } - } else { - arroy::Writer::new(self.angular_db(), index, dimension) - .add_item(wtxn, item_id, vector)? + let writer = arroy::Writer::new(db, index, dimension); + if !writer.contains_item(wtxn, item_id)? { + writer.add_item(wtxn, item_id, vector)?; + break; } } - Ok(()) } + /// Delete an item from the index. It **does not** take care of fixing the hole + /// made after deleting the item. pub fn del_item_raw( &self, wtxn: &mut RwTxn, @@ -163,36 +189,39 @@ impl ArroyWrapper { Ok(false) } + /// Delete one item. pub fn del_item( &self, wtxn: &mut RwTxn, - itemid: arroy::ItemId, + item_id: arroy::ItemId, + vector: &[f32], + ) -> Result { + if self.quantized { + self._del_item(wtxn, self.quantized_db(), item_id, vector) + } else { + self._del_item(wtxn, self.angular_db(), item_id, vector) + } + } + + fn _del_item( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, vector: &[f32], ) -> Result { let dimension = vector.len(); let mut deleted_index = None; for index in arroy_db_range_for_embedder(self.index) { - if self.quantized { - let writer = arroy::Writer::new(self.quantized_db(), index, dimension); - let Some(candidate) = writer.item_vector(wtxn, itemid)? else { - // uses invariant: vectors are packed in the first writers. - break; - }; - if candidate == vector { - writer.del_item(wtxn, itemid)?; - deleted_index = Some(index); - } - } else { - let writer = arroy::Writer::new(self.angular_db(), index, dimension); - let Some(candidate) = writer.item_vector(wtxn, itemid)? else { - // uses invariant: vectors are packed in the first writers. - break; - }; - if candidate == vector { - writer.del_item(wtxn, itemid)?; - deleted_index = Some(index); - } + let writer = arroy::Writer::new(db, index, dimension); + let Some(candidate) = writer.item_vector(wtxn, item_id)? else { + // uses invariant: vectors are packed in the first writers. + break; + }; + if candidate == vector { + writer.del_item(wtxn, item_id)?; + deleted_index = Some(index); } } @@ -200,34 +229,18 @@ impl ArroyWrapper { if let Some(deleted_index) = deleted_index { let mut last_index_with_a_vector = None; for index in arroy_db_range_for_embedder(self.index).skip(deleted_index as usize) { - if self.quantized { - let writer = arroy::Writer::new(self.quantized_db(), index, dimension); - let Some(candidate) = writer.item_vector(wtxn, itemid)? else { - break; - }; - last_index_with_a_vector = Some((index, candidate)); - } else { - let writer = arroy::Writer::new(self.angular_db(), index, dimension); - let Some(candidate) = writer.item_vector(wtxn, itemid)? else { - break; - }; - last_index_with_a_vector = Some((index, candidate)); - } + let writer = arroy::Writer::new(db, index, dimension); + let Some(candidate) = writer.item_vector(wtxn, item_id)? else { + break; + }; + last_index_with_a_vector = Some((index, candidate)); } if let Some((last_index, vector)) = last_index_with_a_vector { - if self.quantized { - // unwrap: computed the index from the list of writers - let writer = arroy::Writer::new(self.quantized_db(), last_index, dimension); - writer.del_item(wtxn, itemid)?; - let writer = arroy::Writer::new(self.quantized_db(), deleted_index, dimension); - writer.add_item(wtxn, itemid, &vector)?; - } else { - // unwrap: computed the index from the list of writers - let writer = arroy::Writer::new(self.angular_db(), last_index, dimension); - writer.del_item(wtxn, itemid)?; - let writer = arroy::Writer::new(self.angular_db(), deleted_index, dimension); - writer.add_item(wtxn, itemid, &vector)?; - } + // unwrap: computed the index from the list of writers + let writer = arroy::Writer::new(db, last_index, dimension); + writer.del_item(wtxn, item_id)?; + let writer = arroy::Writer::new(db, deleted_index, dimension); + writer.add_item(wtxn, item_id, &vector)?; } } Ok(deleted_index.is_some()) @@ -284,17 +297,26 @@ impl ArroyWrapper { item: ItemId, limit: usize, filter: Option<&RoaringBitmap>, + ) -> Result, arroy::Error> { + if self.quantized { + self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter) + } else { + self._nns_by_item(rtxn, self.angular_db(), item, limit, filter) + } + } + + fn _nns_by_item( + &self, + rtxn: &RoTxn, + db: arroy::Database, + item: ItemId, + limit: usize, + filter: Option<&RoaringBitmap>, ) -> Result, arroy::Error> { let mut results = Vec::new(); - for index in arroy_db_range_for_embedder(self.index) { - let ret = if self.quantized { - arroy::Reader::open(rtxn, index, self.quantized_db())? - .nns_by_item(rtxn, item, limit, None, None, filter)? - } else { - arroy::Reader::open(rtxn, index, self.angular_db())? - .nns_by_item(rtxn, item, limit, None, None, filter)? - }; + for reader in self.readers(rtxn, db) { + let ret = reader?.nns_by_item(rtxn, item, limit, None, None, filter)?; if let Some(mut ret) = ret { results.append(&mut ret); } else { @@ -302,27 +324,35 @@ impl ArroyWrapper { } } results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); - Ok(results) } pub fn nns_by_vector( &self, - txn: &RoTxn, - item: &[f32], + rtxn: &RoTxn, + vector: &[f32], + limit: usize, + filter: Option<&RoaringBitmap>, + ) -> Result, arroy::Error> { + if self.quantized { + self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter) + } else { + self._nns_by_vector(rtxn, self.angular_db(), vector, limit, filter) + } + } + + fn _nns_by_vector( + &self, + rtxn: &RoTxn, + db: arroy::Database, + vector: &[f32], limit: usize, filter: Option<&RoaringBitmap>, ) -> Result, arroy::Error> { let mut results = Vec::new(); - for index in arroy_db_range_for_embedder(self.index) { - let mut ret = if self.quantized { - arroy::Reader::open(txn, index, self.quantized_db())? - .nns_by_vector(txn, item, limit, None, None, filter)? - } else { - arroy::Reader::open(txn, index, self.angular_db())? - .nns_by_vector(txn, item, limit, None, None, filter)? - }; + for reader in self.readers(rtxn, db) { + let mut ret = reader?.nns_by_vector(rtxn, vector, limit, None, None, filter)?; results.append(&mut ret); } @@ -331,18 +361,27 @@ impl ArroyWrapper { Ok(results) } - pub fn item_vector(&self, rtxn: &RoTxn, docid: u32) -> Result>, arroy::Error> { - for index in arroy_db_range_for_embedder(self.index) { - let ret = if self.quantized { - arroy::Reader::open(rtxn, index, self.quantized_db())?.item_vector(rtxn, docid)? - } else { - arroy::Reader::open(rtxn, index, self.angular_db())?.item_vector(rtxn, docid)? - }; - if ret.is_some() { - return Ok(ret); + pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> Result>, arroy::Error> { + let mut vectors = Vec::new(); + + if self.quantized { + for reader in self.readers(rtxn, self.quantized_db()) { + if let Some(vec) = reader?.item_vector(rtxn, item_id)? { + vectors.push(vec); + } else { + break; + } + } + } else { + for reader in self.readers(rtxn, self.angular_db()) { + if let Some(vec) = reader?.item_vector(rtxn, item_id)? { + vectors.push(vec); + } else { + break; + } } } - Ok(None) + Ok(vectors) } fn angular_db(&self) -> arroy::Database { From 0704fb71e97ce20fbe3ed5f5af6ad53da3a3d67f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 24 Sep 2024 09:44:29 +0200 Subject: [PATCH 15/92] Fix bench by adding embedder --- .../search/embeddings-movies-subset-hf.json | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/workloads/search/embeddings-movies-subset-hf.json b/workloads/search/embeddings-movies-subset-hf.json index aeeecac59..36f45cfb9 100644 --- a/workloads/search/embeddings-movies-subset-hf.json +++ b/workloads/search/embeddings-movies-subset-hf.json @@ -77,7 +77,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 0.1 + "semanticRatio": 0.1, + "embedder": "default" } } }, @@ -91,7 +92,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 0.5 + "semanticRatio": 0.5, + "embedder": "default" } } }, @@ -105,7 +107,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 0.9 + "semanticRatio": 0.9, + "embedder": "default" } } }, @@ -119,7 +122,8 @@ "q": "puppy cute comforting movie", "limit": 100, "hybrid": { - "semanticRatio": 1.0 + "semanticRatio": 1.0, + "embedder": "default" } } }, @@ -133,7 +137,8 @@ "q": "shrek", "limit": 100, "hybrid": { - "semanticRatio": 1.0 + "semanticRatio": 1.0, + "embedder": "default" } } }, @@ -147,7 +152,8 @@ "q": "shrek", "limit": 100, "hybrid": { - "semanticRatio": 0.5 + "semanticRatio": 0.5, + "embedder": "default" } } }, @@ -161,7 +167,8 @@ "q": "shrek", "limit": 100, "hybrid": { - "semanticRatio": 0.1 + "semanticRatio": 0.1, + "embedder": "default" } } }, From 86da0e83fe9043ff84d27ec7eb98e0ccd312b98e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 24 Sep 2024 10:02:53 +0200 Subject: [PATCH 16/92] Upgrade "batch failed" log to ERROR level --- index-scheduler/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index fe8244f9b..e0e2bfb75 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1263,7 +1263,7 @@ impl IndexScheduler { #[cfg(test)] self.maybe_fail(tests::FailureLocation::UpdatingTaskAfterProcessBatchFailure)?; - tracing::info!("Batch failed {}", error); + tracing::error!("Batch failed {}", error); self.update_task(&mut wtxn, &task) .map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?; From 79d8a7a51a13fc089c3ebe58721302c856191d8d Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 24 Sep 2024 10:36:28 +0200 Subject: [PATCH 17/92] rename the embedder index for clarity --- milli/src/vector/mod.rs | 42 ++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index b5b6cd953..2da8ecd57 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -32,17 +32,21 @@ pub const REQUEST_PARALLELISM: usize = 40; pub struct ArroyWrapper { quantized: bool, - index: u8, + embedder_index: u8, database: arroy::Database, } impl ArroyWrapper { - pub fn new(database: arroy::Database, index: u8, quantized: bool) -> Self { - Self { database, index, quantized } + pub fn new( + database: arroy::Database, + embedder_index: u8, + quantized: bool, + ) -> Self { + Self { database, embedder_index, quantized } } pub fn index(&self) -> u8 { - self.index + self.embedder_index } fn readers<'a, D: arroy::Distance>( @@ -50,7 +54,7 @@ impl ArroyWrapper { rtxn: &'a RoTxn<'a>, db: arroy::Database, ) -> impl Iterator, arroy::Error>> + 'a { - arroy_db_range_for_embedder(self.index).map_while(move |index| { + arroy_db_range_for_embedder(self.embedder_index).map_while(move |index| { match arroy::Reader::open(rtxn, index, db) { Ok(reader) => Some(Ok(reader)), Err(arroy::Error::MissingMetadata(_)) => None, @@ -60,7 +64,7 @@ impl ArroyWrapper { } pub fn dimensions(&self, rtxn: &RoTxn) -> Result { - let first_id = arroy_db_range_for_embedder(self.index).next().unwrap(); + let first_id = arroy_db_range_for_embedder(self.embedder_index).next().unwrap(); if self.quantized { Ok(arroy::Reader::open(rtxn, first_id, self.quantized_db())?.dimensions()) } else { @@ -70,7 +74,7 @@ impl ArroyWrapper { pub fn quantize(&mut self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { if !self.quantized { - for index in arroy_db_range_for_embedder(self.index) { + for index in arroy_db_range_for_embedder(self.embedder_index) { let writer = arroy::Writer::new(self.angular_db(), index, dimension); writer.prepare_changing_distance::(wtxn)?; } @@ -81,7 +85,7 @@ impl ArroyWrapper { // TODO: We can stop early when we find an empty DB pub fn need_build(&self, rtxn: &RoTxn, dimension: usize) -> Result { - for index in arroy_db_range_for_embedder(self.index) { + for index in arroy_db_range_for_embedder(self.embedder_index) { let need_build = if self.quantized { arroy::Writer::new(self.quantized_db(), index, dimension).need_build(rtxn) } else { @@ -101,7 +105,7 @@ impl ArroyWrapper { rng: &mut R, dimension: usize, ) -> Result<(), arroy::Error> { - for index in arroy_db_range_for_embedder(self.index) { + for index in arroy_db_range_for_embedder(self.embedder_index) { if self.quantized { arroy::Writer::new(self.quantized_db(), index, dimension).build(wtxn, rng, None)? } else { @@ -119,7 +123,9 @@ impl ArroyWrapper { embeddings: &Embeddings, ) -> Result<(), arroy::Error> { let dimension = embeddings.dimension(); - for (index, vector) in arroy_db_range_for_embedder(self.index).zip(embeddings.iter()) { + for (index, vector) in + arroy_db_range_for_embedder(self.embedder_index).zip(embeddings.iter()) + { if self.quantized { arroy::Writer::new(self.quantized_db(), index, dimension) .add_item(wtxn, item_id, vector)? @@ -154,7 +160,7 @@ impl ArroyWrapper { ) -> Result<(), arroy::Error> { let dimension = vector.len(); - for index in arroy_db_range_for_embedder(self.index) { + for index in arroy_db_range_for_embedder(self.embedder_index) { let writer = arroy::Writer::new(db, index, dimension); if !writer.contains_item(wtxn, item_id)? { writer.add_item(wtxn, item_id, vector)?; @@ -172,7 +178,7 @@ impl ArroyWrapper { dimension: usize, item_id: arroy::ItemId, ) -> Result { - for index in arroy_db_range_for_embedder(self.index) { + for index in arroy_db_range_for_embedder(self.embedder_index) { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.del_item(wtxn, item_id)? { @@ -213,7 +219,7 @@ impl ArroyWrapper { let dimension = vector.len(); let mut deleted_index = None; - for index in arroy_db_range_for_embedder(self.index) { + for index in arroy_db_range_for_embedder(self.embedder_index) { let writer = arroy::Writer::new(db, index, dimension); let Some(candidate) = writer.item_vector(wtxn, item_id)? else { // uses invariant: vectors are packed in the first writers. @@ -228,7 +234,9 @@ impl ArroyWrapper { // 🥲 enforce invariant: vectors are packed in the first writers. if let Some(deleted_index) = deleted_index { let mut last_index_with_a_vector = None; - for index in arroy_db_range_for_embedder(self.index).skip(deleted_index as usize) { + for index in + arroy_db_range_for_embedder(self.embedder_index).skip(deleted_index as usize) + { let writer = arroy::Writer::new(db, index, dimension); let Some(candidate) = writer.item_vector(wtxn, item_id)? else { break; @@ -247,7 +255,7 @@ impl ArroyWrapper { } pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { - for index in arroy_db_range_for_embedder(self.index) { + for index in arroy_db_range_for_embedder(self.embedder_index) { if self.quantized { arroy::Writer::new(self.quantized_db(), index, dimension).clear(wtxn)?; } else { @@ -258,7 +266,7 @@ impl ArroyWrapper { } pub fn is_empty(&self, rtxn: &RoTxn, dimension: usize) -> Result { - for index in arroy_db_range_for_embedder(self.index) { + for index in arroy_db_range_for_embedder(self.embedder_index) { let empty = if self.quantized { arroy::Writer::new(self.quantized_db(), index, dimension).is_empty(rtxn)? } else { @@ -277,7 +285,7 @@ impl ArroyWrapper { dimension: usize, item: arroy::ItemId, ) -> Result { - for index in arroy_db_range_for_embedder(self.index) { + for index in arroy_db_range_for_embedder(self.embedder_index) { let contains = if self.quantized { arroy::Writer::new(self.quantized_db(), index, dimension) .contains_item(rtxn, item)? From f2d187ba3e779c0644ad0e1dbf3174dea2614d35 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 24 Sep 2024 10:39:40 +0200 Subject: [PATCH 18/92] rename the index method to embedder_index --- milli/src/vector/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 2da8ecd57..ca607c892 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -45,7 +45,7 @@ impl ArroyWrapper { Self { database, embedder_index, quantized } } - pub fn index(&self) -> u8 { + pub fn embedder_index(&self) -> u8 { self.embedder_index } From fd8447c5214b62b724f18ec5de9b92fa34537462 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 24 Sep 2024 10:52:05 +0200 Subject: [PATCH 19/92] fix the del items thing --- milli/src/update/index_documents/typed_chunk.rs | 2 +- milli/src/vector/mod.rs | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index e118420d8..20e70b2a6 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -680,7 +680,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut iter = merger.into_stream_merger_iter()?; while let Some((key, _)) = iter.next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); - writer.del_item_raw(wtxn, expected_dimension, docid)?; + writer.del_items(wtxn, expected_dimension, docid)?; } // add generated embeddings diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index ca607c892..4b322ddf4 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -170,29 +170,28 @@ impl ArroyWrapper { Ok(()) } - /// Delete an item from the index. It **does not** take care of fixing the hole - /// made after deleting the item. - pub fn del_item_raw( + /// Delete all embeddings from a specific `item_id` + pub fn del_items( &self, wtxn: &mut RwTxn, dimension: usize, item_id: arroy::ItemId, - ) -> Result { + ) -> Result<(), arroy::Error> { for index in arroy_db_range_for_embedder(self.embedder_index) { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); - if writer.del_item(wtxn, item_id)? { - return Ok(true); + if !writer.del_item(wtxn, item_id)? { + break; } } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); - if writer.del_item(wtxn, item_id)? { - return Ok(true); + if !writer.del_item(wtxn, item_id)? { + break; } } } - Ok(false) + Ok(()) } /// Delete one item. From b8a74e04647af60a396539b6ba3b47d19771cc49 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 24 Sep 2024 10:59:15 +0200 Subject: [PATCH 20/92] fix comments --- milli/src/vector/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 4b322ddf4..8341ab923 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -115,7 +115,10 @@ impl ArroyWrapper { Ok(()) } - /// Overwrite all the embeddings associated to the index and item id. + /// Overwrite all the embeddings associated with the index and item ID. + /// /!\ It won't remove embeddings after the last passed embedding, which can leave stale embeddings. + /// You should call `del_items` on the `item_id` before calling this method. + /// /!\ Cannot insert more than u8::MAX embeddings; after inserting u8::MAX embeddings, all the remaining ones will be silently ignored. pub fn add_items( &self, wtxn: &mut RwTxn, @@ -243,7 +246,6 @@ impl ArroyWrapper { last_index_with_a_vector = Some((index, candidate)); } if let Some((last_index, vector)) = last_index_with_a_vector { - // unwrap: computed the index from the list of writers let writer = arroy::Writer::new(db, last_index, dimension); writer.del_item(wtxn, item_id)?; let writer = arroy::Writer::new(db, deleted_index, dimension); From 645a55317af91f37d68d26527568032016bf5393 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 24 Sep 2024 14:54:24 +0200 Subject: [PATCH 21/92] merge the build and quantize method --- milli/src/update/index_documents/mod.rs | 5 +-- milli/src/vector/mod.rs | 43 ++++++++++++++----------- 2 files changed, 25 insertions(+), 23 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index b03ab259a..e164a0817 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -713,10 +713,7 @@ where pool.install(|| { let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); - if is_quantizing { - writer.quantize(wtxn, dimension)?; - } - writer.build(wtxn, &mut rng, dimension)?; + writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing)?; Result::Ok(()) }) .map_err(InternalError::from)??; diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 8341ab923..a33f76559 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -98,18 +98,37 @@ impl ArroyWrapper { Ok(false) } - /// TODO: We should early exit when it doesn't need to be built - pub fn build( - &self, + pub fn build_and_quantize( + &mut self, wtxn: &mut RwTxn, rng: &mut R, dimension: usize, + quantizing: bool, ) -> Result<(), arroy::Error> { for index in arroy_db_range_for_embedder(self.embedder_index) { if self.quantized { - arroy::Writer::new(self.quantized_db(), index, dimension).build(wtxn, rng, None)? + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if writer.need_build(wtxn)? { + writer.build(wtxn, rng, None)? + } else if writer.is_empty(wtxn)? { + break; + } } else { - arroy::Writer::new(self.angular_db(), index, dimension).build(wtxn, rng, None)? + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + // If we are quantizing the databases, we can't know from meilisearch + // if the db was empty but still contained the wrong metadata, thus we need + // to quantize everything and can't stop early. Since this operation can + // only happens once in the life of an embedder, it's not very performances + // sensitive. + if quantizing && !self.quantized { + let writer = + writer.prepare_changing_distance::(wtxn)?; + writer.build(wtxn, rng, None)? + } else if writer.need_build(wtxn)? { + writer.build(wtxn, rng, None)? + } else if writer.is_empty(wtxn)? { + break; + } } } Ok(()) @@ -266,20 +285,6 @@ impl ArroyWrapper { Ok(()) } - pub fn is_empty(&self, rtxn: &RoTxn, dimension: usize) -> Result { - for index in arroy_db_range_for_embedder(self.embedder_index) { - let empty = if self.quantized { - arroy::Writer::new(self.quantized_db(), index, dimension).is_empty(rtxn)? - } else { - arroy::Writer::new(self.angular_db(), index, dimension).is_empty(rtxn)? - }; - if !empty { - return Ok(false); - } - } - Ok(true) - } - pub fn contains_item( &self, rtxn: &RoTxn, From 8b4e2c7b1798e58a71dfb0538dbc980155b688cc Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 24 Sep 2024 15:00:25 +0200 Subject: [PATCH 22/92] Remove now unused method --- milli/src/vector/mod.rs | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index a33f76559..39655e72a 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -72,32 +72,6 @@ impl ArroyWrapper { } } - pub fn quantize(&mut self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { - if !self.quantized { - for index in arroy_db_range_for_embedder(self.embedder_index) { - let writer = arroy::Writer::new(self.angular_db(), index, dimension); - writer.prepare_changing_distance::(wtxn)?; - } - self.quantized = true; - } - Ok(()) - } - - // TODO: We can stop early when we find an empty DB - pub fn need_build(&self, rtxn: &RoTxn, dimension: usize) -> Result { - for index in arroy_db_range_for_embedder(self.embedder_index) { - let need_build = if self.quantized { - arroy::Writer::new(self.quantized_db(), index, dimension).need_build(rtxn) - } else { - arroy::Writer::new(self.angular_db(), index, dimension).need_build(rtxn) - }; - if need_build? { - return Ok(true); - } - } - Ok(false) - } - pub fn build_and_quantize( &mut self, wtxn: &mut RwTxn, From 7f048b9732a048624bbe4beacb2e93f59c6d510d Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 24 Sep 2024 15:02:38 +0200 Subject: [PATCH 23/92] early exit in the clear and contains --- milli/src/vector/mod.rs | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 39655e72a..d5b80db83 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -251,9 +251,17 @@ impl ArroyWrapper { pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { for index in arroy_db_range_for_embedder(self.embedder_index) { if self.quantized { - arroy::Writer::new(self.quantized_db(), index, dimension).clear(wtxn)?; + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if writer.is_empty(wtxn)? { + break; + } + writer.clear(wtxn)?; } else { - arroy::Writer::new(self.angular_db(), index, dimension).clear(wtxn)?; + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + if writer.is_empty(wtxn)? { + break; + } + writer.clear(wtxn)?; } } Ok(()) @@ -267,10 +275,17 @@ impl ArroyWrapper { ) -> Result { for index in arroy_db_range_for_embedder(self.embedder_index) { let contains = if self.quantized { - arroy::Writer::new(self.quantized_db(), index, dimension) - .contains_item(rtxn, item)? + let writer = arroy::Writer::new(self.quantized_db(), index, dimension); + if writer.is_empty(rtxn)? { + break; + } + writer.contains_item(rtxn, item)? } else { - arroy::Writer::new(self.angular_db(), index, dimension).contains_item(rtxn, item)? + let writer = arroy::Writer::new(self.angular_db(), index, dimension); + if writer.is_empty(rtxn)? { + break; + } + writer.contains_item(rtxn, item)? }; if contains { return Ok(contains); From b31e9bea26c098750dece8fb38eb2f57d6c254b5 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 24 Sep 2024 16:33:17 +0200 Subject: [PATCH 24/92] while retrieving the readers on an arroywrapper, stops at the first empty reader --- milli/src/vector/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index d5b80db83..b6d6510af 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -56,7 +56,11 @@ impl ArroyWrapper { ) -> impl Iterator, arroy::Error>> + 'a { arroy_db_range_for_embedder(self.embedder_index).map_while(move |index| { match arroy::Reader::open(rtxn, index, db) { - Ok(reader) => Some(Ok(reader)), + Ok(reader) => match reader.is_empty(rtxn) { + Ok(false) => Some(Ok(reader)), + Ok(true) => None, + Err(e) => Some(Err(e)), + }, Err(arroy::Error::MissingMetadata(_)) => None, Err(e) => Some(Err(e)), } From e9580fe61946477d83b9222ad4c00058a9868824 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Sep 2024 11:03:17 +0200 Subject: [PATCH 25/92] Add turkish normalization --- meilisearch-types/Cargo.toml | 5 ++++- meilisearch/Cargo.toml | 1 + milli/Cargo.toml | 4 ++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/meilisearch-types/Cargo.toml b/meilisearch-types/Cargo.toml index cb4937e57..0dae024f2 100644 --- a/meilisearch-types/Cargo.toml +++ b/meilisearch-types/Cargo.toml @@ -66,5 +66,8 @@ khmer = ["milli/khmer"] vietnamese = ["milli/vietnamese"] # force swedish character recomposition swedish-recomposition = ["milli/swedish-recomposition"] -# force german character recomposition +# allow german tokenization german = ["milli/german"] +# allow turkish normalization +turkish = ["milli/turkish"] + diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 2a16e1017..c193c89d4 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -154,6 +154,7 @@ khmer = ["meilisearch-types/khmer"] vietnamese = ["meilisearch-types/vietnamese"] swedish-recomposition = ["meilisearch-types/swedish-recomposition"] german = ["meilisearch-types/german"] +turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 5fc2d65c8..70d09ce4e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -108,6 +108,7 @@ all-tokenizations = [ "charabia/vietnamese", "charabia/swedish-recomposition", "charabia/german-segmentation", + "charabia/turkish", ] # Use POSIX semaphores instead of SysV semaphores in LMDB @@ -146,5 +147,8 @@ german = ["charabia/german-segmentation"] # force swedish character recomposition swedish-recomposition = ["charabia/swedish-recomposition"] +# allow turkish specialized tokenization +turkish = ["charabia/turkish"] + # allow CUDA support, see cuda = ["candle-core/cuda"] From dc2cb58cf1ce3fa33f791d095f095c429a6ad9c0 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 25 Sep 2024 11:12:30 +0200 Subject: [PATCH 26/92] use charabia default for all-tokenization --- milli/Cargo.toml | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 70d09ce4e..3c4a44639 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -98,17 +98,7 @@ rand = { version = "0.8.5", features = ["small_rng"] } [features] all-tokenizations = [ - "charabia/chinese", - "charabia/hebrew", - "charabia/japanese", - "charabia/thai", - "charabia/korean", - "charabia/greek", - "charabia/khmer", - "charabia/vietnamese", - "charabia/swedish-recomposition", - "charabia/german-segmentation", - "charabia/turkish", + "charabia/default", ] # Use POSIX semaphores instead of SysV semaphores in LMDB From 78a4b7949df6c1f5ee6e95c80b8966ddf5aca957 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 26 Sep 2024 15:04:03 +0200 Subject: [PATCH 27/92] =?UTF-8?q?update=20rhai=20to=20a=20version=20that?= =?UTF-8?q?=20shouldn=E2=80=99t=20panic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.lock | 8 +++----- milli/Cargo.toml | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bcca35173..3237d4e16 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4581,9 +4581,8 @@ dependencies = [ [[package]] name = "rhai" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61797318be89b1a268a018a92a7657096d83f3ecb31418b9e9c16dcbb043b702" +version = "1.20.0" +source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4" dependencies = [ "ahash 0.8.11", "bitflags 2.6.0", @@ -4600,8 +4599,7 @@ dependencies = [ [[package]] name = "rhai_codegen" version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5a11a05ee1ce44058fa3d5961d05194fdbe3ad6b40f904af764d81b86450e6b" +source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4" dependencies = [ "proc-macro2", "quote", diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 5fc2d65c8..b22d2164f 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -79,7 +79,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", ] } tiktoken-rs = "0.5.9" liquid = "0.26.6" -rhai = { version = "1.19.0", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] } +rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] } arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" } rand = "0.8.5" tracing = "0.1.40" From d20a39b9599f7962b2a316e45cc126f90a3d8eed Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 27 Sep 2024 15:44:30 +0300 Subject: [PATCH 28/92] Refactor find_best_match_interval --- milli/src/search/new/matches/mod.rs | 154 +++++++++++++++++++--------- 1 file changed, 106 insertions(+), 48 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 26115c39b..bbd39e682 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -442,36 +442,48 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { /// Returns the matches interval where the score computed by match_interval_score is the best. fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { + let matches_len = matches.len(); + // we compute the matches interval if we have at least 2 matches. - if matches.len() > 1 { + if matches_len > 1 { + // current interval positions. + let mut interval_first = 0; // positions of the first and the last match of the best matches interval in `matches`. let mut best_interval = (0, 0); let mut best_interval_score = self.match_interval_score(&matches[0..=0]); - // current interval positions. - let mut interval_first = 0; - let mut interval_last = 0; - for (index, next_match) in matches.iter().enumerate().skip(1) { + + let mut index = 1; + while index < matches_len - 1 { + let next_match = &matches[index]; + // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. let next_match_last_word_pos = next_match.get_last_word_pos(); - let mut interval_first_match_first_word_pos = + let interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); + // skip for 1, because it would result in the same as our very first interval score + if index != 1 { + let interval_last = index - 1; + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); - // keep interval if it's the best - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - best_interval_score = interval_score; + // keep interval if it's the best + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + best_interval_score = interval_score; + } } // advance start of the interval while interval is longer than crop_size. loop { interval_first += 1; - interval_first_match_first_word_pos = + let interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); if next_match_last_word_pos - interval_first_match_first_word_pos @@ -481,10 +493,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } } - interval_last = index; + + index += 1; } // compute the last interval score and compare it to the best one. + let interval_last = matches_len - 1; let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); if interval_score > best_interval_score { @@ -914,32 +928,32 @@ mod tests { let format_options = FormatOptions { highlight: true, crop: Some(10) }; - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); - let mut matcher = builder.build(text, None); - // should return 10 words with a marker at the start as well the end, and the highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…the power to split the world between those who embraced…" - ); + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); + // let mut matcher = builder.build(text, None); + // // should return 10 words with a marker at the start as well the end, and the highlighted matches. + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…the power to split the world between those who embraced…" + // ); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); - let mut matcher = builder.build(text, None); - // should highlight "those" and the phrase "and those". - insta::assert_snapshot!( - matcher.format(format_options), - @"…world between those who embraced progress and those who resisted…" - ); + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\""); + // let mut matcher = builder.build(text, None); + // // should highlight "those" and the phrase "and those". + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…groundbreaking invention had the power to split the world between…" + // ); - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"The groundbreaking invention had the power to split the world\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"The groundbreaking invention had the power to split the world…" - ); + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"The groundbreaking invention had the power to split the world\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"The groundbreaking invention had the power to split the world…" + // ); let builder = MatcherBuilder::new_test( &rtxn, @@ -952,16 +966,60 @@ mod tests { @"The groundbreaking invention had the power to split the world …" ); - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"The groundbreaking invention\" \"embraced progress and those who resisted change\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"…between those who embraced progress and those who resisted change…" - ); + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…between those who embraced progress and those who resisted change…" + // ); + + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"The groundbreaking invention\" \"split the world between those\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…the power to split the world between those who embraced…" + // ); + + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"groundbreaking invention\" \"split the world between\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…groundbreaking invention had the power to split the world between…" + // ); + + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"groundbreaking invention\" \"had the power to split the world between those\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…invention had the power to split the world between those…" + // ); + + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"The groundbreaking invention\" \"had the power to split the world between those\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…invention had the power to split the world between those…" + // ); } #[test] From eabc14c26858d9f0bda89e6fa38f0aa4b0244be8 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Mon, 30 Sep 2024 21:24:41 +0300 Subject: [PATCH 29/92] Refactor, handle more cases for phrases --- .../src/search/new/matches/matching_words.rs | 2 +- milli/src/search/new/matches/mod.rs | 497 ++++++++++-------- 2 files changed, 291 insertions(+), 208 deletions(-) diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 4ad5c37ec..4deaff6a0 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -181,7 +181,7 @@ impl<'a> PartialMatch<'a> { // return a new Partial match allowing the highlighter to continue. if is_matching && matching_words.len() > 1 { matching_words.remove(0); - Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len })) + Some(MatchType::Partial(Self { matching_words, ids, char_len })) // if there is no remaining word to match in the phrase and the current token is matching, // return a Full match. } else if is_matching { diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index bbd39e682..624287f5f 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; -use charabia::{Language, SeparatorKind, Token, Tokenizer}; +use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer}; pub use matching_words::MatchingWords; use matching_words::{MatchType, PartialMatch, WordId}; use serde::Serialize; @@ -145,6 +145,13 @@ impl Match { MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp, } } + + fn get_word_count(&self) -> usize { + match self.position { + MatchPosition::Word { .. } => 1, + MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => lwp - fwp + 1, + } + } } #[derive(Serialize, Debug, Clone, PartialEq, Eq)] @@ -153,6 +160,27 @@ pub struct MatchBounds { pub length: usize, } +enum SimpleTokenKind { + Separator(SeparatorKind), + NotSeparator, +} + +impl SimpleTokenKind { + fn get(token: &&Token<'_>) -> Self { + match token.kind { + TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), + _ => Self::NotSeparator, + } + } + + fn is_not_separator(&self) -> bool { + match self { + SimpleTokenKind::NotSeparator => true, + SimpleTokenKind::Separator(_) => false, + } + } +} + /// Structure used to analyze a string, compute words that match, /// and format the source string, returning a highlighted and cropped sub-string. pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { @@ -287,95 +315,130 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { crop_size: usize, ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. - let first_match_word_position = + let first_match_first_word_position = matches.first().map(|m| m.get_first_word_pos()).unwrap_or(0); - let first_match_token_position = + let first_match_first_token_position = matches.first().map(|m| m.get_first_token_pos()).unwrap_or(0); - let last_match_word_position = matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0); - let last_match_token_position = matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0); + let last_match_last_word_position = + matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0); + let last_match_last_token_position = + matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0); - // matches needs to be counted in the crop len. - let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; + let matches_window_len = + last_match_last_word_position - first_match_first_word_position + 1; - // create the initial state of the crop window: 2 iterators starting from the matches positions, - // a reverse iterator starting from the first match token position and going towards the beginning of the text, - let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); - // an iterator starting from the last match token position and going towards the end of the text. - let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); + if crop_size >= matches_window_len { + // matches needs to be counted in the crop len. + let mut remaining_words = crop_size - matches_window_len; - // grows the crop window peeking in both directions - // until the window contains the good number of words: - while remaining_words > 0 { - let before_token = before_tokens.peek().map(|t| t.separator_kind()); - let after_token = after_tokens.peek().map(|t| t.separator_kind()); + // create the initial state of the crop window: 2 iterators starting from the matches positions, + // a reverse iterator starting from the first match token position and going towards the beginning of the text, + let mut before_tokens = + tokens[..first_match_first_token_position].iter().rev().peekable(); + // an iterator starting from the last match token position and going towards the end of the text. + let mut after_tokens = tokens[last_match_last_token_position + 1..].iter().peekable(); - match (before_token, after_token) { - // we can expand both sides. - (Some(before_token), Some(after_token)) => { - match (before_token, after_token) { - // if they are both separators and are the same kind then advance both, - // or expand in the soft separator separator side. - (Some(before_token_kind), Some(after_token_kind)) => { - if before_token_kind == after_token_kind { - before_tokens.next(); + // grows the crop window peeking in both directions + // until the window contains the good number of words: + while remaining_words > 0 { + let before_token_kind = before_tokens.peek().map(SimpleTokenKind::get); + let after_token_kind = after_tokens.peek().map(SimpleTokenKind::get); - // this avoid having an ending separator before crop marker. - if remaining_words > 1 { + match (before_token_kind, after_token_kind) { + // we can expand both sides. + (Some(before_token_kind), Some(after_token_kind)) => { + match (before_token_kind, after_token_kind) { + // if they are both separators and are the same kind then advance both, + // or expand in the soft separator separator side. + ( + SimpleTokenKind::Separator(before_token_separator_kind), + SimpleTokenKind::Separator(after_token_separator_kind), + ) => { + if before_token_separator_kind == after_token_separator_kind { + before_tokens.next(); + + // this avoid having an ending separator before crop marker. + if remaining_words > 1 { + after_tokens.next(); + } + } else if let SeparatorKind::Hard = before_token_separator_kind { after_tokens.next(); + } else { + before_tokens.next(); } - } else if before_token_kind == SeparatorKind::Hard { - after_tokens.next(); - } else { - before_tokens.next(); } - } - // if one of the tokens is a word, we expend in the side of the word. - // left is a word, advance left. - (None, Some(_)) => { - before_tokens.next(); - remaining_words -= 1; - } - // right is a word, advance right. - (Some(_), None) => { - after_tokens.next(); - remaining_words -= 1; - } - // both are words, advance left then right if remaining_word > 0. - (None, None) => { - before_tokens.next(); - remaining_words -= 1; - - if remaining_words > 0 { + // if one of the tokens is a word, we expend in the side of the word. + // left is a word, advance left. + (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => { + before_tokens.next(); + remaining_words -= 1; + } + // right is a word, advance right. + (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => { after_tokens.next(); remaining_words -= 1; } + // both are words, advance left then right if remaining_word > 0. + (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => { + before_tokens.next(); + remaining_words -= 1; + + if remaining_words > 0 { + after_tokens.next(); + remaining_words -= 1; + } + } } } - } - // the end of the text is reached, advance left. - (Some(before_token), None) => { - before_tokens.next(); - if before_token.is_none() { - remaining_words -= 1; + // the end of the text is reached, advance left. + (Some(before_token_kind), None) => { + before_tokens.next(); + if let SimpleTokenKind::NotSeparator = before_token_kind { + remaining_words -= 1; + } } - } - // the start of the text is reached, advance right. - (None, Some(after_token)) => { - after_tokens.next(); - if after_token.is_none() { - remaining_words -= 1; + // the start of the text is reached, advance right. + (None, Some(after_token_kind)) => { + after_tokens.next(); + if let SimpleTokenKind::NotSeparator = after_token_kind { + remaining_words -= 1; + } } + // no more token to add. + (None, None) => break, } - // no more token to add. - (None, None) => break, } + + // finally, keep the byte index of each bound of the crop window. + let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); + let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + + (crop_byte_start, crop_byte_end) + } else { + // there's one match? and it's longer than the crop window, so we have to advance inward + let mut remaining_extra_words = matches_window_len - crop_size; + let mut tokens_from_end = + tokens[..=last_match_last_token_position].iter().rev().peekable(); + + while remaining_extra_words > 0 { + let token_from_end_kind = + tokens_from_end.peek().map(SimpleTokenKind::get).expect("TODO"); + if token_from_end_kind.is_not_separator() { + remaining_extra_words -= 1; + } + + tokens_from_end.next(); + } + + let crop_byte_start = if first_match_first_token_position > 0 { + &tokens[first_match_first_token_position - 1].byte_end + } else { + &0 + }; + let crop_byte_end = tokens_from_end.next().map(|t| t.byte_start).expect("TODO"); + + (*crop_byte_start, crop_byte_end) } - - // finally, keep the byte index of each bound of the crop window. - let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); - let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - - (crop_byte_start, crop_byte_end) } /// Compute the score of a match interval: @@ -416,11 +479,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { lwp } }; - - let next_match_first_word_pos = match next_match.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp, - }; + let next_match_first_word_pos = next_match.get_first_word_pos(); // compute distance between matches distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; @@ -443,72 +502,96 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { /// Returns the matches interval where the score computed by match_interval_score is the best. fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { let matches_len = matches.len(); + if matches_len <= 1 { + return matches; + } + + // positions of the first and the last match of the best matches interval in `matches`. + struct BestInterval { + interval: (usize, usize), + score: (i16, i16, i16), + } + + fn save_best_interval( + best_interval: &mut Option, + interval_first: usize, + interval_last: usize, + interval_score: (i16, i16, i16), + ) { + if let Some(best_interval) = best_interval { + if interval_score > best_interval.score { + best_interval.interval = (interval_first, interval_last); + best_interval.score = interval_score; + } + } else { + *best_interval = Some(BestInterval { + interval: (interval_first, interval_last), + score: interval_score, + }); + } + } + + let mut best_interval: Option = None; // we compute the matches interval if we have at least 2 matches. - if matches_len > 1 { - // current interval positions. - let mut interval_first = 0; - // positions of the first and the last match of the best matches interval in `matches`. - let mut best_interval = (0, 0); - let mut best_interval_score = self.match_interval_score(&matches[0..=0]); + // current interval positions. + let mut interval_first = 0; + let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - let mut index = 1; - while index < matches_len - 1 { - let next_match = &matches[index]; + for (index, next_match) in matches.iter().enumerate() { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + let next_match_last_word_pos = next_match.get_last_word_pos(); - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - let next_match_last_word_pos = next_match.get_last_word_pos(); - let interval_first_match_first_word_pos = - matches[interval_first].get_first_word_pos(); + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far + if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { + // if index is 0 there is no last viable match + if index != 0 { + let interval_last = index - 1; + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); - // if the next match would mean that we pass the crop size window, - // we take the last valid match, that didn't pass this boundry, which is `index` - 1, - // and calculate a score for it, and check if it's better than our best so far - if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { - // skip for 1, because it would result in the same as our very first interval score - if index != 1 { - let interval_last = index - 1; - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - - // keep interval if it's the best - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - best_interval_score = interval_score; - } - } - - // advance start of the interval while interval is longer than crop_size. - loop { - interval_first += 1; - let interval_first_match_first_word_pos = - matches[interval_first].get_first_word_pos(); - - if next_match_last_word_pos - interval_first_match_first_word_pos - < crop_size - { - break; - } - } + // keep interval if it's the best + save_best_interval( + &mut best_interval, + interval_first, + interval_last, + interval_score, + ); } - index += 1; - } + // advance start of the interval while interval is longer than crop_size. + loop { + interval_first += 1; + interval_first_match_first_word_pos = + matches[interval_first].get_first_word_pos(); - // compute the last interval score and compare it to the best one. - let interval_last = matches_len - 1; + if interval_first_match_first_word_pos > next_match_last_word_pos + || next_match_last_word_pos - interval_first_match_first_word_pos + < crop_size + { + break; + } + } + } + } + + // compute the last interval score and compare it to the best one. + let interval_last = matches_len - 1; + // if it's the last match with itself, we need to make sure it's + // not a phrase longer than the crop window + if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - } - - &matches[best_interval.0..=best_interval.1] - } else { - matches + save_best_interval(&mut best_interval, interval_first, interval_last, interval_score); } + + // if none of the matches fit the criteria above, default to the first one + let best_interval = best_interval.map_or((0, 0), |v| v.interval); + &matches[best_interval.0..=best_interval.1] } // Returns the formatted version of the original text. @@ -928,98 +1011,98 @@ mod tests { let format_options = FormatOptions { highlight: true, crop: Some(10) }; - // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); - // let mut matcher = builder.build(text, None); - // // should return 10 words with a marker at the start as well the end, and the highlighted matches. - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…the power to split the world between those who embraced…" - // ); + let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); + let mut matcher = builder.build(text, None); + // should return 10 words with a marker at the start as well the end, and the highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…the power to split the world between those who embraced…" + ); - // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\""); - // let mut matcher = builder.build(text, None); - // // should highlight "those" and the phrase "and those". - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…groundbreaking invention had the power to split the world between…" - // ); - - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"The groundbreaking invention had the power to split the world\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"The groundbreaking invention had the power to split the world…" - // ); + let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\""); + let mut matcher = builder.build(text, None); + // should highlight "those" and the phrase "and those". + insta::assert_snapshot!( + matcher.format(format_options), + @"…groundbreaking invention had the power to split the world between…" + ); let builder = MatcherBuilder::new_test( &rtxn, &temp_index, - "\"The groundbreaking invention had the power to split the world between\"", + "\"The groundbreaking invention had the power to split the world\"", ); let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - @"The groundbreaking invention had the power to split the world …" + @"The groundbreaking invention had the power to split the world…" ); - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…between those who embraced progress and those who resisted change…" - // ); + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention had the power to split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"The groundbreaking invention had the power to split the world…" + ); - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"The groundbreaking invention\" \"split the world between those\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…the power to split the world between those who embraced…" - // ); + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…between those who embraced progress and those who resisted change…" + ); - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"groundbreaking invention\" \"split the world between\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…groundbreaking invention had the power to split the world between…" - // ); + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention\" \"split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…the power to split the world between those who embraced…" + ); - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"groundbreaking invention\" \"had the power to split the world between those\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…invention had the power to split the world between those…" - // ); + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"groundbreaking invention\" \"split the world between\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…groundbreaking invention had the power to split the world between…" + ); - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"The groundbreaking invention\" \"had the power to split the world between those\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…invention had the power to split the world between those…" - // ); + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"groundbreaking invention\" \"had the power to split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…invention had the power to split the world between those…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention\" \"had the power to split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…invention had the power to split the world between those…" + ); } #[test] From 6d16230f17eb000407adb21dc2f3e9fa49767cc8 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:19:15 +0300 Subject: [PATCH 30/92] Refactor --- milli/src/search/new/matches/mod.rs | 327 ++++++++++++++-------------- 1 file changed, 158 insertions(+), 169 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 624287f5f..804b59553 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -181,6 +181,149 @@ impl SimpleTokenKind { } } +#[derive(PartialEq, PartialOrd)] +struct MatchIntervalScore(i16, i16, i16); + +impl MatchIntervalScore { + /// Compute the score of a match interval: + /// 1) count unique matches + /// 2) calculate distance between matches + /// 3) count ordered matches + fn new(matches: &[Match]) -> Self { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + // count score for phrases + fn tally_phrase_scores( + fwp: &usize, + lwp: &usize, + order_score: &mut i16, + distance_score: &mut i16, + ) { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be ordered, so +1 for each space between words + *order_score += words_in_phrase_minus_one; + // distance will always be 1, so -1 for each space between words + *distance_score -= words_in_phrase_minus_one; + } + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + let m_last_word_pos = match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => { + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + lwp + } + }; + let next_match_first_word_pos = next_match.get_first_word_pos(); + + // compute distance between matches + distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; + } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position { + // in case last match is a phrase, count score for its words + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + Self(uniq_score, distance_score, order_score) + } +} + +struct MatchIntervalWithScore { + interval: (usize, usize), + score: MatchIntervalScore, +} + +impl MatchIntervalWithScore { + /// Returns the matches interval where the score computed by match_interval_score is the best. + fn find_best_match_interval(matches: &[Match], crop_size: usize) -> &[Match] { + let matches_len = matches.len(); + if matches_len <= 1 { + return matches; + } + + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval: Option = None; + let mut save_best_interval = |interval_first, interval_last, interval_score| { + let is_interval_score_better = + &best_interval.as_ref().map_or(true, |Self { score, .. }| interval_score > *score); + if *is_interval_score_better { + best_interval = + Some(Self { interval: (interval_first, interval_last), score: interval_score }); + } + }; + + // we compute the matches interval if we have at least 2 matches. + // current interval positions. + let mut interval_first = 0; + let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + for (index, next_match) in matches.iter().enumerate() { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + let next_match_last_word_pos = next_match.get_last_word_pos(); + + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far + if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { + // if index is 0 there is no last viable match + if index != 0 { + let interval_last = index - 1; + let interval_score = + MatchIntervalScore::new(&matches[interval_first..=interval_last]); + + // keep interval if it's the best + save_best_interval(interval_first, interval_last, interval_score); + } + + // advance start of the interval while interval is longer than crop_size. + loop { + interval_first += 1; + interval_first_match_first_word_pos = + matches[interval_first].get_first_word_pos(); + + if interval_first_match_first_word_pos > next_match_last_word_pos + || next_match_last_word_pos - interval_first_match_first_word_pos + < crop_size + { + break; + } + } + } + } + + // compute the last interval score and compare it to the best one. + let interval_last = matches_len - 1; + // if it's the last match with itself, we need to make sure it's + // not a phrase longer than the crop window + if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { + let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]); + save_best_interval(interval_first, interval_last, interval_score); + } + + // if none of the matches fit the criteria above, default to the first one + let best_interval = best_interval.map_or((0, 0), |v| v.interval); + &matches[best_interval.0..=best_interval.1] + } +} + /// Structure used to analyze a string, compute words that match, /// and format the source string, returning a highlighted and cropped sub-string. pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { @@ -415,14 +558,16 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { (crop_byte_start, crop_byte_end) } else { - // there's one match? and it's longer than the crop window, so we have to advance inward + // there's one match and it's longer than the crop window, so we have to advance inward let mut remaining_extra_words = matches_window_len - crop_size; let mut tokens_from_end = tokens[..=last_match_last_token_position].iter().rev().peekable(); while remaining_extra_words > 0 { - let token_from_end_kind = - tokens_from_end.peek().map(SimpleTokenKind::get).expect("TODO"); + let token_from_end_kind = tokens_from_end + .peek() + .map(SimpleTokenKind::get) + .expect("Expected iterator to not reach end"); if token_from_end_kind.is_not_separator() { remaining_extra_words -= 1; } @@ -435,165 +580,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } else { &0 }; - let crop_byte_end = tokens_from_end.next().map(|t| t.byte_start).expect("TODO"); + let crop_byte_end = tokens_from_end + .next() + .map(|t| t.byte_start) + .expect("Expected iterator to not reach end"); (*crop_byte_start, crop_byte_end) } } - /// Compute the score of a match interval: - /// 1) count unique matches - /// 2) calculate distance between matches - /// 3) count ordered matches - fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { - let mut ids: Vec = Vec::with_capacity(matches.len()); - let mut order_score = 0; - let mut distance_score = 0; - - // count score for phrases - fn tally_phrase_scores( - fwp: &usize, - lwp: &usize, - order_score: &mut i16, - distance_score: &mut i16, - ) { - let words_in_phrase_minus_one = (lwp - fwp) as i16; - // will always be ordered, so +1 for each space between words - *order_score += words_in_phrase_minus_one; - // distance will always be 1, so -1 for each space between words - *distance_score -= words_in_phrase_minus_one; - } - - let mut iter = matches.iter().peekable(); - while let Some(m) = iter.next() { - if let Some(next_match) = iter.peek() { - // if matches are ordered - if next_match.ids.iter().min() > m.ids.iter().min() { - order_score += 1; - } - - let m_last_word_pos = match m.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => { - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - lwp - } - }; - let next_match_first_word_pos = next_match.get_first_word_pos(); - - // compute distance between matches - distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; - } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position { - // in case last match is a phrase, count score for its words - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - } - - ids.extend(m.ids.iter()); - } - - ids.sort_unstable(); - ids.dedup(); - let uniq_score = ids.len() as i16; - - // rank by unique match count, then by distance between matches, then by ordered match count. - (uniq_score, distance_score, order_score) - } - - /// Returns the matches interval where the score computed by match_interval_score is the best. - fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { - let matches_len = matches.len(); - if matches_len <= 1 { - return matches; - } - - // positions of the first and the last match of the best matches interval in `matches`. - struct BestInterval { - interval: (usize, usize), - score: (i16, i16, i16), - } - - fn save_best_interval( - best_interval: &mut Option, - interval_first: usize, - interval_last: usize, - interval_score: (i16, i16, i16), - ) { - if let Some(best_interval) = best_interval { - if interval_score > best_interval.score { - best_interval.interval = (interval_first, interval_last); - best_interval.score = interval_score; - } - } else { - *best_interval = Some(BestInterval { - interval: (interval_first, interval_last), - score: interval_score, - }); - } - } - - let mut best_interval: Option = None; - - // we compute the matches interval if we have at least 2 matches. - // current interval positions. - let mut interval_first = 0; - let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - - for (index, next_match) in matches.iter().enumerate() { - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - let next_match_last_word_pos = next_match.get_last_word_pos(); - - // if the next match would mean that we pass the crop size window, - // we take the last valid match, that didn't pass this boundry, which is `index` - 1, - // and calculate a score for it, and check if it's better than our best so far - if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { - // if index is 0 there is no last viable match - if index != 0 { - let interval_last = index - 1; - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - - // keep interval if it's the best - save_best_interval( - &mut best_interval, - interval_first, - interval_last, - interval_score, - ); - } - - // advance start of the interval while interval is longer than crop_size. - loop { - interval_first += 1; - interval_first_match_first_word_pos = - matches[interval_first].get_first_word_pos(); - - if interval_first_match_first_word_pos > next_match_last_word_pos - || next_match_last_word_pos - interval_first_match_first_word_pos - < crop_size - { - break; - } - } - } - } - - // compute the last interval score and compare it to the best one. - let interval_last = matches_len - 1; - // if it's the last match with itself, we need to make sure it's - // not a phrase longer than the crop window - if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - save_best_interval(&mut best_interval, interval_first, interval_last, interval_score); - } - - // if none of the matches fit the criteria above, default to the first one - let best_interval = best_interval.map_or((0, 0), |v| v.interval); - &matches[best_interval.0..=best_interval.1] - } - // Returns the formatted version of the original text. pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { if !format_options.highlight && format_options.crop.is_none() { @@ -606,7 +601,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // crop around the best interval. let (byte_start, byte_end) = match format_options.crop { Some(crop_size) if crop_size > 0 => { - let matches = self.find_best_match_interval(matches, crop_size); + let matches = MatchIntervalWithScore::find_best_match_interval( + matches, crop_size, + ); self.crop_bounds(tokens, matches, crop_size) } _ => (0, self.text.len()), @@ -1046,6 +1043,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), + // @TODO: Should probably highlight it all, even if it didn't fit the whole phrase @"The groundbreaking invention had the power to split the world…" ); @@ -1057,6 +1055,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), + // @TODO: Should probably include end of string in this case? @"…between those who embraced progress and those who resisted change…" ); @@ -1090,17 +1089,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - @"…invention had the power to split the world between those…" - ); - - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"The groundbreaking invention\" \"had the power to split the world between those\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), + // @TODO: "invention" should be highlighted as well @"…invention had the power to split the world between those…" ); } From d9e4db9983e7017bb13a89f7e28def43069e1a58 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:50:59 +0300 Subject: [PATCH 31/92] Refactor --- milli/src/search/new/matches/mod.rs | 38 ++++++++++++----------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 804b59553..1552de8aa 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -166,19 +166,12 @@ enum SimpleTokenKind { } impl SimpleTokenKind { - fn get(token: &&Token<'_>) -> Self { + fn new(token: &&Token<'_>) -> Self { match token.kind { TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), _ => Self::NotSeparator, } } - - fn is_not_separator(&self) -> bool { - match self { - SimpleTokenKind::NotSeparator => true, - SimpleTokenKind::Separator(_) => false, - } - } } #[derive(PartialEq, PartialOrd)] @@ -259,9 +252,12 @@ impl MatchIntervalWithScore { // positions of the first and the last match of the best matches interval in `matches`. let mut best_interval: Option = None; - let mut save_best_interval = |interval_first, interval_last, interval_score| { + + let mut save_best_interval = |interval_first, interval_last| { + let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]); let is_interval_score_better = &best_interval.as_ref().map_or(true, |Self { score, .. }| interval_score > *score); + if *is_interval_score_better { best_interval = Some(Self { interval: (interval_first, interval_last), score: interval_score }); @@ -286,11 +282,8 @@ impl MatchIntervalWithScore { // if index is 0 there is no last viable match if index != 0 { let interval_last = index - 1; - let interval_score = - MatchIntervalScore::new(&matches[interval_first..=interval_last]); - // keep interval if it's the best - save_best_interval(interval_first, interval_last, interval_score); + save_best_interval(interval_first, interval_last); } // advance start of the interval while interval is longer than crop_size. @@ -314,8 +307,7 @@ impl MatchIntervalWithScore { // if it's the last match with itself, we need to make sure it's // not a phrase longer than the crop window if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { - let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]); - save_best_interval(interval_first, interval_last, interval_score); + save_best_interval(interval_first, interval_last); } // if none of the matches fit the criteria above, default to the first one @@ -359,6 +351,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { + // @TODO: Shouldn't this be +1? match_len: word.char_end - *first_word_char_start, ids: ids.clone().collect(), position: MatchPosition::Phrase { @@ -484,8 +477,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // grows the crop window peeking in both directions // until the window contains the good number of words: while remaining_words > 0 { - let before_token_kind = before_tokens.peek().map(SimpleTokenKind::get); - let after_token_kind = after_tokens.peek().map(SimpleTokenKind::get); + let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new); + let after_token_kind = after_tokens.peek().map(SimpleTokenKind::new); match (before_token_kind, after_token_kind) { // we can expand both sides. @@ -504,7 +497,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { if remaining_words > 1 { after_tokens.next(); } - } else if let SeparatorKind::Hard = before_token_separator_kind { + } else if matches!(before_token_separator_kind, SeparatorKind::Hard) + { after_tokens.next(); } else { before_tokens.next(); @@ -536,14 +530,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // the end of the text is reached, advance left. (Some(before_token_kind), None) => { before_tokens.next(); - if let SimpleTokenKind::NotSeparator = before_token_kind { + if matches!(before_token_kind, SimpleTokenKind::NotSeparator) { remaining_words -= 1; } } // the start of the text is reached, advance right. (None, Some(after_token_kind)) => { after_tokens.next(); - if let SimpleTokenKind::NotSeparator = after_token_kind { + if matches!(after_token_kind, SimpleTokenKind::NotSeparator) { remaining_words -= 1; } } @@ -566,9 +560,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { while remaining_extra_words > 0 { let token_from_end_kind = tokens_from_end .peek() - .map(SimpleTokenKind::get) + .map(SimpleTokenKind::new) .expect("Expected iterator to not reach end"); - if token_from_end_kind.is_not_separator() { + if matches!(token_from_end_kind, SimpleTokenKind::NotSeparator) { remaining_extra_words -= 1; } From 4b598fa648944a5f5f1cdd7ecbdadd1cb8d3d659 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 30 Sep 2024 13:12:01 +0200 Subject: [PATCH 32/92] update arroy --- Cargo.lock | 5 +++-- index-scheduler/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- milli/src/error.rs | 1 + milli/src/vector/mod.rs | 33 +++++++++++++++++++++------------ 5 files changed, 27 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3237d4e16..c85a59952 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -386,8 +386,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arroy" -version = "0.4.0" -source = "git+https://github.com/meilisearch/arroy/?rev=2386594dfb009ce08821a925ccc89fb8e30bf73d#2386594dfb009ce08821a925ccc89fb8e30bf73d" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc5f272f38fa063bbff0a7ab5219404e221493de005e2b4078c62d626ef567e" dependencies = [ "bytemuck", "byteorder", diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 432a86382..e80311005 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -40,7 +40,7 @@ ureq = "2.10.0" uuid = { version = "1.10.0", features = ["serde", "v4"] } [dev-dependencies] -arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" } +arroy = "0.5.0" big_s = "1.0.2" crossbeam = "0.8.4" insta = { version = "1.39.0", features = ["json", "redactions"] } diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 01384f496..df0e59496 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -80,7 +80,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", tiktoken-rs = "0.5.9" liquid = "0.26.6" rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] } -arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" } +arroy = "0.5.0" rand = "0.8.5" tracing = "0.1.40" ureq = { version = "2.10.0", features = ["json"] } diff --git a/milli/src/error.rs b/milli/src/error.rs index 400d3d3be..840db7606 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -297,6 +297,7 @@ impl From for Error { arroy::Error::InvalidVecDimension { expected, received } => { Error::UserError(UserError::InvalidVectorDimensions { expected, found: received }) } + arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation), arroy::Error::DatabaseFull | arroy::Error::InvalidItemAppend | arroy::Error::UnmatchingDistance { .. } diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index b6d6510af..097e93ad2 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::sync::Arc; -use arroy::distances::{Angular, BinaryQuantizedAngular}; +use arroy::distances::{BinaryQuantizedCosine, Cosine}; use arroy::ItemId; use deserr::{DeserializeError, Deserr}; use heed::{RoTxn, RwTxn, Unspecified}; @@ -87,7 +87,7 @@ impl ArroyWrapper { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.need_build(wtxn)? { - writer.build(wtxn, rng, None)? + writer.builder(rng).build(wtxn)? } else if writer.is_empty(wtxn)? { break; } @@ -99,11 +99,10 @@ impl ArroyWrapper { // only happens once in the life of an embedder, it's not very performances // sensitive. if quantizing && !self.quantized { - let writer = - writer.prepare_changing_distance::(wtxn)?; - writer.build(wtxn, rng, None)? + let writer = writer.prepare_changing_distance::(wtxn)?; + writer.builder(rng).build(wtxn)?; } else if writer.need_build(wtxn)? { - writer.build(wtxn, rng, None)? + writer.builder(rng).build(wtxn)?; } else if writer.is_empty(wtxn)? { break; } @@ -323,8 +322,13 @@ impl ArroyWrapper { let mut results = Vec::new(); for reader in self.readers(rtxn, db) { - let ret = reader?.nns_by_item(rtxn, item, limit, None, None, filter)?; - if let Some(mut ret) = ret { + let reader = reader?; + let mut searcher = reader.nns(limit); + if let Some(filter) = filter { + searcher.candidates(filter); + } + + if let Some(mut ret) = searcher.by_item(rtxn, item)? { results.append(&mut ret); } else { break; @@ -359,8 +363,13 @@ impl ArroyWrapper { let mut results = Vec::new(); for reader in self.readers(rtxn, db) { - let mut ret = reader?.nns_by_vector(rtxn, vector, limit, None, None, filter)?; - results.append(&mut ret); + let reader = reader?; + let mut searcher = reader.nns(limit); + if let Some(filter) = filter { + searcher.candidates(filter); + } + + results.append(&mut searcher.by_vector(rtxn, vector)?); } results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); @@ -391,11 +400,11 @@ impl ArroyWrapper { Ok(vectors) } - fn angular_db(&self) -> arroy::Database { + fn angular_db(&self) -> arroy::Database { self.database.remap_data_type() } - fn quantized_db(&self) -> arroy::Database { + fn quantized_db(&self) -> arroy::Database { self.database.remap_data_type() } } From b1dc10e771a757826fe400280c8bac84976ce95b Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 1 Oct 2024 17:45:49 +0200 Subject: [PATCH 33/92] uses the new cancellation method in arroy --- milli/src/update/index_documents/mod.rs | 3 ++- milli/src/vector/mod.rs | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e164a0817..88d20fff0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -699,6 +699,7 @@ where for (embedder_name, dimension) in dimension { let wtxn = &mut *self.wtxn; let vector_arroy = self.index.vector_arroy; + let cancel = &self.should_abort; let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, @@ -713,7 +714,7 @@ where pool.install(|| { let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); - writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing)?; + writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing, cancel)?; Result::Ok(()) }) .map_err(InternalError::from)??; diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 097e93ad2..571c02c8c 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -82,6 +82,7 @@ impl ArroyWrapper { rng: &mut R, dimension: usize, quantizing: bool, + cancel: &(impl Fn() -> bool + Sync + Send), ) -> Result<(), arroy::Error> { for index in arroy_db_range_for_embedder(self.embedder_index) { if self.quantized { @@ -100,9 +101,9 @@ impl ArroyWrapper { // sensitive. if quantizing && !self.quantized { let writer = writer.prepare_changing_distance::(wtxn)?; - writer.builder(rng).build(wtxn)?; + writer.builder(rng).cancel(cancel).build(wtxn)?; } else if writer.need_build(wtxn)? { - writer.builder(rng).build(wtxn)?; + writer.builder(rng).cancel(cancel).build(wtxn)?; } else if writer.is_empty(wtxn)? { break; } From 37a9d64c4441bb6a4a199ad018ab4ddb44d4d958 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Tue, 1 Oct 2024 22:52:01 +0300 Subject: [PATCH 34/92] Fix failing test, refactor --- milli/src/search/new/matches/mod.rs | 44 ++++++++++++++++++----------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 1552de8aa..ae1264482 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -245,8 +245,7 @@ struct MatchIntervalWithScore { impl MatchIntervalWithScore { /// Returns the matches interval where the score computed by match_interval_score is the best. fn find_best_match_interval(matches: &[Match], crop_size: usize) -> &[Match] { - let matches_len = matches.len(); - if matches_len <= 1 { + if matches.len() <= 1 { return matches; } @@ -303,7 +302,7 @@ impl MatchIntervalWithScore { } // compute the last interval score and compare it to the best one. - let interval_last = matches_len - 1; + let interval_last = matches.len() - 1; // if it's the last match with itself, we need to make sure it's // not a phrase longer than the crop window if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { @@ -451,28 +450,39 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { crop_size: usize, ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. - let first_match_first_word_position = - matches.first().map(|m| m.get_first_word_pos()).unwrap_or(0); - let first_match_first_token_position = - matches.first().map(|m| m.get_first_token_pos()).unwrap_or(0); - let last_match_last_word_position = - matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0); - let last_match_last_token_position = - matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0); + let (matches_size, first_match_first_token_position, last_match_last_token_position) = + if !matches.is_empty() { + let matches_first = matches.first().unwrap(); + let matches_last = matches.last().unwrap(); - let matches_window_len = - last_match_last_word_position - first_match_first_word_position + 1; + ( + matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1, + matches_first.get_first_token_pos(), + matches_last.get_last_token_pos(), + ) + } else { + (0, 0, 0) + }; - if crop_size >= matches_window_len { + if crop_size >= matches_size { // matches needs to be counted in the crop len. - let mut remaining_words = crop_size - matches_window_len; + let mut remaining_words = crop_size - matches_size; + + let last_match_last_token_position_plus_one = last_match_last_token_position + 1; + let after_tokens_starting_index = if matches_size == 0 { + 0 + } else if last_match_last_token_position_plus_one < tokens.len() { + last_match_last_token_position_plus_one + } else { + tokens.len() + }; // create the initial state of the crop window: 2 iterators starting from the matches positions, // a reverse iterator starting from the first match token position and going towards the beginning of the text, let mut before_tokens = tokens[..first_match_first_token_position].iter().rev().peekable(); // an iterator starting from the last match token position and going towards the end of the text. - let mut after_tokens = tokens[last_match_last_token_position + 1..].iter().peekable(); + let mut after_tokens = tokens[after_tokens_starting_index..].iter().peekable(); // grows the crop window peeking in both directions // until the window contains the good number of words: @@ -553,7 +563,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { (crop_byte_start, crop_byte_end) } else { // there's one match and it's longer than the crop window, so we have to advance inward - let mut remaining_extra_words = matches_window_len - crop_size; + let mut remaining_extra_words = matches_size - crop_size; let mut tokens_from_end = tokens[..=last_match_last_token_position].iter().rev().peekable(); From 62dfbd6255846db8fcfb7c515a9ad041999f7d3a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Oct 2024 11:20:02 +0200 Subject: [PATCH 35/92] Add binary quantized to allowed fields for source adds its sources --- milli/src/vector/settings.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index 3bb7f09e6..d1cf364a2 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -417,6 +417,8 @@ impl EmbeddingSettings { pub const DISTRIBUTION: &'static str = "distribution"; + pub const BINARY_QUANTIZED: &'static str = "binaryQuantized"; + pub fn allowed_sources_for_field(field: &'static str) -> &'static [EmbedderSource] { match field { Self::SOURCE => &[ @@ -456,6 +458,13 @@ impl EmbeddingSettings { EmbedderSource::Rest, EmbedderSource::UserProvided, ], + Self::BINARY_QUANTIZED => &[ + EmbedderSource::HuggingFace, + EmbedderSource::Ollama, + EmbedderSource::OpenAi, + EmbedderSource::Rest, + EmbedderSource::UserProvided, + ], _other => unreachable!("unknown field"), } } @@ -470,6 +479,7 @@ impl EmbeddingSettings { Self::DIMENSIONS, Self::DISTRIBUTION, Self::URL, + Self::BINARY_QUANTIZED, ], EmbedderSource::HuggingFace => &[ Self::SOURCE, @@ -477,6 +487,7 @@ impl EmbeddingSettings { Self::REVISION, Self::DOCUMENT_TEMPLATE, Self::DISTRIBUTION, + Self::BINARY_QUANTIZED, ], EmbedderSource::Ollama => &[ Self::SOURCE, @@ -486,8 +497,11 @@ impl EmbeddingSettings { Self::API_KEY, Self::DIMENSIONS, Self::DISTRIBUTION, + Self::BINARY_QUANTIZED, ], - EmbedderSource::UserProvided => &[Self::SOURCE, Self::DIMENSIONS, Self::DISTRIBUTION], + EmbedderSource::UserProvided => { + &[Self::SOURCE, Self::DIMENSIONS, Self::DISTRIBUTION, Self::BINARY_QUANTIZED] + } EmbedderSource::Rest => &[ Self::SOURCE, Self::API_KEY, @@ -498,6 +512,7 @@ impl EmbeddingSettings { Self::RESPONSE, Self::HEADERS, Self::DISTRIBUTION, + Self::BINARY_QUANTIZED, ], } } From 0c2661ea90f26d3269d0ed53cb47fa69bf9e5600 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Oct 2024 11:20:29 +0200 Subject: [PATCH 36/92] Fix tests --- meilisearch/tests/vector/settings.rs | 47 ++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs index 4f07ca18b..ed45913a8 100644 --- a/meilisearch/tests/vector/settings.rs +++ b/meilisearch/tests/vector/settings.rs @@ -4,6 +4,53 @@ use crate::common::{GetAllDocumentsOptions, Server}; use crate::json; use crate::vector::generate_default_user_provided_documents; +#[actix_rt::test] +async fn field_unavailable_for_source() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false, + "editDocumentsByFunction": false, + "containsFilter": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "manual": {"source": "userProvided", "documentTemplate": "{{doc.documentTemplate}}"}}, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.manual`: Field `documentTemplate` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`, `rest`). Available fields: `source`, `dimensions`, `distribution`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "default": {"source": "openAi", "revision": "42"}}, + })) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r###" + { + "message": "`.embedders.default`: Field `revision` unavailable for source `openAi` (only available for sources: `huggingFace`). Available fields: `source`, `model`, `apiKey`, `documentTemplate`, `dimensions`, `distribution`, `url`, `binaryQuantized`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + } + "###); +} + #[actix_rt::test] async fn update_embedder() { let server = Server::new().await; From 40336ce87d46b43123d03cb343b4d3f785001a9c Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 3 Oct 2024 10:40:14 +0300 Subject: [PATCH 37/92] Fix and refactor crop_bounds --- milli/src/search/new/matches/mod.rs | 231 ++++++++++++++-------------- 1 file changed, 113 insertions(+), 118 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index ae1264482..f8d60ef54 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -1,6 +1,7 @@ use std::borrow::Cow; use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer}; +use either::Either; pub use matching_words::MatchingWords; use matching_words::{MatchType, PartialMatch, WordId}; use serde::Serialize; @@ -450,147 +451,141 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { crop_size: usize, ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. - let (matches_size, first_match_first_token_position, last_match_last_token_position) = - if !matches.is_empty() { - let matches_first = matches.first().unwrap(); - let matches_last = matches.last().unwrap(); + let ( + mut remaining_words, + is_iterating_forward, + before_tokens_starting_index, + after_tokens_starting_index, + ) = if !matches.is_empty() { + let matches_first = matches.first().unwrap(); + let matches_last = matches.last().unwrap(); - ( - matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1, - matches_first.get_first_token_pos(), - matches_last.get_last_token_pos(), - ) + let matches_size = + matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1; + + let is_crop_size_gte_match_size = crop_size >= matches_size; + let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size; + + let remaining_words = if is_crop_size_gte_match_size { + crop_size - matches_size } else { - (0, 0, 0) + // in case matches size is greater than crop size, which implies there's only one match, + // we count words backwards, because we have to remove words, as they're extra words outside of + // crop window + matches_size - crop_size }; - if crop_size >= matches_size { - // matches needs to be counted in the crop len. - let mut remaining_words = crop_size - matches_size; - - let last_match_last_token_position_plus_one = last_match_last_token_position + 1; let after_tokens_starting_index = if matches_size == 0 { 0 - } else if last_match_last_token_position_plus_one < tokens.len() { - last_match_last_token_position_plus_one } else { - tokens.len() + let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1; + if last_match_last_token_position_plus_one < tokens.len() { + last_match_last_token_position_plus_one + } else { + // we have matched the end of possible tokens, there's nothing to advance + tokens.len() - 1 + } }; - // create the initial state of the crop window: 2 iterators starting from the matches positions, - // a reverse iterator starting from the first match token position and going towards the beginning of the text, - let mut before_tokens = - tokens[..first_match_first_token_position].iter().rev().peekable(); - // an iterator starting from the last match token position and going towards the end of the text. - let mut after_tokens = tokens[after_tokens_starting_index..].iter().peekable(); + ( + remaining_words, + is_iterating_forward, + if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 }, + after_tokens_starting_index, + ) + } else { + (crop_size, true, 0, 0) + }; - // grows the crop window peeking in both directions - // until the window contains the good number of words: - while remaining_words > 0 { - let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new); - let after_token_kind = after_tokens.peek().map(SimpleTokenKind::new); + // create the initial state of the crop window: 2 iterators starting from the matches positions, + // a reverse iterator starting from the first match token position and going towards the beginning of the text, + let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable(); + // an iterator ... + let mut after_tokens = if is_iterating_forward { + // ... starting from the last match token position and going towards the end of the text. + Either::Left(tokens[after_tokens_starting_index..].iter().peekable()) + } else { + // ... starting from the last match token position and going towards the start of the text. + Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable()) + }; - match (before_token_kind, after_token_kind) { - // we can expand both sides. - (Some(before_token_kind), Some(after_token_kind)) => { - match (before_token_kind, after_token_kind) { - // if they are both separators and are the same kind then advance both, - // or expand in the soft separator separator side. - ( - SimpleTokenKind::Separator(before_token_separator_kind), - SimpleTokenKind::Separator(after_token_separator_kind), - ) => { - if before_token_separator_kind == after_token_separator_kind { - before_tokens.next(); + // grows the crop window peeking in both directions + // until the window contains the good number of words: + while remaining_words > 0 { + let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new); + let after_token_kind = + after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new); - // this avoid having an ending separator before crop marker. - if remaining_words > 1 { - after_tokens.next(); - } - } else if matches!(before_token_separator_kind, SeparatorKind::Hard) - { - after_tokens.next(); - } else { - before_tokens.next(); - } - } - // if one of the tokens is a word, we expend in the side of the word. - // left is a word, advance left. - (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => { + match (before_token_kind, after_token_kind) { + // we can expand both sides. + (Some(before_token_kind), Some(after_token_kind)) => { + match (before_token_kind, after_token_kind) { + // if they are both separators and are the same kind then advance both, + // or expand in the soft separator separator side. + ( + SimpleTokenKind::Separator(before_token_separator_kind), + SimpleTokenKind::Separator(after_token_separator_kind), + ) => { + if before_token_separator_kind == after_token_separator_kind { + before_tokens.next(); + + // this avoid having an ending separator before crop marker. + if remaining_words > 1 { + after_tokens.next(); + } + } else if matches!(before_token_separator_kind, SeparatorKind::Hard) { + after_tokens.next(); + } else { before_tokens.next(); - remaining_words -= 1; } - // right is a word, advance right. - (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => { + } + // if one of the tokens is a word, we expend in the side of the word. + // left is a word, advance left. + (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => { + before_tokens.next(); + remaining_words -= 1; + } + // right is a word, advance right. + (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => { + after_tokens.next(); + remaining_words -= 1; + } + // both are words, advance left then right if remaining_word > 0. + (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => { + before_tokens.next(); + remaining_words -= 1; + + if remaining_words > 0 { after_tokens.next(); remaining_words -= 1; } - // both are words, advance left then right if remaining_word > 0. - (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => { - before_tokens.next(); - remaining_words -= 1; - - if remaining_words > 0 { - after_tokens.next(); - remaining_words -= 1; - } - } } } - // the end of the text is reached, advance left. - (Some(before_token_kind), None) => { - before_tokens.next(); - if matches!(before_token_kind, SimpleTokenKind::NotSeparator) { - remaining_words -= 1; - } - } - // the start of the text is reached, advance right. - (None, Some(after_token_kind)) => { - after_tokens.next(); - if matches!(after_token_kind, SimpleTokenKind::NotSeparator) { - remaining_words -= 1; - } - } - // no more token to add. - (None, None) => break, } - } - - // finally, keep the byte index of each bound of the crop window. - let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); - let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - - (crop_byte_start, crop_byte_end) - } else { - // there's one match and it's longer than the crop window, so we have to advance inward - let mut remaining_extra_words = matches_size - crop_size; - let mut tokens_from_end = - tokens[..=last_match_last_token_position].iter().rev().peekable(); - - while remaining_extra_words > 0 { - let token_from_end_kind = tokens_from_end - .peek() - .map(SimpleTokenKind::new) - .expect("Expected iterator to not reach end"); - if matches!(token_from_end_kind, SimpleTokenKind::NotSeparator) { - remaining_extra_words -= 1; + // the end of the text is reached, advance left. + (Some(before_token_kind), None) => { + before_tokens.next(); + if matches!(before_token_kind, SimpleTokenKind::NotSeparator) { + remaining_words -= 1; + } } - - tokens_from_end.next(); + // the start of the text is reached, advance right. + (None, Some(after_token_kind)) => { + after_tokens.next(); + if matches!(after_token_kind, SimpleTokenKind::NotSeparator) { + remaining_words -= 1; + } + } + // no more token to add. + (None, None) => break, } - - let crop_byte_start = if first_match_first_token_position > 0 { - &tokens[first_match_first_token_position - 1].byte_end - } else { - &0 - }; - let crop_byte_end = tokens_from_end - .next() - .map(|t| t.byte_start) - .expect("Expected iterator to not reach end"); - - (*crop_byte_start, crop_byte_end) } + + // finally, keep the byte index of each bound of the crop window. + let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); + let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + + (crop_byte_start, crop_byte_end) } // Returns the formatted version of the original text. From 8221c94e7f5666c73944cc5f57211a0eb4035b59 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 3 Oct 2024 15:37:51 +0300 Subject: [PATCH 38/92] Split into multiple files, refactor --- .../search/new/matches/best_match_interval.rs | 139 ++++++++++ milli/src/search/new/matches/match.rs | 62 +++++ milli/src/search/new/matches/mod.rs | 244 +----------------- .../search/new/matches/simple_token_kind.rs | 15 ++ 4 files changed, 230 insertions(+), 230 deletions(-) create mode 100644 milli/src/search/new/matches/best_match_interval.rs create mode 100644 milli/src/search/new/matches/match.rs create mode 100644 milli/src/search/new/matches/simple_token_kind.rs diff --git a/milli/src/search/new/matches/best_match_interval.rs b/milli/src/search/new/matches/best_match_interval.rs new file mode 100644 index 000000000..a6497f351 --- /dev/null +++ b/milli/src/search/new/matches/best_match_interval.rs @@ -0,0 +1,139 @@ +use super::matching_words::WordId; +use super::{Match, MatchPosition}; + +struct MatchIntervalWithScore { + interval: [usize; 2], + score: [i16; 3], +} + +// count score for phrases +fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be ordered, so +1 for each space between words + *order_score += words_in_phrase_minus_one; + // distance will always be 1, so -1 for each space between words + *distance_score -= words_in_phrase_minus_one; +} + +/// Compute the score of a match interval: +/// 1) count unique matches +/// 2) calculate distance between matches +/// 3) count ordered matches +fn get_interval_score(matches: &[Match]) -> [i16; 3] { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + let m_last_word_pos = match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => { + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + lwp + } + }; + let next_match_first_word_pos = next_match.get_first_word_pos(); + + // compute distance between matches + distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; + } else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position { + // in case last match is a phrase, count score for its words + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + [uniq_score, distance_score, order_score] +} + +/// Returns the first and last match where the score computed by match_interval_score is the best. +pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] { + if matches.is_empty() { + panic!("`matches` should not be empty at this point"); + } + + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval: Option = None; + + let mut save_best_interval = |interval_first, interval_last| { + let interval_score = get_interval_score(&matches[interval_first..=interval_last]); + let is_interval_score_better = &best_interval + .as_ref() + .map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score); + + if *is_interval_score_better { + best_interval = Some(MatchIntervalWithScore { + interval: [interval_first, interval_last], + score: interval_score, + }); + } + }; + + // we compute the matches interval if we have at least 2 matches. + // current interval positions. + let mut interval_first = 0; + let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + for (index, next_match) in matches.iter().enumerate() { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + let next_match_last_word_pos = next_match.get_last_word_pos(); + + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far + if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { + // if index is 0 there is no last viable match + if index != 0 { + let interval_last = index - 1; + // keep interval if it's the best + save_best_interval(interval_first, interval_last); + } + + // advance start of the interval while interval is longer than crop_size. + loop { + interval_first += 1; + if interval_first == matches.len() { + interval_first -= 1; + break; + } + + interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + if interval_first_match_first_word_pos > next_match_last_word_pos + || next_match_last_word_pos - interval_first_match_first_word_pos < crop_size + { + break; + } + } + } + } + + // compute the last interval score and compare it to the best one. + let interval_last = matches.len() - 1; + // if it's the last match with itself, we need to make sure it's + // not a phrase longer than the crop window + if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { + save_best_interval(interval_first, interval_last); + } + + // if none of the matches fit the criteria above, default to the first one + best_interval.map_or( + [&matches[0], &matches[0]], + |MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]], + ) +} diff --git a/milli/src/search/new/matches/match.rs b/milli/src/search/new/matches/match.rs new file mode 100644 index 000000000..cc08b006c --- /dev/null +++ b/milli/src/search/new/matches/match.rs @@ -0,0 +1,62 @@ +use super::matching_words::WordId; + +#[derive(Clone, Debug)] +pub enum MatchPosition { + Word { + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, + }, + Phrase { + // position of the first and last word in the phrase in the whole text. + word_positions: [usize; 2], + // position of the first and last token in the phrase in the whole text. + token_positions: [usize; 2], + }, +} + +#[derive(Clone, Debug)] +pub struct Match { + pub match_len: usize, + // ids of the query words that matches. + pub ids: Vec, + pub position: MatchPosition, +} + +impl Match { + pub(super) fn get_first_word_pos(&self) -> usize { + match self.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp, + } + } + + pub(super) fn get_last_word_pos(&self) -> usize { + match self.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp, + } + } + + pub(super) fn get_first_token_pos(&self) -> usize { + match self.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp, + } + } + + pub(super) fn get_last_token_pos(&self) -> usize { + match self.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp, + } + } + + pub(super) fn get_word_count(&self) -> usize { + match self.position { + MatchPosition::Word { .. } => 1, + MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1, + } + } +} diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index f8d60ef54..3df361702 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -1,12 +1,16 @@ -use std::borrow::Cow; +mod best_match_interval; +mod r#match; +mod matching_words; +mod simple_token_kind; -use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer}; +use charabia::{Language, SeparatorKind, Token, Tokenizer}; use either::Either; pub use matching_words::MatchingWords; -use matching_words::{MatchType, PartialMatch, WordId}; +use matching_words::{MatchType, PartialMatch}; +use r#match::{Match, MatchPosition}; use serde::Serialize; - -pub mod matching_words; +use simple_token_kind::SimpleTokenKind; +use std::borrow::Cow; const DEFAULT_CROP_MARKER: &str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; @@ -94,228 +98,12 @@ impl FormatOptions { } } -#[derive(Clone, Debug)] -pub enum MatchPosition { - Word { - // position of the word in the whole text. - word_position: usize, - // position of the token in the whole text. - token_position: usize, - }, - Phrase { - // position of the first and last word in the phrase in the whole text. - word_positions: (usize, usize), - // position of the first and last token in the phrase in the whole text. - token_positions: (usize, usize), - }, -} - -#[derive(Clone, Debug)] -pub struct Match { - match_len: usize, - // ids of the query words that matches. - ids: Vec, - position: MatchPosition, -} - -impl Match { - fn get_first_word_pos(&self) -> usize { - match self.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp, - } - } - - fn get_last_word_pos(&self) -> usize { - match self.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (_, lwp), .. } => lwp, - } - } - - fn get_first_token_pos(&self) -> usize { - match self.position { - MatchPosition::Word { token_position, .. } => token_position, - MatchPosition::Phrase { token_positions: (ftp, _), .. } => ftp, - } - } - - fn get_last_token_pos(&self) -> usize { - match self.position { - MatchPosition::Word { token_position, .. } => token_position, - MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp, - } - } - - fn get_word_count(&self) -> usize { - match self.position { - MatchPosition::Word { .. } => 1, - MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => lwp - fwp + 1, - } - } -} - #[derive(Serialize, Debug, Clone, PartialEq, Eq)] pub struct MatchBounds { pub start: usize, pub length: usize, } -enum SimpleTokenKind { - Separator(SeparatorKind), - NotSeparator, -} - -impl SimpleTokenKind { - fn new(token: &&Token<'_>) -> Self { - match token.kind { - TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), - _ => Self::NotSeparator, - } - } -} - -#[derive(PartialEq, PartialOrd)] -struct MatchIntervalScore(i16, i16, i16); - -impl MatchIntervalScore { - /// Compute the score of a match interval: - /// 1) count unique matches - /// 2) calculate distance between matches - /// 3) count ordered matches - fn new(matches: &[Match]) -> Self { - let mut ids: Vec = Vec::with_capacity(matches.len()); - let mut order_score = 0; - let mut distance_score = 0; - - // count score for phrases - fn tally_phrase_scores( - fwp: &usize, - lwp: &usize, - order_score: &mut i16, - distance_score: &mut i16, - ) { - let words_in_phrase_minus_one = (lwp - fwp) as i16; - // will always be ordered, so +1 for each space between words - *order_score += words_in_phrase_minus_one; - // distance will always be 1, so -1 for each space between words - *distance_score -= words_in_phrase_minus_one; - } - - let mut iter = matches.iter().peekable(); - while let Some(m) = iter.next() { - if let Some(next_match) = iter.peek() { - // if matches are ordered - if next_match.ids.iter().min() > m.ids.iter().min() { - order_score += 1; - } - - let m_last_word_pos = match m.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => { - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - lwp - } - }; - let next_match_first_word_pos = next_match.get_first_word_pos(); - - // compute distance between matches - distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; - } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position { - // in case last match is a phrase, count score for its words - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - } - - ids.extend(m.ids.iter()); - } - - ids.sort_unstable(); - ids.dedup(); - let uniq_score = ids.len() as i16; - - // rank by unique match count, then by distance between matches, then by ordered match count. - Self(uniq_score, distance_score, order_score) - } -} - -struct MatchIntervalWithScore { - interval: (usize, usize), - score: MatchIntervalScore, -} - -impl MatchIntervalWithScore { - /// Returns the matches interval where the score computed by match_interval_score is the best. - fn find_best_match_interval(matches: &[Match], crop_size: usize) -> &[Match] { - if matches.len() <= 1 { - return matches; - } - - // positions of the first and the last match of the best matches interval in `matches`. - let mut best_interval: Option = None; - - let mut save_best_interval = |interval_first, interval_last| { - let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]); - let is_interval_score_better = - &best_interval.as_ref().map_or(true, |Self { score, .. }| interval_score > *score); - - if *is_interval_score_better { - best_interval = - Some(Self { interval: (interval_first, interval_last), score: interval_score }); - } - }; - - // we compute the matches interval if we have at least 2 matches. - // current interval positions. - let mut interval_first = 0; - let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - - for (index, next_match) in matches.iter().enumerate() { - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - let next_match_last_word_pos = next_match.get_last_word_pos(); - - // if the next match would mean that we pass the crop size window, - // we take the last valid match, that didn't pass this boundry, which is `index` - 1, - // and calculate a score for it, and check if it's better than our best so far - if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { - // if index is 0 there is no last viable match - if index != 0 { - let interval_last = index - 1; - // keep interval if it's the best - save_best_interval(interval_first, interval_last); - } - - // advance start of the interval while interval is longer than crop_size. - loop { - interval_first += 1; - interval_first_match_first_word_pos = - matches[interval_first].get_first_word_pos(); - - if interval_first_match_first_word_pos > next_match_last_word_pos - || next_match_last_word_pos - interval_first_match_first_word_pos - < crop_size - { - break; - } - } - } - } - - // compute the last interval score and compare it to the best one. - let interval_last = matches.len() - 1; - // if it's the last match with itself, we need to make sure it's - // not a phrase longer than the crop window - if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { - save_best_interval(interval_first, interval_last); - } - - // if none of the matches fit the criteria above, default to the first one - let best_interval = best_interval.map_or((0, 0), |v| v.interval); - &matches[best_interval.0..=best_interval.1] - } -} - /// Structure used to analyze a string, compute words that match, /// and format the source string, returning a highlighted and cropped sub-string. pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { @@ -355,8 +143,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { match_len: word.char_end - *first_word_char_start, ids: ids.clone().collect(), position: MatchPosition::Phrase { - word_positions: (first_word_position, word_position), - token_positions: (first_token_position, token_position), + word_positions: [first_word_position, word_position], + token_positions: [first_token_position, token_position], }, }); @@ -450,15 +238,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { matches: &[Match], crop_size: usize, ) -> (usize, usize) { - // if there is no match, we start from the beginning of the string by default. let ( mut remaining_words, is_iterating_forward, before_tokens_starting_index, after_tokens_starting_index, ) = if !matches.is_empty() { - let matches_first = matches.first().unwrap(); - let matches_last = matches.last().unwrap(); + let [matches_first, matches_last] = + best_match_interval::find_best_match_interval(matches, crop_size); let matches_size = matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1; @@ -600,9 +387,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // crop around the best interval. let (byte_start, byte_end) = match format_options.crop { Some(crop_size) if crop_size > 0 => { - let matches = MatchIntervalWithScore::find_best_match_interval( - matches, crop_size, - ); self.crop_bounds(tokens, matches, crop_size) } _ => (0, self.text.len()), @@ -625,7 +409,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let token = &tokens[token_position]; (&token.byte_start, &token.byte_end) } - MatchPosition::Phrase { token_positions: (ftp, ltp), .. } => { + MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => { (&tokens[ftp].byte_start, &tokens[ltp].byte_end) } }; diff --git a/milli/src/search/new/matches/simple_token_kind.rs b/milli/src/search/new/matches/simple_token_kind.rs new file mode 100644 index 000000000..b34a8c985 --- /dev/null +++ b/milli/src/search/new/matches/simple_token_kind.rs @@ -0,0 +1,15 @@ +use charabia::{SeparatorKind, Token, TokenKind}; + +pub enum SimpleTokenKind { + Separator(SeparatorKind), + NotSeparator, +} + +impl SimpleTokenKind { + pub fn new(token: &&Token<'_>) -> Self { + match token.kind { + TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), + _ => Self::NotSeparator, + } + } +} From c3de3a9ab75e6be99314400137b8329cdf46ff12 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 4 Oct 2024 11:30:31 +0300 Subject: [PATCH 39/92] Refactor --- milli/src/search/new/matches/matching_words.rs | 12 +++--------- milli/src/search/new/matches/mod.rs | 1 - 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 4deaff6a0..e4d2785ca 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -130,7 +130,7 @@ impl<'a> Iterator for MatchesIter<'a, '_> { word.map(|word| self.matching_words.word_interner.get(word).as_str()) }) .collect(); - let partial = PartialMatch { matching_words: words, ids, char_len: 0 }; + let partial = PartialMatch { matching_words: words, ids }; partial.match_token(self.token).or_else(|| self.next()) } @@ -158,7 +158,6 @@ pub enum MatchType<'a> { pub struct PartialMatch<'a> { matching_words: Vec>, ids: &'a RangeInclusive, - char_len: usize, } impl<'a> PartialMatch<'a> { @@ -176,25 +175,20 @@ impl<'a> PartialMatch<'a> { None => token.is_stopword(), }; - let char_len = token.char_end - token.char_start; // if there are remaining words to match in the phrase and the current token is matching, // return a new Partial match allowing the highlighter to continue. if is_matching && matching_words.len() > 1 { matching_words.remove(0); - Some(MatchType::Partial(Self { matching_words, ids, char_len })) + Some(MatchType::Partial(Self { matching_words, ids })) // if there is no remaining word to match in the phrase and the current token is matching, // return a Full match. } else if is_matching { - Some(MatchType::Full { char_len, ids }) + Some(MatchType::Full { char_len: token.char_end - token.char_start, ids }) // if the current token doesn't match, return None to break the match sequence. } else { None } } - - pub fn char_len(&self) -> usize { - self.char_len - } } impl fmt::Debug for MatchingWords { diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 3df361702..9ca560529 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -139,7 +139,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - // @TODO: Shouldn't this be +1? match_len: word.char_end - *first_word_char_start, ids: ids.clone().collect(), position: MatchPosition::Phrase { From 03579aba13853560059cec3c881e284b4f7a307a Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 4 Oct 2024 11:38:47 +0300 Subject: [PATCH 40/92] Adjust test --- milli/src/search/new/matches/mod.rs | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 9ca560529..ac0fb7e7b 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -798,12 +798,12 @@ mod tests { @"…the power to split the world between those who embraced…" ); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\""); + let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); let mut matcher = builder.build(text, None); // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), - @"…groundbreaking invention had the power to split the world between…" + @"…world between those who embraced progress and those who resisted…" ); let builder = MatcherBuilder::new_test( @@ -841,17 +841,6 @@ mod tests { @"…between those who embraced progress and those who resisted change…" ); - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"The groundbreaking invention\" \"split the world between those\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"…the power to split the world between those who embraced…" - ); - let builder = MatcherBuilder::new_test( &rtxn, &temp_index, From 7f5d0837c3343b9ce154197867bd153b12390e5c Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 9 Oct 2024 11:46:57 +0200 Subject: [PATCH 41/92] fix the bad experimental search queue size --- meilisearch/src/option.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 3799bdcb7..82c783115 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -357,8 +357,8 @@ pub struct Opt { /// Lets you customize the size of the search queue. Meilisearch processes your search requests as fast as possible but once the /// queue is full it starts returning HTTP 503, Service Unavailable. /// The default value is 1000. - #[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = 1000)] - #[serde(default)] + #[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = default_experimental_search_queue_size())] + #[serde(default = "default_experimental_search_queue_size")] pub experimental_search_queue_size: usize, /// Experimental logs mode feature. For more information, see: @@ -890,6 +890,10 @@ fn default_dump_dir() -> PathBuf { PathBuf::from(DEFAULT_DUMP_DIR) } +fn default_experimental_search_queue_size() -> usize { + 1000 +} + /// Indicates if a snapshot was scheduled, and if yes with which interval. #[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)] pub enum ScheduleSnapshot { From 6e37ae8619ebf52aa1f9a703fc12723764f4ebe5 Mon Sep 17 00:00:00 2001 From: curquiza Date: Wed, 9 Oct 2024 19:13:14 +0200 Subject: [PATCH 42/92] Update mini-dashboard --- meilisearch/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index c193c89d4..6c2fb4060 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -157,5 +157,5 @@ german = ["meilisearch-types/german"] turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip" -sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.15/build.zip" +sha1 = "d057600b4a839a2e0c0be7a372cd1b2683f3ca7e" From 466604725ec017234db3e61c58c957a3802d2bb9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 10 Oct 2024 23:47:15 +0200 Subject: [PATCH 43/92] Do not send empty edit document by function --- meilisearch/src/analytics/segment_analytics.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index f8d6a0fdc..0ea0de572 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -1572,6 +1572,10 @@ impl EditDocumentsByFunctionAggregator { pub fn into_event(self, user: &User, event_name: &str) -> Option { let Self { timestamp, user_agents, index_creation, filtered, with_context } = self; + // if we had no timestamp it means we never encountered any events and + // thus we don't need to send this event. + let timestamp = timestamp?; + let properties = json!({ "user-agent": user_agents, "filtered": filtered, @@ -1580,7 +1584,7 @@ impl EditDocumentsByFunctionAggregator { }); Some(Track { - timestamp, + timestamp: Some(timestamp), user: user.clone(), event: event_name.to_string(), properties, From 92070a3578ded5a78bb42e8fb0ab02242fd11bc4 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 10 Oct 2024 13:17:25 +0200 Subject: [PATCH 44/92] Implement the experimental drop search after and nb search per core --- .../src/analytics/segment_analytics.rs | 6 +++ meilisearch/src/main.rs | 11 +++++- meilisearch/src/option.rs | 38 ++++++++++++++++++- 3 files changed, 52 insertions(+), 3 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 0ea0de572..476b3264e 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -265,6 +265,8 @@ struct Infos { experimental_contains_filter: bool, experimental_enable_metrics: bool, experimental_search_queue_size: usize, + experimental_drop_search_after: usize, + experimental_nb_searches_per_core: usize, experimental_logs_mode: LogMode, experimental_replication_parameters: bool, experimental_enable_logs_route: bool, @@ -308,6 +310,8 @@ impl From for Infos { experimental_contains_filter, experimental_enable_metrics, experimental_search_queue_size, + experimental_drop_search_after, + experimental_nb_searches_per_core, experimental_logs_mode, experimental_replication_parameters, experimental_enable_logs_route, @@ -359,6 +363,8 @@ impl From for Infos { experimental_contains_filter, experimental_enable_metrics, experimental_search_queue_size, + experimental_drop_search_after: experimental_drop_search_after.into(), + experimental_nb_searches_per_core: experimental_nb_searches_per_core.into(), experimental_logs_mode, experimental_replication_parameters, experimental_enable_logs_route, diff --git a/meilisearch/src/main.rs b/meilisearch/src/main.rs index b66bfc5b8..de9784d15 100644 --- a/meilisearch/src/main.rs +++ b/meilisearch/src/main.rs @@ -5,6 +5,7 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; use std::thread::available_parallelism; +use std::time::Duration; use actix_web::http::KeepAlive; use actix_web::web::Data; @@ -153,8 +154,14 @@ async fn run_http( let auth_controller = Data::from(auth_controller); let search_queue = SearchQueue::new( opt.experimental_search_queue_size, - available_parallelism().unwrap_or(NonZeroUsize::new(2).unwrap()), - ); + available_parallelism() + .unwrap_or(NonZeroUsize::new(2).unwrap()) + .checked_mul(opt.experimental_nb_searches_per_core) + .unwrap_or(NonZeroUsize::MAX), + ) + .with_time_to_abort(Duration::from_secs( + usize::from(opt.experimental_drop_search_after) as u64 + )); let search_queue = Data::new(search_queue); let http_server = HttpServer::new(move || { diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 82c783115..bbeb94577 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -2,7 +2,7 @@ use std::env::VarError; use std::ffi::OsStr; use std::fmt::Display; use std::io::{BufReader, Read}; -use std::num::ParseIntError; +use std::num::{NonZeroUsize, ParseIntError}; use std::ops::Deref; use std::path::PathBuf; use std::str::FromStr; @@ -55,6 +55,8 @@ const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LO const MEILI_EXPERIMENTAL_CONTAINS_FILTER: &str = "MEILI_EXPERIMENTAL_CONTAINS_FILTER"; const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS"; const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE"; +const MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER: &str = "MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER"; +const MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE: &str = "MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE"; const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str = "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE"; const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str = @@ -361,6 +363,22 @@ pub struct Opt { #[serde(default = "default_experimental_search_queue_size")] pub experimental_search_queue_size: usize, + /// Experimental drop search after. For more information, see: + /// + /// Lets you customize after how much seconds should Meilisearch consider a search as irrelevant and drop it. + /// The default value is 60. + #[clap(long, env = MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, default_value_t = default_drop_search_after())] + #[serde(default = "default_drop_search_after")] + pub experimental_drop_search_after: NonZeroUsize, + + /// Experimental number of searches per core. For more information, see: + /// + /// Lets you customize after how many search requests can run on each cores. + /// The default value is 4. + #[clap(long, env = MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, default_value_t = default_nb_searches_per_core())] + #[serde(default = "default_drop_search_after")] + pub experimental_nb_searches_per_core: NonZeroUsize, + /// Experimental logs mode feature. For more information, see: /// /// Change the mode of the logs on the console. @@ -492,6 +510,8 @@ impl Opt { experimental_contains_filter, experimental_enable_metrics, experimental_search_queue_size, + experimental_drop_search_after, + experimental_nb_searches_per_core, experimental_logs_mode, experimental_enable_logs_route, experimental_replication_parameters, @@ -559,6 +579,14 @@ impl Opt { MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, experimental_search_queue_size.to_string(), ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, + experimental_drop_search_after.to_string(), + ); + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, + experimental_nb_searches_per_core.to_string(), + ); export_to_env_if_not_present( MEILI_EXPERIMENTAL_LOGS_MODE, experimental_logs_mode.to_string(), @@ -894,6 +922,14 @@ fn default_experimental_search_queue_size() -> usize { 1000 } +fn default_drop_search_after() -> NonZeroUsize { + NonZeroUsize::new(60).unwrap() +} + +fn default_nb_searches_per_core() -> NonZeroUsize { + NonZeroUsize::new(4).unwrap() +} + /// Indicates if a snapshot was scheduled, and if yes with which interval. #[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)] pub enum ScheduleSnapshot { From c32282acb1f14e65bb124003c34fa1de9c01f869 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 10 Oct 2024 13:21:18 +0200 Subject: [PATCH 45/92] improve doc --- meilisearch/src/option.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index bbeb94577..a231eb058 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -365,7 +365,7 @@ pub struct Opt { /// Experimental drop search after. For more information, see: /// - /// Lets you customize after how much seconds should Meilisearch consider a search as irrelevant and drop it. + /// Let you customize after how many seconds Meilisearch should consider a search as irrelevant and drop it. /// The default value is 60. #[clap(long, env = MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, default_value_t = default_drop_search_after())] #[serde(default = "default_drop_search_after")] @@ -373,7 +373,7 @@ pub struct Opt { /// Experimental number of searches per core. For more information, see: /// - /// Lets you customize after how many search requests can run on each cores. + /// Lets you customize how many search requests can run on each core. /// The default value is 4. #[clap(long, env = MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, default_value_t = default_nb_searches_per_core())] #[serde(default = "default_drop_search_after")] From c4efd1df4e70b2929ee1cb1c22b535b7ff163cc7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 10 Oct 2024 13:40:21 +0200 Subject: [PATCH 46/92] Update meilisearch/src/option.rs Co-authored-by: Louis Dureuil --- meilisearch/src/option.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index a231eb058..cef787e1a 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -376,7 +376,7 @@ pub struct Opt { /// Lets you customize how many search requests can run on each core. /// The default value is 4. #[clap(long, env = MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, default_value_t = default_nb_searches_per_core())] - #[serde(default = "default_drop_search_after")] + #[serde(default = "default_nb_searches_per_core")] pub experimental_nb_searches_per_core: NonZeroUsize, /// Experimental logs mode feature. For more information, see: From 3085092e04cbc909601b8b290d883b35ff541f89 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 10 Oct 2024 13:40:28 +0200 Subject: [PATCH 47/92] Update meilisearch/src/option.rs Co-authored-by: Louis Dureuil --- meilisearch/src/option.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index cef787e1a..b3f01d208 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -373,7 +373,7 @@ pub struct Opt { /// Experimental number of searches per core. For more information, see: /// - /// Lets you customize how many search requests can run on each core. + /// Lets you customize how many search requests can run on each core concurrently. /// The default value is 4. #[clap(long, env = MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, default_value_t = default_nb_searches_per_core())] #[serde(default = "default_nb_searches_per_core")] From 4b4a6c78638573721d7b88869fd443236f90d29a Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 10 Oct 2024 15:24:24 +0200 Subject: [PATCH 48/92] Update meilisearch/src/option.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- meilisearch/src/option.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index b3f01d208..02dc660a4 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -365,7 +365,7 @@ pub struct Opt { /// Experimental drop search after. For more information, see: /// - /// Let you customize after how many seconds Meilisearch should consider a search as irrelevant and drop it. + /// Let you customize after how many seconds Meilisearch should consider a search request irrelevant and drop it. /// The default value is 60. #[clap(long, env = MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, default_value_t = default_drop_search_after())] #[serde(default = "default_drop_search_after")] From e44e7b5e81e8644ae1c95d3a3b28f530fcc52eb2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 14 Oct 2024 16:17:19 +0200 Subject: [PATCH 49/92] Fix retrieveVectors when explicitly passed in displayed attributes without any document containing _vectors --- meilisearch/src/search/mod.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/meilisearch/src/search/mod.rs b/meilisearch/src/search/mod.rs index 66b6e56de..7832c1761 100644 --- a/meilisearch/src/search/mod.rs +++ b/meilisearch/src/search/mod.rs @@ -1195,8 +1195,13 @@ impl<'a> HitMaker<'a> { let vectors_is_hidden = match (&displayed_ids, vectors_fid) { // displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid (None, _) => false, - // displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field - (Some(_), None) => true, + // vectors has no fid, so check its explicit name + (Some(_), None) => { + // unwrap as otherwise we'd go to the first one + let displayed_names = index.displayed_fields(rtxn)?.unwrap(); + !displayed_names + .contains(&milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME) + } // displayed_ids is a finit list, so hide if `_vectors` is not part of it (Some(map), Some(vectors_fid)) => map.contains(&vectors_fid), }; From 5a74d4729cdc02a3cea011d4ab6a0f608be867f9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 14 Oct 2024 16:23:28 +0200 Subject: [PATCH 50/92] Add test failing before this PR, OK now --- meilisearch/tests/search/hybrid.rs | 51 ++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index e301c0b05..00a65d9aa 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -568,6 +568,57 @@ async fn retrieve_vectors() { ] "###); + // use explicit `_vectors` in displayed attributes + let (response, code) = index + .update_settings(json!({ "displayedAttributes": ["id", "title", "desc", "_vectors"]} )) + .await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"embedder": "default", "semanticRatio": 0.2}, "retrieveVectors": true}), + ) + .await; + snapshot!(code, @"200 OK"); + insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###" + [ + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + } + ] + "###); + // remove `_vectors` from displayed attributes let (response, code) = index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await; From 73e87c152a4bd35fd4309141615676210c6b279c Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 16 Oct 2024 15:43:27 +0200 Subject: [PATCH 51/92] rewrite most of the analytics especially the settings --- meilisearch/src/analytics/mock_analytics.rs | 109 -- meilisearch/src/analytics/mod.rs | 179 ++-- .../src/analytics/segment_analytics.rs | 211 ++-- meilisearch/src/lib.rs | 4 +- meilisearch/src/routes/dump.rs | 7 +- meilisearch/src/routes/features.rs | 58 +- meilisearch/src/routes/indexes/documents.rs | 318 +++++- .../src/routes/indexes/facet_search.rs | 112 +- meilisearch/src/routes/indexes/mod.rs | 53 +- meilisearch/src/routes/indexes/search.rs | 13 +- meilisearch/src/routes/indexes/settings.rs | 962 +++++++++++++----- meilisearch/src/routes/swap_indexes.rs | 2 +- 12 files changed, 1381 insertions(+), 647 deletions(-) delete mode 100644 meilisearch/src/analytics/mock_analytics.rs diff --git a/meilisearch/src/analytics/mock_analytics.rs b/meilisearch/src/analytics/mock_analytics.rs deleted file mode 100644 index 54b8d4f1b..000000000 --- a/meilisearch/src/analytics/mock_analytics.rs +++ /dev/null @@ -1,109 +0,0 @@ -use std::any::Any; -use std::sync::Arc; - -use actix_web::HttpRequest; -use meilisearch_types::InstanceUid; -use serde_json::Value; - -use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind}; -use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery}; -use crate::Opt; - -pub struct MockAnalytics { - instance_uid: Option, -} - -#[derive(Default)] -pub struct SearchAggregator; - -#[allow(dead_code)] -impl SearchAggregator { - pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self { - Self - } - - pub fn succeed(&mut self, _: &dyn Any) {} -} - -#[derive(Default)] -pub struct SimilarAggregator; - -#[allow(dead_code)] -impl SimilarAggregator { - pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self { - Self - } - - pub fn succeed(&mut self, _: &dyn Any) {} -} - -#[derive(Default)] -pub struct MultiSearchAggregator; - -#[allow(dead_code)] -impl MultiSearchAggregator { - pub fn from_federated_search(_: &dyn Any, _: &dyn Any) -> Self { - Self - } - - pub fn succeed(&mut self) {} -} - -#[derive(Default)] -pub struct FacetSearchAggregator; - -#[allow(dead_code)] -impl FacetSearchAggregator { - pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self { - Self - } - - pub fn succeed(&mut self, _: &dyn Any) {} -} - -impl MockAnalytics { - #[allow(clippy::new_ret_no_self)] - pub fn new(opt: &Opt) -> Arc { - let instance_uid = find_user_id(&opt.db_path); - Arc::new(Self { instance_uid }) - } -} - -impl Analytics for MockAnalytics { - fn instance_uid(&self) -> Option<&meilisearch_types::InstanceUid> { - self.instance_uid.as_ref() - } - - // These methods are noop and should be optimized out - fn publish(&self, _event_name: String, _send: Value, _request: Option<&HttpRequest>) {} - fn get_search(&self, _aggregate: super::SearchAggregator) {} - fn post_search(&self, _aggregate: super::SearchAggregator) {} - fn get_similar(&self, _aggregate: super::SimilarAggregator) {} - fn post_similar(&self, _aggregate: super::SimilarAggregator) {} - fn post_multi_search(&self, _aggregate: super::MultiSearchAggregator) {} - fn post_facet_search(&self, _aggregate: super::FacetSearchAggregator) {} - fn add_documents( - &self, - _documents_query: &UpdateDocumentsQuery, - _index_creation: bool, - _request: &HttpRequest, - ) { - } - fn delete_documents(&self, _kind: DocumentDeletionKind, _request: &HttpRequest) {} - fn update_documents( - &self, - _documents_query: &UpdateDocumentsQuery, - _index_creation: bool, - _request: &HttpRequest, - ) { - } - fn update_documents_by_function( - &self, - _documents_query: &DocumentEditionByFunction, - _index_creation: bool, - _request: &HttpRequest, - ) { - } - fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} - fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} -} diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index 3c7ca0ed3..a8658d830 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -1,45 +1,51 @@ -mod mock_analytics; -#[cfg(feature = "analytics")] -mod segment_analytics; +pub mod segment_analytics; +use std::any::TypeId; +use std::collections::HashMap; use std::fs; use std::path::{Path, PathBuf}; use std::str::FromStr; use actix_web::HttpRequest; use meilisearch_types::InstanceUid; -pub use mock_analytics::MockAnalytics; use once_cell::sync::Lazy; use platform_dirs::AppDirs; -use serde_json::Value; - -use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery}; - -// if the analytics feature is disabled -// the `SegmentAnalytics` point to the mock instead of the real analytics -#[cfg(not(feature = "analytics"))] -pub type SegmentAnalytics = mock_analytics::MockAnalytics; -#[cfg(not(feature = "analytics"))] -pub type SearchAggregator = mock_analytics::SearchAggregator; -#[cfg(not(feature = "analytics"))] -pub type SimilarAggregator = mock_analytics::SimilarAggregator; -#[cfg(not(feature = "analytics"))] -pub type MultiSearchAggregator = mock_analytics::MultiSearchAggregator; -#[cfg(not(feature = "analytics"))] -pub type FacetSearchAggregator = mock_analytics::FacetSearchAggregator; +use segment::message::User; +use serde::Serialize; // if the feature analytics is enabled we use the real analytics -#[cfg(feature = "analytics")] pub type SegmentAnalytics = segment_analytics::SegmentAnalytics; -#[cfg(feature = "analytics")] -pub type SearchAggregator = segment_analytics::SearchAggregator; -#[cfg(feature = "analytics")] +pub use segment_analytics::SearchAggregator; pub type SimilarAggregator = segment_analytics::SimilarAggregator; -#[cfg(feature = "analytics")] pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator; -#[cfg(feature = "analytics")] pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator; +/// A macro used to quickly define events that don't aggregate or send anything besides an empty event with its name. +#[macro_export] +macro_rules! empty_analytics { + ($struct_name:ident, $event_name:literal) => { + #[derive(Default)] + struct $struct_name {} + + impl $crate::analytics::Aggregate for $struct_name { + fn event_name(&self) -> &'static str { + $event_name + } + + fn aggregate(self, other: Self) -> Self + where + Self: Sized, + { + self + } + + fn into_event(self) -> serde_json::Value { + serde_json::json!({}) + } + } + }; +} + /// The Meilisearch config dir: /// `~/.config/Meilisearch` on *NIX or *BSD. /// `~/Library/ApplicationSupport` on macOS. @@ -78,60 +84,73 @@ pub enum DocumentFetchKind { Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, } -pub trait Analytics: Sync + Send { - fn instance_uid(&self) -> Option<&InstanceUid>; +pub trait Aggregate { + fn event_name(&self) -> &'static str; + + fn aggregate(self, other: Self) -> Self + where + Self: Sized; + + fn into_event(self) -> impl Serialize + where + Self: Sized; +} + +/// Helper trait to define multiple aggregate with the same content but a different name. +/// Commonly used when you must aggregate a search with POST or with GET for example. +pub trait AggregateMethod { + fn event_name() -> &'static str; +} + +/// A macro used to quickly define multiple aggregate method with their name +#[macro_export] +macro_rules! aggregate_methods { + ($method:ident => $event_name:literal) => { + pub enum $method {} + + impl $crate::analytics::AggregateMethod for $method { + fn event_name() -> &'static str { + $event_name + } + } + }; + ($($method:ident => $event_name:literal,)+) => { + $( + aggregate_methods!($method => $event_name); + )+ + + }; +} + +pub struct Analytics { + // TODO: TAMO: remove + inner: Option, + + instance_uid: Option, + user: Option, + events: HashMap>, +} + +impl Analytics { + fn no_analytics() -> Self { + Self { inner: None, events: HashMap::new(), instance_uid: None, user: None } + } + + fn segment_analytics(segment: SegmentAnalytics) -> Self { + Self { + instance_uid: Some(segment.instance_uid), + user: Some(segment.user), + inner: Some(segment), + events: HashMap::new(), + } + } + + pub fn instance_uid(&self) -> Option<&InstanceUid> { + self.instance_uid + } /// The method used to publish most analytics that do not need to be batched every hours - fn publish(&self, event_name: String, send: Value, request: Option<&HttpRequest>); - - /// This method should be called to aggregate a get search - fn get_search(&self, aggregate: SearchAggregator); - - /// This method should be called to aggregate a post search - fn post_search(&self, aggregate: SearchAggregator); - - /// This method should be called to aggregate a get similar request - fn get_similar(&self, aggregate: SimilarAggregator); - - /// This method should be called to aggregate a post similar request - fn post_similar(&self, aggregate: SimilarAggregator); - - /// This method should be called to aggregate a post array of searches - fn post_multi_search(&self, aggregate: MultiSearchAggregator); - - /// This method should be called to aggregate post facet values searches - fn post_facet_search(&self, aggregate: FacetSearchAggregator); - - // this method should be called to aggregate an add documents request - fn add_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ); - - // this method should be called to aggregate a fetch documents request - fn get_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest); - - // this method should be called to aggregate a fetch documents request - fn post_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest); - - // this method should be called to aggregate a add documents request - fn delete_documents(&self, kind: DocumentDeletionKind, request: &HttpRequest); - - // this method should be called to batch an update documents request - fn update_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ); - - // this method should be called to batch an update documents by function request - fn update_documents_by_function( - &self, - documents_query: &DocumentEditionByFunction, - index_creation: bool, - request: &HttpRequest, - ); + pub fn publish(&self, send: impl Aggregate, request: Option<&HttpRequest>) { + let Some(segment) = self.inner else { return }; + } } diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 476b3264e..8a6dfd780 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -25,7 +25,8 @@ use tokio::sync::mpsc::{self, Receiver, Sender}; use uuid::Uuid; use super::{ - config_user_id_path, DocumentDeletionKind, DocumentFetchKind, MEILISEARCH_CONFIG_PATH, + config_user_id_path, Aggregate, AggregateMethod, DocumentDeletionKind, DocumentFetchKind, + MEILISEARCH_CONFIG_PATH, }; use crate::analytics::Analytics; use crate::option::{ @@ -40,7 +41,7 @@ use crate::search::{ DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEMANTIC_RATIO, }; -use crate::Opt; +use crate::{aggregate_methods, Opt}; const ANALYTICS_HEADER: &str = "X-Meilisearch-Client"; @@ -87,9 +88,9 @@ pub enum AnalyticsMsg { } pub struct SegmentAnalytics { - instance_uid: InstanceUid, + pub instance_uid: InstanceUid, sender: Sender, - user: User, + pub user: User, } impl SegmentAnalytics { @@ -98,7 +99,7 @@ impl SegmentAnalytics { opt: &Opt, index_scheduler: Arc, auth_controller: Arc, - ) -> Arc { + ) -> Arc { let instance_uid = super::find_user_id(&opt.db_path); let first_time_run = instance_uid.is_none(); let instance_uid = instance_uid.unwrap_or_else(Uuid::new_v4); @@ -108,7 +109,7 @@ impl SegmentAnalytics { // if reqwest throws an error we won't be able to send analytics if client.is_err() { - return super::MockAnalytics::new(opt); + return Arc::new(Analytics::no_analytics()); } let client = @@ -161,10 +162,11 @@ impl SegmentAnalytics { let this = Self { instance_uid, sender, user: user.clone() }; - Arc::new(this) + Arc::new(Analytics::segment_analytics(this)) } } +/* impl super::Analytics for SegmentAnalytics { fn instance_uid(&self) -> Option<&InstanceUid> { Some(&self.instance_uid) @@ -253,6 +255,7 @@ impl super::Analytics for SegmentAnalytics { let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate)); } } +*/ /// This structure represent the `infos` field we send in the analytics. /// It's quite close to the `Opt` structure except all sensitive informations @@ -607,12 +610,7 @@ impl Segment { } #[derive(Default)] -pub struct SearchAggregator { - timestamp: Option, - - // context - user_agents: HashSet, - +pub struct SearchAggregator { // requests total_received: usize, total_succeeded: usize, @@ -684,9 +682,11 @@ pub struct SearchAggregator { show_ranking_score: bool, show_ranking_score_details: bool, ranking_score_threshold: bool, + + marker: std::marker::PhantomData, } -impl SearchAggregator { +impl SearchAggregator { #[allow(clippy::field_reassign_with_default)] pub fn from_query(query: &SearchQuery, request: &HttpRequest) -> Self { let SearchQuery { @@ -827,12 +827,21 @@ impl SearchAggregator { } self.time_spent.push(*processing_time_ms as usize); } +} - /// Aggregate one [SearchAggregator] into another. - pub fn aggregate(&mut self, mut other: Self) { +aggregate_methods!( + SearchGET => "Documents Searched GET", + SearchPOST => "Documents Searched POST", + +); + +impl Aggregate for SearchAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(mut self, mut other: Self) -> Self { let Self { - timestamp, - user_agents, total_received, total_succeeded, ref mut time_spent, @@ -871,17 +880,9 @@ impl SearchAggregator { total_used_negative_operator, ranking_score_threshold, ref mut locales, + marker: _, } = other; - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // context - for user_agent in user_agents.into_iter() { - self.user_agents.insert(user_agent); - } - // request self.total_received = self.total_received.saturating_add(total_received); self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); @@ -961,12 +962,12 @@ impl SearchAggregator { // locales self.locales.append(locales); + + self } - pub fn into_event(self, user: &User, event_name: &str) -> Option { + fn into_event(self) -> Option { let Self { - timestamp, - user_agents, total_received, total_succeeded, time_spent, @@ -1005,90 +1006,78 @@ impl SearchAggregator { total_used_negative_operator, ranking_score_threshold, locales, + marker: _, } = self; - if total_received == 0 { - None - } else { - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // the index of the 99th percentage of value - let percentile_99th = time_spent.len() * 99 / 100; - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th); + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // the index of the 99th percentage of value + let percentile_99th = time_spent.len() * 99 / 100; + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th); - let properties = json!({ - "user-agent": user_agents, - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - "total_degraded": total_degraded, - "total_used_negative_operator": total_used_negative_operator, - }, - "sort": { - "with_geoPoint": sort_with_geo_point, - "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), - }, - "distinct": distinct, - "filter": { - "with_geoRadius": filter_with_geo_radius, - "with_geoBoundingBox": filter_with_geo_bounding_box, - "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), - "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "attributes_to_search_on": { - "total_number_of_uses": attributes_to_search_on_total_number_of_uses, - }, - "q": { - "max_terms_number": max_terms_number, - }, - "vector": { - "max_vector_size": max_vector_size, - "retrieve_vectors": retrieve_vectors, - }, - "hybrid": { - "enabled": hybrid, - "semantic_ratio": semantic_ratio, - }, - "pagination": { - "max_limit": max_limit, - "max_offset": max_offset, - "most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" }, - }, - "formatting": { - "max_attributes_to_retrieve": max_attributes_to_retrieve, - "max_attributes_to_highlight": max_attributes_to_highlight, - "highlight_pre_tag": highlight_pre_tag, - "highlight_post_tag": highlight_post_tag, - "max_attributes_to_crop": max_attributes_to_crop, - "crop_marker": crop_marker, - "show_matches_position": show_matches_position, - "crop_length": crop_length, - }, - "facets": { - "avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64), - }, - "matching_strategy": { - "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "locales": locales, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - "ranking_score_threshold": ranking_score_threshold, - }, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } + json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + "total_degraded": total_degraded, + "total_used_negative_operator": total_used_negative_operator, + }, + "sort": { + "with_geoPoint": sort_with_geo_point, + "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), + }, + "distinct": distinct, + "filter": { + "with_geoRadius": filter_with_geo_radius, + "with_geoBoundingBox": filter_with_geo_bounding_box, + "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), + "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "attributes_to_search_on": { + "total_number_of_uses": attributes_to_search_on_total_number_of_uses, + }, + "q": { + "max_terms_number": max_terms_number, + }, + "vector": { + "max_vector_size": max_vector_size, + "retrieve_vectors": retrieve_vectors, + }, + "hybrid": { + "enabled": hybrid, + "semantic_ratio": semantic_ratio, + }, + "pagination": { + "max_limit": max_limit, + "max_offset": max_offset, + "most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" }, + }, + "formatting": { + "max_attributes_to_retrieve": max_attributes_to_retrieve, + "max_attributes_to_highlight": max_attributes_to_highlight, + "highlight_pre_tag": highlight_pre_tag, + "highlight_post_tag": highlight_post_tag, + "max_attributes_to_crop": max_attributes_to_crop, + "crop_marker": crop_marker, + "show_matches_position": show_matches_position, + "crop_length": crop_length, + }, + "facets": { + "avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64), + }, + "matching_strategy": { + "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "locales": locales, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, + }, + }) } } diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index b24f18fae..80177876a 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -120,7 +120,7 @@ pub fn create_app( search_queue: Data, opt: Opt, logs: (LogRouteHandle, LogStderrHandle), - analytics: Arc, + analytics: Arc, enable_dashboard: bool, ) -> actix_web::App< impl ServiceFactory< @@ -473,7 +473,7 @@ pub fn configure_data( search_queue: Data, opt: &Opt, (logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle), - analytics: Arc, + analytics: Arc, ) { let http_payload_size_limit = opt.http_payload_size_limit.as_u64() as usize; config diff --git a/meilisearch/src/routes/dump.rs b/meilisearch/src/routes/dump.rs index 7f3cd06a5..0fdeef5ed 100644 --- a/meilisearch/src/routes/dump.rs +++ b/meilisearch/src/routes/dump.rs @@ -4,7 +4,6 @@ use index_scheduler::IndexScheduler; use meilisearch_auth::AuthController; use meilisearch_types::error::ResponseError; use meilisearch_types::tasks::KindWithContent; -use serde_json::json; use tracing::debug; use crate::analytics::Analytics; @@ -18,14 +17,16 @@ pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump)))); } +crate::empty_analytics!(DumpAnalytics, "Dump Created"); + pub async fn create_dump( index_scheduler: GuardedData, Data>, auth_controller: GuardedData, Data>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { - analytics.publish("Dump Created".to_string(), json!({}), Some(&req)); + analytics.publish(DumpAnalytics::default(), Some(&req)); let task = KindWithContent::DumpCreation { keys: auth_controller.list_keys()?, diff --git a/meilisearch/src/routes/features.rs b/meilisearch/src/routes/features.rs index bc656bdbb..24c89938d 100644 --- a/meilisearch/src/routes/features.rs +++ b/meilisearch/src/routes/features.rs @@ -6,10 +6,11 @@ use index_scheduler::IndexScheduler; use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::ResponseError; use meilisearch_types::keys::actions; +use serde::Serialize; use serde_json::json; use tracing::debug; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; @@ -22,17 +23,19 @@ pub fn configure(cfg: &mut web::ServiceConfig) { ); } +crate::empty_analytics!(GetExperimentalFeatureAnalytics, "Experimental features Seen"); + async fn get_features( index_scheduler: GuardedData< ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_GET }>, Data, >, req: HttpRequest, - analytics: Data, + analytics: Data, ) -> HttpResponse { let features = index_scheduler.features(); - analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req)); + analytics.publish(GetExperimentalFeatureAnalytics::default(), Some(&req)); let features = features.runtime_features(); debug!(returns = ?features, "Get features"); HttpResponse::Ok().json(features) @@ -53,6 +56,38 @@ pub struct RuntimeTogglableFeatures { pub contains_filter: Option, } +#[derive(Serialize)] +pub struct PatchExperimentalFeatureAnalytics { + vector_store: bool, + metrics: bool, + logs_route: bool, + edit_documents_by_function: bool, + contains_filter: bool, +} + +impl Aggregate for PatchExperimentalFeatureAnalytics { + fn event_name(&self) -> &'static str { + "Experimental features Updated" + } + + fn aggregate(self, other: Self) -> Self + where + Self: Sized, + { + Self { + vector_store: other.vector_store, + metrics: other.metrics, + logs_route: other.logs_route, + edit_documents_by_function: other.edit_documents_by_function, + contains_filter: other.contains_filter, + } + } + + fn into_event(self) -> serde_json::Value { + serde_json::to_value(self).unwrap() + } +} + async fn patch_features( index_scheduler: GuardedData< ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_UPDATE }>, @@ -60,7 +95,7 @@ async fn patch_features( >, new_features: AwebJson, req: HttpRequest, - analytics: Data, + analytics: Data, ) -> Result { let features = index_scheduler.features(); debug!(parameters = ?new_features, "Patch features"); @@ -89,14 +124,13 @@ async fn patch_features( } = new_features; analytics.publish( - "Experimental features Updated".to_string(), - json!({ - "vector_store": vector_store, - "metrics": metrics, - "logs_route": logs_route, - "edit_documents_by_function": edit_documents_by_function, - "contains_filter": contains_filter, - }), + PatchExperimentalFeatureAnalytics { + vector_store, + metrics, + logs_route, + edit_documents_by_function, + contains_filter, + }, Some(&req), ); index_scheduler.put_runtime_features(new_features)?; diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 85cf33c54..8f4cd026d 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -1,4 +1,6 @@ +use std::collections::HashSet; use std::io::ErrorKind; +use std::marker::PhantomData; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; @@ -23,14 +25,14 @@ use meilisearch_types::tasks::KindWithContent; use meilisearch_types::{milli, Document, Index}; use mime::Mime; use once_cell::sync::Lazy; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use serde_json::Value; use tempfile::tempfile; use tokio::fs::File; use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter}; use tracing::debug; -use crate::analytics::{Analytics, DocumentDeletionKind, DocumentFetchKind}; +use crate::analytics::{Aggregate, AggregateMethod, Analytics, DocumentDeletionKind}; use crate::error::MeilisearchHttpError; use crate::error::PayloadError::ReceivePayload; use crate::extractors::authentication::policies::*; @@ -41,7 +43,7 @@ use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; use crate::search::{parse_filter, RetrieveVectors}; -use crate::Opt; +use crate::{aggregate_methods, Opt}; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()] @@ -100,12 +102,82 @@ pub struct GetDocument { retrieve_vectors: Param, } +#[derive(Default, Serialize)] +pub struct DocumentsFetchAggregator { + #[serde(rename = "requests.total_received")] + total_received: usize, + + // a call on ../documents/:doc_id + per_document_id: bool, + // if a filter was used + per_filter: bool, + + #[serde(rename = "vector.retrieve_vectors")] + retrieve_vectors: bool, + + // pagination + #[serde(rename = "pagination.max_limit")] + max_limit: usize, + #[serde(rename = "pagination.max_offset")] + max_offset: usize, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum DocumentFetchKind { + PerDocumentId { retrieve_vectors: bool }, + Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, +} + +impl DocumentsFetchAggregator { + pub fn from_query(query: &DocumentFetchKind) -> Self { + let (limit, offset, retrieve_vectors) = match query { + DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), + DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { + (*limit, *offset, *retrieve_vectors) + } + }; + Self { + total_received: 1, + per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), + per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), + max_limit: limit, + max_offset: offset, + retrieve_vectors, + } + } +} + +impl Aggregate for DocumentsFetchAggregator { + // TODO: TAMO: Should we do the same event for the GET requests + fn event_name(&self) -> &'static str { + "Documents Fetched POST" + } + + fn aggregate(self, other: Self) -> Self + where + Self: Sized, + { + Self { + total_received: self.total_received.saturating_add(other.total_received), + per_document_id: self.per_document_id | other.per_document_id, + per_filter: self.per_filter | other.per_filter, + retrieve_vectors: self.retrieve_vectors | other.retrieve_vectors, + max_limit: self.max_limit.max(other.max_limit), + max_offset: self.max_offset.max(other.max_offset), + } + } + + fn into_event(self) -> Value { + serde_json::to_value(self).unwrap() + } +} + pub async fn get_document( index_scheduler: GuardedData, Data>, document_param: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let DocumentParam { index_uid, document_id } = document_param.into_inner(); debug!(parameters = ?params, "Get document"); @@ -117,9 +189,12 @@ pub async fn get_document( let features = index_scheduler.features(); let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?; - analytics.get_fetch_documents( - &DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 }, - &req, + analytics.publish( + DocumentsFetchAggregator { + retrieve_vectors: param_retrieve_vectors.0, + ..Default::default() + }, + Some(&req), ); let index = index_scheduler.index(&index_uid)?; @@ -129,17 +204,57 @@ pub async fn get_document( Ok(HttpResponse::Ok().json(document)) } +#[derive(Default, Serialize)] +pub struct DocumentsDeletionAggregator { + #[serde(rename = "requests.total_received")] + total_received: usize, + per_document_id: bool, + clear_all: bool, + per_batch: bool, + per_filter: bool, +} + +impl Aggregate for DocumentsDeletionAggregator { + fn event_name(&self) -> &'static str { + "Documents Deleted" + } + + fn aggregate(self, other: Self) -> Self + where + Self: Sized, + { + Self { + total_received: self.total_received.saturating_add(other.total_received), + per_document_id: self.per_document_id | other.per_document_id, + clear_all: self.clear_all | other.clear_all, + per_batch: self.per_batch | other.per_batch, + per_filter: self.per_filter | other.per_filter, + } + } + + fn into_event(self) -> Value { + serde_json::to_value(self).unwrap() + } +} + pub async fn delete_document( index_scheduler: GuardedData, Data>, path: web::Path, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let DocumentParam { index_uid, document_id } = path.into_inner(); let index_uid = IndexUid::try_from(index_uid)?; - analytics.delete_documents(DocumentDeletionKind::PerDocumentId, &req); + analytics.publish( + DocumentsDeletionAggregator { + total_received: 1, + per_document_id: true, + ..Default::default() + }, + Some(&req), + ); let task = KindWithContent::DocumentDeletion { index_uid: index_uid.to_string(), @@ -190,19 +305,21 @@ pub async fn documents_by_query_post( index_uid: web::Path, body: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let body = body.into_inner(); debug!(parameters = ?body, "Get documents POST"); - analytics.post_fetch_documents( - &DocumentFetchKind::Normal { - with_filter: body.filter.is_some(), - limit: body.limit, - offset: body.offset, + analytics.publish( + DocumentsFetchAggregator { + total_received: 1, + per_filter: body.filter.is_some(), retrieve_vectors: body.retrieve_vectors, + max_limit: body.limit, + max_offset: body.offset, + ..Default::default() }, - &req, + Some(&req), ); documents_by_query(&index_scheduler, index_uid, body) @@ -213,7 +330,7 @@ pub async fn get_documents( index_uid: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?params, "Get documents GET"); @@ -235,14 +352,16 @@ pub async fn get_documents( filter, }; - analytics.get_fetch_documents( - &DocumentFetchKind::Normal { - with_filter: query.filter.is_some(), - limit: query.limit, - offset: query.offset, + analytics.publish( + DocumentsFetchAggregator { + total_received: 1, + per_filter: query.filter.is_some(), retrieve_vectors: query.retrieve_vectors, + max_limit: query.limit, + max_offset: query.offset, + ..Default::default() }, - &req, + Some(&req), ); documents_by_query(&index_scheduler, index_uid, query) @@ -298,6 +417,42 @@ fn from_char_csv_delimiter( } } +aggregate_methods!( + Replaced => "Documents Added", + Updated => "Documents Updated", +); + +#[derive(Default, Serialize)] +pub struct DocumentsAggregator { + payload_types: HashSet, + primary_key: HashSet, + index_creation: bool, + #[serde(skip)] + method: PhantomData, +} + +impl Aggregate for DocumentsAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(mut self, other: Self) -> Self + where + Self: Sized, + { + Self { + payload_types: self.payload_types.union(&other.payload_types).collect(), + primary_key: self.primary_key.union(&other.primary_key).collect(), + index_creation: self.index_creation | other.index_creation, + method: PhantomData, + } + } + + fn into_event(self) -> Value { + serde_json::to_value(self).unwrap() + } +} + pub async fn replace_documents( index_scheduler: GuardedData, Data>, index_uid: web::Path, @@ -305,17 +460,33 @@ pub async fn replace_documents( body: Payload, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; debug!(parameters = ?params, "Replace documents"); let params = params.into_inner(); - analytics.add_documents( - ¶ms, - index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), - &req, + let mut content_types = HashSet::new(); + let content_type = req + .headers() + .get(CONTENT_TYPE) + .and_then(|s| s.to_str().ok()) + .unwrap_or("unknown") + .to_string(); + content_types.insert(content_type); + let mut primary_keys = HashSet::new(); + if let Some(primary_key) = params.primary_key.clone() { + primary_keys.insert(primary_key); + } + analytics.publish( + DocumentsAggregator:: { + payload_types: content_types, + primary_key: primary_keys, + index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), + method: PhantomData, + }, + Some(&req), ); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); @@ -346,17 +517,33 @@ pub async fn update_documents( body: Payload, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let params = params.into_inner(); debug!(parameters = ?params, "Update documents"); - analytics.add_documents( - ¶ms, - index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), - &req, + let mut content_types = HashSet::new(); + let content_type = req + .headers() + .get(CONTENT_TYPE) + .and_then(|s| s.to_str().ok()) + .unwrap_or("unknown") + .to_string(); + content_types.insert(content_type); + let mut primary_keys = HashSet::new(); + if let Some(primary_key) = params.primary_key.clone() { + primary_keys.insert(primary_key); + } + analytics.publish( + DocumentsAggregator:: { + payload_types: content_types, + primary_key: primary_keys, + index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), + method: PhantomData, + }, + Some(&req), ); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); @@ -524,12 +711,15 @@ pub async fn delete_documents_batch( body: web::Json>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Delete documents by batch"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; - analytics.delete_documents(DocumentDeletionKind::PerBatch, &req); + analytics.publish( + DocumentsDeletionAggregator { total_received: 1, per_batch: true, ..Default::default() }, + Some(&req), + ); let ids = body .iter() @@ -562,14 +752,17 @@ pub async fn delete_documents_by_filter( body: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Delete documents by filter"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; let index_uid = index_uid.into_inner(); let filter = body.into_inner().filter; - analytics.delete_documents(DocumentDeletionKind::PerFilter, &req); + analytics.publish( + DocumentsDeletionAggregator { total_received: 1, per_filter: true, ..Default::default() }, + Some(&req), + ); // we ensure the filter is well formed before enqueuing it crate::search::parse_filter(&filter, Code::InvalidDocumentFilter, index_scheduler.features())? @@ -599,13 +792,44 @@ pub struct DocumentEditionByFunction { pub function: String, } +#[derive(Default, Serialize)] +struct EditDocumentsByFunctionAggregator { + // Set to true if at least one request was filtered + filtered: bool, + // Set to true if at least one request contained a context + with_context: bool, + + index_creation: bool, +} + +impl Aggregate for EditDocumentsByFunctionAggregator { + fn event_name(&self) -> &'static str { + "Documents Edited By Function" + } + + fn aggregate(self, other: Self) -> Self + where + Self: Sized, + { + Self { + filtered: self.filtered | other.filtered, + with_context: self.with_context | other.with_context, + index_creation: self.index_creation | other.index_creation, + } + } + + fn into_event(self) -> Value { + serde_json::to_value(self).unwrap() + } +} + pub async fn edit_documents_by_function( index_scheduler: GuardedData, Data>, index_uid: web::Path, params: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?params, "Edit documents by function"); @@ -617,10 +841,13 @@ pub async fn edit_documents_by_function( let index_uid = index_uid.into_inner(); let params = params.into_inner(); - analytics.update_documents_by_function( - ¶ms, - index_scheduler.index(&index_uid).is_err(), - &req, + analytics.publish( + EditDocumentsByFunctionAggregator { + filtered: params.filter.is_some(), + with_context: params.context.is_some(), + index_creation: index_scheduler.index(&index_uid).is_err(), + }, + Some(&req), ); let DocumentEditionByFunction { filter, context, function } = params; @@ -670,10 +897,13 @@ pub async fn clear_all_documents( index_uid: web::Path, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - analytics.delete_documents(DocumentDeletionKind::ClearAll, &req); + analytics.publish( + DocumentsDeletionAggregator { total_received: 1, clear_all: true, ..Default::default() }, + Some(&req), + ); let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() }; let uid = get_task_id(&req, &opt)?; diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 1df80711d..1e9d0e15e 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -1,3 +1,5 @@ +use std::collections::{BinaryHeap, HashSet}; + use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::AwebJson; @@ -10,14 +12,15 @@ use meilisearch_types::locales::Locale; use serde_json::Value; use tracing::debug; -use crate::analytics::{Analytics, FacetSearchAggregator}; +use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::routes::indexes::search::search_kind; use crate::search::{ - add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, - SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, - DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, + add_search_rules, perform_facet_search, FacetSearchResult, HybridQuery, MatchingStrategy, + RankingScoreThreshold, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, + DEFAULT_SEARCH_OFFSET, }; use crate::search_queue::SearchQueue; @@ -53,13 +56,110 @@ pub struct FacetSearchQuery { pub locales: Option>, } +#[derive(Default)] +pub struct FacetSearchAggregator { + // requests + total_received: usize, + total_succeeded: usize, + time_spent: BinaryHeap, + + // The set of all facetNames that were used + facet_names: HashSet, + + // As there been any other parameter than the facetName or facetQuery ones? + additional_search_parameters_provided: bool, +} + +impl FacetSearchAggregator { + #[allow(clippy::field_reassign_with_default)] + pub fn from_query(query: &FacetSearchQuery, request: &HttpRequest) -> Self { + let FacetSearchQuery { + facet_query: _, + facet_name, + vector, + q, + filter, + matching_strategy, + attributes_to_search_on, + hybrid, + ranking_score_threshold, + locales, + } = query; + + Self { + total_received: 1, + facet_names: Some(facet_name.clone()).into_iter().collect(), + additional_search_parameters_provided: q.is_some() + || vector.is_some() + || filter.is_some() + || *matching_strategy != MatchingStrategy::default() + || attributes_to_search_on.is_some() + || hybrid.is_some() + || ranking_score_threshold.is_some() + || locales.is_some(), + ..Default::default() + } + } + + pub fn succeed(&mut self, result: &FacetSearchResult) { + let FacetSearchResult { facet_hits: _, facet_query: _, processing_time_ms } = result; + self.total_succeeded = 1; + self.time_spent.push(*processing_time_ms as usize); + } +} + +impl Aggregate for FacetSearchAggregator { + fn event_name(&self) -> &'static str { + "Facet Searched POST" + } + + fn aggregate(mut self, other: Self) -> Self + where + Self: Sized, + { + self.time_spent.insert(other.time_spent); + + Self { + total_received: self.total_received.saturating_add(other.total_received), + total_succeeded: self.total_succeeded.saturating_add(other.total_succeeded), + time_spent: self.time_spent, + facet_names: self.facet_names.union(&other.facet_names).collect(), + additional_search_parameters_provided: self.additional_search_parameters_provided + | other.additional_search_parameters_provided, + } + } + + fn into_event(self) -> Value { + let Self { + total_received, + total_succeeded, + time_spent, + facet_names, + additional_search_parameters_provided, + } = self; + + serde_json::json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "facets": { + "total_distinct_facet_count": facet_names.len(), + "additional_search_parameters_provided": additional_search_parameters_provided, + }, + }) + } +} + pub async fn search( index_scheduler: GuardedData, Data>, search_queue: Data, index_uid: web::Path, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -100,7 +200,7 @@ pub async fn search( if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } - analytics.post_facet_search(aggregate); + analytics.publish(aggregate, Some(&req)); let search_result = search_result?; diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 35b747ccf..483a48a16 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -1,3 +1,4 @@ +use std::collections::BTreeSet; use std::convert::Infallible; use actix_web::web::Data; @@ -18,7 +19,7 @@ use time::OffsetDateTime; use tracing::debug; use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT}; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; @@ -123,12 +124,34 @@ pub struct IndexCreateRequest { primary_key: Option, } +#[derive(Serialize)] +struct IndexCreatedAggregate { + primary_key: BTreeSet, +} + +impl Aggregate for IndexCreatedAggregate { + fn event_name(&self) -> &'static str { + "Index Created" + } + + fn aggregate(self, other: Self) -> Self + where + Self: Sized, + { + Self { primary_key: self.primary_key.union(&other.primary_key).collect() } + } + + fn into_event(self) -> impl Serialize { + self + } +} + pub async fn create_index( index_scheduler: GuardedData, Data>, body: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Create index"); let IndexCreateRequest { primary_key, uid } = body.into_inner(); @@ -136,8 +159,7 @@ pub async fn create_index( let allow_index_creation = index_scheduler.filters().allow_index_creation(&uid); if allow_index_creation { analytics.publish( - "Index Created".to_string(), - json!({ "primary_key": primary_key }), + IndexCreatedAggregate { primary_key: primary_key.iter().cloned().collect() }, Some(&req), ); @@ -194,20 +216,37 @@ pub async fn get_index( Ok(HttpResponse::Ok().json(index_view)) } +#[derive(Serialize)] +struct IndexUpdatedAggregate { + primary_key: BTreeSet, +} + +impl Aggregate for IndexUpdatedAggregate { + fn event_name(&self) -> &'static str { + "Index Updated" + } + + fn aggregate(self, other: Self) -> Self { + Self { primary_key: self.primary_key.union(&other.primary_key).collect() } + } + + fn into_event(self) -> impl Serialize { + self + } +} pub async fn update_index( index_scheduler: GuardedData, Data>, index_uid: web::Path, body: AwebJson, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?body, "Update index"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; let body = body.into_inner(); analytics.publish( - "Index Updated".to_string(), - json!({ "primary_key": body.primary_key }), + IndexUpdatedAggregate { primary_key: body.primary_key.iter().cloned().collect() }, Some(&req), ); diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 6a8eee521..f833a57d2 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -13,6 +13,7 @@ use meilisearch_types::serde_cs::vec::CS; use serde_json::Value; use tracing::debug; +use crate::analytics::segment_analytics::{SearchGET, SearchPOST}; use crate::analytics::{Analytics, SearchAggregator}; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; @@ -225,7 +226,7 @@ pub async fn search_with_url_query( index_uid: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { debug!(parameters = ?params, "Search get"); let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -237,7 +238,7 @@ pub async fn search_with_url_query( add_search_rules(&mut query.filter, search_rules); } - let mut aggregate = SearchAggregator::from_query(&query, &req); + let mut aggregate = SearchAggregator::::from_query(&query, &req); let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); @@ -254,7 +255,7 @@ pub async fn search_with_url_query( if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } - analytics.get_search(aggregate); + analytics.publish(aggregate, Some(&req)); let search_result = search_result?; @@ -268,7 +269,7 @@ pub async fn search_with_post( index_uid: web::Path, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -280,7 +281,7 @@ pub async fn search_with_post( add_search_rules(&mut query.filter, search_rules); } - let mut aggregate = SearchAggregator::from_query(&query, &req); + let mut aggregate = SearchAggregator::::from_query(&query, &req); let index = index_scheduler.index(&index_uid)?; @@ -302,7 +303,7 @@ pub async fn search_with_post( MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc(); } } - analytics.post_search(aggregate); + analytics.publish(aggregate, Some(&req)); let search_result = search_result?; diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index aaf8673d0..112f8671b 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -1,3 +1,5 @@ +use std::collections::{BTreeSet, HashSet}; + use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::AwebJson; @@ -7,12 +9,15 @@ use meilisearch_types::error::ResponseError; use meilisearch_types::facet_values_sort::FacetValuesSort; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::update::Setting; -use meilisearch_types::settings::{settings, RankingRuleView, SecretPolicy, Settings, Unchecked}; +use meilisearch_types::settings::{ + settings, ProximityPrecisionView, RankingRuleView, SecretPolicy, Settings, Unchecked, +}; use meilisearch_types::tasks::KindWithContent; +use serde::Serialize; use serde_json::json; use tracing::debug; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, Analytics}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; @@ -80,7 +85,7 @@ macro_rules! make_setting_route { body: deserr::actix_web::AwebJson, $err_ty>, req: HttpRequest, opt: web::Data, - $analytics_var: web::Data, + $analytics_var: web::Data, ) -> std::result::Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -162,16 +167,8 @@ make_setting_route!( "filterableAttributes", analytics, |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "FilterableAttributes Updated".to_string(), - json!({ - "filterable_attributes": { - "total": setting.as_ref().map(|filter| filter.len()).unwrap_or(0), - "has_geo": setting.as_ref().map(|filter| filter.contains("_geo")).unwrap_or(false), - } - }), + crate::routes::indexes::settings::FilterableAttributesAnalytics::new(setting.as_ref()).to_settings(), Some(req), ); } @@ -188,16 +185,8 @@ make_setting_route!( "sortableAttributes", analytics, |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "SortableAttributes Updated".to_string(), - json!({ - "sortable_attributes": { - "total": setting.as_ref().map(|sort| sort.len()), - "has_geo": setting.as_ref().map(|sort| sort.contains("_geo")), - }, - }), + crate::routes::indexes::settings::SortableAttributesAnalytics::new(setting.as_ref()).to_settings(), Some(req), ); } @@ -214,16 +203,8 @@ make_setting_route!( "displayedAttributes", analytics, |displayed: &Option>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "DisplayedAttributes Updated".to_string(), - json!({ - "displayed_attributes": { - "total": displayed.as_ref().map(|displayed| displayed.len()), - "with_wildcard": displayed.as_ref().map(|displayed| displayed.iter().any(|displayed| displayed == "*")), - }, - }), + crate::routes::indexes::settings::DisplayedAttributesAnalytics::new(displayed.as_ref()).to_settings(), Some(req), ); } @@ -240,35 +221,8 @@ make_setting_route!( "typoTolerance", analytics, |setting: &Option, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "TypoTolerance Updated".to_string(), - json!({ - "typo_tolerance": { - "enabled": setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))), - "disable_on_attributes": setting - .as_ref() - .and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())), - "disable_on_words": setting - .as_ref() - .and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())), - "min_word_size_for_one_typo": setting - .as_ref() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.one_typo.set())) - .flatten(), - "min_word_size_for_two_typos": setting - .as_ref() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.two_typos.set())) - .flatten(), - }, - }), + crate::routes::indexes::settings::TypoToleranceAnalytics::new(setting.as_ref()).to_settings(), Some(req), ); } @@ -285,16 +239,8 @@ make_setting_route!( "searchableAttributes", analytics, |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "SearchableAttributes Updated".to_string(), - json!({ - "searchable_attributes": { - "total": setting.as_ref().map(|searchable| searchable.len()), - "with_wildcard": setting.as_ref().map(|searchable| searchable.iter().any(|searchable| searchable == "*")), - }, - }), + crate::routes::indexes::settings::SearchableAttributesAnalytics::new(setting.as_ref()).to_settings(), Some(req), ); } @@ -311,15 +257,8 @@ make_setting_route!( "stopWords", analytics, |stop_words: &Option>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "StopWords Updated".to_string(), - json!({ - "stop_words": { - "total": stop_words.as_ref().map(|stop_words| stop_words.len()), - }, - }), + crate::routes::indexes::settings::StopWordsAnalytics::new(stop_words.as_ref()).to_settings(), Some(req), ); } @@ -336,15 +275,8 @@ make_setting_route!( "nonSeparatorTokens", analytics, |non_separator_tokens: &Option>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "nonSeparatorTokens Updated".to_string(), - json!({ - "non_separator_tokens": { - "total": non_separator_tokens.as_ref().map(|non_separator_tokens| non_separator_tokens.len()), - }, - }), + crate::routes::indexes::settings::NonSeparatorTokensAnalytics::new(non_separator_tokens.as_ref()).to_settings(), Some(req), ); } @@ -361,15 +293,8 @@ make_setting_route!( "separatorTokens", analytics, |separator_tokens: &Option>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "separatorTokens Updated".to_string(), - json!({ - "separator_tokens": { - "total": separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()), - }, - }), + crate::routes::indexes::settings::SeparatorTokensAnalytics::new(separator_tokens.as_ref()).to_settings(), Some(req), ); } @@ -386,15 +311,8 @@ make_setting_route!( "dictionary", analytics, |dictionary: &Option>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "dictionary Updated".to_string(), - json!({ - "dictionary": { - "total": dictionary.as_ref().map(|dictionary| dictionary.len()), - }, - }), + crate::routes::indexes::settings::DictionaryAnalytics::new(dictionary.as_ref()).to_settings(), Some(req), ); } @@ -411,15 +329,8 @@ make_setting_route!( "synonyms", analytics, |synonyms: &Option>>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "Synonyms Updated".to_string(), - json!({ - "synonyms": { - "total": synonyms.as_ref().map(|synonyms| synonyms.len()), - }, - }), + crate::routes::indexes::settings::SynonymsAnalytics::new(synonyms.as_ref()).to_settings(), Some(req), ); } @@ -436,14 +347,8 @@ make_setting_route!( "distinctAttribute", analytics, |distinct: &Option, req: &HttpRequest| { - use serde_json::json; analytics.publish( - "DistinctAttribute Updated".to_string(), - json!({ - "distinct_attribute": { - "set": distinct.is_some(), - } - }), + crate::routes::indexes::settings::DistinctAttributeAnalytics::new(distinct.as_ref()).to_settings(), Some(req), ); } @@ -460,15 +365,8 @@ make_setting_route!( "proximityPrecision", analytics, |precision: &Option, req: &HttpRequest| { - use serde_json::json; analytics.publish( - "ProximityPrecision Updated".to_string(), - json!({ - "proximity_precision": { - "set": precision.is_some(), - "value": precision.unwrap_or_default(), - } - }), + crate::routes::indexes::settings::ProximityPrecisionAnalytics::new(precision.as_ref()).to_settings(), Some(req), ); } @@ -485,12 +383,8 @@ make_setting_route!( "localizedAttributes", analytics, |rules: &Option>, req: &HttpRequest| { - use serde_json::json; analytics.publish( - "LocalizedAttributesRules Updated".to_string(), - json!({ - "locales": rules.as_ref().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::>()) - }), + crate::routes::indexes::settings::LocalesAnalytics::new(rules.as_ref()).to_settings(), Some(req), ); } @@ -507,21 +401,8 @@ make_setting_route!( "rankingRules", analytics, |setting: &Option>, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "RankingRules Updated".to_string(), - json!({ - "ranking_rules": { - "words_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words))), - "typo_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo))), - "proximity_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Proximity))), - "attribute_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Attribute))), - "sort_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort))), - "exactness_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Exactness))), - "values": setting.as_ref().map(|rr| rr.iter().filter(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Asc(_) | meilisearch_types::settings::RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::>().join(", ")), - } - }), + crate::routes::indexes::settings::RankingRulesAnalytics::new(setting.as_ref()).to_settings(), Some(req), ); } @@ -538,20 +419,8 @@ make_setting_route!( "faceting", analytics, |setting: &Option, req: &HttpRequest| { - use serde_json::json; - use meilisearch_types::facet_values_sort::FacetValuesSort; - analytics.publish( - "Faceting Updated".to_string(), - json!({ - "faceting": { - "max_values_per_facet": setting.as_ref().and_then(|s| s.max_values_per_facet.set()), - "sort_facet_values_by_star_count": setting.as_ref().and_then(|s| { - s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) - }), - "sort_facet_values_by_total": setting.as_ref().and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), - }, - }), + crate::routes::indexes::settings::FacetingAnalytics::new(setting.as_ref()).to_settings(), Some(req), ); } @@ -568,15 +437,8 @@ make_setting_route!( "pagination", analytics, |setting: &Option, req: &HttpRequest| { - use serde_json::json; - analytics.publish( - "Pagination Updated".to_string(), - json!({ - "pagination": { - "max_total_hits": setting.as_ref().and_then(|s| s.max_total_hits.set()), - }, - }), + crate::routes::indexes::settings::PaginationAnalytics::new(setting.as_ref()).to_settings(), Some(req), ); } @@ -593,11 +455,8 @@ make_setting_route!( "embedders", analytics, |setting: &Option>>, req: &HttpRequest| { - - analytics.publish( - "Embedders Updated".to_string(), - serde_json::json!({"embedders": crate::routes::indexes::settings::embedder_analytics(setting.as_ref())}), + crate::routes::indexes::settings::EmbeddersAnalytics::new(setting.as_ref()).to_settings(), Some(req), ); } @@ -651,10 +510,15 @@ fn embedder_analytics( json!( { + // last "total": setting.as_ref().map(|s| s.len()), + // Merge the sources "sources": sources, + // |= "document_template_used": document_template_used, + // max "document_template_max_bytes": document_template_max_bytes, + // |= "binary_quantization_used": binary_quantization_used, } ) @@ -672,8 +536,7 @@ make_setting_route!( analytics, |setting: &Option, req: &HttpRequest| { analytics.publish( - "Search Cutoff Updated".to_string(), - serde_json::json!({"search_cutoff_ms": setting }), + crate::routes::indexes::settings::SearchCutoffMsAnalytics::new(setting.as_ref()).to_settings(), Some(req), ); } @@ -714,13 +577,639 @@ generate_configure!( search_cutoff_ms ); +#[derive(Serialize, Default)] +struct SettingsAnalytics { + ranking_rules: RankingRulesAnalytics, + searchable_attributes: SearchableAttributesAnalytics, + displayed_attributes: DisplayedAttributesAnalytics, + sortable_attributes: SortableAttributesAnalytics, + filterable_attributes: FilterableAttributesAnalytics, + distinct_attribute: DistinctAttributeAnalytics, + proximity_precision: ProximityPrecisionAnalytics, + typo_tolerance: TypoToleranceAnalytics, + faceting: FacetingAnalytics, + pagination: PaginationAnalytics, + stop_words: StopWordsAnalytics, + synonyms: SynonymsAnalytics, + embedders: EmbeddersAnalytics, + search_cutoff_ms: SearchCutoffMsAnalytics, + locales: LocalesAnalytics, + dictionary: DictionaryAnalytics, + separator_tokens: SeparatorTokensAnalytics, + non_separator_tokens: NonSeparatorTokensAnalytics, +} + +impl Aggregate for SettingsAnalytics { + fn event_name(&self) -> &'static str { + "Settings Updated" + } + + fn aggregate(self, other: Self) -> Self + where + Self: Sized, + { + Self { + ranking_rules: RankingRulesAnalytics { + words_position: self + .ranking_rules + .words_position + .or(other.ranking_rules.words_position), + typo_position: self + .ranking_rules + .typo_position + .or(other.ranking_rules.typo_position), + proximity_position: self + .ranking_rules + .proximity_position + .or(other.ranking_rules.proximity_position), + attribute_position: self + .ranking_rules + .attribute_position + .or(other.ranking_rules.attribute_position), + sort_position: self + .ranking_rules + .sort_position + .or(other.ranking_rules.sort_position), + exactness_position: self + .ranking_rules + .exactness_position + .or(other.ranking_rules.exactness_position), + values: self.ranking_rules.values.or(other.ranking_rules.values), + }, + searchable_attributes: SearchableAttributesAnalytics { + total: self.searchable_attributes.total.or(other.searchable_attributes.total), + with_wildcard: self + .searchable_attributes + .with_wildcard + .or(other.searchable_attributes.with_wildcard), + }, + displayed_attributes: DisplayedAttributesAnalytics { + total: self.displayed_attributes.total.or(other.displayed_attributes.total), + with_wildcard: self + .displayed_attributes + .with_wildcard + .or(other.displayed_attributes.with_wildcard), + }, + sortable_attributes: SortableAttributesAnalytics { + total: self.sortable_attributes.total.or(other.sortable_attributes.total), + has_geo: self.sortable_attributes.has_geo.or(other.sortable_attributes.has_geo), + }, + filterable_attributes: FilterableAttributesAnalytics { + total: self.filterable_attributes.total.or(other.filterable_attributes.total), + has_geo: self.filterable_attributes.has_geo.or(other.filterable_attributes.has_geo), + }, + distinct_attribute: DistinctAttributeAnalytics { + set: self.distinct_attribute.set.or(other.distinct_attribute.set), + }, + proximity_precision: ProximityPrecisionAnalytics { + set: self.proximity_precision.set(other.proximity_precision.set), + value: self.proximity_precision.value(other.proximity_precision.value), + }, + typo_tolerance: TypoToleranceAnalytics { + enabled: self.typo_tolerance.enabled.or(other.typo_tolerance.enabled), + disable_on_attributes: self + .typo_tolerance + .disable_on_attributes + .or(other.typo_tolerance.disable_on_attributes), + disable_on_words: self + .typo_tolerance + .disable_on_words + .or(other.typo_tolerance.disable_on_words), + min_word_size_for_one_typo: self + .typo_tolerance + .min_word_size_for_one_typo + .or(other.typo_tolerance.min_word_size_for_one_typo), + min_word_size_for_two_typos: self + .typo_tolerance + .min_word_size_for_two_typos + .or(other.typo_tolerance.min_word_size_for_two_typos), + }, + faceting: FacetingAnalytics { + max_values_per_facet: self + .faceting + .max_values_per_facet + .or(other.faceting.max_values_per_facet), + sort_facet_values_by_star_count: self + .faceting + .sort_facet_values_by_star_count + .or(other.faceting.sort_facet_values_by_star_count), + sort_facet_values_by_total: self + .faceting + .sort_facet_values_by_total + .or(other.faceting.sort_facet_values_by_total), + }, + pagination: PaginationAnalytics { + max_total_hits: self.pagination.max_total_hits.or(other.pagination.max_total_hits), + }, + stop_words: StopWordsAnalytics { + total: self.stop_words.total.or(other.stop_words.total), + }, + synonyms: SynonymsAnalytics { total: self.synonyms.total.or(other.synonyms.total) }, + embedders: EmbeddersAnalytics { + total: self.embedders.total.or(other.embedders.total), + sources: match (self.embedders.sources, other.embedders.sources) { + (None, None) => None, + (Some(sources), None) | (None, Some(sources)) => Some(sources), + (Some(this), Some(other)) => Some(this.union(&other).collect()), + }, + document_template_used: match ( + self.embedders.document_template_used, + other.embedders.document_template_used, + ) { + (None, None) => None, + (Some(used), None) | (None, Some(used)) => Some(used), + (Some(this), Some(other)) => Some(this | other), + }, + document_template_max_bytes: match ( + self.embedders.document_template_max_bytes, + other.embedders.document_template_max_bytes, + ) { + (None, None) => None, + (Some(bytes), None) | (None, Some(bytes)) => Some(bytes), + (Some(this), Some(other)) => Some(this.max(other)), + }, + binary_quantization_used: match ( + self.embedders.binary_quantization_used, + other.embedders.binary_quantization_used, + ) { + (None, None) => None, + (Some(bq), None) | (None, Some(bq)) => Some(bq), + (Some(this), Some(other)) => Some(this | other), + }, + }, + search_cutoff_ms: SearchCutoffMsAnalytics { + search_cutoff_ms: self + .search_cutoff_ms + .search_cutoff_ms + .or(other.search_cutoff_ms.search_cutoff_ms), + }, + locales: LocalesAnalytics { locales: self.locales.locales.or(other.locales.locales) }, + dictionary: DictionaryAnalytics { + total: self.dictionary.total.or(other.dictionary.total), + }, + separator_tokens: SeparatorTokensAnalytics { + total: self.separator_tokens.total.or(other.non_separator_tokens.total), + }, + non_separator_tokens: NonSeparatorTokensAnalytics { + total: self.non_separator_tokens.total.or(other.non_separator_tokens.total), + }, + } + } + + fn into_event(self) -> impl Serialize + where + Self: Sized, + { + self + } +} + +#[derive(Serialize, Default)] +struct RankingRulesAnalytics { + words_position: Option, + typo_position: Option, + proximity_position: Option, + attribute_position: Option, + sort_position: Option, + exactness_position: Option, + values: Option, +} + +impl RankingRulesAnalytics { + pub fn new(rr: Option<&Vec>) -> Self { + RankingRulesAnalytics { + words_position: rr.as_ref().map(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words)) + }), + typo_position: rr.as_ref().map(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo)) + }), + proximity_position: rr.as_ref().map(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Proximity) + }) + }), + attribute_position: rr.as_ref().map(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Attribute) + }) + }), + sort_position: rr.as_ref().map(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort)) + }), + exactness_position: rr.as_ref().map(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Exactness) + }) + }), + values: rr.as_ref().map(|rr| { + rr.iter() + .filter(|s| { + matches!( + s, + meilisearch_types::settings::RankingRuleView::Asc(_) + | meilisearch_types::settings::RankingRuleView::Desc(_) + ) + }) + .map(|x| x.to_string()) + .collect::>() + .join(", ") + }), + } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { ranking_rules: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct SearchableAttributesAnalytics { + total: Option, + with_wildcard: bool, +} + +impl SearchableAttributesAnalytics { + pub fn new(setting: Option<&Vec>) -> Self { + Self { + total: setting.as_ref().map(|searchable| searchable.len()), + with_wildcard: setting + .as_ref() + .map(|searchable| searchable.iter().any(|searchable| searchable == "*")), + } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { searchable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct DisplayedAttributesAnalytics { + total: usize, + with_wildcard: bool, +} + +impl DisplayedAttributesAnalytics { + pub fn new(displayed: Option<&Vec>) -> Self { + Self { + total: displayed.as_ref().map(|displayed| displayed.len()), + with_wildcard: displayed + .as_ref() + .map(|displayed| displayed.iter().any(|displayed| displayed == "*")), + } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { displayed_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct SortableAttributesAnalytics { + total: usize, + has_geo: bool, +} + +impl SortableAttributesAnalytics { + pub fn new(setting: Option<&std::collections::BTreeSet>) -> Self { + Self { + total: setting.as_ref().map(|sort| sort.len()), + has_geo: setting.as_ref().map(|sort| sort.contains("_geo")), + } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { sortable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct FilterableAttributesAnalytics { + total: usize, + has_geo: bool, +} + +impl FilterableAttributesAnalytics { + pub fn new(setting: Option<&std::collections::BTreeSet>) -> Self { + Self { + total: setting.as_ref().map(|filter| filter.len()).unwrap_or(0), + has_geo: setting.as_ref().map(|filter| filter.contains("_geo")).unwrap_or(false), + } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { filterable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct DistinctAttributeAnalytics { + set: bool, +} + +impl DistinctAttributeAnalytics { + pub fn new(distinct: Option<&String>) -> Self { + Self { set: distinct.is_some() } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { distinct_attribute: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct ProximityPrecisionAnalytics { + set: bool, + value: Option, +} + +impl ProximityPrecisionAnalytics { + pub fn new(precision: Option<&meilisearch_types::settings::ProximityPrecisionView>) -> Self { + Self { set: precision.is_some(), value: precision.unwrap_or_default() } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { proximity_precision: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct TypoToleranceAnalytics { + enabled: Option, + disable_on_attributes: Option, + disable_on_words: Option, + min_word_size_for_one_typo: Option, + min_word_size_for_two_typos: Option, +} + +impl TypoToleranceAnalytics { + pub fn new(setting: Option<&meilisearch_types::settings::TypoSettings>) -> Self { + Self { + enabled: setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))), + disable_on_attributes: setting + .as_ref() + .and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())), + disable_on_words: setting + .as_ref() + .and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())), + min_word_size_for_one_typo: setting + .as_ref() + .and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.one_typo.set())) + .flatten(), + min_word_size_for_two_typos: setting + .as_ref() + .and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.two_typos.set())) + .flatten(), + } + } + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { typo_tolerance: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct FacetingAnalytics { + max_values_per_facet: Option, + sort_facet_values_by_star_count: Option, + sort_facet_values_by_total: Option, +} + +impl FacetingAnalytics { + pub fn new(setting: Option<&meilisearch_types::settings::FacetingSettings>) -> Self { + Self { + max_values_per_facet: setting.as_ref().and_then(|s| s.max_values_per_facet.set()), + sort_facet_values_by_star_count: setting.as_ref().and_then(|s| { + s.sort_facet_values_by + .as_ref() + .set() + .map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) + }), + sort_facet_values_by_total: setting + .as_ref() + .and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), + } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { faceting: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct PaginationAnalytics { + max_total_hits: Option, +} + +impl PaginationAnalytics { + pub fn new(setting: Option<&meilisearch_types::settings::PaginationSettings>) -> Self { + Self { max_total_hits: setting.as_ref().and_then(|s| s.max_total_hits.set()) } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { pagination: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct StopWordsAnalytics { + total: Option, +} + +impl StopWordsAnalytics { + pub fn new(stop_words: Option<&BTreeSet>) -> Self { + Self { total: stop_words.as_ref().map(|stop_words| stop_words.len()) } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { stop_words: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct SynonymsAnalytics { + total: Option, +} + +impl SynonymsAnalytics { + pub fn new(synonyms: Option<&std::collections::BTreeMap>>) -> Self { + Self { total: synonyms.as_ref().map(|synonyms| synonyms.len()) } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { synonyms: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct EmbeddersAnalytics { + // last + total: Option, + // Merge the sources + sources: Option>, + // |= + document_template_used: Option, + // max + document_template_max_bytes: Option, + // |= + binary_quantization_used: Option, +} + +impl EmbeddersAnalytics { + pub fn new( + setting: Option< + &std::collections::BTreeMap< + String, + Setting, + >, + >, + ) -> Self { + let mut sources = std::collections::HashSet::new(); + + if let Some(s) = &setting { + for source in s + .values() + .filter_map(|config| config.clone().set()) + .filter_map(|config| config.source.set()) + { + use meilisearch_types::milli::vector::settings::EmbedderSource; + match source { + EmbedderSource::OpenAi => sources.insert("openAi"), + EmbedderSource::HuggingFace => sources.insert("huggingFace"), + EmbedderSource::UserProvided => sources.insert("userProvided"), + EmbedderSource::Ollama => sources.insert("ollama"), + EmbedderSource::Rest => sources.insert("rest"), + }; + } + }; + + Self { + total: setting.as_ref().map(|s| s.len()), + sources, + document_template_used: setting.as_ref().map(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .any(|config| config.document_template.set().is_some()) + }), + document_template_max_bytes: setting.as_ref().and_then(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .filter_map(|config| config.document_template_max_bytes.set()) + .max() + }), + binary_quantization_used: setting.as_ref().map(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .any(|config| config.binary_quantized.set().is_some()) + }), + } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { embedders: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +#[serde(transparent)] +struct SearchCutoffMsAnalytics { + search_cutoff_ms: Option, +} + +impl SearchCutoffMsAnalytics { + pub fn new(setting: Option<&u64>) -> Self { + Self { search_cutoff_ms: setting } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { search_cutoff_ms: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +#[serde(transparent)] +struct LocalesAnalytics { + locales: BTreeSet, +} + +impl LocalesAnalytics { + pub fn new( + rules: Option<&Vec>, + ) -> Self { + LocalesAnalytics { + locales: rules.as_ref().map(|rules| { + rules + .iter() + .flat_map(|rule| rule.locales.iter().cloned()) + .collect::>() + }), + } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { locales: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct DictionaryAnalytics { + total: usize, +} + +impl DictionaryAnalytics { + pub fn new(dictionary: Option<&std::collections::BTreeSet>) -> Self { + Self { total: dictionary.as_ref().map(|dictionary| dictionary.len()) } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { dictionary: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct SeparatorTokensAnalytics { + total: usize, +} + +impl SeparatorTokensAnalytics { + pub fn new(separator_tokens: Option<&std::collections::BTreeSet>) -> Self { + Self { total: separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()) } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { separator_tokens: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +struct NonSeparatorTokensAnalytics { + total: usize, +} + +impl NonSeparatorTokensAnalytics { + pub fn new(non_separator_tokens: Option<&std::collections::BTreeSet>) -> Self { + Self { + total: non_separator_tokens + .as_ref() + .map(|non_separator_tokens| non_separator_tokens.len()), + } + } + + pub fn to_settings(self) -> SettingsAnalytics { + SettingsAnalytics { non_separator_tokens: self, ..Default::default() } + } +} + pub async fn update_all( index_scheduler: GuardedData, Data>, index_uid: web::Path, body: AwebJson, DeserrJsonError>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -729,103 +1218,44 @@ pub async fn update_all( let new_settings = validate_settings(new_settings, &index_scheduler)?; analytics.publish( - "Settings Updated".to_string(), - json!({ - "ranking_rules": { - "words_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Words))), - "typo_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Typo))), - "proximity_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Proximity))), - "attribute_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Attribute))), - "sort_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Sort))), - "exactness_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Exactness))), - "values": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().filter(|s| !matches!(s, RankingRuleView::Asc(_) | RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::>().join(", ")), - }, - "searchable_attributes": { - "total": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.len()), - "with_wildcard": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.iter().any(|searchable| searchable == "*")), - }, - "displayed_attributes": { - "total": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.len()), - "with_wildcard": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.iter().any(|displayed| displayed == "*")), - }, - "sortable_attributes": { - "total": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.len()), - "has_geo": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.iter().any(|s| s == "_geo")), - }, - "filterable_attributes": { - "total": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.len()), - "has_geo": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.iter().any(|s| s == "_geo")), - }, - "distinct_attribute": { - "set": new_settings.distinct_attribute.as_ref().set().is_some() - }, - "proximity_precision": { - "set": new_settings.proximity_precision.as_ref().set().is_some(), - "value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default() - }, - "typo_tolerance": { - "enabled": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.enabled.as_ref().set()) - .copied(), - "disable_on_attributes": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())), - "disable_on_words": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())), - "min_word_size_for_one_typo": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.one_typo.set())) - .flatten(), - "min_word_size_for_two_typos": new_settings.typo_tolerance - .as_ref() - .set() - .and_then(|s| s.min_word_size_for_typos - .as_ref() - .set() - .map(|s| s.two_typos.set())) - .flatten(), - }, - "faceting": { - "max_values_per_facet": new_settings.faceting - .as_ref() - .set() - .and_then(|s| s.max_values_per_facet.as_ref().set()), - "sort_facet_values_by_star_count": new_settings.faceting - .as_ref() - .set() - .and_then(|s| { - s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) - }), - "sort_facet_values_by_total": new_settings.faceting - .as_ref() - .set() - .and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), - }, - "pagination": { - "max_total_hits": new_settings.pagination - .as_ref() - .set() - .and_then(|s| s.max_total_hits.as_ref().set()), - }, - "stop_words": { - "total": new_settings.stop_words.as_ref().set().map(|stop_words| stop_words.len()), - }, - "synonyms": { - "total": new_settings.synonyms.as_ref().set().map(|synonyms| synonyms.len()), - }, - "embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set()), - "search_cutoff_ms": new_settings.search_cutoff_ms.as_ref().set(), - "locales": new_settings.localized_attributes.as_ref().set().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::>()), - }), + SettingsAnalytics { + ranking_rules: RankingRulesAnalytics::new(new_settings.ranking_rules.as_ref().set()), + searchable_attributes: SearchableAttributesAnalytics::new( + new_settings.searchable_attributes.as_ref().set(), + ), + displayed_attributes: DisplayedAttributesAnalytics::new( + new_settings.displayed_attributes.as_ref().set(), + ), + sortable_attributes: SortableAttributesAnalytics::new( + new_settings.sortable_attributes.as_ref().set(), + ), + filterable_attributes: FilterableAttributesAnalytics::new( + new_settings.filterable_attributes.as_ref().set(), + ), + distinct_attribute: DistinctAttributeAnalytics::new( + new_settings.distinct_attribute.as_ref().set(), + ), + proximity_precision: ProximityPrecisionAnalytics::new( + new_settings.proximity_precision.as_ref().set(), + ), + typo_tolerance: TypoToleranceAnalytics::new(new_settings.typo_tolerance.as_ref().set()), + faceting: FacetingAnalytics::new(new_settings.faceting.as_ref().set()), + pagination: PaginationAnalytics::new(new_settings.pagination.as_ref().set()), + stop_words: StopWordsAnalytics::new(new_settings.stop_words.as_ref().set()), + synonyms: SynonymsAnalytics::new(new_settings.synonyms.as_ref().set()), + embedders: EmbeddersAnalytics::new(new_settings.embedders.as_ref().set()), + search_cutoff_ms: SearchCutoffMsAnalytics::new( + new_settings.search_cutoff_ms.as_ref().set(), + ), + locales: LocalesAnalytics::new(new_settings.localized_attributes.as_ref().set()), + dictionary: DictionaryAnalytics::new(new_settings.dictionary.as_ref().set()), + separator_tokens: SeparatorTokensAnalytics::new( + new_settings.separator_tokens.as_ref().set(), + ), + non_separator_tokens: NonSeparatorTokensAnalytics::new( + new_settings.non_separator_tokens.as_ref().set(), + ), + }, Some(&req), ); diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index 51a7b0707..34e904230 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -40,7 +40,7 @@ pub async fn swap_indexes( analytics.publish( "Indexes Swapped".to_string(), json!({ - "swap_operation_number": params.len(), + "swap_operation_number": params.len(), // Return the max ever encountered }), Some(&req), ); From e66fccc3f2e8c9ef9f576f9484d1135bf02716e6 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 16 Oct 2024 15:51:48 +0200 Subject: [PATCH 52/92] get rids of the analytics closure --- meilisearch/src/routes/indexes/settings.rs | 216 +++------------------ 1 file changed, 24 insertions(+), 192 deletions(-) diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 112f8671b..db83cb39b 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -14,7 +14,6 @@ use meilisearch_types::settings::{ }; use meilisearch_types::tasks::KindWithContent; use serde::Serialize; -use serde_json::json; use tracing::debug; use crate::analytics::{Aggregate, Analytics}; @@ -25,7 +24,7 @@ use crate::Opt; #[macro_export] macro_rules! make_setting_route { - ($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics_var:ident, $analytics:expr) => { + ($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident) => { pub mod $attr { use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse, Resource}; @@ -85,7 +84,7 @@ macro_rules! make_setting_route { body: deserr::actix_web::AwebJson, $err_ty>, req: HttpRequest, opt: web::Data, - $analytics_var: web::Data, + analytics: web::Data, ) -> std::result::Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; @@ -93,7 +92,10 @@ macro_rules! make_setting_route { debug!(parameters = ?body, "Update settings"); #[allow(clippy::redundant_closure_call)] - $analytics(&body, &req); + analytics.publish( + $crate::routes::indexes::settings::$analytics::new(body.as_ref()).to_settings(), + Some(&req), + ); let new_settings = Settings { $attr: match body { @@ -165,13 +167,7 @@ make_setting_route!( >, filterable_attributes, "filterableAttributes", - analytics, - |setting: &Option>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::FilterableAttributesAnalytics::new(setting.as_ref()).to_settings(), - Some(req), - ); - } + FilterableAttributesAnalytics ); make_setting_route!( @@ -183,13 +179,7 @@ make_setting_route!( >, sortable_attributes, "sortableAttributes", - analytics, - |setting: &Option>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::SortableAttributesAnalytics::new(setting.as_ref()).to_settings(), - Some(req), - ); - } + SortableAttributesAnalytics ); make_setting_route!( @@ -201,13 +191,7 @@ make_setting_route!( >, displayed_attributes, "displayedAttributes", - analytics, - |displayed: &Option>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::DisplayedAttributesAnalytics::new(displayed.as_ref()).to_settings(), - Some(req), - ); - } + DisplayedAttributesAnalytics ); make_setting_route!( @@ -219,13 +203,7 @@ make_setting_route!( >, typo_tolerance, "typoTolerance", - analytics, - |setting: &Option, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::TypoToleranceAnalytics::new(setting.as_ref()).to_settings(), - Some(req), - ); - } + TypoToleranceAnalytics ); make_setting_route!( @@ -237,13 +215,7 @@ make_setting_route!( >, searchable_attributes, "searchableAttributes", - analytics, - |setting: &Option>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::SearchableAttributesAnalytics::new(setting.as_ref()).to_settings(), - Some(req), - ); - } + SearchableAttributesAnalytics ); make_setting_route!( @@ -255,13 +227,7 @@ make_setting_route!( >, stop_words, "stopWords", - analytics, - |stop_words: &Option>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::StopWordsAnalytics::new(stop_words.as_ref()).to_settings(), - Some(req), - ); - } + StopWordsAnalytics ); make_setting_route!( @@ -273,13 +239,7 @@ make_setting_route!( >, non_separator_tokens, "nonSeparatorTokens", - analytics, - |non_separator_tokens: &Option>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::NonSeparatorTokensAnalytics::new(non_separator_tokens.as_ref()).to_settings(), - Some(req), - ); - } + NonSeparatorTokensAnalytics ); make_setting_route!( @@ -291,13 +251,7 @@ make_setting_route!( >, separator_tokens, "separatorTokens", - analytics, - |separator_tokens: &Option>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::SeparatorTokensAnalytics::new(separator_tokens.as_ref()).to_settings(), - Some(req), - ); - } + SeparatorTokensAnalytics ); make_setting_route!( @@ -309,13 +263,7 @@ make_setting_route!( >, dictionary, "dictionary", - analytics, - |dictionary: &Option>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::DictionaryAnalytics::new(dictionary.as_ref()).to_settings(), - Some(req), - ); - } + DictionaryAnalytics ); make_setting_route!( @@ -327,13 +275,7 @@ make_setting_route!( >, synonyms, "synonyms", - analytics, - |synonyms: &Option>>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::SynonymsAnalytics::new(synonyms.as_ref()).to_settings(), - Some(req), - ); - } + SynonymsAnalytics ); make_setting_route!( @@ -345,13 +287,7 @@ make_setting_route!( >, distinct_attribute, "distinctAttribute", - analytics, - |distinct: &Option, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::DistinctAttributeAnalytics::new(distinct.as_ref()).to_settings(), - Some(req), - ); - } + DistinctAttributeAnalytics ); make_setting_route!( @@ -363,13 +299,7 @@ make_setting_route!( >, proximity_precision, "proximityPrecision", - analytics, - |precision: &Option, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::ProximityPrecisionAnalytics::new(precision.as_ref()).to_settings(), - Some(req), - ); - } + ProximityPrecisionAnalytics ); make_setting_route!( @@ -381,13 +311,7 @@ make_setting_route!( >, localized_attributes, "localizedAttributes", - analytics, - |rules: &Option>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::LocalesAnalytics::new(rules.as_ref()).to_settings(), - Some(req), - ); - } + LocalesAnalytics ); make_setting_route!( @@ -399,13 +323,7 @@ make_setting_route!( >, ranking_rules, "rankingRules", - analytics, - |setting: &Option>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::RankingRulesAnalytics::new(setting.as_ref()).to_settings(), - Some(req), - ); - } + RankingRulesAnalytics ); make_setting_route!( @@ -417,13 +335,7 @@ make_setting_route!( >, faceting, "faceting", - analytics, - |setting: &Option, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::FacetingAnalytics::new(setting.as_ref()).to_settings(), - Some(req), - ); - } + FacetingAnalytics ); make_setting_route!( @@ -435,13 +347,7 @@ make_setting_route!( >, pagination, "pagination", - analytics, - |setting: &Option, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::PaginationAnalytics::new(setting.as_ref()).to_settings(), - Some(req), - ); - } + PaginationAnalytics ); make_setting_route!( @@ -453,77 +359,9 @@ make_setting_route!( >, embedders, "embedders", - analytics, - |setting: &Option>>, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::EmbeddersAnalytics::new(setting.as_ref()).to_settings(), - Some(req), - ); - } + EmbeddersAnalytics ); -fn embedder_analytics( - setting: Option< - &std::collections::BTreeMap< - String, - Setting, - >, - >, -) -> serde_json::Value { - let mut sources = std::collections::HashSet::new(); - - if let Some(s) = &setting { - for source in s - .values() - .filter_map(|config| config.clone().set()) - .filter_map(|config| config.source.set()) - { - use meilisearch_types::milli::vector::settings::EmbedderSource; - match source { - EmbedderSource::OpenAi => sources.insert("openAi"), - EmbedderSource::HuggingFace => sources.insert("huggingFace"), - EmbedderSource::UserProvided => sources.insert("userProvided"), - EmbedderSource::Ollama => sources.insert("ollama"), - EmbedderSource::Rest => sources.insert("rest"), - }; - } - }; - - let document_template_used = setting.as_ref().map(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .any(|config| config.document_template.set().is_some()) - }); - - let document_template_max_bytes = setting.as_ref().and_then(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .filter_map(|config| config.document_template_max_bytes.set()) - .max() - }); - - let binary_quantization_used = setting.as_ref().map(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .any(|config| config.binary_quantized.set().is_some()) - }); - - json!( - { - // last - "total": setting.as_ref().map(|s| s.len()), - // Merge the sources - "sources": sources, - // |= - "document_template_used": document_template_used, - // max - "document_template_max_bytes": document_template_max_bytes, - // |= - "binary_quantization_used": binary_quantization_used, - } - ) -} - make_setting_route!( "/search-cutoff-ms", put, @@ -533,13 +371,7 @@ make_setting_route!( >, search_cutoff_ms, "searchCutoffMs", - analytics, - |setting: &Option, req: &HttpRequest| { - analytics.publish( - crate::routes::indexes::settings::SearchCutoffMsAnalytics::new(setting.as_ref()).to_settings(), - Some(req), - ); - } + SearchCutoffMsAnalytics ); macro_rules! generate_configure { From fdeb47fb549a242d318a17195e1a804e50aef5dd Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 16 Oct 2024 17:16:33 +0200 Subject: [PATCH 53/92] implements all routes --- meilisearch/src/analytics/mod.rs | 14 +- .../src/analytics/segment_analytics.rs | 239 +++++++----------- meilisearch/src/routes/dump.rs | 2 +- meilisearch/src/routes/features.rs | 8 +- meilisearch/src/routes/indexes/documents.rs | 20 +- .../src/routes/indexes/facet_search.rs | 2 +- meilisearch/src/routes/indexes/mod.rs | 4 +- meilisearch/src/routes/indexes/search.rs | 4 +- meilisearch/src/routes/indexes/settings.rs | 152 ++++++----- meilisearch/src/routes/indexes/similar.rs | 13 +- meilisearch/src/routes/multi_search.rs | 6 +- meilisearch/src/routes/snapshot.rs | 7 +- meilisearch/src/routes/swap_indexes.rs | 32 ++- meilisearch/src/routes/tasks.rs | 129 +++++++--- 14 files changed, 337 insertions(+), 295 deletions(-) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index a8658d830..a0ca47d8f 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -16,7 +16,9 @@ use serde::Serialize; // if the feature analytics is enabled we use the real analytics pub type SegmentAnalytics = segment_analytics::SegmentAnalytics; pub use segment_analytics::SearchAggregator; -pub type SimilarAggregator = segment_analytics::SimilarAggregator; +pub use segment_analytics::SimilarAggregator; + +use self::segment_analytics::extract_user_agents; pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator; pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator; @@ -32,14 +34,11 @@ macro_rules! empty_analytics { $event_name } - fn aggregate(self, other: Self) -> Self - where - Self: Sized, - { + fn aggregate(self, _other: Self) -> Self { self } - fn into_event(self) -> serde_json::Value { + fn into_event(self) -> impl serde::Serialize { serde_json::json!({}) } } @@ -150,7 +149,8 @@ impl Analytics { } /// The method used to publish most analytics that do not need to be batched every hours - pub fn publish(&self, send: impl Aggregate, request: Option<&HttpRequest>) { + pub fn publish(&self, send: impl Aggregate, request: &HttpRequest) { let Some(segment) = self.inner else { return }; + let user_agents = extract_user_agents(request); } } diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 8a6dfd780..0572267e1 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -71,25 +71,8 @@ pub fn extract_user_agents(request: &HttpRequest) -> Vec { .collect() } -pub enum AnalyticsMsg { - BatchMessage(Track), - AggregateGetSearch(SearchAggregator), - AggregatePostSearch(SearchAggregator), - AggregateGetSimilar(SimilarAggregator), - AggregatePostSimilar(SimilarAggregator), - AggregatePostMultiSearch(MultiSearchAggregator), - AggregatePostFacetSearch(FacetSearchAggregator), - AggregateAddDocuments(DocumentsAggregator), - AggregateDeleteDocuments(DocumentsDeletionAggregator), - AggregateUpdateDocuments(DocumentsAggregator), - AggregateEditDocumentsByFunction(EditDocumentsByFunctionAggregator), - AggregateGetFetchDocuments(DocumentsFetchAggregator), - AggregatePostFetchDocuments(DocumentsFetchAggregator), -} - pub struct SegmentAnalytics { pub instance_uid: InstanceUid, - sender: Sender, pub user: User, } @@ -1083,8 +1066,6 @@ impl Aggregate for SearchAggregator { #[derive(Default)] pub struct MultiSearchAggregator { - timestamp: Option, - // requests total_received: usize, total_succeeded: usize, @@ -1103,9 +1084,6 @@ pub struct MultiSearchAggregator { // federation use_federation: bool, - - // context - user_agents: HashSet, } impl MultiSearchAggregator { @@ -1113,10 +1091,6 @@ impl MultiSearchAggregator { federated_search: &FederatedSearch, request: &HttpRequest, ) -> Self { - let timestamp = Some(OffsetDateTime::now_utc()); - - let user_agents = extract_user_agents(request).into_iter().collect(); - let use_federation = federated_search.federation.is_some(); let distinct_indexes: HashSet<_> = federated_search @@ -1166,7 +1140,6 @@ impl MultiSearchAggregator { federated_search.queries.iter().any(|query| query.show_ranking_score_details); Self { - timestamp, total_received: 1, total_succeeded: 0, total_distinct_index_count: distinct_indexes.len(), @@ -1174,7 +1147,6 @@ impl MultiSearchAggregator { total_search_count: federated_search.queries.len(), show_ranking_score, show_ranking_score_details, - user_agents, use_federation, } } @@ -1182,15 +1154,20 @@ impl MultiSearchAggregator { pub fn succeed(&mut self) { self.total_succeeded = self.total_succeeded.saturating_add(1); } +} + +impl Aggregate for MultiSearchAggregator { + fn event_name(&self) -> &'static str { + "Documents Searched by Multi-Search POST" + } /// Aggregate one [MultiSearchAggregator] into another. - pub fn aggregate(&mut self, other: Self) { + fn aggregate(mut self, other: Self) -> Self { // write the aggregate in a way that will cause a compilation error if a field is added. // get ownership of self, replacing it by a default value. - let this = std::mem::take(self); + let this = self; - let timestamp = this.timestamp.or(other.timestamp); let total_received = this.total_received.saturating_add(other.total_received); let total_succeeded = this.total_succeeded.saturating_add(other.total_succeeded); let total_distinct_index_count = @@ -1207,75 +1184,53 @@ impl MultiSearchAggregator { user_agents.insert(user_agent); } - // need all fields or compile error - let mut aggregated = Self { - timestamp, + Self { total_received, total_succeeded, total_distinct_index_count, total_single_index, total_search_count, - user_agents, show_ranking_score, show_ranking_score_details, use_federation, - // do not add _ or ..Default::default() here - }; - - // replace the default self with the aggregated value - std::mem::swap(self, &mut aggregated); + } } - pub fn into_event(self, user: &User, event_name: &str) -> Option { + fn into_event(self) -> impl Serialize { let Self { - timestamp, total_received, total_succeeded, total_distinct_index_count, total_single_index, total_search_count, - user_agents, show_ranking_score, show_ranking_score_details, use_federation, } = self; - if total_received == 0 { - None - } else { - let properties = json!({ - "user-agent": user_agents, - "requests": { - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "indexes": { - "total_single_index": total_single_index, - "total_distinct_index_count": total_distinct_index_count, - "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early - }, - "searches": { - "total_search_count": total_search_count, - "avg_search_count": (total_search_count as f64) / (total_received as f64), - }, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - }, - "federation": { - "use_federation": use_federation, - } - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } + json!({ + "requests": { + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "indexes": { + "total_single_index": total_single_index, + "total_distinct_index_count": total_distinct_index_count, + "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early + }, + "searches": { + "total_search_count": total_search_count, + "avg_search_count": (total_search_count as f64) / (total_received as f64), + }, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + }, + "federation": { + "use_federation": use_federation, + } + }) } } @@ -1752,13 +1707,13 @@ impl DocumentsFetchAggregator { } } +aggregate_methods!( + SimilarPOST => "Similar POST", + SimilarGET => "Similar GET", +); + #[derive(Default)] -pub struct SimilarAggregator { - timestamp: Option, - - // context - user_agents: HashSet, - +pub struct SimilarAggregator { // requests total_received: usize, total_succeeded: usize, @@ -1787,9 +1742,11 @@ pub struct SimilarAggregator { show_ranking_score: bool, show_ranking_score_details: bool, ranking_score_threshold: bool, + + marker: std::marker::PhantomData, } -impl SimilarAggregator { +impl SimilarAggregator { #[allow(clippy::field_reassign_with_default)] pub fn from_query(query: &SimilarQuery, request: &HttpRequest) -> Self { let SimilarQuery { @@ -1854,12 +1811,16 @@ impl SimilarAggregator { self.time_spent.push(*processing_time_ms as usize); } +} + +impl Aggregate for SimilarAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } /// Aggregate one [SimilarAggregator] into another. - pub fn aggregate(&mut self, mut other: Self) { + fn aggregate(mut self, mut other: Self) -> Self { let Self { - timestamp, - user_agents, total_received, total_succeeded, ref mut time_spent, @@ -1875,17 +1836,9 @@ impl SimilarAggregator { show_ranking_score_details, ranking_score_threshold, retrieve_vectors, + marker: _, } = other; - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // context - for user_agent in user_agents.into_iter() { - self.user_agents.insert(user_agent); - } - // request self.total_received = self.total_received.saturating_add(total_received); self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); @@ -1917,12 +1870,12 @@ impl SimilarAggregator { self.show_ranking_score |= show_ranking_score; self.show_ranking_score_details |= show_ranking_score_details; self.ranking_score_threshold |= ranking_score_threshold; + + self } - pub fn into_event(self, user: &User, event_name: &str) -> Option { + fn into_event(self) -> impl Serialize { let Self { - timestamp, - user_agents, total_received, total_succeeded, time_spent, @@ -1938,56 +1891,44 @@ impl SimilarAggregator { show_ranking_score_details, ranking_score_threshold, retrieve_vectors, + marker: _, } = self; - if total_received == 0 { - None - } else { - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // the index of the 99th percentage of value - let percentile_99th = time_spent.len() * 99 / 100; - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th); + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // the index of the 99th percentage of value + let percentile_99th = time_spent.len() * 99 / 100; + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th); - let properties = json!({ - "user-agent": user_agents, - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "filter": { - "with_geoRadius": filter_with_geo_radius, - "with_geoBoundingBox": filter_with_geo_bounding_box, - "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), - "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "vector": { - "retrieve_vectors": retrieve_vectors, - }, - "pagination": { - "max_limit": max_limit, - "max_offset": max_offset, - }, - "formatting": { - "max_attributes_to_retrieve": max_attributes_to_retrieve, - }, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - "ranking_score_threshold": ranking_score_threshold, - }, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } + json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "filter": { + "with_geoRadius": filter_with_geo_radius, + "with_geoBoundingBox": filter_with_geo_bounding_box, + "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), + "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "vector": { + "retrieve_vectors": retrieve_vectors, + }, + "pagination": { + "max_limit": max_limit, + "max_offset": max_offset, + }, + "formatting": { + "max_attributes_to_retrieve": max_attributes_to_retrieve, + }, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, + } + }) } } diff --git a/meilisearch/src/routes/dump.rs b/meilisearch/src/routes/dump.rs index 0fdeef5ed..c78dc4dad 100644 --- a/meilisearch/src/routes/dump.rs +++ b/meilisearch/src/routes/dump.rs @@ -26,7 +26,7 @@ pub async fn create_dump( opt: web::Data, analytics: web::Data, ) -> Result { - analytics.publish(DumpAnalytics::default(), Some(&req)); + analytics.publish(DumpAnalytics::default(), &req); let task = KindWithContent::DumpCreation { keys: auth_controller.list_keys()?, diff --git a/meilisearch/src/routes/features.rs b/meilisearch/src/routes/features.rs index 24c89938d..4ee5b37b0 100644 --- a/meilisearch/src/routes/features.rs +++ b/meilisearch/src/routes/features.rs @@ -35,7 +35,7 @@ async fn get_features( ) -> HttpResponse { let features = index_scheduler.features(); - analytics.publish(GetExperimentalFeatureAnalytics::default(), Some(&req)); + analytics.publish(GetExperimentalFeatureAnalytics::default(), &req); let features = features.runtime_features(); debug!(returns = ?features, "Get features"); HttpResponse::Ok().json(features) @@ -83,8 +83,8 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { } } - fn into_event(self) -> serde_json::Value { - serde_json::to_value(self).unwrap() + fn into_event(self) -> impl Serialize { + self } } @@ -131,7 +131,7 @@ async fn patch_features( edit_documents_by_function, contains_filter, }, - Some(&req), + &req, ); index_scheduler.put_runtime_features(new_features)?; debug!(returns = ?new_features, "Patch features"); diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 8f4cd026d..6dece61e6 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -194,7 +194,7 @@ pub async fn get_document( retrieve_vectors: param_retrieve_vectors.0, ..Default::default() }, - Some(&req), + &req, ); let index = index_scheduler.index(&index_uid)?; @@ -253,7 +253,7 @@ pub async fn delete_document( per_document_id: true, ..Default::default() }, - Some(&req), + &req, ); let task = KindWithContent::DocumentDeletion { @@ -319,7 +319,7 @@ pub async fn documents_by_query_post( max_offset: body.offset, ..Default::default() }, - Some(&req), + &req, ); documents_by_query(&index_scheduler, index_uid, body) @@ -361,7 +361,7 @@ pub async fn get_documents( max_offset: query.offset, ..Default::default() }, - Some(&req), + &req, ); documents_by_query(&index_scheduler, index_uid, query) @@ -486,7 +486,7 @@ pub async fn replace_documents( index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), method: PhantomData, }, - Some(&req), + &req, ); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); @@ -543,7 +543,7 @@ pub async fn update_documents( index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x), method: PhantomData, }, - Some(&req), + &req, ); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); @@ -718,7 +718,7 @@ pub async fn delete_documents_batch( analytics.publish( DocumentsDeletionAggregator { total_received: 1, per_batch: true, ..Default::default() }, - Some(&req), + &req, ); let ids = body @@ -761,7 +761,7 @@ pub async fn delete_documents_by_filter( analytics.publish( DocumentsDeletionAggregator { total_received: 1, per_filter: true, ..Default::default() }, - Some(&req), + &req, ); // we ensure the filter is well formed before enqueuing it @@ -847,7 +847,7 @@ pub async fn edit_documents_by_function( with_context: params.context.is_some(), index_creation: index_scheduler.index(&index_uid).is_err(), }, - Some(&req), + &req, ); let DocumentEditionByFunction { filter, context, function } = params; @@ -902,7 +902,7 @@ pub async fn clear_all_documents( let index_uid = IndexUid::try_from(index_uid.into_inner())?; analytics.publish( DocumentsDeletionAggregator { total_received: 1, clear_all: true, ..Default::default() }, - Some(&req), + &req, ); let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() }; diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 1e9d0e15e..f3c74a388 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -200,7 +200,7 @@ pub async fn search( if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } - analytics.publish(aggregate, Some(&req)); + analytics.publish(aggregate, &req); let search_result = search_result?; diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 483a48a16..f926f663c 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -160,7 +160,7 @@ pub async fn create_index( if allow_index_creation { analytics.publish( IndexCreatedAggregate { primary_key: primary_key.iter().cloned().collect() }, - Some(&req), + &req, ); let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key }; @@ -247,7 +247,7 @@ pub async fn update_index( let body = body.into_inner(); analytics.publish( IndexUpdatedAggregate { primary_key: body.primary_key.iter().cloned().collect() }, - Some(&req), + &req, ); let task = KindWithContent::IndexUpdate { diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index f833a57d2..538c46fd0 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -255,7 +255,7 @@ pub async fn search_with_url_query( if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } - analytics.publish(aggregate, Some(&req)); + analytics.publish(aggregate, &req); let search_result = search_result?; @@ -303,7 +303,7 @@ pub async fn search_with_post( MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc(); } } - analytics.publish(aggregate, Some(&req)); + analytics.publish(aggregate, &req); let search_result = search_result?; diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index db83cb39b..bb2f6792d 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -8,6 +8,7 @@ use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::ResponseError; use meilisearch_types::facet_values_sort::FacetValuesSort; use meilisearch_types::index_uid::IndexUid; +use meilisearch_types::locales::Locale; use meilisearch_types::milli::update::Setting; use meilisearch_types::settings::{ settings, ProximityPrecisionView, RankingRuleView, SecretPolicy, Settings, Unchecked, @@ -94,7 +95,7 @@ macro_rules! make_setting_route { #[allow(clippy::redundant_closure_call)] analytics.publish( $crate::routes::indexes::settings::$analytics::new(body.as_ref()).to_settings(), - Some(&req), + &req, ); let new_settings = Settings { @@ -491,11 +492,11 @@ impl Aggregate for SettingsAnalytics { has_geo: self.filterable_attributes.has_geo.or(other.filterable_attributes.has_geo), }, distinct_attribute: DistinctAttributeAnalytics { - set: self.distinct_attribute.set.or(other.distinct_attribute.set), + set: self.distinct_attribute.set | other.distinct_attribute.set, }, proximity_precision: ProximityPrecisionAnalytics { - set: self.proximity_precision.set(other.proximity_precision.set), - value: self.proximity_precision.value(other.proximity_precision.value), + set: self.proximity_precision.set | other.proximity_precision.set, + value: self.proximity_precision.value.or(other.proximity_precision.value), }, typo_tolerance: TypoToleranceAnalytics { enabled: self.typo_tolerance.enabled.or(other.typo_tolerance.enabled), @@ -542,7 +543,7 @@ impl Aggregate for SettingsAnalytics { sources: match (self.embedders.sources, other.embedders.sources) { (None, None) => None, (Some(sources), None) | (None, Some(sources)) => Some(sources), - (Some(this), Some(other)) => Some(this.union(&other).collect()), + (Some(this), Some(other)) => Some(this.union(&other).cloned().collect()), }, document_template_used: match ( self.embedders.document_template_used, @@ -598,45 +599,70 @@ impl Aggregate for SettingsAnalytics { #[derive(Serialize, Default)] struct RankingRulesAnalytics { - words_position: Option, - typo_position: Option, - proximity_position: Option, - attribute_position: Option, - sort_position: Option, - exactness_position: Option, - values: Option, + words_position: Option, + typo_position: Option, + proximity_position: Option, + attribute_position: Option, + sort_position: Option, + exactness_position: Option, + values: Option, } impl RankingRulesAnalytics { pub fn new(rr: Option<&Vec>) -> Self { RankingRulesAnalytics { - words_position: rr.as_ref().map(|rr| { - rr.iter() - .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words)) - }), - typo_position: rr.as_ref().map(|rr| { - rr.iter() - .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo)) - }), - proximity_position: rr.as_ref().map(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Proximity) + words_position: rr + .as_ref() + .map(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Words) + }) }) - }), - attribute_position: rr.as_ref().map(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Attribute) + .flatten(), + + typo_position: rr + .as_ref() + .map(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Typo) + }) }) - }), - sort_position: rr.as_ref().map(|rr| { - rr.iter() - .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort)) - }), - exactness_position: rr.as_ref().map(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Exactness) + .flatten(), + + proximity_position: rr + .as_ref() + .map(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Proximity) + }) }) - }), + .flatten(), + + attribute_position: rr + .as_ref() + .map(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Attribute) + }) + }) + .flatten(), + sort_position: rr + .as_ref() + .map(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Sort) + }) + }) + .flatten(), + exactness_position: rr + .as_ref() + .map(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Exactness) + }) + }) + .flatten(), + values: rr.as_ref().map(|rr| { rr.iter() .filter(|s| { @@ -661,7 +687,7 @@ impl RankingRulesAnalytics { #[derive(Serialize, Default)] struct SearchableAttributesAnalytics { total: Option, - with_wildcard: bool, + with_wildcard: Option, } impl SearchableAttributesAnalytics { @@ -681,8 +707,8 @@ impl SearchableAttributesAnalytics { #[derive(Serialize, Default)] struct DisplayedAttributesAnalytics { - total: usize, - with_wildcard: bool, + total: Option, + with_wildcard: Option, } impl DisplayedAttributesAnalytics { @@ -702,8 +728,8 @@ impl DisplayedAttributesAnalytics { #[derive(Serialize, Default)] struct SortableAttributesAnalytics { - total: usize, - has_geo: bool, + total: Option, + has_geo: Option, } impl SortableAttributesAnalytics { @@ -721,15 +747,15 @@ impl SortableAttributesAnalytics { #[derive(Serialize, Default)] struct FilterableAttributesAnalytics { - total: usize, - has_geo: bool, + total: Option, + has_geo: Option, } impl FilterableAttributesAnalytics { pub fn new(setting: Option<&std::collections::BTreeSet>) -> Self { Self { - total: setting.as_ref().map(|filter| filter.len()).unwrap_or(0), - has_geo: setting.as_ref().map(|filter| filter.contains("_geo")).unwrap_or(false), + total: setting.as_ref().map(|filter| filter.len()), + has_geo: setting.as_ref().map(|filter| filter.contains("_geo")), } } @@ -761,7 +787,7 @@ struct ProximityPrecisionAnalytics { impl ProximityPrecisionAnalytics { pub fn new(precision: Option<&meilisearch_types::settings::ProximityPrecisionView>) -> Self { - Self { set: precision.is_some(), value: precision.unwrap_or_default() } + Self { set: precision.is_some(), value: precision.cloned() } } pub fn to_settings(self) -> SettingsAnalytics { @@ -774,8 +800,8 @@ struct TypoToleranceAnalytics { enabled: Option, disable_on_attributes: Option, disable_on_words: Option, - min_word_size_for_one_typo: Option, - min_word_size_for_two_typos: Option, + min_word_size_for_one_typo: Option, + min_word_size_for_two_typos: Option, } impl TypoToleranceAnalytics { @@ -805,9 +831,9 @@ impl TypoToleranceAnalytics { #[derive(Serialize, Default)] struct FacetingAnalytics { - max_values_per_facet: Option, + max_values_per_facet: Option, sort_facet_values_by_star_count: Option, - sort_facet_values_by_total: Option, + sort_facet_values_by_total: Option, } impl FacetingAnalytics { @@ -833,7 +859,7 @@ impl FacetingAnalytics { #[derive(Serialize, Default)] struct PaginationAnalytics { - max_total_hits: Option, + max_total_hits: Option, } impl PaginationAnalytics { @@ -909,18 +935,18 @@ impl EmbeddersAnalytics { { use meilisearch_types::milli::vector::settings::EmbedderSource; match source { - EmbedderSource::OpenAi => sources.insert("openAi"), - EmbedderSource::HuggingFace => sources.insert("huggingFace"), - EmbedderSource::UserProvided => sources.insert("userProvided"), - EmbedderSource::Ollama => sources.insert("ollama"), - EmbedderSource::Rest => sources.insert("rest"), + EmbedderSource::OpenAi => sources.insert("openAi".to_string()), + EmbedderSource::HuggingFace => sources.insert("huggingFace".to_string()), + EmbedderSource::UserProvided => sources.insert("userProvided".to_string()), + EmbedderSource::Ollama => sources.insert("ollama".to_string()), + EmbedderSource::Rest => sources.insert("rest".to_string()), }; } }; Self { total: setting.as_ref().map(|s| s.len()), - sources, + sources: Some(sources), document_template_used: setting.as_ref().map(|map| { map.values() .filter_map(|config| config.clone().set()) @@ -953,7 +979,7 @@ struct SearchCutoffMsAnalytics { impl SearchCutoffMsAnalytics { pub fn new(setting: Option<&u64>) -> Self { - Self { search_cutoff_ms: setting } + Self { search_cutoff_ms: setting.copied() } } pub fn to_settings(self) -> SettingsAnalytics { @@ -964,7 +990,7 @@ impl SearchCutoffMsAnalytics { #[derive(Serialize, Default)] #[serde(transparent)] struct LocalesAnalytics { - locales: BTreeSet, + locales: Option>, } impl LocalesAnalytics { @@ -988,7 +1014,7 @@ impl LocalesAnalytics { #[derive(Serialize, Default)] struct DictionaryAnalytics { - total: usize, + total: Option, } impl DictionaryAnalytics { @@ -1003,7 +1029,7 @@ impl DictionaryAnalytics { #[derive(Serialize, Default)] struct SeparatorTokensAnalytics { - total: usize, + total: Option, } impl SeparatorTokensAnalytics { @@ -1018,7 +1044,7 @@ impl SeparatorTokensAnalytics { #[derive(Serialize, Default)] struct NonSeparatorTokensAnalytics { - total: usize, + total: Option, } impl NonSeparatorTokensAnalytics { @@ -1088,7 +1114,7 @@ pub async fn update_all( new_settings.non_separator_tokens.as_ref().set(), ), }, - Some(&req), + &req, ); let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid); diff --git a/meilisearch/src/routes/indexes/similar.rs b/meilisearch/src/routes/indexes/similar.rs index f94a02987..91c435254 100644 --- a/meilisearch/src/routes/indexes/similar.rs +++ b/meilisearch/src/routes/indexes/similar.rs @@ -13,6 +13,7 @@ use serde_json::Value; use tracing::debug; use super::ActionPolicy; +use crate::analytics::segment_analytics::{SimilarGET, SimilarPOST}; use crate::analytics::{Analytics, SimilarAggregator}; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; @@ -34,13 +35,13 @@ pub async fn similar_get( index_uid: web::Path, params: AwebQueryParameter, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let query = params.0.try_into()?; - let mut aggregate = SimilarAggregator::from_query(&query, &req); + let mut aggregate = SimilarAggregator::::from_query(&query, &req); debug!(parameters = ?query, "Similar get"); @@ -49,7 +50,7 @@ pub async fn similar_get( if let Ok(similar) = &similar { aggregate.succeed(similar); } - analytics.get_similar(aggregate); + analytics.publish(aggregate, &req); let similar = similar?; @@ -62,21 +63,21 @@ pub async fn similar_post( index_uid: web::Path, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; let query = params.into_inner(); debug!(parameters = ?query, "Similar post"); - let mut aggregate = SimilarAggregator::from_query(&query, &req); + let mut aggregate = SimilarAggregator::::from_query(&query, &req); let similar = similar(index_scheduler, index_uid, query).await; if let Ok(similar) = &similar { aggregate.succeed(similar); } - analytics.post_similar(aggregate); + analytics.publish(aggregate, &req); let similar = similar?; diff --git a/meilisearch/src/routes/multi_search.rs b/meilisearch/src/routes/multi_search.rs index 5fcb868c6..994c256d2 100644 --- a/meilisearch/src/routes/multi_search.rs +++ b/meilisearch/src/routes/multi_search.rs @@ -35,7 +35,7 @@ pub async fn multi_search_with_post( search_queue: Data, params: AwebJson, req: HttpRequest, - analytics: web::Data, + analytics: web::Data, ) -> Result { // Since we don't want to process half of the search requests and then get a permit refused // we're going to get one permit for the whole duration of the multi-search request. @@ -87,7 +87,7 @@ pub async fn multi_search_with_post( multi_aggregate.succeed(); } - analytics.post_multi_search(multi_aggregate); + analytics.publish(multi_aggregate, &req); HttpResponse::Ok().json(search_result??) } None => { @@ -149,7 +149,7 @@ pub async fn multi_search_with_post( if search_results.is_ok() { multi_aggregate.succeed(); } - analytics.post_multi_search(multi_aggregate); + analytics.publish(multi_aggregate, &req); let search_results = search_results.map_err(|(mut err, query_index)| { // Add the query index that failed as context for the error message. diff --git a/meilisearch/src/routes/snapshot.rs b/meilisearch/src/routes/snapshot.rs index 84673729f..cacbc41af 100644 --- a/meilisearch/src/routes/snapshot.rs +++ b/meilisearch/src/routes/snapshot.rs @@ -3,7 +3,6 @@ use actix_web::{web, HttpRequest, HttpResponse}; use index_scheduler::IndexScheduler; use meilisearch_types::error::ResponseError; use meilisearch_types::tasks::KindWithContent; -use serde_json::json; use tracing::debug; use crate::analytics::Analytics; @@ -17,13 +16,15 @@ pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot)))); } +crate::empty_analytics!(SnapshotAnalytics, "Snapshot Created"); + pub async fn create_snapshot( index_scheduler: GuardedData, Data>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { - analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req)); + analytics.publish(SnapshotAnalytics::default(), &req); let task = KindWithContent::SnapshotCreation; let uid = get_task_id(&req, &opt)?; diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index 34e904230..42ebd7858 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -8,10 +8,11 @@ use meilisearch_types::error::deserr_codes::InvalidSwapIndexes; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; +use serde::Serialize; use serde_json::json; use super::{get_task_id, is_dry_run, SummarizedTaskView}; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, Analytics}; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::{AuthenticationError, GuardedData}; @@ -29,21 +30,34 @@ pub struct SwapIndexesPayload { indexes: Vec, } +#[derive(Serialize)] +struct IndexSwappedAnalytics { + swap_operation_number: usize, +} + +impl Aggregate for IndexSwappedAnalytics { + fn event_name(&self) -> &'static str { + "Indexes Swapped" + } + + fn aggregate(self, other: Self) -> Self { + Self { swap_operation_number: self.swap_operation_number.max(other.swap_operation_number) } + } + + fn into_event(self) -> impl Serialize { + self + } +} + pub async fn swap_indexes( index_scheduler: GuardedData, Data>, params: AwebJson, DeserrJsonError>, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let params = params.into_inner(); - analytics.publish( - "Indexes Swapped".to_string(), - json!({ - "swap_operation_number": params.len(), // Return the max ever encountered - }), - Some(&req), - ); + analytics.publish(IndexSwappedAnalytics { swap_operation_number: params.len() }, &req); let filters = index_scheduler.filters(); let mut swaps = vec![]; diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index 3dc6520af..162d19ca1 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -12,18 +12,17 @@ use meilisearch_types::star_or::{OptionStarOr, OptionStarOrList}; use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{Kind, KindWithContent, Status}; use serde::Serialize; -use serde_json::json; use time::format_description::well_known::Rfc3339; use time::macros::format_description; use time::{Date, Duration, OffsetDateTime, Time}; use tokio::task; use super::{get_task_id, is_dry_run, SummarizedTaskView}; -use crate::analytics::Analytics; +use crate::analytics::{Aggregate, AggregateMethod, Analytics}; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; -use crate::Opt; +use crate::{aggregate_methods, Opt}; const DEFAULT_LIMIT: u32 = 20; @@ -158,12 +157,69 @@ impl TaskDeletionOrCancelationQuery { } } +aggregate_methods!( + CancelTasks => "Tasks Canceled", + DeleteTasks => "Tasks Deleted", +); + +#[derive(Serialize)] +struct TaskFilterAnalytics { + filtered_by_uid: bool, + filtered_by_index_uid: bool, + filtered_by_type: bool, + filtered_by_status: bool, + filtered_by_canceled_by: bool, + filtered_by_before_enqueued_at: bool, + filtered_by_after_enqueued_at: bool, + filtered_by_before_started_at: bool, + filtered_by_after_started_at: bool, + filtered_by_before_finished_at: bool, + filtered_by_after_finished_at: bool, + + #[serde(skip)] + marker: std::marker::PhantomData, +} + +impl Aggregate for TaskFilterAnalytics { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(self, other: Self) -> Self { + Self { + filtered_by_uid: self.filtered_by_uid | other.filtered_by_uid, + filtered_by_index_uid: self.filtered_by_index_uid | other.filtered_by_index_uid, + filtered_by_type: self.filtered_by_type | other.filtered_by_type, + filtered_by_status: self.filtered_by_status | other.filtered_by_status, + filtered_by_canceled_by: self.filtered_by_canceled_by | other.filtered_by_canceled_by, + filtered_by_before_enqueued_at: self.filtered_by_before_enqueued_at + | other.filtered_by_before_enqueued_at, + filtered_by_after_enqueued_at: self.filtered_by_after_enqueued_at + | other.filtered_by_after_enqueued_at, + filtered_by_before_started_at: self.filtered_by_before_started_at + | other.filtered_by_before_started_at, + filtered_by_after_started_at: self.filtered_by_after_started_at + | other.filtered_by_after_started_at, + filtered_by_before_finished_at: self.filtered_by_before_finished_at + | other.filtered_by_before_finished_at, + filtered_by_after_finished_at: self.filtered_by_after_finished_at + | other.filtered_by_after_finished_at, + + marker: std::marker::PhantomData, + } + } + + fn into_event(self) -> impl Serialize { + self + } +} + async fn cancel_tasks( index_scheduler: GuardedData, Data>, params: AwebQueryParameter, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -172,21 +228,22 @@ async fn cancel_tasks( } analytics.publish( - "Tasks Canceled".to_string(), - json!({ - "filtered_by_uid": params.uids.is_some(), - "filtered_by_index_uid": params.index_uids.is_some(), - "filtered_by_type": params.types.is_some(), - "filtered_by_status": params.statuses.is_some(), - "filtered_by_canceled_by": params.canceled_by.is_some(), - "filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(), - "filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(), - "filtered_by_before_started_at": params.before_started_at.is_some(), - "filtered_by_after_started_at": params.after_started_at.is_some(), - "filtered_by_before_finished_at": params.before_finished_at.is_some(), - "filtered_by_after_finished_at": params.after_finished_at.is_some(), - }), - Some(&req), + TaskFilterAnalytics:: { + filtered_by_uid: params.uids.is_some(), + filtered_by_index_uid: params.index_uids.is_some(), + filtered_by_type: params.types.is_some(), + filtered_by_status: params.statuses.is_some(), + filtered_by_canceled_by: params.canceled_by.is_some(), + filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(), + filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(), + filtered_by_before_started_at: params.before_started_at.is_some(), + filtered_by_after_started_at: params.after_started_at.is_some(), + filtered_by_before_finished_at: params.before_finished_at.is_some(), + filtered_by_after_finished_at: params.after_finished_at.is_some(), + + marker: std::marker::PhantomData, + }, + &req, ); let query = params.into_query(); @@ -214,7 +271,7 @@ async fn delete_tasks( params: AwebQueryParameter, req: HttpRequest, opt: web::Data, - analytics: web::Data, + analytics: web::Data, ) -> Result { let params = params.into_inner(); @@ -223,22 +280,24 @@ async fn delete_tasks( } analytics.publish( - "Tasks Deleted".to_string(), - json!({ - "filtered_by_uid": params.uids.is_some(), - "filtered_by_index_uid": params.index_uids.is_some(), - "filtered_by_type": params.types.is_some(), - "filtered_by_status": params.statuses.is_some(), - "filtered_by_canceled_by": params.canceled_by.is_some(), - "filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(), - "filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(), - "filtered_by_before_started_at": params.before_started_at.is_some(), - "filtered_by_after_started_at": params.after_started_at.is_some(), - "filtered_by_before_finished_at": params.before_finished_at.is_some(), - "filtered_by_after_finished_at": params.after_finished_at.is_some(), - }), - Some(&req), + TaskFilterAnalytics:: { + filtered_by_uid: params.uids.is_some(), + filtered_by_index_uid: params.index_uids.is_some(), + filtered_by_type: params.types.is_some(), + filtered_by_status: params.statuses.is_some(), + filtered_by_canceled_by: params.canceled_by.is_some(), + filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(), + filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(), + filtered_by_before_started_at: params.before_started_at.is_some(), + filtered_by_after_started_at: params.after_started_at.is_some(), + filtered_by_before_finished_at: params.before_finished_at.is_some(), + filtered_by_after_finished_at: params.after_finished_at.is_some(), + + marker: std::marker::PhantomData, + }, + &req, ); + let query = params.into_query(); let (tasks, _) = index_scheduler.get_task_ids_from_authorized_indexes( From ea6883189ef73429b748473d436b71ea4a7a5a52 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 16 Oct 2024 21:17:06 +0200 Subject: [PATCH 54/92] finish the analytics in all the routes --- meilisearch/src/analytics/mod.rs | 33 ++-- .../src/analytics/segment_analytics.rs | 153 +++--------------- meilisearch/src/routes/features.rs | 1 - meilisearch/src/routes/indexes/documents.rs | 58 +++---- .../src/routes/indexes/facet_search.rs | 24 +-- meilisearch/src/routes/indexes/mod.rs | 5 +- meilisearch/src/routes/indexes/similar.rs | 4 +- meilisearch/src/routes/multi_search.rs | 2 +- meilisearch/src/routes/swap_indexes.rs | 1 - meilisearch/src/routes/tasks.rs | 2 +- 10 files changed, 84 insertions(+), 199 deletions(-) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index a0ca47d8f..ab6fd9993 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -1,7 +1,5 @@ pub mod segment_analytics; -use std::any::TypeId; -use std::collections::HashMap; use std::fs; use std::path::{Path, PathBuf}; use std::str::FromStr; @@ -10,7 +8,6 @@ use actix_web::HttpRequest; use meilisearch_types::InstanceUid; use once_cell::sync::Lazy; use platform_dirs::AppDirs; -use segment::message::User; use serde::Serialize; // if the feature analytics is enabled we use the real analytics @@ -83,7 +80,7 @@ pub enum DocumentFetchKind { Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, } -pub trait Aggregate { +pub trait Aggregate: 'static { fn event_name(&self) -> &'static str; fn aggregate(self, other: Self) -> Self @@ -97,7 +94,7 @@ pub trait Aggregate { /// Helper trait to define multiple aggregate with the same content but a different name. /// Commonly used when you must aggregate a search with POST or with GET for example. -pub trait AggregateMethod { +pub trait AggregateMethod: 'static + Default { fn event_name() -> &'static str; } @@ -105,7 +102,8 @@ pub trait AggregateMethod { #[macro_export] macro_rules! aggregate_methods { ($method:ident => $event_name:literal) => { - pub enum $method {} + #[derive(Default)] + pub struct $method {} impl $crate::analytics::AggregateMethod for $method { fn event_name() -> &'static str { @@ -122,35 +120,26 @@ macro_rules! aggregate_methods { } pub struct Analytics { - // TODO: TAMO: remove - inner: Option, - - instance_uid: Option, - user: Option, - events: HashMap>, + segment: Option, } impl Analytics { fn no_analytics() -> Self { - Self { inner: None, events: HashMap::new(), instance_uid: None, user: None } + Self { segment: None } } fn segment_analytics(segment: SegmentAnalytics) -> Self { - Self { - instance_uid: Some(segment.instance_uid), - user: Some(segment.user), - inner: Some(segment), - events: HashMap::new(), - } + Self { segment: Some(segment) } } pub fn instance_uid(&self) -> Option<&InstanceUid> { - self.instance_uid + self.segment.as_ref().map(|segment| segment.instance_uid.as_ref()) } /// The method used to publish most analytics that do not need to be batched every hours - pub fn publish(&self, send: impl Aggregate, request: &HttpRequest) { - let Some(segment) = self.inner else { return }; + pub fn publish(&self, event: impl Aggregate, request: &HttpRequest) { + let Some(ref segment) = self.segment else { return }; let user_agents = extract_user_agents(request); + let _ = segment.sender.try_send(Box::new(event)); } } diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 0572267e1..601fefa1e 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -1,3 +1,4 @@ +use std::any::{Any, TypeId}; use std::collections::{BTreeSet, BinaryHeap, HashMap, HashSet}; use std::fs; use std::mem::take; @@ -74,6 +75,7 @@ pub fn extract_user_agents(request: &HttpRequest) -> Vec { pub struct SegmentAnalytics { pub instance_uid: InstanceUid, pub user: User, + pub sender: Sender>, } impl SegmentAnalytics { @@ -128,18 +130,7 @@ impl SegmentAnalytics { user: user.clone(), opt: opt.clone(), batcher, - post_search_aggregator: SearchAggregator::default(), - post_multi_search_aggregator: MultiSearchAggregator::default(), - post_facet_search_aggregator: FacetSearchAggregator::default(), - get_search_aggregator: SearchAggregator::default(), - add_documents_aggregator: DocumentsAggregator::default(), - delete_documents_aggregator: DocumentsDeletionAggregator::default(), - update_documents_aggregator: DocumentsAggregator::default(), - edit_documents_by_function_aggregator: EditDocumentsByFunctionAggregator::default(), - get_fetch_documents_aggregator: DocumentsFetchAggregator::default(), - post_fetch_documents_aggregator: DocumentsFetchAggregator::default(), - get_similar_aggregator: SimilarAggregator::default(), - post_similar_aggregator: SimilarAggregator::default(), + events: todo!(), }); tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone())); @@ -387,22 +378,11 @@ impl From for Infos { } pub struct Segment { - inbox: Receiver, + inbox: Receiver>, user: User, opt: Opt, batcher: AutoBatcher, - get_search_aggregator: SearchAggregator, - post_search_aggregator: SearchAggregator, - post_multi_search_aggregator: MultiSearchAggregator, - post_facet_search_aggregator: FacetSearchAggregator, - add_documents_aggregator: DocumentsAggregator, - delete_documents_aggregator: DocumentsDeletionAggregator, - update_documents_aggregator: DocumentsAggregator, - edit_documents_by_function_aggregator: EditDocumentsByFunctionAggregator, - get_fetch_documents_aggregator: DocumentsFetchAggregator, - post_fetch_documents_aggregator: DocumentsFetchAggregator, - get_similar_aggregator: SimilarAggregator, - post_similar_aggregator: SimilarAggregator, + events: HashMap>, } impl Segment { @@ -455,19 +435,8 @@ impl Segment { }, msg = self.inbox.recv() => { match msg { - Some(AnalyticsMsg::BatchMessage(msg)) => drop(self.batcher.push(msg).await), - Some(AnalyticsMsg::AggregateGetSearch(agreg)) => self.get_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostSearch(agreg)) => self.post_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostMultiSearch(agreg)) => self.post_multi_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostFacetSearch(agreg)) => self.post_facet_search_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateAddDocuments(agreg)) => self.add_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateDeleteDocuments(agreg)) => self.delete_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateEditDocumentsByFunction(agreg)) => self.edit_documents_by_function_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregateGetSimilar(agreg)) => self.get_similar_aggregator.aggregate(agreg), - Some(AnalyticsMsg::AggregatePostSimilar(agreg)) => self.post_similar_aggregator.aggregate(agreg), + // Some(AnalyticsMsg::BatchMessage(msg)) => drop(self.batcher.push(msg).await), + Some(_) => todo!(), None => (), } } @@ -507,87 +476,19 @@ impl Segment { .await; } - let Segment { - inbox: _, - opt: _, - batcher: _, - user, - get_search_aggregator, - post_search_aggregator, - post_multi_search_aggregator, - post_facet_search_aggregator, - add_documents_aggregator, - delete_documents_aggregator, - update_documents_aggregator, - edit_documents_by_function_aggregator, - get_fetch_documents_aggregator, - post_fetch_documents_aggregator, - get_similar_aggregator, - post_similar_aggregator, - } = self; + // We empty the list of events + let events = std::mem::take(&mut self.events); - if let Some(get_search) = - take(get_search_aggregator).into_event(user, "Documents Searched GET") - { - let _ = self.batcher.push(get_search).await; - } - if let Some(post_search) = - take(post_search_aggregator).into_event(user, "Documents Searched POST") - { - let _ = self.batcher.push(post_search).await; - } - if let Some(post_multi_search) = take(post_multi_search_aggregator) - .into_event(user, "Documents Searched by Multi-Search POST") - { - let _ = self.batcher.push(post_multi_search).await; - } - if let Some(post_facet_search) = - take(post_facet_search_aggregator).into_event(user, "Facet Searched POST") - { - let _ = self.batcher.push(post_facet_search).await; - } - if let Some(add_documents) = - take(add_documents_aggregator).into_event(user, "Documents Added") - { - let _ = self.batcher.push(add_documents).await; - } - if let Some(delete_documents) = - take(delete_documents_aggregator).into_event(user, "Documents Deleted") - { - let _ = self.batcher.push(delete_documents).await; - } - if let Some(update_documents) = - take(update_documents_aggregator).into_event(user, "Documents Updated") - { - let _ = self.batcher.push(update_documents).await; - } - if let Some(edit_documents_by_function) = take(edit_documents_by_function_aggregator) - .into_event(user, "Documents Edited By Function") - { - let _ = self.batcher.push(edit_documents_by_function).await; - } - if let Some(get_fetch_documents) = - take(get_fetch_documents_aggregator).into_event(user, "Documents Fetched GET") - { - let _ = self.batcher.push(get_fetch_documents).await; - } - if let Some(post_fetch_documents) = - take(post_fetch_documents_aggregator).into_event(user, "Documents Fetched POST") - { - let _ = self.batcher.push(post_fetch_documents).await; + for (_, mut event) in events { + self.batcher.push(Track { + user: self.user, + event: event.event_name().to_string(), + properties: event.into_event(), + timestamp: todo!(), + ..Default::default() + }); } - if let Some(get_similar_documents) = - take(get_similar_aggregator).into_event(user, "Similar GET") - { - let _ = self.batcher.push(get_similar_documents).await; - } - - if let Some(post_similar_documents) = - take(post_similar_aggregator).into_event(user, "Similar POST") - { - let _ = self.batcher.push(post_similar_documents).await; - } let _ = self.batcher.flush().await; } } @@ -702,10 +603,8 @@ impl SearchAggregator { } = query; let mut ret = Self::default(); - ret.timestamp = Some(OffsetDateTime::now_utc()); ret.total_received = 1; - ret.user_agents = extract_user_agents(request).into_iter().collect(); if let Some(ref sort) = sort { ret.sort_total_number_of_criteria = 1; @@ -949,7 +848,7 @@ impl Aggregate for SearchAggregator { self } - fn into_event(self) -> Option { + fn into_event(self) -> impl Serialize { let Self { total_received, total_succeeded, @@ -1087,10 +986,7 @@ pub struct MultiSearchAggregator { } impl MultiSearchAggregator { - pub fn from_federated_search( - federated_search: &FederatedSearch, - request: &HttpRequest, - ) -> Self { + pub fn from_federated_search(federated_search: &FederatedSearch) -> Self { let use_federation = federated_search.federation.is_some(); let distinct_indexes: HashSet<_> = federated_search @@ -1162,7 +1058,7 @@ impl Aggregate for MultiSearchAggregator { } /// Aggregate one [MultiSearchAggregator] into another. - fn aggregate(mut self, other: Self) -> Self { + fn aggregate(self, other: Self) -> Self { // write the aggregate in a way that will cause a compilation error if a field is added. // get ownership of self, replacing it by a default value. @@ -1177,13 +1073,8 @@ impl Aggregate for MultiSearchAggregator { let show_ranking_score = this.show_ranking_score || other.show_ranking_score; let show_ranking_score_details = this.show_ranking_score_details || other.show_ranking_score_details; - let mut user_agents = this.user_agents; let use_federation = this.use_federation || other.use_federation; - for user_agent in other.user_agents.into_iter() { - user_agents.insert(user_agent); - } - Self { total_received, total_succeeded, @@ -1748,7 +1639,7 @@ pub struct SimilarAggregator { impl SimilarAggregator { #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &SimilarQuery, request: &HttpRequest) -> Self { + pub fn from_query(query: &SimilarQuery) -> Self { let SimilarQuery { id: _, embedder: _, @@ -1763,10 +1654,8 @@ impl SimilarAggregator { } = query; let mut ret = Self::default(); - ret.timestamp = Some(OffsetDateTime::now_utc()); ret.total_received = 1; - ret.user_agents = extract_user_agents(request).into_iter().collect(); if let Some(ref filter) = filter { static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); diff --git a/meilisearch/src/routes/features.rs b/meilisearch/src/routes/features.rs index 4ee5b37b0..0b43c3f13 100644 --- a/meilisearch/src/routes/features.rs +++ b/meilisearch/src/routes/features.rs @@ -7,7 +7,6 @@ use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::ResponseError; use meilisearch_types::keys::actions; use serde::Serialize; -use serde_json::json; use tracing::debug; use crate::analytics::{Aggregate, Analytics}; diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 6dece61e6..1573b768b 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -32,7 +32,7 @@ use tokio::fs::File; use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter}; use tracing::debug; -use crate::analytics::{Aggregate, AggregateMethod, Analytics, DocumentDeletionKind}; +use crate::analytics::{Aggregate, AggregateMethod, Analytics}; use crate::error::MeilisearchHttpError; use crate::error::PayloadError::ReceivePayload; use crate::extractors::authentication::policies::*; @@ -102,8 +102,13 @@ pub struct GetDocument { retrieve_vectors: Param, } +aggregate_methods!( + DocumentsGET => "Documents Fetched GET", + DocumentsPOST => "Documents Fetched POST", +); + #[derive(Default, Serialize)] -pub struct DocumentsFetchAggregator { +pub struct DocumentsFetchAggregator { #[serde(rename = "requests.total_received")] total_received: usize, @@ -120,6 +125,8 @@ pub struct DocumentsFetchAggregator { max_limit: usize, #[serde(rename = "pagination.max_offset")] max_offset: usize, + + marker: std::marker::PhantomData, } #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -128,7 +135,7 @@ pub enum DocumentFetchKind { Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, } -impl DocumentsFetchAggregator { +impl DocumentsFetchAggregator { pub fn from_query(query: &DocumentFetchKind) -> Self { let (limit, offset, retrieve_vectors) = match query { DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), @@ -136,6 +143,7 @@ impl DocumentsFetchAggregator { (*limit, *offset, *retrieve_vectors) } }; + Self { total_received: 1, per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), @@ -143,20 +151,18 @@ impl DocumentsFetchAggregator { max_limit: limit, max_offset: offset, retrieve_vectors, + + marker: PhantomData, } } } -impl Aggregate for DocumentsFetchAggregator { - // TODO: TAMO: Should we do the same event for the GET requests +impl Aggregate for DocumentsFetchAggregator { fn event_name(&self) -> &'static str { - "Documents Fetched POST" + Method::event_name() } - fn aggregate(self, other: Self) -> Self - where - Self: Sized, - { + fn aggregate(self, other: Self) -> Self { Self { total_received: self.total_received.saturating_add(other.total_received), per_document_id: self.per_document_id | other.per_document_id, @@ -164,11 +170,12 @@ impl Aggregate for DocumentsFetchAggregator { retrieve_vectors: self.retrieve_vectors | other.retrieve_vectors, max_limit: self.max_limit.max(other.max_limit), max_offset: self.max_offset.max(other.max_offset), + marker: PhantomData, } } - fn into_event(self) -> Value { - serde_json::to_value(self).unwrap() + fn into_event(self) -> impl Serialize { + self } } @@ -190,7 +197,7 @@ pub async fn get_document( let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?; analytics.publish( - DocumentsFetchAggregator { + DocumentsFetchAggregator:: { retrieve_vectors: param_retrieve_vectors.0, ..Default::default() }, @@ -232,8 +239,8 @@ impl Aggregate for DocumentsDeletionAggregator { } } - fn into_event(self) -> Value { - serde_json::to_value(self).unwrap() + fn into_event(self) -> impl Serialize { + self } } @@ -311,7 +318,7 @@ pub async fn documents_by_query_post( debug!(parameters = ?body, "Get documents POST"); analytics.publish( - DocumentsFetchAggregator { + DocumentsFetchAggregator:: { total_received: 1, per_filter: body.filter.is_some(), retrieve_vectors: body.retrieve_vectors, @@ -353,7 +360,7 @@ pub async fn get_documents( }; analytics.publish( - DocumentsFetchAggregator { + DocumentsFetchAggregator:: { total_received: 1, per_filter: query.filter.is_some(), retrieve_vectors: query.retrieve_vectors, @@ -436,20 +443,17 @@ impl Aggregate for DocumentsAggregator { Method::event_name() } - fn aggregate(mut self, other: Self) -> Self - where - Self: Sized, - { + fn aggregate(self, other: Self) -> Self { Self { - payload_types: self.payload_types.union(&other.payload_types).collect(), - primary_key: self.primary_key.union(&other.primary_key).collect(), + payload_types: self.payload_types.union(&other.payload_types).cloned().collect(), + primary_key: self.primary_key.union(&other.primary_key).cloned().collect(), index_creation: self.index_creation | other.index_creation, method: PhantomData, } } - fn into_event(self) -> Value { - serde_json::to_value(self).unwrap() + fn into_event(self) -> impl Serialize { + self } } @@ -818,8 +822,8 @@ impl Aggregate for EditDocumentsByFunctionAggregator { } } - fn into_event(self) -> Value { - serde_json::to_value(self).unwrap() + fn into_event(self) -> impl Serialize { + self } } diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index f3c74a388..08618970d 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -9,6 +9,7 @@ use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::locales::Locale; +use serde::Serialize; use serde_json::Value; use tracing::debug; @@ -72,7 +73,7 @@ pub struct FacetSearchAggregator { impl FacetSearchAggregator { #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &FacetSearchQuery, request: &HttpRequest) -> Self { + pub fn from_query(query: &FacetSearchQuery) -> Self { let FacetSearchQuery { facet_query: _, facet_name, @@ -113,23 +114,22 @@ impl Aggregate for FacetSearchAggregator { "Facet Searched POST" } - fn aggregate(mut self, other: Self) -> Self - where - Self: Sized, - { - self.time_spent.insert(other.time_spent); + fn aggregate(mut self, other: Self) -> Self { + for time in other.time_spent { + self.time_spent.push(time); + } Self { total_received: self.total_received.saturating_add(other.total_received), total_succeeded: self.total_succeeded.saturating_add(other.total_succeeded), time_spent: self.time_spent, - facet_names: self.facet_names.union(&other.facet_names).collect(), + facet_names: self.facet_names.union(&other.facet_names).cloned().collect(), additional_search_parameters_provided: self.additional_search_parameters_provided | other.additional_search_parameters_provided, } } - fn into_event(self) -> Value { + fn into_event(self) -> impl Serialize { let Self { total_received, total_succeeded, @@ -137,6 +137,12 @@ impl Aggregate for FacetSearchAggregator { facet_names, additional_search_parameters_provided, } = self; + // the index of the 99th percentage of value + let percentile_99th = 0.99 * (total_succeeded as f64 - 1.) + 1.; + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th as usize); serde_json::json!({ "requests": { @@ -166,7 +172,7 @@ pub async fn search( let query = params.into_inner(); debug!(parameters = ?query, "Facet search"); - let mut aggregate = FacetSearchAggregator::from_query(&query, &req); + let mut aggregate = FacetSearchAggregator::from_query(&query); let facet_query = query.facet_query.clone(); let facet_name = query.facet_name.clone(); diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index f926f663c..3c41f36fe 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -14,7 +14,6 @@ use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::{self, FieldDistribution, Index}; use meilisearch_types::tasks::KindWithContent; use serde::Serialize; -use serde_json::json; use time::OffsetDateTime; use tracing::debug; @@ -138,7 +137,7 @@ impl Aggregate for IndexCreatedAggregate { where Self: Sized, { - Self { primary_key: self.primary_key.union(&other.primary_key).collect() } + Self { primary_key: self.primary_key.union(&other.primary_key).cloned().collect() } } fn into_event(self) -> impl Serialize { @@ -227,7 +226,7 @@ impl Aggregate for IndexUpdatedAggregate { } fn aggregate(self, other: Self) -> Self { - Self { primary_key: self.primary_key.union(&other.primary_key).collect() } + Self { primary_key: self.primary_key.union(&other.primary_key).cloned().collect() } } fn into_event(self) -> impl Serialize { diff --git a/meilisearch/src/routes/indexes/similar.rs b/meilisearch/src/routes/indexes/similar.rs index 91c435254..33df6bdad 100644 --- a/meilisearch/src/routes/indexes/similar.rs +++ b/meilisearch/src/routes/indexes/similar.rs @@ -41,7 +41,7 @@ pub async fn similar_get( let query = params.0.try_into()?; - let mut aggregate = SimilarAggregator::::from_query(&query, &req); + let mut aggregate = SimilarAggregator::::from_query(&query); debug!(parameters = ?query, "Similar get"); @@ -70,7 +70,7 @@ pub async fn similar_post( let query = params.into_inner(); debug!(parameters = ?query, "Similar post"); - let mut aggregate = SimilarAggregator::::from_query(&query, &req); + let mut aggregate = SimilarAggregator::::from_query(&query); let similar = similar(index_scheduler, index_uid, query).await; diff --git a/meilisearch/src/routes/multi_search.rs b/meilisearch/src/routes/multi_search.rs index 994c256d2..13a39cb44 100644 --- a/meilisearch/src/routes/multi_search.rs +++ b/meilisearch/src/routes/multi_search.rs @@ -43,7 +43,7 @@ pub async fn multi_search_with_post( let federated_search = params.into_inner(); - let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search, &req); + let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search); let FederatedSearch { mut queries, federation } = federated_search; diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index 42ebd7858..abdffbb73 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -9,7 +9,6 @@ use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; use serde::Serialize; -use serde_json::json; use super::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::analytics::{Aggregate, Analytics}; diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index 162d19ca1..f04e2ead2 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -180,7 +180,7 @@ struct TaskFilterAnalytics { marker: std::marker::PhantomData, } -impl Aggregate for TaskFilterAnalytics { +impl Aggregate for TaskFilterAnalytics { fn event_name(&self) -> &'static str { Method::event_name() } From 6728cfbfac2a1b3e56b7bb7f13687dc610b48ca3 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 00:38:18 +0200 Subject: [PATCH 55/92] fix the analytics --- Cargo.lock | 7 ++ meilisearch/Cargo.toml | 1 + meilisearch/src/analytics/mod.rs | 34 ++++++--- .../src/analytics/segment_analytics.rs | 76 ++++++++++++------- 4 files changed, 81 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c85a59952..733470384 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3415,6 +3415,7 @@ dependencies = [ "meilisearch-types", "mimalloc", "mime", + "mopa", "num_cpus", "obkv", "once_cell", @@ -3681,6 +3682,12 @@ dependencies = [ "syn 2.0.60", ] +[[package]] +name = "mopa" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a785740271256c230f57462d3b83e52f998433a7062fc18f96d5999474a9f915" + [[package]] name = "mutually_exclusive_features" version = "0.0.3" diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 6c2fb4060..322b333ac 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -104,6 +104,7 @@ tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.11" build-info = { version = "1.7.0", path = "../build-info" } roaring = "0.10.2" +mopa = "0.2.2" [dev-dependencies] actix-rt = "2.10.0" diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index ab6fd9993..8a0a68bad 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -6,9 +6,9 @@ use std::str::FromStr; use actix_web::HttpRequest; use meilisearch_types::InstanceUid; +use mopa::mopafy; use once_cell::sync::Lazy; use platform_dirs::AppDirs; -use serde::Serialize; // if the feature analytics is enabled we use the real analytics pub type SegmentAnalytics = segment_analytics::SegmentAnalytics; @@ -31,11 +31,11 @@ macro_rules! empty_analytics { $event_name } - fn aggregate(self, _other: Self) -> Self { + fn aggregate(self: Box, _other: Box) -> Box { self } - fn into_event(self) -> impl serde::Serialize { + fn into_event(self: Box) -> serde_json::Value { serde_json::json!({}) } } @@ -80,18 +80,34 @@ pub enum DocumentFetchKind { Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, } -pub trait Aggregate: 'static { +pub trait Aggregate: 'static + mopa::Any + Send { fn event_name(&self) -> &'static str; - fn aggregate(self, other: Self) -> Self + fn aggregate(self: Box, other: Box) -> Box where Self: Sized; - fn into_event(self) -> impl Serialize + fn downcast_aggregate( + this: Box, + other: Box, + ) -> Option> where - Self: Sized; + Self: Sized, + { + if this.is::() && other.is::() { + let this = this.downcast::().ok()?; + let other = other.downcast::().ok()?; + Some(Self::aggregate(this, other)) + } else { + None + } + } + + fn into_event(self: Box) -> serde_json::Value; } +mopafy!(Aggregate); + /// Helper trait to define multiple aggregate with the same content but a different name. /// Commonly used when you must aggregate a search with POST or with GET for example. pub trait AggregateMethod: 'static + Default { @@ -137,9 +153,9 @@ impl Analytics { } /// The method used to publish most analytics that do not need to be batched every hours - pub fn publish(&self, event: impl Aggregate, request: &HttpRequest) { + pub fn publish(&self, event: T, request: &HttpRequest) { let Some(ref segment) = self.segment else { return }; let user_agents = extract_user_agents(request); - let _ = segment.sender.try_send(Box::new(event)); + let _ = segment.sender.try_send(segment_analytics::Message::new(event)); } } diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 601fefa1e..1a1bb9226 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -1,7 +1,6 @@ -use std::any::{Any, TypeId}; +use std::any::TypeId; use std::collections::{BTreeSet, BinaryHeap, HashMap, HashSet}; use std::fs; -use std::mem::take; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -72,10 +71,26 @@ pub fn extract_user_agents(request: &HttpRequest) -> Vec { .collect() } +pub struct Message { + type_id: TypeId, + event: Box, + aggregator_function: fn(Box, Box) -> Option>, +} + +impl Message { + pub fn new(event: T) -> Self { + Self { + type_id: TypeId::of::(), + event: Box::new(event), + aggregator_function: T::downcast_aggregate, + } + } +} + pub struct SegmentAnalytics { pub instance_uid: InstanceUid, pub user: User, - pub sender: Sender>, + pub sender: Sender, } impl SegmentAnalytics { @@ -378,7 +393,7 @@ impl From for Infos { } pub struct Segment { - inbox: Receiver>, + inbox: Receiver, user: User, opt: Opt, batcher: AutoBatcher, @@ -435,8 +450,13 @@ impl Segment { }, msg = self.inbox.recv() => { match msg { - // Some(AnalyticsMsg::BatchMessage(msg)) => drop(self.batcher.push(msg).await), - Some(_) => todo!(), + Some(Message { type_id, event, aggregator_function }) => { + let new_event = match self.events.remove(&type_id) { + Some(old) => (aggregator_function)(old, event).unwrap(), + None => event, + }; + self.events.insert(type_id, new_event); + }, None => (), } } @@ -479,9 +499,9 @@ impl Segment { // We empty the list of events let events = std::mem::take(&mut self.events); - for (_, mut event) in events { + for (_, event) in events { self.batcher.push(Track { - user: self.user, + user: self.user.clone(), event: event.event_name().to_string(), properties: event.into_event(), timestamp: todo!(), @@ -722,11 +742,11 @@ impl Aggregate for SearchAggregator { Method::event_name() } - fn aggregate(mut self, mut other: Self) -> Self { + fn aggregate(mut self: Box, other: Box) -> Box { let Self { total_received, total_succeeded, - ref mut time_spent, + mut time_spent, sort_with_geo_point, sort_sum_of_criteria_terms, sort_total_number_of_criteria, @@ -761,9 +781,9 @@ impl Aggregate for SearchAggregator { total_degraded, total_used_negative_operator, ranking_score_threshold, - ref mut locales, + mut locales, marker: _, - } = other; + } = *other; // request self.total_received = self.total_received.saturating_add(total_received); @@ -771,7 +791,7 @@ impl Aggregate for SearchAggregator { self.total_degraded = self.total_degraded.saturating_add(total_degraded); self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(total_used_negative_operator); - self.time_spent.append(time_spent); + self.time_spent.append(&mut time_spent); // sort self.sort_with_geo_point |= sort_with_geo_point; @@ -843,12 +863,12 @@ impl Aggregate for SearchAggregator { self.ranking_score_threshold |= ranking_score_threshold; // locales - self.locales.append(locales); + self.locales.append(&mut locales); self } - fn into_event(self) -> impl Serialize { + fn into_event(self: Box) -> serde_json::Value { let Self { total_received, total_succeeded, @@ -889,7 +909,7 @@ impl Aggregate for SearchAggregator { ranking_score_threshold, locales, marker: _, - } = self; + } = *self; // we get all the values in a sorted manner let time_spent = time_spent.into_sorted_vec(); @@ -1058,11 +1078,11 @@ impl Aggregate for MultiSearchAggregator { } /// Aggregate one [MultiSearchAggregator] into another. - fn aggregate(self, other: Self) -> Self { + fn aggregate(self: Box, other: Box) -> Box { // write the aggregate in a way that will cause a compilation error if a field is added. // get ownership of self, replacing it by a default value. - let this = self; + let this = *self; let total_received = this.total_received.saturating_add(other.total_received); let total_succeeded = this.total_succeeded.saturating_add(other.total_succeeded); @@ -1075,7 +1095,7 @@ impl Aggregate for MultiSearchAggregator { this.show_ranking_score_details || other.show_ranking_score_details; let use_federation = this.use_federation || other.use_federation; - Self { + Box::new(Self { total_received, total_succeeded, total_distinct_index_count, @@ -1084,10 +1104,10 @@ impl Aggregate for MultiSearchAggregator { show_ranking_score, show_ranking_score_details, use_federation, - } + }) } - fn into_event(self) -> impl Serialize { + fn into_event(self: Box) -> serde_json::Value { let Self { total_received, total_succeeded, @@ -1097,7 +1117,7 @@ impl Aggregate for MultiSearchAggregator { show_ranking_score, show_ranking_score_details, use_federation, - } = self; + } = *self; json!({ "requests": { @@ -1708,11 +1728,11 @@ impl Aggregate for SimilarAggregator { } /// Aggregate one [SimilarAggregator] into another. - fn aggregate(mut self, mut other: Self) -> Self { + fn aggregate(mut self: Box, other: Box) -> Box { let Self { total_received, total_succeeded, - ref mut time_spent, + mut time_spent, filter_with_geo_radius, filter_with_geo_bounding_box, filter_sum_of_criteria_terms, @@ -1726,12 +1746,12 @@ impl Aggregate for SimilarAggregator { ranking_score_threshold, retrieve_vectors, marker: _, - } = other; + } = *other; // request self.total_received = self.total_received.saturating_add(total_received); self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.time_spent.append(time_spent); + self.time_spent.append(&mut time_spent); // filter self.filter_with_geo_radius |= filter_with_geo_radius; @@ -1763,7 +1783,7 @@ impl Aggregate for SimilarAggregator { self } - fn into_event(self) -> impl Serialize { + fn into_event(self: Box) -> serde_json::Value { let Self { total_received, total_succeeded, @@ -1781,7 +1801,7 @@ impl Aggregate for SimilarAggregator { ranking_score_threshold, retrieve_vectors, marker: _, - } = self; + } = *self; // we get all the values in a sorted manner let time_spent = time_spent.into_sorted_vec(); From aa7a34ffe8b9572c44b4bd36c30f7cf3805a9ed7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 00:43:34 +0200 Subject: [PATCH 56/92] make the aggregate method send --- meilisearch/src/analytics/mod.rs | 2 +- meilisearch/src/analytics/segment_analytics.rs | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index 8a0a68bad..f8a589901 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -110,7 +110,7 @@ mopafy!(Aggregate); /// Helper trait to define multiple aggregate with the same content but a different name. /// Commonly used when you must aggregate a search with POST or with GET for example. -pub trait AggregateMethod: 'static + Default { +pub trait AggregateMethod: 'static + Default + Send { fn event_name() -> &'static str; } diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 1a1bb9226..92f03e48e 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -72,9 +72,12 @@ pub fn extract_user_agents(request: &HttpRequest) -> Vec { } pub struct Message { + // Since the type_id is solved statically we cannot retrieve it from the Box. + // Thus we have to send it in the message directly. type_id: TypeId, - event: Box, + // Same for the aggregate function. aggregator_function: fn(Box, Box) -> Option>, + event: Box, } impl Message { From e4ace98004fff86e35fe8dd4a2cdccfa8b03ce9f Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 01:04:25 +0200 Subject: [PATCH 57/92] fix all the routes + move to a better version of mopa --- Cargo.lock | 8 ++-- meilisearch/Cargo.toml | 2 +- meilisearch/src/analytics/mod.rs | 2 + meilisearch/src/routes/features.rs | 13 ++---- meilisearch/src/routes/indexes/documents.rs | 46 ++++++++----------- .../src/routes/indexes/facet_search.rs | 10 ++-- meilisearch/src/routes/indexes/mod.rs | 23 +++++----- meilisearch/src/routes/indexes/settings.rs | 16 ++----- meilisearch/src/routes/swap_indexes.rs | 10 ++-- meilisearch/src/routes/tasks.rs | 10 ++-- 10 files changed, 65 insertions(+), 75 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 733470384..500f28454 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3415,7 +3415,7 @@ dependencies = [ "meilisearch-types", "mimalloc", "mime", - "mopa", + "mopa-maintained", "num_cpus", "obkv", "once_cell", @@ -3683,10 +3683,10 @@ dependencies = [ ] [[package]] -name = "mopa" -version = "0.2.2" +name = "mopa-maintained" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a785740271256c230f57462d3b83e52f998433a7062fc18f96d5999474a9f915" +checksum = "79b7f3e22167862cc7c95b21a6f326c22e4bf40da59cbf000b368a310173ba11" [[package]] name = "mutually_exclusive_features" diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 322b333ac..07357e724 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -104,7 +104,7 @@ tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.11" build-info = { version = "1.7.0", path = "../build-info" } roaring = "0.10.2" -mopa = "0.2.2" +mopa-maintained = "0.2.3" [dev-dependencies] actix-rt = "2.10.0" diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index f8a589901..b3e8109a3 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -1,3 +1,5 @@ +#![allow(clippy::transmute_ptr_to_ref)] // mopify isn't updated with the latest version of clippy yet + pub mod segment_analytics; use std::fs; diff --git a/meilisearch/src/routes/features.rs b/meilisearch/src/routes/features.rs index 0b43c3f13..1de00717d 100644 --- a/meilisearch/src/routes/features.rs +++ b/meilisearch/src/routes/features.rs @@ -69,21 +69,18 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { "Experimental features Updated" } - fn aggregate(self, other: Self) -> Self - where - Self: Sized, - { - Self { + fn aggregate(self: Box, other: Box) -> Box { + Box::new(Self { vector_store: other.vector_store, metrics: other.metrics, logs_route: other.logs_route, edit_documents_by_function: other.edit_documents_by_function, contains_filter: other.contains_filter, - } + }) } - fn into_event(self) -> impl Serialize { - self + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() } } diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 1573b768b..854fa5b69 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -162,8 +162,8 @@ impl Aggregate for DocumentsFetchAggregator { Method::event_name() } - fn aggregate(self, other: Self) -> Self { - Self { + fn aggregate(self: Box, other: Box) -> Box { + Box::new(Self { total_received: self.total_received.saturating_add(other.total_received), per_document_id: self.per_document_id | other.per_document_id, per_filter: self.per_filter | other.per_filter, @@ -171,11 +171,11 @@ impl Aggregate for DocumentsFetchAggregator { max_limit: self.max_limit.max(other.max_limit), max_offset: self.max_offset.max(other.max_offset), marker: PhantomData, - } + }) } - fn into_event(self) -> impl Serialize { - self + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() } } @@ -226,21 +226,18 @@ impl Aggregate for DocumentsDeletionAggregator { "Documents Deleted" } - fn aggregate(self, other: Self) -> Self - where - Self: Sized, - { - Self { + fn aggregate(self: Box, other: Box) -> Box { + Box::new(Self { total_received: self.total_received.saturating_add(other.total_received), per_document_id: self.per_document_id | other.per_document_id, clear_all: self.clear_all | other.clear_all, per_batch: self.per_batch | other.per_batch, per_filter: self.per_filter | other.per_filter, - } + }) } - fn into_event(self) -> impl Serialize { - self + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() } } @@ -443,17 +440,17 @@ impl Aggregate for DocumentsAggregator { Method::event_name() } - fn aggregate(self, other: Self) -> Self { - Self { + fn aggregate(self: Box, other: Box) -> Box { + Box::new(Self { payload_types: self.payload_types.union(&other.payload_types).cloned().collect(), primary_key: self.primary_key.union(&other.primary_key).cloned().collect(), index_creation: self.index_creation | other.index_creation, method: PhantomData, - } + }) } - fn into_event(self) -> impl Serialize { - self + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(self).unwrap_or_default() } } @@ -811,19 +808,16 @@ impl Aggregate for EditDocumentsByFunctionAggregator { "Documents Edited By Function" } - fn aggregate(self, other: Self) -> Self - where - Self: Sized, - { - Self { + fn aggregate(self: Box, other: Box) -> Box { + Box::new(Self { filtered: self.filtered | other.filtered, with_context: self.with_context | other.with_context, index_creation: self.index_creation | other.index_creation, - } + }) } - fn into_event(self) -> impl Serialize { - self + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() } } diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 08618970d..715eaaaa7 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -114,29 +114,29 @@ impl Aggregate for FacetSearchAggregator { "Facet Searched POST" } - fn aggregate(mut self, other: Self) -> Self { + fn aggregate(mut self: Box, other: Box) -> Box { for time in other.time_spent { self.time_spent.push(time); } - Self { + Box::new(Self { total_received: self.total_received.saturating_add(other.total_received), total_succeeded: self.total_succeeded.saturating_add(other.total_succeeded), time_spent: self.time_spent, facet_names: self.facet_names.union(&other.facet_names).cloned().collect(), additional_search_parameters_provided: self.additional_search_parameters_provided | other.additional_search_parameters_provided, - } + }) } - fn into_event(self) -> impl Serialize { + fn into_event(self: Box) -> serde_json::Value { let Self { total_received, total_succeeded, time_spent, facet_names, additional_search_parameters_provided, - } = self; + } = *self; // the index of the 99th percentage of value let percentile_99th = 0.99 * (total_succeeded as f64 - 1.) + 1.; // we get all the values in a sorted manner diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 3c41f36fe..8972119d7 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -133,15 +133,14 @@ impl Aggregate for IndexCreatedAggregate { "Index Created" } - fn aggregate(self, other: Self) -> Self - where - Self: Sized, - { - Self { primary_key: self.primary_key.union(&other.primary_key).cloned().collect() } + fn aggregate(self: Box, other: Box) -> Box { + Box::new(Self { + primary_key: self.primary_key.union(&other.primary_key).cloned().collect(), + }) } - fn into_event(self) -> impl Serialize { - self + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() } } @@ -225,12 +224,14 @@ impl Aggregate for IndexUpdatedAggregate { "Index Updated" } - fn aggregate(self, other: Self) -> Self { - Self { primary_key: self.primary_key.union(&other.primary_key).cloned().collect() } + fn aggregate(self: Box, other: Box) -> Box { + Box::new(Self { + primary_key: self.primary_key.union(&other.primary_key).cloned().collect(), + }) } - fn into_event(self) -> impl Serialize { - self + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() } } pub async fn update_index( diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index bb2f6792d..f31f52dc1 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -437,11 +437,8 @@ impl Aggregate for SettingsAnalytics { "Settings Updated" } - fn aggregate(self, other: Self) -> Self - where - Self: Sized, - { - Self { + fn aggregate(self: Box, other: Box) -> Box { + Box::new(Self { ranking_rules: RankingRulesAnalytics { words_position: self .ranking_rules @@ -586,14 +583,11 @@ impl Aggregate for SettingsAnalytics { non_separator_tokens: NonSeparatorTokensAnalytics { total: self.non_separator_tokens.total.or(other.non_separator_tokens.total), }, - } + }) } - fn into_event(self) -> impl Serialize - where - Self: Sized, - { - self + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() } } diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index abdffbb73..f7d8f4eff 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -39,12 +39,14 @@ impl Aggregate for IndexSwappedAnalytics { "Indexes Swapped" } - fn aggregate(self, other: Self) -> Self { - Self { swap_operation_number: self.swap_operation_number.max(other.swap_operation_number) } + fn aggregate(self: Box, other: Box) -> Box { + Box::new(Self { + swap_operation_number: self.swap_operation_number.max(other.swap_operation_number), + }) } - fn into_event(self) -> impl Serialize { - self + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() } } diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index f04e2ead2..ff4aee998 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -185,8 +185,8 @@ impl Aggregate for TaskFilterAnalytics Self { - Self { + fn aggregate(self: Box, other: Box) -> Box { + Box::new(Self { filtered_by_uid: self.filtered_by_uid | other.filtered_by_uid, filtered_by_index_uid: self.filtered_by_index_uid | other.filtered_by_index_uid, filtered_by_type: self.filtered_by_type | other.filtered_by_type, @@ -206,11 +206,11 @@ impl Aggregate for TaskFilterAnalytics impl Serialize { - self + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() } } From 7382fb21e41719a6be6dbf5f25b6c47ad7afc581 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 08:38:11 +0200 Subject: [PATCH 58/92] fix the main --- meilisearch/src/analytics/mod.rs | 24 +++++++++++++------ .../src/analytics/segment_analytics.rs | 10 ++++---- meilisearch/src/lib.rs | 6 ++--- meilisearch/src/main.rs | 22 +++++------------ meilisearch/src/routes/indexes/search.rs | 4 ++-- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index b3e8109a3..91139e1dd 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -5,8 +5,11 @@ pub mod segment_analytics; use std::fs; use std::path::{Path, PathBuf}; use std::str::FromStr; +use std::sync::Arc; use actix_web::HttpRequest; +use index_scheduler::IndexScheduler; +use meilisearch_auth::AuthController; use meilisearch_types::InstanceUid; use mopa::mopafy; use once_cell::sync::Lazy; @@ -17,6 +20,8 @@ pub type SegmentAnalytics = segment_analytics::SegmentAnalytics; pub use segment_analytics::SearchAggregator; pub use segment_analytics::SimilarAggregator; +use crate::Opt; + use self::segment_analytics::extract_user_agents; pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator; pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator; @@ -137,17 +142,22 @@ macro_rules! aggregate_methods { }; } +#[derive(Clone)] pub struct Analytics { - segment: Option, + segment: Option>, } impl Analytics { - fn no_analytics() -> Self { - Self { segment: None } - } - - fn segment_analytics(segment: SegmentAnalytics) -> Self { - Self { segment: Some(segment) } + pub async fn new( + opt: &Opt, + index_scheduler: Arc, + auth_controller: Arc, + ) -> Self { + if opt.no_analytics { + Self { segment: None } + } else { + Self { segment: SegmentAnalytics::new(opt, index_scheduler, auth_controller).await } + } } pub fn instance_uid(&self) -> Option<&InstanceUid> { diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 92f03e48e..3496853ff 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -102,7 +102,7 @@ impl SegmentAnalytics { opt: &Opt, index_scheduler: Arc, auth_controller: Arc, - ) -> Arc { + ) -> Option> { let instance_uid = super::find_user_id(&opt.db_path); let first_time_run = instance_uid.is_none(); let instance_uid = instance_uid.unwrap_or_else(Uuid::new_v4); @@ -112,7 +112,7 @@ impl SegmentAnalytics { // if reqwest throws an error we won't be able to send analytics if client.is_err() { - return Arc::new(Analytics::no_analytics()); + return None; } let client = @@ -148,13 +148,13 @@ impl SegmentAnalytics { user: user.clone(), opt: opt.clone(), batcher, - events: todo!(), + events: HashMap::new(), }); tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone())); let this = Self { instance_uid, sender, user: user.clone() }; - Arc::new(Analytics::segment_analytics(this)) + Some(Arc::new(this)) } } @@ -595,7 +595,7 @@ pub struct SearchAggregator { impl SearchAggregator { #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &SearchQuery, request: &HttpRequest) -> Self { + pub fn from_query(query: &SearchQuery) -> Self { let SearchQuery { q, vector, diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs index 80177876a..633ad2776 100644 --- a/meilisearch/src/lib.rs +++ b/meilisearch/src/lib.rs @@ -120,7 +120,7 @@ pub fn create_app( search_queue: Data, opt: Opt, logs: (LogRouteHandle, LogStderrHandle), - analytics: Arc, + analytics: Data, enable_dashboard: bool, ) -> actix_web::App< impl ServiceFactory< @@ -473,14 +473,14 @@ pub fn configure_data( search_queue: Data, opt: &Opt, (logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle), - analytics: Arc, + analytics: Data, ) { let http_payload_size_limit = opt.http_payload_size_limit.as_u64() as usize; config .app_data(index_scheduler) .app_data(auth) .app_data(search_queue) - .app_data(web::Data::from(analytics)) + .app_data(analytics) .app_data(web::Data::new(logs_route)) .app_data(web::Data::new(logs_stderr)) .app_data(web::Data::new(opt.clone())) diff --git a/meilisearch/src/main.rs b/meilisearch/src/main.rs index de9784d15..eebea3b6d 100644 --- a/meilisearch/src/main.rs +++ b/meilisearch/src/main.rs @@ -124,19 +124,12 @@ async fn try_main() -> anyhow::Result<()> { let (index_scheduler, auth_controller) = setup_meilisearch(&opt)?; - #[cfg(all(not(debug_assertions), feature = "analytics"))] - let analytics = if !opt.no_analytics { - analytics::SegmentAnalytics::new(&opt, index_scheduler.clone(), auth_controller.clone()) - .await - } else { - analytics::MockAnalytics::new(&opt) - }; - #[cfg(any(debug_assertions, not(feature = "analytics")))] - let analytics = analytics::MockAnalytics::new(&opt); + let analytics = + analytics::Analytics::new(&opt, index_scheduler.clone(), auth_controller.clone()).await; print_launch_resume(&opt, analytics.clone(), config_read_from); - run_http(index_scheduler, auth_controller, opt, log_handle, analytics).await?; + run_http(index_scheduler, auth_controller, opt, log_handle, Arc::new(analytics)).await?; Ok(()) } @@ -146,12 +139,13 @@ async fn run_http( auth_controller: Arc, opt: Opt, logs: (LogRouteHandle, LogStderrHandle), - analytics: Arc, + analytics: Arc, ) -> anyhow::Result<()> { let enable_dashboard = &opt.env == "development"; let opt_clone = opt.clone(); let index_scheduler = Data::from(index_scheduler); let auth_controller = Data::from(auth_controller); + let analytics = Data::from(analytics); let search_queue = SearchQueue::new( opt.experimental_search_queue_size, available_parallelism() @@ -187,11 +181,7 @@ async fn run_http( Ok(()) } -pub fn print_launch_resume( - opt: &Opt, - analytics: Arc, - config_read_from: Option, -) { +pub fn print_launch_resume(opt: &Opt, analytics: Analytics, config_read_from: Option) { let build_info = build_info::BuildInfo::from_build(); let protocol = diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 538c46fd0..ac6e23c8f 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -238,7 +238,7 @@ pub async fn search_with_url_query( add_search_rules(&mut query.filter, search_rules); } - let mut aggregate = SearchAggregator::::from_query(&query, &req); + let mut aggregate = SearchAggregator::::from_query(&query); let index = index_scheduler.index(&index_uid)?; let features = index_scheduler.features(); @@ -281,7 +281,7 @@ pub async fn search_with_post( add_search_rules(&mut query.filter, search_rules); } - let mut aggregate = SearchAggregator::::from_query(&query, &req); + let mut aggregate = SearchAggregator::::from_query(&query); let index = index_scheduler.index(&index_uid)?; From ef77c7699b21422b4857878d072494e1bfc49d6b Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 09:06:23 +0200 Subject: [PATCH 59/92] add the required shared values between all the events and fix the timestamp --- meilisearch/src/analytics/mod.rs | 6 +- .../src/analytics/segment_analytics.rs | 75 +++++++++++++------ 2 files changed, 57 insertions(+), 24 deletions(-) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index 91139e1dd..a3b8d6d1d 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -166,8 +166,8 @@ impl Analytics { /// The method used to publish most analytics that do not need to be batched every hours pub fn publish(&self, event: T, request: &HttpRequest) { - let Some(ref segment) = self.segment else { return }; - let user_agents = extract_user_agents(request); - let _ = segment.sender.try_send(segment_analytics::Message::new(event)); + if let Some(ref segment) = self.segment { + let _ = segment.sender.try_send(segment_analytics::Message::new(event, request)); + } } } diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 3496853ff..00a3adaaf 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -28,7 +28,6 @@ use super::{ config_user_id_path, Aggregate, AggregateMethod, DocumentDeletionKind, DocumentFetchKind, MEILISEARCH_CONFIG_PATH, }; -use crate::analytics::Analytics; use crate::option::{ default_http_addr, IndexerOpts, LogMode, MaxMemory, MaxThreads, ScheduleSnapshot, }; @@ -58,7 +57,7 @@ fn write_user_id(db_path: &Path, user_id: &InstanceUid) { const SEGMENT_API_KEY: &str = "P3FWhhEsJiEDCuEHpmcN9DHcK4hVfBvb"; -pub fn extract_user_agents(request: &HttpRequest) -> Vec { +pub fn extract_user_agents(request: &HttpRequest) -> HashSet { request .headers() .get(ANALYTICS_HEADER) @@ -77,14 +76,26 @@ pub struct Message { type_id: TypeId, // Same for the aggregate function. aggregator_function: fn(Box, Box) -> Option>, - event: Box, + event: Event, +} + +pub struct Event { + original: Box, + timestamp: OffsetDateTime, + user_agents: HashSet, + total: usize, } impl Message { - pub fn new(event: T) -> Self { + pub fn new(event: T, request: &HttpRequest) -> Self { Self { type_id: TypeId::of::(), - event: Box::new(event), + event: Event { + original: Box::new(event), + timestamp: OffsetDateTime::now_utc(), + user_agents: extract_user_agents(request), + total: 1, + }, aggregator_function: T::downcast_aggregate, } } @@ -400,7 +411,7 @@ pub struct Segment { user: User, opt: Opt, batcher: AutoBatcher, - events: HashMap>, + events: HashMap, } impl Segment { @@ -451,22 +462,34 @@ impl Segment { _ = interval.tick() => { self.tick(index_scheduler.clone(), auth_controller.clone()).await; }, - msg = self.inbox.recv() => { - match msg { - Some(Message { type_id, event, aggregator_function }) => { - let new_event = match self.events.remove(&type_id) { - Some(old) => (aggregator_function)(old, event).unwrap(), - None => event, - }; - self.events.insert(type_id, new_event); - }, - None => (), - } - } + Some(msg) = self.inbox.recv() => { + self.handle_msg(msg); + } } } } + fn handle_msg(&mut self, Message { type_id, aggregator_function, event }: Message) { + let new_event = match self.events.remove(&type_id) { + Some(old) => { + // The function should never fail since we retrieved the corresponding TypeId in the map. But in the unfortunate + // case it could happens we're going to silently ignore the error + let Some(original) = (aggregator_function)(old.original, event.original) else { + return; + }; + Event { + original, + // We always want to return the FIRST timestamp ever encountered + timestamp: old.timestamp, + user_agents: old.user_agents.union(&event.user_agents).cloned().collect(), + total: old.total.saturating_add(event.total), + } + } + None => event, + }; + self.events.insert(type_id, new_event); + } + async fn tick( &mut self, index_scheduler: Arc, @@ -503,11 +526,21 @@ impl Segment { let events = std::mem::take(&mut self.events); for (_, event) in events { + let Event { original, timestamp, user_agents, total } = event; + let name = original.event_name(); + let mut properties = original.into_event(); + if properties["user-agent"].is_null() { + properties["user-agent"] = json!(user_agents); + }; + if properties["requests"]["total_received"].is_null() { + properties["requests"]["total_received"] = total.into(); + }; + self.batcher.push(Track { user: self.user.clone(), - event: event.event_name().to_string(), - properties: event.into_event(), - timestamp: todo!(), + event: name.to_string(), + properties, + timestamp: Some(timestamp), ..Default::default() }); } From 4ee65d870eab55f0c5098aaad659aa98fbd9d500 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 09:14:34 +0200 Subject: [PATCH 60/92] remove a lot of ununsed code --- meilisearch/src/analytics/mod.rs | 4 +- .../src/analytics/segment_analytics.rs | 598 +----------------- .../src/routes/indexes/facet_search.rs | 1 - 3 files changed, 17 insertions(+), 586 deletions(-) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index a3b8d6d1d..d08f3307c 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -22,9 +22,7 @@ pub use segment_analytics::SimilarAggregator; use crate::Opt; -use self::segment_analytics::extract_user_agents; -pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator; -pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator; +pub use self::segment_analytics::MultiSearchAggregator; /// A macro used to quickly define events that don't aggregate or send anything besides an empty event with its name. #[macro_export] diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 00a3adaaf..1edfa1bdd 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -5,7 +5,7 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::{Duration, Instant}; -use actix_web::http::header::{CONTENT_TYPE, USER_AGENT}; +use actix_web::http::header::USER_AGENT; use actix_web::HttpRequest; use byte_unit::Byte; use index_scheduler::IndexScheduler; @@ -24,21 +24,15 @@ use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; use uuid::Uuid; -use super::{ - config_user_id_path, Aggregate, AggregateMethod, DocumentDeletionKind, DocumentFetchKind, - MEILISEARCH_CONFIG_PATH, -}; +use super::{config_user_id_path, Aggregate, AggregateMethod, MEILISEARCH_CONFIG_PATH}; use crate::option::{ default_http_addr, IndexerOpts, LogMode, MaxMemory, MaxThreads, ScheduleSnapshot, }; -use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery}; -use crate::routes::indexes::facet_search::FacetSearchQuery; use crate::routes::{create_all_stats, Stats}; use crate::search::{ - FacetSearchResult, FederatedSearch, MatchingStrategy, SearchQuery, SearchQueryWithIndex, - SearchResult, SimilarQuery, SimilarResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, - DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, - DEFAULT_SEMANTIC_RATIO, + FederatedSearch, SearchQuery, SearchQueryWithIndex, SearchResult, SimilarQuery, SimilarResult, + DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, + DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEMANTIC_RATIO, }; use crate::{aggregate_methods, Opt}; @@ -75,6 +69,7 @@ pub struct Message { // Thus we have to send it in the message directly. type_id: TypeId, // Same for the aggregate function. + #[allow(clippy::type_complexity)] aggregator_function: fn(Box, Box) -> Option>, event: Event, } @@ -169,97 +164,6 @@ impl SegmentAnalytics { } } -/* -impl super::Analytics for SegmentAnalytics { - fn instance_uid(&self) -> Option<&InstanceUid> { - Some(&self.instance_uid) - } - - fn publish(&self, event_name: String, mut send: Value, request: Option<&HttpRequest>) { - let user_agent = request.map(extract_user_agents); - - send["user-agent"] = json!(user_agent); - let event = Track { - user: self.user.clone(), - event: event_name.clone(), - properties: send, - ..Default::default() - }; - let _ = self.sender.try_send(AnalyticsMsg::BatchMessage(event)); - } - - fn get_search(&self, aggregate: SearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregateGetSearch(aggregate)); - } - - fn post_search(&self, aggregate: SearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostSearch(aggregate)); - } - - fn get_similar(&self, aggregate: SimilarAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregateGetSimilar(aggregate)); - } - - fn post_similar(&self, aggregate: SimilarAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostSimilar(aggregate)); - } - - fn post_facet_search(&self, aggregate: FacetSearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFacetSearch(aggregate)); - } - - fn post_multi_search(&self, aggregate: MultiSearchAggregator) { - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostMultiSearch(aggregate)); - } - - fn add_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ) { - let aggregate = DocumentsAggregator::from_query(documents_query, index_creation, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateAddDocuments(aggregate)); - } - - fn delete_documents(&self, kind: DocumentDeletionKind, request: &HttpRequest) { - let aggregate = DocumentsDeletionAggregator::from_query(kind, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateDeleteDocuments(aggregate)); - } - - fn update_documents( - &self, - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ) { - let aggregate = DocumentsAggregator::from_query(documents_query, index_creation, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateUpdateDocuments(aggregate)); - } - - fn update_documents_by_function( - &self, - documents_query: &DocumentEditionByFunction, - index_creation: bool, - request: &HttpRequest, - ) { - let aggregate = - EditDocumentsByFunctionAggregator::from_query(documents_query, index_creation, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateEditDocumentsByFunction(aggregate)); - } - - fn get_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest) { - let aggregate = DocumentsFetchAggregator::from_query(documents_query, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregateGetFetchDocuments(aggregate)); - } - - fn post_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest) { - let aggregate = DocumentsFetchAggregator::from_query(documents_query, request); - let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate)); - } -} -*/ - /// This structure represent the `infos` field we send in the analytics. /// It's quite close to the `Opt` structure except all sensitive informations /// have been simplified to a boolean. @@ -536,13 +440,16 @@ impl Segment { properties["requests"]["total_received"] = total.into(); }; - self.batcher.push(Track { - user: self.user.clone(), - event: name.to_string(), - properties, - timestamp: Some(timestamp), - ..Default::default() - }); + let _ = self + .batcher + .push(Track { + user: self.user.clone(), + event: name.to_string(), + properties, + timestamp: Some(timestamp), + ..Default::default() + }) + .await; } let _ = self.batcher.flush().await; @@ -1181,479 +1088,6 @@ impl Aggregate for MultiSearchAggregator { } } -#[derive(Default)] -pub struct FacetSearchAggregator { - timestamp: Option, - - // context - user_agents: HashSet, - - // requests - total_received: usize, - total_succeeded: usize, - time_spent: BinaryHeap, - - // The set of all facetNames that were used - facet_names: HashSet, - - // As there been any other parameter than the facetName or facetQuery ones? - additional_search_parameters_provided: bool, -} - -impl FacetSearchAggregator { - #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &FacetSearchQuery, request: &HttpRequest) -> Self { - let FacetSearchQuery { - facet_query: _, - facet_name, - vector, - q, - filter, - matching_strategy, - attributes_to_search_on, - hybrid, - ranking_score_threshold, - locales, - } = query; - - let mut ret = Self::default(); - ret.timestamp = Some(OffsetDateTime::now_utc()); - - ret.total_received = 1; - ret.user_agents = extract_user_agents(request).into_iter().collect(); - ret.facet_names = Some(facet_name.clone()).into_iter().collect(); - - ret.additional_search_parameters_provided = q.is_some() - || vector.is_some() - || filter.is_some() - || *matching_strategy != MatchingStrategy::default() - || attributes_to_search_on.is_some() - || hybrid.is_some() - || ranking_score_threshold.is_some() - || locales.is_some(); - - ret - } - - pub fn succeed(&mut self, result: &FacetSearchResult) { - let FacetSearchResult { facet_hits: _, facet_query: _, processing_time_ms } = result; - self.total_succeeded = self.total_succeeded.saturating_add(1); - self.time_spent.push(*processing_time_ms as usize); - } - - /// Aggregate one [FacetSearchAggregator] into another. - pub fn aggregate(&mut self, mut other: Self) { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - ref mut time_spent, - facet_names, - additional_search_parameters_provided, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // context - for user_agent in user_agents.into_iter() { - self.user_agents.insert(user_agent); - } - - // request - self.total_received = self.total_received.saturating_add(total_received); - self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.time_spent.append(time_spent); - - // facet_names - for facet_name in facet_names.into_iter() { - self.facet_names.insert(facet_name); - } - - // additional_search_parameters_provided - self.additional_search_parameters_provided |= additional_search_parameters_provided; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { - timestamp, - user_agents, - total_received, - total_succeeded, - time_spent, - facet_names, - additional_search_parameters_provided, - } = self; - - if total_received == 0 { - None - } else { - // the index of the 99th percentage of value - let percentile_99th = 0.99 * (total_succeeded as f64 - 1.) + 1.; - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th as usize); - - let properties = json!({ - "user-agent": user_agents, - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "facets": { - "total_distinct_facet_count": facet_names.len(), - "additional_search_parameters_provided": additional_search_parameters_provided, - }, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} - -#[derive(Default)] -pub struct DocumentsAggregator { - timestamp: Option, - - // set to true when at least one request was received - updated: bool, - - // context - user_agents: HashSet, - - content_types: HashSet, - primary_keys: HashSet, - index_creation: bool, -} - -impl DocumentsAggregator { - pub fn from_query( - documents_query: &UpdateDocumentsQuery, - index_creation: bool, - request: &HttpRequest, - ) -> Self { - let UpdateDocumentsQuery { primary_key, csv_delimiter: _ } = documents_query; - - let mut primary_keys = HashSet::new(); - if let Some(primary_key) = primary_key.clone() { - primary_keys.insert(primary_key); - } - - let mut content_types = HashSet::new(); - let content_type = request - .headers() - .get(CONTENT_TYPE) - .and_then(|s| s.to_str().ok()) - .unwrap_or("unknown") - .to_string(); - content_types.insert(content_type); - - Self { - timestamp: Some(OffsetDateTime::now_utc()), - updated: true, - user_agents: extract_user_agents(request).into_iter().collect(), - content_types, - primary_keys, - index_creation, - } - } - - /// Aggregate one [DocumentsAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { timestamp, user_agents, primary_keys, content_types, index_creation, updated } = - other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - self.updated |= updated; - // we can't create a union because there is no `into_union` method - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - for primary_key in primary_keys { - self.primary_keys.insert(primary_key); - } - for content_type in content_types { - self.content_types.insert(content_type); - } - self.index_creation |= index_creation; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { timestamp, user_agents, primary_keys, content_types, index_creation, updated } = - self; - - if !updated { - None - } else { - let properties = json!({ - "user-agent": user_agents, - "payload_type": content_types, - "primary_key": primary_keys, - "index_creation": index_creation, - }); - - Some(Track { - timestamp, - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } - } -} - -#[derive(Default)] -pub struct EditDocumentsByFunctionAggregator { - timestamp: Option, - - // Set to true if at least one request was filtered - filtered: bool, - // Set to true if at least one request contained a context - with_context: bool, - - // context - user_agents: HashSet, - - index_creation: bool, -} - -impl EditDocumentsByFunctionAggregator { - pub fn from_query( - documents_query: &DocumentEditionByFunction, - index_creation: bool, - request: &HttpRequest, - ) -> Self { - let DocumentEditionByFunction { filter, context, function: _ } = documents_query; - - Self { - timestamp: Some(OffsetDateTime::now_utc()), - user_agents: extract_user_agents(request).into_iter().collect(), - filtered: filter.is_some(), - with_context: context.is_some(), - index_creation, - } - } - - /// Aggregate one [DocumentsAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { timestamp, user_agents, index_creation, filtered, with_context } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // we can't create a union because there is no `into_union` method - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - self.index_creation |= index_creation; - self.filtered |= filtered; - self.with_context |= with_context; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - let Self { timestamp, user_agents, index_creation, filtered, with_context } = self; - - // if we had no timestamp it means we never encountered any events and - // thus we don't need to send this event. - let timestamp = timestamp?; - - let properties = json!({ - "user-agent": user_agents, - "filtered": filtered, - "with_context": with_context, - "index_creation": index_creation, - }); - - Some(Track { - timestamp: Some(timestamp), - user: user.clone(), - event: event_name.to_string(), - properties, - ..Default::default() - }) - } -} - -#[derive(Default, Serialize)] -pub struct DocumentsDeletionAggregator { - #[serde(skip)] - timestamp: Option, - - // context - #[serde(rename = "user-agent")] - user_agents: HashSet, - - #[serde(rename = "requests.total_received")] - total_received: usize, - per_document_id: bool, - clear_all: bool, - per_batch: bool, - per_filter: bool, -} - -impl DocumentsDeletionAggregator { - pub fn from_query(kind: DocumentDeletionKind, request: &HttpRequest) -> Self { - Self { - timestamp: Some(OffsetDateTime::now_utc()), - user_agents: extract_user_agents(request).into_iter().collect(), - total_received: 1, - per_document_id: matches!(kind, DocumentDeletionKind::PerDocumentId), - clear_all: matches!(kind, DocumentDeletionKind::ClearAll), - per_batch: matches!(kind, DocumentDeletionKind::PerBatch), - per_filter: matches!(kind, DocumentDeletionKind::PerFilter), - } - } - - /// Aggregate one [DocumentsAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { - timestamp, - user_agents, - total_received, - per_document_id, - clear_all, - per_batch, - per_filter, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - - // we can't create a union because there is no `into_union` method - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - self.total_received = self.total_received.saturating_add(total_received); - self.per_document_id |= per_document_id; - self.clear_all |= clear_all; - self.per_batch |= per_batch; - self.per_filter |= per_filter; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - // if we had no timestamp it means we never encountered any events and - // thus we don't need to send this event. - let timestamp = self.timestamp?; - - Some(Track { - timestamp: Some(timestamp), - user: user.clone(), - event: event_name.to_string(), - properties: serde_json::to_value(self).ok()?, - ..Default::default() - }) - } -} - -#[derive(Default, Serialize)] -pub struct DocumentsFetchAggregator { - #[serde(skip)] - timestamp: Option, - - // context - #[serde(rename = "user-agent")] - user_agents: HashSet, - - #[serde(rename = "requests.total_received")] - total_received: usize, - - // a call on ../documents/:doc_id - per_document_id: bool, - // if a filter was used - per_filter: bool, - - #[serde(rename = "vector.retrieve_vectors")] - retrieve_vectors: bool, - - // pagination - #[serde(rename = "pagination.max_limit")] - max_limit: usize, - #[serde(rename = "pagination.max_offset")] - max_offset: usize, -} - -impl DocumentsFetchAggregator { - pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self { - let (limit, offset, retrieve_vectors) = match query { - DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), - DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { - (*limit, *offset, *retrieve_vectors) - } - }; - Self { - timestamp: Some(OffsetDateTime::now_utc()), - user_agents: extract_user_agents(request).into_iter().collect(), - total_received: 1, - per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), - per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), - max_limit: limit, - max_offset: offset, - retrieve_vectors, - } - } - - /// Aggregate one [DocumentsFetchAggregator] into another. - pub fn aggregate(&mut self, other: Self) { - let Self { - timestamp, - user_agents, - total_received, - per_document_id, - per_filter, - max_limit, - max_offset, - retrieve_vectors, - } = other; - - if self.timestamp.is_none() { - self.timestamp = timestamp; - } - for user_agent in user_agents { - self.user_agents.insert(user_agent); - } - - self.total_received = self.total_received.saturating_add(total_received); - self.per_document_id |= per_document_id; - self.per_filter |= per_filter; - - self.max_limit = self.max_limit.max(max_limit); - self.max_offset = self.max_offset.max(max_offset); - - self.retrieve_vectors |= retrieve_vectors; - } - - pub fn into_event(self, user: &User, event_name: &str) -> Option { - // if we had no timestamp it means we never encountered any events and - // thus we don't need to send this event. - let timestamp = self.timestamp?; - - Some(Track { - timestamp: Some(timestamp), - user: user.clone(), - event: event_name.to_string(), - properties: serde_json::to_value(self).ok()?, - ..Default::default() - }) - } -} - aggregate_methods!( SimilarPOST => "Similar POST", SimilarGET => "Similar GET", diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 715eaaaa7..8e40397c7 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -9,7 +9,6 @@ use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::locales::Locale; -use serde::Serialize; use serde_json::Value; use tracing::debug; From 0fde49640a3f76cce57414e88b6690aa90ff8523 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 09:18:25 +0200 Subject: [PATCH 61/92] make clippy happy --- meilisearch/src/main.rs | 1 - meilisearch/src/routes/indexes/settings.rs | 111 ++++++++------------- 2 files changed, 43 insertions(+), 69 deletions(-) diff --git a/meilisearch/src/main.rs b/meilisearch/src/main.rs index eebea3b6d..c0652bf1e 100644 --- a/meilisearch/src/main.rs +++ b/meilisearch/src/main.rs @@ -223,7 +223,6 @@ pub fn print_launch_resume(opt: &Opt, analytics: Analytics, config_read_from: Op eprintln!("Prototype:\t\t{:?}", prototype); } - #[cfg(all(not(debug_assertions), feature = "analytics"))] { if !opt.no_analytics { eprintln!( diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index f31f52dc1..745ad5c78 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -94,7 +94,7 @@ macro_rules! make_setting_route { #[allow(clippy::redundant_closure_call)] analytics.publish( - $crate::routes::indexes::settings::$analytics::new(body.as_ref()).to_settings(), + $crate::routes::indexes::settings::$analytics::new(body.as_ref()).into_settings(), &req, ); @@ -605,58 +605,33 @@ struct RankingRulesAnalytics { impl RankingRulesAnalytics { pub fn new(rr: Option<&Vec>) -> Self { RankingRulesAnalytics { - words_position: rr - .as_ref() - .map(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Words) - }) + words_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words)) + }), + typo_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo)) + }), + proximity_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Proximity) }) - .flatten(), - - typo_position: rr - .as_ref() - .map(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Typo) - }) + }), + attribute_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Attribute) }) - .flatten(), - - proximity_position: rr - .as_ref() - .map(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Proximity) - }) + }), + sort_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort)) + }), + exactness_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Exactness) }) - .flatten(), - - attribute_position: rr - .as_ref() - .map(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Attribute) - }) - }) - .flatten(), - sort_position: rr - .as_ref() - .map(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Sort) - }) - }) - .flatten(), - exactness_position: rr - .as_ref() - .map(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Exactness) - }) - }) - .flatten(), - + }), values: rr.as_ref().map(|rr| { rr.iter() .filter(|s| { @@ -673,7 +648,7 @@ impl RankingRulesAnalytics { } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { ranking_rules: self, ..Default::default() } } } @@ -694,7 +669,7 @@ impl SearchableAttributesAnalytics { } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { searchable_attributes: self, ..Default::default() } } } @@ -715,7 +690,7 @@ impl DisplayedAttributesAnalytics { } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { displayed_attributes: self, ..Default::default() } } } @@ -734,7 +709,7 @@ impl SortableAttributesAnalytics { } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { sortable_attributes: self, ..Default::default() } } } @@ -753,7 +728,7 @@ impl FilterableAttributesAnalytics { } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { filterable_attributes: self, ..Default::default() } } } @@ -768,7 +743,7 @@ impl DistinctAttributeAnalytics { Self { set: distinct.is_some() } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { distinct_attribute: self, ..Default::default() } } } @@ -784,7 +759,7 @@ impl ProximityPrecisionAnalytics { Self { set: precision.is_some(), value: precision.cloned() } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { proximity_precision: self, ..Default::default() } } } @@ -818,7 +793,7 @@ impl TypoToleranceAnalytics { .flatten(), } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { typo_tolerance: self, ..Default::default() } } } @@ -846,7 +821,7 @@ impl FacetingAnalytics { } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { faceting: self, ..Default::default() } } } @@ -861,7 +836,7 @@ impl PaginationAnalytics { Self { max_total_hits: setting.as_ref().and_then(|s| s.max_total_hits.set()) } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { pagination: self, ..Default::default() } } } @@ -876,7 +851,7 @@ impl StopWordsAnalytics { Self { total: stop_words.as_ref().map(|stop_words| stop_words.len()) } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { stop_words: self, ..Default::default() } } } @@ -891,7 +866,7 @@ impl SynonymsAnalytics { Self { total: synonyms.as_ref().map(|synonyms| synonyms.len()) } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { synonyms: self, ..Default::default() } } } @@ -960,7 +935,7 @@ impl EmbeddersAnalytics { } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { embedders: self, ..Default::default() } } } @@ -976,7 +951,7 @@ impl SearchCutoffMsAnalytics { Self { search_cutoff_ms: setting.copied() } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { search_cutoff_ms: self, ..Default::default() } } } @@ -1001,7 +976,7 @@ impl LocalesAnalytics { } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { locales: self, ..Default::default() } } } @@ -1016,7 +991,7 @@ impl DictionaryAnalytics { Self { total: dictionary.as_ref().map(|dictionary| dictionary.len()) } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { dictionary: self, ..Default::default() } } } @@ -1031,7 +1006,7 @@ impl SeparatorTokensAnalytics { Self { total: separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()) } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { separator_tokens: self, ..Default::default() } } } @@ -1050,7 +1025,7 @@ impl NonSeparatorTokensAnalytics { } } - pub fn to_settings(self) -> SettingsAnalytics { + pub fn into_settings(self) -> SettingsAnalytics { SettingsAnalytics { non_separator_tokens: self, ..Default::default() } } } From d9115b74f09118b3bc687f9c0853bb74469b0d87 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 09:32:54 +0200 Subject: [PATCH 62/92] move the analytics settings code to a dedicated file --- meilisearch/src/routes/indexes/mod.rs | 1 + meilisearch/src/routes/indexes/settings.rs | 634 +----------------- .../src/routes/indexes/settings_analytics.rs | 627 +++++++++++++++++ 3 files changed, 632 insertions(+), 630 deletions(-) create mode 100644 meilisearch/src/routes/indexes/settings_analytics.rs diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 8972119d7..65c81a57e 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -29,6 +29,7 @@ pub mod documents; pub mod facet_search; pub mod search; pub mod settings; +mod settings_analytics; pub mod similar; pub fn configure(cfg: &mut web::ServiceConfig) { diff --git a/meilisearch/src/routes/indexes/settings.rs b/meilisearch/src/routes/indexes/settings.rs index 745ad5c78..bca763a99 100644 --- a/meilisearch/src/routes/indexes/settings.rs +++ b/meilisearch/src/routes/indexes/settings.rs @@ -1,23 +1,17 @@ -use std::collections::{BTreeSet, HashSet}; - +use super::settings_analytics::*; use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::AwebJson; use index_scheduler::IndexScheduler; use meilisearch_types::deserr::DeserrJsonError; use meilisearch_types::error::ResponseError; -use meilisearch_types::facet_values_sort::FacetValuesSort; use meilisearch_types::index_uid::IndexUid; -use meilisearch_types::locales::Locale; use meilisearch_types::milli::update::Setting; -use meilisearch_types::settings::{ - settings, ProximityPrecisionView, RankingRuleView, SecretPolicy, Settings, Unchecked, -}; +use meilisearch_types::settings::{settings, SecretPolicy, Settings, Unchecked}; use meilisearch_types::tasks::KindWithContent; -use serde::Serialize; use tracing::debug; -use crate::analytics::{Aggregate, Analytics}; +use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; @@ -94,7 +88,7 @@ macro_rules! make_setting_route { #[allow(clippy::redundant_closure_call)] analytics.publish( - $crate::routes::indexes::settings::$analytics::new(body.as_ref()).into_settings(), + $crate::routes::indexes::settings_analytics::$analytics::new(body.as_ref()).into_settings(), &req, ); @@ -410,626 +404,6 @@ generate_configure!( search_cutoff_ms ); -#[derive(Serialize, Default)] -struct SettingsAnalytics { - ranking_rules: RankingRulesAnalytics, - searchable_attributes: SearchableAttributesAnalytics, - displayed_attributes: DisplayedAttributesAnalytics, - sortable_attributes: SortableAttributesAnalytics, - filterable_attributes: FilterableAttributesAnalytics, - distinct_attribute: DistinctAttributeAnalytics, - proximity_precision: ProximityPrecisionAnalytics, - typo_tolerance: TypoToleranceAnalytics, - faceting: FacetingAnalytics, - pagination: PaginationAnalytics, - stop_words: StopWordsAnalytics, - synonyms: SynonymsAnalytics, - embedders: EmbeddersAnalytics, - search_cutoff_ms: SearchCutoffMsAnalytics, - locales: LocalesAnalytics, - dictionary: DictionaryAnalytics, - separator_tokens: SeparatorTokensAnalytics, - non_separator_tokens: NonSeparatorTokensAnalytics, -} - -impl Aggregate for SettingsAnalytics { - fn event_name(&self) -> &'static str { - "Settings Updated" - } - - fn aggregate(self: Box, other: Box) -> Box { - Box::new(Self { - ranking_rules: RankingRulesAnalytics { - words_position: self - .ranking_rules - .words_position - .or(other.ranking_rules.words_position), - typo_position: self - .ranking_rules - .typo_position - .or(other.ranking_rules.typo_position), - proximity_position: self - .ranking_rules - .proximity_position - .or(other.ranking_rules.proximity_position), - attribute_position: self - .ranking_rules - .attribute_position - .or(other.ranking_rules.attribute_position), - sort_position: self - .ranking_rules - .sort_position - .or(other.ranking_rules.sort_position), - exactness_position: self - .ranking_rules - .exactness_position - .or(other.ranking_rules.exactness_position), - values: self.ranking_rules.values.or(other.ranking_rules.values), - }, - searchable_attributes: SearchableAttributesAnalytics { - total: self.searchable_attributes.total.or(other.searchable_attributes.total), - with_wildcard: self - .searchable_attributes - .with_wildcard - .or(other.searchable_attributes.with_wildcard), - }, - displayed_attributes: DisplayedAttributesAnalytics { - total: self.displayed_attributes.total.or(other.displayed_attributes.total), - with_wildcard: self - .displayed_attributes - .with_wildcard - .or(other.displayed_attributes.with_wildcard), - }, - sortable_attributes: SortableAttributesAnalytics { - total: self.sortable_attributes.total.or(other.sortable_attributes.total), - has_geo: self.sortable_attributes.has_geo.or(other.sortable_attributes.has_geo), - }, - filterable_attributes: FilterableAttributesAnalytics { - total: self.filterable_attributes.total.or(other.filterable_attributes.total), - has_geo: self.filterable_attributes.has_geo.or(other.filterable_attributes.has_geo), - }, - distinct_attribute: DistinctAttributeAnalytics { - set: self.distinct_attribute.set | other.distinct_attribute.set, - }, - proximity_precision: ProximityPrecisionAnalytics { - set: self.proximity_precision.set | other.proximity_precision.set, - value: self.proximity_precision.value.or(other.proximity_precision.value), - }, - typo_tolerance: TypoToleranceAnalytics { - enabled: self.typo_tolerance.enabled.or(other.typo_tolerance.enabled), - disable_on_attributes: self - .typo_tolerance - .disable_on_attributes - .or(other.typo_tolerance.disable_on_attributes), - disable_on_words: self - .typo_tolerance - .disable_on_words - .or(other.typo_tolerance.disable_on_words), - min_word_size_for_one_typo: self - .typo_tolerance - .min_word_size_for_one_typo - .or(other.typo_tolerance.min_word_size_for_one_typo), - min_word_size_for_two_typos: self - .typo_tolerance - .min_word_size_for_two_typos - .or(other.typo_tolerance.min_word_size_for_two_typos), - }, - faceting: FacetingAnalytics { - max_values_per_facet: self - .faceting - .max_values_per_facet - .or(other.faceting.max_values_per_facet), - sort_facet_values_by_star_count: self - .faceting - .sort_facet_values_by_star_count - .or(other.faceting.sort_facet_values_by_star_count), - sort_facet_values_by_total: self - .faceting - .sort_facet_values_by_total - .or(other.faceting.sort_facet_values_by_total), - }, - pagination: PaginationAnalytics { - max_total_hits: self.pagination.max_total_hits.or(other.pagination.max_total_hits), - }, - stop_words: StopWordsAnalytics { - total: self.stop_words.total.or(other.stop_words.total), - }, - synonyms: SynonymsAnalytics { total: self.synonyms.total.or(other.synonyms.total) }, - embedders: EmbeddersAnalytics { - total: self.embedders.total.or(other.embedders.total), - sources: match (self.embedders.sources, other.embedders.sources) { - (None, None) => None, - (Some(sources), None) | (None, Some(sources)) => Some(sources), - (Some(this), Some(other)) => Some(this.union(&other).cloned().collect()), - }, - document_template_used: match ( - self.embedders.document_template_used, - other.embedders.document_template_used, - ) { - (None, None) => None, - (Some(used), None) | (None, Some(used)) => Some(used), - (Some(this), Some(other)) => Some(this | other), - }, - document_template_max_bytes: match ( - self.embedders.document_template_max_bytes, - other.embedders.document_template_max_bytes, - ) { - (None, None) => None, - (Some(bytes), None) | (None, Some(bytes)) => Some(bytes), - (Some(this), Some(other)) => Some(this.max(other)), - }, - binary_quantization_used: match ( - self.embedders.binary_quantization_used, - other.embedders.binary_quantization_used, - ) { - (None, None) => None, - (Some(bq), None) | (None, Some(bq)) => Some(bq), - (Some(this), Some(other)) => Some(this | other), - }, - }, - search_cutoff_ms: SearchCutoffMsAnalytics { - search_cutoff_ms: self - .search_cutoff_ms - .search_cutoff_ms - .or(other.search_cutoff_ms.search_cutoff_ms), - }, - locales: LocalesAnalytics { locales: self.locales.locales.or(other.locales.locales) }, - dictionary: DictionaryAnalytics { - total: self.dictionary.total.or(other.dictionary.total), - }, - separator_tokens: SeparatorTokensAnalytics { - total: self.separator_tokens.total.or(other.non_separator_tokens.total), - }, - non_separator_tokens: NonSeparatorTokensAnalytics { - total: self.non_separator_tokens.total.or(other.non_separator_tokens.total), - }, - }) - } - - fn into_event(self: Box) -> serde_json::Value { - serde_json::to_value(*self).unwrap_or_default() - } -} - -#[derive(Serialize, Default)] -struct RankingRulesAnalytics { - words_position: Option, - typo_position: Option, - proximity_position: Option, - attribute_position: Option, - sort_position: Option, - exactness_position: Option, - values: Option, -} - -impl RankingRulesAnalytics { - pub fn new(rr: Option<&Vec>) -> Self { - RankingRulesAnalytics { - words_position: rr.as_ref().and_then(|rr| { - rr.iter() - .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words)) - }), - typo_position: rr.as_ref().and_then(|rr| { - rr.iter() - .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo)) - }), - proximity_position: rr.as_ref().and_then(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Proximity) - }) - }), - attribute_position: rr.as_ref().and_then(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Attribute) - }) - }), - sort_position: rr.as_ref().and_then(|rr| { - rr.iter() - .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort)) - }), - exactness_position: rr.as_ref().and_then(|rr| { - rr.iter().position(|s| { - matches!(s, meilisearch_types::settings::RankingRuleView::Exactness) - }) - }), - values: rr.as_ref().map(|rr| { - rr.iter() - .filter(|s| { - matches!( - s, - meilisearch_types::settings::RankingRuleView::Asc(_) - | meilisearch_types::settings::RankingRuleView::Desc(_) - ) - }) - .map(|x| x.to_string()) - .collect::>() - .join(", ") - }), - } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { ranking_rules: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct SearchableAttributesAnalytics { - total: Option, - with_wildcard: Option, -} - -impl SearchableAttributesAnalytics { - pub fn new(setting: Option<&Vec>) -> Self { - Self { - total: setting.as_ref().map(|searchable| searchable.len()), - with_wildcard: setting - .as_ref() - .map(|searchable| searchable.iter().any(|searchable| searchable == "*")), - } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { searchable_attributes: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct DisplayedAttributesAnalytics { - total: Option, - with_wildcard: Option, -} - -impl DisplayedAttributesAnalytics { - pub fn new(displayed: Option<&Vec>) -> Self { - Self { - total: displayed.as_ref().map(|displayed| displayed.len()), - with_wildcard: displayed - .as_ref() - .map(|displayed| displayed.iter().any(|displayed| displayed == "*")), - } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { displayed_attributes: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct SortableAttributesAnalytics { - total: Option, - has_geo: Option, -} - -impl SortableAttributesAnalytics { - pub fn new(setting: Option<&std::collections::BTreeSet>) -> Self { - Self { - total: setting.as_ref().map(|sort| sort.len()), - has_geo: setting.as_ref().map(|sort| sort.contains("_geo")), - } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { sortable_attributes: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct FilterableAttributesAnalytics { - total: Option, - has_geo: Option, -} - -impl FilterableAttributesAnalytics { - pub fn new(setting: Option<&std::collections::BTreeSet>) -> Self { - Self { - total: setting.as_ref().map(|filter| filter.len()), - has_geo: setting.as_ref().map(|filter| filter.contains("_geo")), - } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { filterable_attributes: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct DistinctAttributeAnalytics { - set: bool, -} - -impl DistinctAttributeAnalytics { - pub fn new(distinct: Option<&String>) -> Self { - Self { set: distinct.is_some() } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { distinct_attribute: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct ProximityPrecisionAnalytics { - set: bool, - value: Option, -} - -impl ProximityPrecisionAnalytics { - pub fn new(precision: Option<&meilisearch_types::settings::ProximityPrecisionView>) -> Self { - Self { set: precision.is_some(), value: precision.cloned() } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { proximity_precision: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct TypoToleranceAnalytics { - enabled: Option, - disable_on_attributes: Option, - disable_on_words: Option, - min_word_size_for_one_typo: Option, - min_word_size_for_two_typos: Option, -} - -impl TypoToleranceAnalytics { - pub fn new(setting: Option<&meilisearch_types::settings::TypoSettings>) -> Self { - Self { - enabled: setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))), - disable_on_attributes: setting - .as_ref() - .and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())), - disable_on_words: setting - .as_ref() - .and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())), - min_word_size_for_one_typo: setting - .as_ref() - .and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.one_typo.set())) - .flatten(), - min_word_size_for_two_typos: setting - .as_ref() - .and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.two_typos.set())) - .flatten(), - } - } - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { typo_tolerance: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct FacetingAnalytics { - max_values_per_facet: Option, - sort_facet_values_by_star_count: Option, - sort_facet_values_by_total: Option, -} - -impl FacetingAnalytics { - pub fn new(setting: Option<&meilisearch_types::settings::FacetingSettings>) -> Self { - Self { - max_values_per_facet: setting.as_ref().and_then(|s| s.max_values_per_facet.set()), - sort_facet_values_by_star_count: setting.as_ref().and_then(|s| { - s.sort_facet_values_by - .as_ref() - .set() - .map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) - }), - sort_facet_values_by_total: setting - .as_ref() - .and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), - } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { faceting: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct PaginationAnalytics { - max_total_hits: Option, -} - -impl PaginationAnalytics { - pub fn new(setting: Option<&meilisearch_types::settings::PaginationSettings>) -> Self { - Self { max_total_hits: setting.as_ref().and_then(|s| s.max_total_hits.set()) } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { pagination: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct StopWordsAnalytics { - total: Option, -} - -impl StopWordsAnalytics { - pub fn new(stop_words: Option<&BTreeSet>) -> Self { - Self { total: stop_words.as_ref().map(|stop_words| stop_words.len()) } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { stop_words: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct SynonymsAnalytics { - total: Option, -} - -impl SynonymsAnalytics { - pub fn new(synonyms: Option<&std::collections::BTreeMap>>) -> Self { - Self { total: synonyms.as_ref().map(|synonyms| synonyms.len()) } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { synonyms: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct EmbeddersAnalytics { - // last - total: Option, - // Merge the sources - sources: Option>, - // |= - document_template_used: Option, - // max - document_template_max_bytes: Option, - // |= - binary_quantization_used: Option, -} - -impl EmbeddersAnalytics { - pub fn new( - setting: Option< - &std::collections::BTreeMap< - String, - Setting, - >, - >, - ) -> Self { - let mut sources = std::collections::HashSet::new(); - - if let Some(s) = &setting { - for source in s - .values() - .filter_map(|config| config.clone().set()) - .filter_map(|config| config.source.set()) - { - use meilisearch_types::milli::vector::settings::EmbedderSource; - match source { - EmbedderSource::OpenAi => sources.insert("openAi".to_string()), - EmbedderSource::HuggingFace => sources.insert("huggingFace".to_string()), - EmbedderSource::UserProvided => sources.insert("userProvided".to_string()), - EmbedderSource::Ollama => sources.insert("ollama".to_string()), - EmbedderSource::Rest => sources.insert("rest".to_string()), - }; - } - }; - - Self { - total: setting.as_ref().map(|s| s.len()), - sources: Some(sources), - document_template_used: setting.as_ref().map(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .any(|config| config.document_template.set().is_some()) - }), - document_template_max_bytes: setting.as_ref().and_then(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .filter_map(|config| config.document_template_max_bytes.set()) - .max() - }), - binary_quantization_used: setting.as_ref().map(|map| { - map.values() - .filter_map(|config| config.clone().set()) - .any(|config| config.binary_quantized.set().is_some()) - }), - } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { embedders: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -#[serde(transparent)] -struct SearchCutoffMsAnalytics { - search_cutoff_ms: Option, -} - -impl SearchCutoffMsAnalytics { - pub fn new(setting: Option<&u64>) -> Self { - Self { search_cutoff_ms: setting.copied() } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { search_cutoff_ms: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -#[serde(transparent)] -struct LocalesAnalytics { - locales: Option>, -} - -impl LocalesAnalytics { - pub fn new( - rules: Option<&Vec>, - ) -> Self { - LocalesAnalytics { - locales: rules.as_ref().map(|rules| { - rules - .iter() - .flat_map(|rule| rule.locales.iter().cloned()) - .collect::>() - }), - } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { locales: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct DictionaryAnalytics { - total: Option, -} - -impl DictionaryAnalytics { - pub fn new(dictionary: Option<&std::collections::BTreeSet>) -> Self { - Self { total: dictionary.as_ref().map(|dictionary| dictionary.len()) } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { dictionary: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct SeparatorTokensAnalytics { - total: Option, -} - -impl SeparatorTokensAnalytics { - pub fn new(separator_tokens: Option<&std::collections::BTreeSet>) -> Self { - Self { total: separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()) } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { separator_tokens: self, ..Default::default() } - } -} - -#[derive(Serialize, Default)] -struct NonSeparatorTokensAnalytics { - total: Option, -} - -impl NonSeparatorTokensAnalytics { - pub fn new(non_separator_tokens: Option<&std::collections::BTreeSet>) -> Self { - Self { - total: non_separator_tokens - .as_ref() - .map(|non_separator_tokens| non_separator_tokens.len()), - } - } - - pub fn into_settings(self) -> SettingsAnalytics { - SettingsAnalytics { non_separator_tokens: self, ..Default::default() } - } -} - pub async fn update_all( index_scheduler: GuardedData, Data>, index_uid: web::Path, diff --git a/meilisearch/src/routes/indexes/settings_analytics.rs b/meilisearch/src/routes/indexes/settings_analytics.rs new file mode 100644 index 000000000..636ef3c57 --- /dev/null +++ b/meilisearch/src/routes/indexes/settings_analytics.rs @@ -0,0 +1,627 @@ +//! All the structures used to make the analytics on the settings works. +//! The signatures of the `new` functions are not very rust idiomatic because they must match the types received +//! through the sub-settings route directly without any manipulation. +//! This is why we often use a `Option<&Vec<_>>` instead of a `Option<&[_]>`. + +use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView}; +use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::vector::settings::EmbeddingSettings; +use meilisearch_types::settings::{ + FacetingSettings, PaginationSettings, ProximityPrecisionView, TypoSettings, +}; +use meilisearch_types::{facet_values_sort::FacetValuesSort, settings::RankingRuleView}; +use serde::Serialize; +use std::collections::{BTreeMap, BTreeSet, HashSet}; + +use crate::analytics::Aggregate; + +#[derive(Serialize, Default)] +pub struct SettingsAnalytics { + pub ranking_rules: RankingRulesAnalytics, + pub searchable_attributes: SearchableAttributesAnalytics, + pub displayed_attributes: DisplayedAttributesAnalytics, + pub sortable_attributes: SortableAttributesAnalytics, + pub filterable_attributes: FilterableAttributesAnalytics, + pub distinct_attribute: DistinctAttributeAnalytics, + pub proximity_precision: ProximityPrecisionAnalytics, + pub typo_tolerance: TypoToleranceAnalytics, + pub faceting: FacetingAnalytics, + pub pagination: PaginationAnalytics, + pub stop_words: StopWordsAnalytics, + pub synonyms: SynonymsAnalytics, + pub embedders: EmbeddersAnalytics, + pub search_cutoff_ms: SearchCutoffMsAnalytics, + pub locales: LocalesAnalytics, + pub dictionary: DictionaryAnalytics, + pub separator_tokens: SeparatorTokensAnalytics, + pub non_separator_tokens: NonSeparatorTokensAnalytics, +} + +impl Aggregate for SettingsAnalytics { + fn event_name(&self) -> &'static str { + "Settings Updated" + } + + fn aggregate(self: Box, other: Box) -> Box { + Box::new(Self { + ranking_rules: RankingRulesAnalytics { + words_position: self + .ranking_rules + .words_position + .or(other.ranking_rules.words_position), + typo_position: self + .ranking_rules + .typo_position + .or(other.ranking_rules.typo_position), + proximity_position: self + .ranking_rules + .proximity_position + .or(other.ranking_rules.proximity_position), + attribute_position: self + .ranking_rules + .attribute_position + .or(other.ranking_rules.attribute_position), + sort_position: self + .ranking_rules + .sort_position + .or(other.ranking_rules.sort_position), + exactness_position: self + .ranking_rules + .exactness_position + .or(other.ranking_rules.exactness_position), + values: self.ranking_rules.values.or(other.ranking_rules.values), + }, + searchable_attributes: SearchableAttributesAnalytics { + total: self.searchable_attributes.total.or(other.searchable_attributes.total), + with_wildcard: self + .searchable_attributes + .with_wildcard + .or(other.searchable_attributes.with_wildcard), + }, + displayed_attributes: DisplayedAttributesAnalytics { + total: self.displayed_attributes.total.or(other.displayed_attributes.total), + with_wildcard: self + .displayed_attributes + .with_wildcard + .or(other.displayed_attributes.with_wildcard), + }, + sortable_attributes: SortableAttributesAnalytics { + total: self.sortable_attributes.total.or(other.sortable_attributes.total), + has_geo: self.sortable_attributes.has_geo.or(other.sortable_attributes.has_geo), + }, + filterable_attributes: FilterableAttributesAnalytics { + total: self.filterable_attributes.total.or(other.filterable_attributes.total), + has_geo: self.filterable_attributes.has_geo.or(other.filterable_attributes.has_geo), + }, + distinct_attribute: DistinctAttributeAnalytics { + set: self.distinct_attribute.set | other.distinct_attribute.set, + }, + proximity_precision: ProximityPrecisionAnalytics { + set: self.proximity_precision.set | other.proximity_precision.set, + value: self.proximity_precision.value.or(other.proximity_precision.value), + }, + typo_tolerance: TypoToleranceAnalytics { + enabled: self.typo_tolerance.enabled.or(other.typo_tolerance.enabled), + disable_on_attributes: self + .typo_tolerance + .disable_on_attributes + .or(other.typo_tolerance.disable_on_attributes), + disable_on_words: self + .typo_tolerance + .disable_on_words + .or(other.typo_tolerance.disable_on_words), + min_word_size_for_one_typo: self + .typo_tolerance + .min_word_size_for_one_typo + .or(other.typo_tolerance.min_word_size_for_one_typo), + min_word_size_for_two_typos: self + .typo_tolerance + .min_word_size_for_two_typos + .or(other.typo_tolerance.min_word_size_for_two_typos), + }, + faceting: FacetingAnalytics { + max_values_per_facet: self + .faceting + .max_values_per_facet + .or(other.faceting.max_values_per_facet), + sort_facet_values_by_star_count: self + .faceting + .sort_facet_values_by_star_count + .or(other.faceting.sort_facet_values_by_star_count), + sort_facet_values_by_total: self + .faceting + .sort_facet_values_by_total + .or(other.faceting.sort_facet_values_by_total), + }, + pagination: PaginationAnalytics { + max_total_hits: self.pagination.max_total_hits.or(other.pagination.max_total_hits), + }, + stop_words: StopWordsAnalytics { + total: self.stop_words.total.or(other.stop_words.total), + }, + synonyms: SynonymsAnalytics { total: self.synonyms.total.or(other.synonyms.total) }, + embedders: EmbeddersAnalytics { + total: self.embedders.total.or(other.embedders.total), + sources: match (self.embedders.sources, other.embedders.sources) { + (None, None) => None, + (Some(sources), None) | (None, Some(sources)) => Some(sources), + (Some(this), Some(other)) => Some(this.union(&other).cloned().collect()), + }, + document_template_used: match ( + self.embedders.document_template_used, + other.embedders.document_template_used, + ) { + (None, None) => None, + (Some(used), None) | (None, Some(used)) => Some(used), + (Some(this), Some(other)) => Some(this | other), + }, + document_template_max_bytes: match ( + self.embedders.document_template_max_bytes, + other.embedders.document_template_max_bytes, + ) { + (None, None) => None, + (Some(bytes), None) | (None, Some(bytes)) => Some(bytes), + (Some(this), Some(other)) => Some(this.max(other)), + }, + binary_quantization_used: match ( + self.embedders.binary_quantization_used, + other.embedders.binary_quantization_used, + ) { + (None, None) => None, + (Some(bq), None) | (None, Some(bq)) => Some(bq), + (Some(this), Some(other)) => Some(this | other), + }, + }, + search_cutoff_ms: SearchCutoffMsAnalytics { + search_cutoff_ms: self + .search_cutoff_ms + .search_cutoff_ms + .or(other.search_cutoff_ms.search_cutoff_ms), + }, + locales: LocalesAnalytics { locales: self.locales.locales.or(other.locales.locales) }, + dictionary: DictionaryAnalytics { + total: self.dictionary.total.or(other.dictionary.total), + }, + separator_tokens: SeparatorTokensAnalytics { + total: self.separator_tokens.total.or(other.non_separator_tokens.total), + }, + non_separator_tokens: NonSeparatorTokensAnalytics { + total: self.non_separator_tokens.total.or(other.non_separator_tokens.total), + }, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + serde_json::to_value(*self).unwrap_or_default() + } +} + +#[derive(Serialize, Default)] +pub struct RankingRulesAnalytics { + pub words_position: Option, + pub typo_position: Option, + pub proximity_position: Option, + pub attribute_position: Option, + pub sort_position: Option, + pub exactness_position: Option, + pub values: Option, +} + +impl RankingRulesAnalytics { + pub fn new(rr: Option<&Vec>) -> Self { + RankingRulesAnalytics { + words_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words)) + }), + typo_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo)) + }), + proximity_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Proximity) + }) + }), + attribute_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Attribute) + }) + }), + sort_position: rr.as_ref().and_then(|rr| { + rr.iter() + .position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort)) + }), + exactness_position: rr.as_ref().and_then(|rr| { + rr.iter().position(|s| { + matches!(s, meilisearch_types::settings::RankingRuleView::Exactness) + }) + }), + values: rr.as_ref().map(|rr| { + rr.iter() + .filter(|s| { + matches!( + s, + meilisearch_types::settings::RankingRuleView::Asc(_) + | meilisearch_types::settings::RankingRuleView::Desc(_) + ) + }) + .map(|x| x.to_string()) + .collect::>() + .join(", ") + }), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { ranking_rules: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SearchableAttributesAnalytics { + pub total: Option, + pub with_wildcard: Option, +} + +impl SearchableAttributesAnalytics { + pub fn new(setting: Option<&Vec>) -> Self { + Self { + total: setting.as_ref().map(|searchable| searchable.len()), + with_wildcard: setting + .as_ref() + .map(|searchable| searchable.iter().any(|searchable| searchable == "*")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { searchable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct DisplayedAttributesAnalytics { + pub total: Option, + pub with_wildcard: Option, +} + +impl DisplayedAttributesAnalytics { + pub fn new(displayed: Option<&Vec>) -> Self { + Self { + total: displayed.as_ref().map(|displayed| displayed.len()), + with_wildcard: displayed + .as_ref() + .map(|displayed| displayed.iter().any(|displayed| displayed == "*")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { displayed_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SortableAttributesAnalytics { + pub total: Option, + pub has_geo: Option, +} + +impl SortableAttributesAnalytics { + pub fn new(setting: Option<&BTreeSet>) -> Self { + Self { + total: setting.as_ref().map(|sort| sort.len()), + has_geo: setting.as_ref().map(|sort| sort.contains("_geo")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { sortable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct FilterableAttributesAnalytics { + pub total: Option, + pub has_geo: Option, +} + +impl FilterableAttributesAnalytics { + pub fn new(setting: Option<&BTreeSet>) -> Self { + Self { + total: setting.as_ref().map(|filter| filter.len()), + has_geo: setting.as_ref().map(|filter| filter.contains("_geo")), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { filterable_attributes: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct DistinctAttributeAnalytics { + pub set: bool, +} + +impl DistinctAttributeAnalytics { + pub fn new(distinct: Option<&String>) -> Self { + Self { set: distinct.is_some() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { distinct_attribute: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct ProximityPrecisionAnalytics { + pub set: bool, + pub value: Option, +} + +impl ProximityPrecisionAnalytics { + pub fn new(precision: Option<&ProximityPrecisionView>) -> Self { + Self { set: precision.is_some(), value: precision.cloned() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { proximity_precision: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct TypoToleranceAnalytics { + pub enabled: Option, + pub disable_on_attributes: Option, + pub disable_on_words: Option, + pub min_word_size_for_one_typo: Option, + pub min_word_size_for_two_typos: Option, +} + +impl TypoToleranceAnalytics { + pub fn new(setting: Option<&TypoSettings>) -> Self { + Self { + enabled: setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))), + disable_on_attributes: setting + .as_ref() + .and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())), + disable_on_words: setting + .as_ref() + .and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())), + min_word_size_for_one_typo: setting + .as_ref() + .and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.one_typo.set())) + .flatten(), + min_word_size_for_two_typos: setting + .as_ref() + .and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.two_typos.set())) + .flatten(), + } + } + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { typo_tolerance: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct FacetingAnalytics { + pub max_values_per_facet: Option, + pub sort_facet_values_by_star_count: Option, + pub sort_facet_values_by_total: Option, +} + +impl FacetingAnalytics { + pub fn new(setting: Option<&FacetingSettings>) -> Self { + Self { + max_values_per_facet: setting.as_ref().and_then(|s| s.max_values_per_facet.set()), + sort_facet_values_by_star_count: setting.as_ref().and_then(|s| { + s.sort_facet_values_by + .as_ref() + .set() + .map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count)) + }), + sort_facet_values_by_total: setting + .as_ref() + .and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { faceting: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct PaginationAnalytics { + pub max_total_hits: Option, +} + +impl PaginationAnalytics { + pub fn new(setting: Option<&PaginationSettings>) -> Self { + Self { max_total_hits: setting.as_ref().and_then(|s| s.max_total_hits.set()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { pagination: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct StopWordsAnalytics { + pub total: Option, +} + +impl StopWordsAnalytics { + pub fn new(stop_words: Option<&BTreeSet>) -> Self { + Self { total: stop_words.as_ref().map(|stop_words| stop_words.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { stop_words: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SynonymsAnalytics { + pub total: Option, +} + +impl SynonymsAnalytics { + pub fn new(synonyms: Option<&BTreeMap>>) -> Self { + Self { total: synonyms.as_ref().map(|synonyms| synonyms.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { synonyms: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct EmbeddersAnalytics { + // last + pub total: Option, + // Merge the sources + pub sources: Option>, + // |= + pub document_template_used: Option, + // max + pub document_template_max_bytes: Option, + // |= + pub binary_quantization_used: Option, +} + +impl EmbeddersAnalytics { + pub fn new(setting: Option<&BTreeMap>>) -> Self { + let mut sources = std::collections::HashSet::new(); + + if let Some(s) = &setting { + for source in s + .values() + .filter_map(|config| config.clone().set()) + .filter_map(|config| config.source.set()) + { + use meilisearch_types::milli::vector::settings::EmbedderSource; + match source { + EmbedderSource::OpenAi => sources.insert("openAi".to_string()), + EmbedderSource::HuggingFace => sources.insert("huggingFace".to_string()), + EmbedderSource::UserProvided => sources.insert("userProvided".to_string()), + EmbedderSource::Ollama => sources.insert("ollama".to_string()), + EmbedderSource::Rest => sources.insert("rest".to_string()), + }; + } + }; + + Self { + total: setting.as_ref().map(|s| s.len()), + sources: Some(sources), + document_template_used: setting.as_ref().map(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .any(|config| config.document_template.set().is_some()) + }), + document_template_max_bytes: setting.as_ref().and_then(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .filter_map(|config| config.document_template_max_bytes.set()) + .max() + }), + binary_quantization_used: setting.as_ref().map(|map| { + map.values() + .filter_map(|config| config.clone().set()) + .any(|config| config.binary_quantized.set().is_some()) + }), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { embedders: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +#[serde(transparent)] +pub struct SearchCutoffMsAnalytics { + pub search_cutoff_ms: Option, +} + +impl SearchCutoffMsAnalytics { + pub fn new(setting: Option<&u64>) -> Self { + Self { search_cutoff_ms: setting.copied() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { search_cutoff_ms: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +#[serde(transparent)] +pub struct LocalesAnalytics { + pub locales: Option>, +} + +impl LocalesAnalytics { + pub fn new(rules: Option<&Vec>) -> Self { + LocalesAnalytics { + locales: rules.as_ref().map(|rules| { + rules + .iter() + .flat_map(|rule| rule.locales.iter().cloned()) + .collect::>() + }), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { locales: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct DictionaryAnalytics { + pub total: Option, +} + +impl DictionaryAnalytics { + pub fn new(dictionary: Option<&BTreeSet>) -> Self { + Self { total: dictionary.as_ref().map(|dictionary| dictionary.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { dictionary: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct SeparatorTokensAnalytics { + pub total: Option, +} + +impl SeparatorTokensAnalytics { + pub fn new(separator_tokens: Option<&BTreeSet>) -> Self { + Self { total: separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()) } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { separator_tokens: self, ..Default::default() } + } +} + +#[derive(Serialize, Default)] +pub struct NonSeparatorTokensAnalytics { + pub total: Option, +} + +impl NonSeparatorTokensAnalytics { + pub fn new(non_separator_tokens: Option<&BTreeSet>) -> Self { + Self { + total: non_separator_tokens + .as_ref() + .map(|non_separator_tokens| non_separator_tokens.len()), + } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { non_separator_tokens: self, ..Default::default() } + } +} From 18ac4032aa5512c96b0068d0603f4db285f81bd9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 09:35:11 +0200 Subject: [PATCH 63/92] Remove the experimental feature seen --- meilisearch/src/routes/features.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/meilisearch/src/routes/features.rs b/meilisearch/src/routes/features.rs index 1de00717d..8bdb3ffb3 100644 --- a/meilisearch/src/routes/features.rs +++ b/meilisearch/src/routes/features.rs @@ -17,24 +17,19 @@ use crate::extractors::sequential_extractor::SeqHandler; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( web::resource("") - .route(web::get().to(SeqHandler(get_features))) + .route(web::get().to(get_features)) .route(web::patch().to(SeqHandler(patch_features))), ); } -crate::empty_analytics!(GetExperimentalFeatureAnalytics, "Experimental features Seen"); - async fn get_features( index_scheduler: GuardedData< ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_GET }>, Data, >, - req: HttpRequest, - analytics: Data, ) -> HttpResponse { let features = index_scheduler.features(); - analytics.publish(GetExperimentalFeatureAnalytics::default(), &req); let features = features.runtime_features(); debug!(returns = ?features, "Get features"); HttpResponse::Ok().json(features) From 1ab6fec9030351956fd2462dc5afb3b2b317860c Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 09:49:21 +0200 Subject: [PATCH 64/92] send all experimental features in the info event including the runtime one --- .../src/analytics/segment_analytics.rs | 44 +++++++++++++------ 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 1edfa1bdd..c0c2b64d8 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -10,6 +10,7 @@ use actix_web::HttpRequest; use byte_unit::Byte; use index_scheduler::IndexScheduler; use meilisearch_auth::{AuthController, AuthFilter}; +use meilisearch_types::features::RuntimeTogglableFeatures; use meilisearch_types::locales::Locale; use meilisearch_types::InstanceUid; use once_cell::sync::Lazy; @@ -173,7 +174,9 @@ impl SegmentAnalytics { struct Infos { env: String, experimental_contains_filter: bool, + experimental_vector_store: bool, experimental_enable_metrics: bool, + experimental_edit_documents_by_function: bool, experimental_search_queue_size: usize, experimental_drop_search_after: usize, experimental_nb_searches_per_core: usize, @@ -210,8 +213,8 @@ struct Infos { ssl_tickets: bool, } -impl From for Infos { - fn from(options: Opt) -> Self { +impl Infos { + pub fn new(options: Opt, features: RuntimeTogglableFeatures) -> Self { // We wants to decompose this whole struct by hand to be sure we don't forget // to add analytics when we add a field in the Opt. // Thus we must not insert `..` at the end. @@ -254,8 +257,7 @@ impl From for Infos { log_level, indexer_options, config_file_path, - #[cfg(feature = "analytics")] - no_analytics: _, + no_analytics: _, } = options; let schedule_snapshot = match schedule_snapshot { @@ -266,18 +268,28 @@ impl From for Infos { let IndexerOpts { max_indexing_memory, max_indexing_threads, skip_index_budget: _ } = indexer_options; + let RuntimeTogglableFeatures { + vector_store, + metrics, + logs_route, + edit_documents_by_function, + contains_filter, + } = features; + // We're going to override every sensible information. // We consider information sensible if it contains a path, an address, or a key. Self { env, - experimental_contains_filter, - experimental_enable_metrics, + experimental_contains_filter: experimental_contains_filter | contains_filter, + experimental_vector_store: vector_store, + experimental_edit_documents_by_function: edit_documents_by_function, + experimental_enable_metrics: experimental_enable_metrics | metrics, experimental_search_queue_size, experimental_drop_search_after: experimental_drop_search_after.into(), experimental_nb_searches_per_core: experimental_nb_searches_per_core.into(), experimental_logs_mode, experimental_replication_parameters, - experimental_enable_logs_route, + experimental_enable_logs_route: experimental_enable_logs_route | logs_route, experimental_reduce_indexing_memory_usage, gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), @@ -319,7 +331,7 @@ pub struct Segment { } impl Segment { - fn compute_traits(opt: &Opt, stats: Stats) -> Value { + fn compute_traits(opt: &Opt, stats: Stats, features: RuntimeTogglableFeatures) -> Value { static FIRST_START_TIMESTAMP: Lazy = Lazy::new(Instant::now); static SYSTEM: Lazy = Lazy::new(|| { let disks = Disks::new_with_refreshed_list(); @@ -347,7 +359,7 @@ impl Segment { "indexes_number": stats.indexes.len(), "documents_number": number_of_documents, }, - "infos": Infos::from(opt.clone()), + "infos": Infos::new(opt.clone(), features), }) } @@ -399,9 +411,11 @@ impl Segment { index_scheduler: Arc, auth_controller: Arc, ) { - if let Ok(stats) = - create_all_stats(index_scheduler.into(), auth_controller.into(), &AuthFilter::default()) - { + if let Ok(stats) = create_all_stats( + index_scheduler.clone().into(), + auth_controller.into(), + &AuthFilter::default(), + ) { // Replace the version number with the prototype name if any. let version = if let Some(prototype) = build_info::DescribeResult::from_build() .and_then(|describe| describe.as_prototype()) @@ -420,7 +434,11 @@ impl Segment { }, })), user: self.user.clone(), - traits: Self::compute_traits(&self.opt, stats), + traits: Self::compute_traits( + &self.opt, + stats, + index_scheduler.features().runtime_features(), + ), ..Default::default() }) .await; From fa1db6b7216fce5e9727dfacbcdccc770ef80f16 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 09:55:30 +0200 Subject: [PATCH 65/92] fix the tests --- meilisearch/src/analytics/mod.rs | 4 ++++ meilisearch/tests/common/service.rs | 5 +++-- meilisearch/tests/logs/mod.rs | 5 +++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index d08f3307c..75e8083c5 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -158,6 +158,10 @@ impl Analytics { } } + pub fn no_analytics() -> Self { + Self { segment: None } + } + pub fn instance_uid(&self) -> Option<&InstanceUid> { self.segment.as_ref().map(|segment| segment.instance_uid.as_ref()) } diff --git a/meilisearch/tests/common/service.rs b/meilisearch/tests/common/service.rs index 8addbacf8..c0b07c217 100644 --- a/meilisearch/tests/common/service.rs +++ b/meilisearch/tests/common/service.rs @@ -9,8 +9,9 @@ use actix_web::test; use actix_web::test::TestRequest; use actix_web::web::Data; use index_scheduler::IndexScheduler; +use meilisearch::analytics::Analytics; use meilisearch::search_queue::SearchQueue; -use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer}; +use meilisearch::{create_app, Opt, SubscriberForSecondLayer}; use meilisearch_auth::AuthController; use tracing::level_filters::LevelFilter; use tracing_subscriber::Layer; @@ -141,7 +142,7 @@ impl Service { Data::new(search_queue), self.options.clone(), (route_layer_handle, stderr_layer_handle), - analytics::MockAnalytics::new(&self.options), + Data::new(Analytics::no_analytics()), true, )) .await diff --git a/meilisearch/tests/logs/mod.rs b/meilisearch/tests/logs/mod.rs index 9f4649dca..26482b561 100644 --- a/meilisearch/tests/logs/mod.rs +++ b/meilisearch/tests/logs/mod.rs @@ -7,8 +7,9 @@ use std::str::FromStr; use actix_web::http::header::ContentType; use actix_web::web::Data; use meili_snap::snapshot; +use meilisearch::analytics::Analytics; use meilisearch::search_queue::SearchQueue; -use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer}; +use meilisearch::{create_app, Opt, SubscriberForSecondLayer}; use tracing::level_filters::LevelFilter; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::Layer; @@ -54,7 +55,7 @@ async fn basic_test_log_stream_route() { Data::new(search_queue), server.service.options.clone(), (route_layer_handle, stderr_layer_handle), - analytics::MockAnalytics::new(&server.service.options), + Data::new(Analytics::no_analytics()), true, )) .await; From 3a7a20c7162b728a99327eb32b012f6651e7186b Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 11:14:33 +0200 Subject: [PATCH 66/92] remove the segment feature and always import segment --- meilisearch/Cargo.toml | 5 ++--- meilisearch/src/analytics/mod.rs | 21 +++++++++++++++---- .../src/analytics/segment_analytics.rs | 1 - meilisearch/src/option.rs | 9 +------- meilisearch/tests/common/server.rs | 1 - 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/meilisearch/Cargo.toml b/meilisearch/Cargo.toml index 07357e724..57202f59f 100644 --- a/meilisearch/Cargo.toml +++ b/meilisearch/Cargo.toml @@ -75,7 +75,7 @@ reqwest = { version = "0.12.5", features = [ rustls = { version = "0.23.11", features = ["ring"], default-features = false } rustls-pki-types = { version = "1.7.0", features = ["alloc"] } rustls-pemfile = "2.1.2" -segment = { version = "0.2.4", optional = true } +segment = { version = "0.2.4" } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } sha2 = "0.10.8" @@ -132,8 +132,7 @@ tempfile = { version = "3.10.1", optional = true } zip = { version = "2.1.3", optional = true } [features] -default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"] -analytics = ["segment"] +default = ["meilisearch-types/all-tokenizations", "mini-dashboard"] mini-dashboard = [ "static-files", "anyhow", diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index 75e8083c5..67b830204 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -1,5 +1,3 @@ -#![allow(clippy::transmute_ptr_to_ref)] // mopify isn't updated with the latest version of clippy yet - pub mod segment_analytics; use std::fs; @@ -85,13 +83,19 @@ pub enum DocumentFetchKind { Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, } +/// To send an event to segment, your event must be able to aggregate itself with another event of the same type. pub trait Aggregate: 'static + mopa::Any + Send { + /// The name of the event that will be sent to segment. fn event_name(&self) -> &'static str; + /// Will be called every time an event has been used twice before segment flushed its buffer. fn aggregate(self: Box, other: Box) -> Box where Self: Sized; + /// An internal helper function, you shouldn't implement it yourself. + /// This function should always be called on the same type. If `this` and `other` + /// aren't the same type behind the function will do nothing and return `None`. fn downcast_aggregate( this: Box, other: Box, @@ -100,6 +104,7 @@ pub trait Aggregate: 'static + mopa::Any + Send { Self: Sized, { if this.is::() && other.is::() { + // Both the two following lines cannot fail, but just to be sure we don't crash, we're still avoiding unwrapping let this = this.downcast::().ok()?; let other = other.downcast::().ok()?; Some(Self::aggregate(this, other)) @@ -108,18 +113,26 @@ pub trait Aggregate: 'static + mopa::Any + Send { } } + /// Converts your structure to the final event that'll be sent to segment. fn into_event(self: Box) -> serde_json::Value; } mopafy!(Aggregate); -/// Helper trait to define multiple aggregate with the same content but a different name. -/// Commonly used when you must aggregate a search with POST or with GET for example. +/// Helper trait to define multiple aggregates with the same content but a different name. +/// Commonly used when you must aggregate a search with POST or with GET, for example. pub trait AggregateMethod: 'static + Default + Send { fn event_name() -> &'static str; } /// A macro used to quickly define multiple aggregate method with their name +/// Usage: +/// ```rust +/// aggregate_methods!( +/// SearchGET => "Documents Searched GET", +/// SearchPOST => "Documents Searched POST", +/// ); +/// ``` #[macro_export] macro_rules! aggregate_methods { ($method:ident => $event_name:literal) => { diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index c0c2b64d8..10927f49b 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -695,7 +695,6 @@ impl SearchAggregator { aggregate_methods!( SearchGET => "Documents Searched GET", SearchPOST => "Documents Searched POST", - ); impl Aggregate for SearchAggregator { diff --git a/meilisearch/src/option.rs b/meilisearch/src/option.rs index 02dc660a4..7e87a5a2c 100644 --- a/meilisearch/src/option.rs +++ b/meilisearch/src/option.rs @@ -29,7 +29,6 @@ const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY"; const MEILI_ENV: &str = "MEILI_ENV"; const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL"; const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER"; -#[cfg(feature = "analytics")] const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS"; const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT"; const MEILI_SSL_CERT_PATH: &str = "MEILI_SSL_CERT_PATH"; @@ -210,7 +209,6 @@ pub struct Opt { /// Meilisearch automatically collects data from all instances that do not opt out using this flag. /// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted /// at any time. - #[cfg(feature = "analytics")] #[serde(default)] // we can't send true #[clap(long, env = MEILI_NO_ANALYTICS)] pub no_analytics: bool, @@ -425,7 +423,6 @@ pub struct Opt { impl Opt { /// Whether analytics should be enabled or not. - #[cfg(all(not(debug_assertions), feature = "analytics"))] pub fn analytics(&self) -> bool { !self.no_analytics } @@ -505,7 +502,6 @@ impl Opt { ignore_missing_dump: _, ignore_dump_if_db_exists: _, config_file_path: _, - #[cfg(feature = "analytics")] no_analytics, experimental_contains_filter, experimental_enable_metrics, @@ -533,10 +529,7 @@ impl Opt { ); } - #[cfg(feature = "analytics")] - { - export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string()); - } + export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string()); export_to_env_if_not_present( MEILI_HTTP_PAYLOAD_SIZE_LIMIT, http_payload_size_limit.to_string(), diff --git a/meilisearch/tests/common/server.rs b/meilisearch/tests/common/server.rs index 6d331ebbc..92f181398 100644 --- a/meilisearch/tests/common/server.rs +++ b/meilisearch/tests/common/server.rs @@ -381,7 +381,6 @@ pub fn default_settings(dir: impl AsRef) -> Opt { db_path: dir.as_ref().join("db"), dump_dir: dir.as_ref().join("dumps"), env: "development".to_owned(), - #[cfg(feature = "analytics")] no_analytics: true, max_index_size: Byte::from_u64_with_unit(100, Unit::MiB).unwrap(), max_task_db_size: Byte::from_u64_with_unit(1, Unit::GiB).unwrap(), From 89e2d2b2b9b83a44e2a2af8e2d13020be72c1260 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 17 Oct 2024 13:55:49 +0200 Subject: [PATCH 67/92] fix the doctest --- meilisearch/src/analytics/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index 67b830204..48ac13fc0 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -128,6 +128,8 @@ pub trait AggregateMethod: 'static + Default + Send { /// A macro used to quickly define multiple aggregate method with their name /// Usage: /// ```rust +/// use meilisearch::aggregate_methods; +/// /// aggregate_methods!( /// SearchGET => "Documents Searched GET", /// SearchPOST => "Documents Searched POST", From e51e6f902a13525610c4d0a81125c7292da3de36 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Sat, 19 Oct 2024 13:42:02 +0300 Subject: [PATCH 68/92] Highlight partially cropped matches too --- milli/src/search/new/matches/match.rs | 2 +- .../src/search/new/matches/matching_words.rs | 25 +++-- milli/src/search/new/matches/mod.rs | 94 ++++++++++--------- 3 files changed, 67 insertions(+), 54 deletions(-) diff --git a/milli/src/search/new/matches/match.rs b/milli/src/search/new/matches/match.rs index cc08b006c..2eef4d5a6 100644 --- a/milli/src/search/new/matches/match.rs +++ b/milli/src/search/new/matches/match.rs @@ -18,7 +18,7 @@ pub enum MatchPosition { #[derive(Clone, Debug)] pub struct Match { - pub match_len: usize, + pub char_count: usize, // ids of the query words that matches. pub ids: Vec, pub position: MatchPosition, diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index e4d2785ca..1f30a17ad 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -86,14 +86,17 @@ impl MatchingWords { continue; }; let prefix_length = char_index + c.len_utf8(); - let char_len = token.original_lengths(prefix_length).0; + let (char_count, byte_len) = token.original_lengths(prefix_length); let ids = &located_words.positions; - return Some(MatchType::Full { char_len, ids }); + return Some(MatchType::Full { ids, char_count, byte_len }); // else we exact match the token. } else if token.lemma() == word { - let char_len = token.char_end - token.char_start; let ids = &located_words.positions; - return Some(MatchType::Full { char_len, ids }); + return Some(MatchType::Full { + char_count: token.char_end - token.char_start, + byte_len: token.byte_end - token.byte_start, + ids, + }); } } } @@ -149,7 +152,7 @@ pub type WordId = u16; /// In these cases we need to match consecutively several tokens to consider that the match is full. #[derive(Debug, PartialEq)] pub enum MatchType<'a> { - Full { char_len: usize, ids: &'a RangeInclusive }, + Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive }, Partial(PartialMatch<'a>), } @@ -183,7 +186,11 @@ impl<'a> PartialMatch<'a> { // if there is no remaining word to match in the phrase and the current token is matching, // return a Full match. } else if is_matching { - Some(MatchType::Full { char_len: token.char_end - token.char_start, ids }) + Some(MatchType::Full { + char_count: token.char_end - token.char_start, + byte_len: token.byte_end - token.byte_start, + ids, + }) // if the current token doesn't match, return None to break the match sequence. } else { None @@ -270,7 +277,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(0..=0) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) }) ); assert_eq!( matching_words @@ -294,7 +301,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) ); assert_eq!( matching_words @@ -306,7 +313,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) ); assert_eq!( matching_words diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index ac0fb7e7b..80e3ec7b2 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -10,7 +10,10 @@ use matching_words::{MatchType, PartialMatch}; use r#match::{Match, MatchPosition}; use serde::Serialize; use simple_token_kind::SimpleTokenKind; -use std::borrow::Cow; +use std::{ + borrow::Cow, + cmp::{max, min}, +}; const DEFAULT_CROP_MARKER: &str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; @@ -139,7 +142,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - match_len: word.char_end - *first_word_char_start, + char_count: word.char_end - *first_word_char_start, ids: ids.clone().collect(), position: MatchPosition::Phrase { word_positions: [first_word_position, word_position], @@ -182,10 +185,10 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { match match_type { // we match, we save the current token as a match, // then we continue the rest of the tokens. - MatchType::Full { char_len, ids } => { + MatchType::Full { ids, char_count, .. } => { let ids: Vec<_> = ids.clone().collect(); matches.push(Match { - match_len: char_len, + char_count, ids, position: MatchPosition::Word { word_position, token_position }, }); @@ -224,19 +227,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .iter() .map(|m| MatchBounds { start: tokens[m.get_first_token_pos()].byte_start, - length: m.match_len, + // TODO: Why is this in chars, while start is in bytes? + length: m.char_count, }) .collect(), } } /// Returns the bounds in byte index of the crop window. - fn crop_bounds( - &self, - tokens: &[Token<'_>], - matches: &[Match], - crop_size: usize, - ) -> (usize, usize) { + fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] { let ( mut remaining_words, is_iterating_forward, @@ -371,7 +370,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - (crop_byte_start, crop_byte_end) + [crop_byte_start, crop_byte_end] } // Returns the formatted version of the original text. @@ -382,78 +381,87 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } else { match &self.matches { Some((tokens, matches)) => { - // If the text has to be cropped, - // crop around the best interval. - let (byte_start, byte_end) = match format_options.crop { + // If the text has to be cropped, crop around the best interval. + let [crop_byte_start, crop_byte_end] = match format_options.crop { Some(crop_size) if crop_size > 0 => { self.crop_bounds(tokens, matches, crop_size) } - _ => (0, self.text.len()), + _ => [0, self.text.len()], }; let mut formatted = Vec::new(); // push crop marker if it's not the start of the text. - if byte_start > 0 && !self.crop_marker.is_empty() { + if crop_byte_start > 0 && !self.crop_marker.is_empty() { formatted.push(self.crop_marker); } - let mut byte_index = byte_start; + let mut byte_index = crop_byte_start; if format_options.highlight { // insert highlight markers around matches. for m in matches { - let (current_byte_start, current_byte_end) = match m.position { + let [m_byte_start, m_byte_end] = match m.position { MatchPosition::Word { token_position, .. } => { let token = &tokens[token_position]; - (&token.byte_start, &token.byte_end) + [&token.byte_start, &token.byte_end] } MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => { - (&tokens[ftp].byte_start, &tokens[ltp].byte_end) + [&tokens[ftp].byte_start, &tokens[ltp].byte_end] } }; - // skip matches out of the crop window. - if *current_byte_start < byte_start || *current_byte_end > byte_end { + // skip matches out of the crop window + if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end { continue; } - if byte_index < *current_byte_start { - formatted.push(&self.text[byte_index..*current_byte_start]); + // adjust start and end to the crop window size + let [m_byte_start, m_byte_end] = [ + max(m_byte_start, &crop_byte_start), + min(m_byte_end, &crop_byte_end), + ]; + + // push text that is positioned before our matches + if byte_index < *m_byte_start { + formatted.push(&self.text[byte_index..*m_byte_start]); } - let highlight_byte_index = self.text[*current_byte_start..] - .char_indices() - .enumerate() - .find(|(i, _)| *i == m.match_len) - .map_or(*current_byte_end, |(_, (i, _))| i + *current_byte_start); - formatted.push(self.highlight_prefix); - formatted.push(&self.text[*current_byte_start..highlight_byte_index]); + + // TODO: This is additional work done, charabia::token::Token byte_len + // should already get us the original byte length, however, that doesn't work as + // it's supposed to, investigate why + let highlight_byte_index = self.text[*m_byte_start..] + .char_indices() + .nth(m.char_count) + .map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end)); + formatted.push(&self.text[*m_byte_start..highlight_byte_index]); + formatted.push(self.highlight_suffix); // if it's a prefix highlight, we put the end of the word after the highlight marker. - if highlight_byte_index < *current_byte_end { - formatted.push(&self.text[highlight_byte_index..*current_byte_end]); + if highlight_byte_index < *m_byte_end { + formatted.push(&self.text[highlight_byte_index..*m_byte_end]); } - byte_index = *current_byte_end; + byte_index = *m_byte_end; } } // push the rest of the text between last match and the end of crop. - if byte_index < byte_end { - formatted.push(&self.text[byte_index..byte_end]); + if byte_index < crop_byte_end { + formatted.push(&self.text[byte_index..crop_byte_end]); } // push crop marker if it's not the end of the text. - if byte_end < self.text.len() && !self.crop_marker.is_empty() { + if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() { formatted.push(self.crop_marker); } if formatted.len() == 1 { // avoid concatenating if there is already 1 slice. - Cow::Borrowed(&self.text[byte_start..byte_end]) + Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end]) } else { Cow::Owned(formatted.concat()) } @@ -825,8 +833,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - // @TODO: Should probably highlight it all, even if it didn't fit the whole phrase - @"The groundbreaking invention had the power to split the world…" + @"The groundbreaking invention had the power to split the world…" ); let builder = MatcherBuilder::new_test( @@ -837,7 +844,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - // @TODO: Should probably include end of string in this case? + // TODO: Should include exclamation mark without crop markers @"…between those who embraced progress and those who resisted change…" ); @@ -860,8 +867,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - // @TODO: "invention" should be highlighted as well - @"…invention had the power to split the world between those…" + @"…invention had the power to split the world between those…" ); } From c94679bde6993f91418e4113852ce9c667a198f8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Sun, 20 Oct 2024 17:24:12 +0200 Subject: [PATCH 69/92] apply review comments --- meilisearch/src/routes/indexes/documents.rs | 56 +++++++++++++-------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 854fa5b69..60014bae4 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -107,11 +107,8 @@ aggregate_methods!( DocumentsPOST => "Documents Fetched POST", ); -#[derive(Default, Serialize)] +#[derive(Serialize)] pub struct DocumentsFetchAggregator { - #[serde(rename = "requests.total_received")] - total_received: usize, - // a call on ../documents/:doc_id per_document_id: bool, // if a filter was used @@ -145,7 +142,6 @@ impl DocumentsFetchAggregator { }; Self { - total_received: 1, per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), max_limit: limit, @@ -164,7 +160,6 @@ impl Aggregate for DocumentsFetchAggregator { fn aggregate(self: Box, other: Box) -> Box { Box::new(Self { - total_received: self.total_received.saturating_add(other.total_received), per_document_id: self.per_document_id | other.per_document_id, per_filter: self.per_filter | other.per_filter, retrieve_vectors: self.retrieve_vectors | other.retrieve_vectors, @@ -199,7 +194,11 @@ pub async fn get_document( analytics.publish( DocumentsFetchAggregator:: { retrieve_vectors: param_retrieve_vectors.0, - ..Default::default() + per_document_id: true, + per_filter: false, + max_limit: 0, + max_offset: 0, + marker: PhantomData, }, &req, ); @@ -211,10 +210,8 @@ pub async fn get_document( Ok(HttpResponse::Ok().json(document)) } -#[derive(Default, Serialize)] +#[derive(Serialize)] pub struct DocumentsDeletionAggregator { - #[serde(rename = "requests.total_received")] - total_received: usize, per_document_id: bool, clear_all: bool, per_batch: bool, @@ -228,7 +225,6 @@ impl Aggregate for DocumentsDeletionAggregator { fn aggregate(self: Box, other: Box) -> Box { Box::new(Self { - total_received: self.total_received.saturating_add(other.total_received), per_document_id: self.per_document_id | other.per_document_id, clear_all: self.clear_all | other.clear_all, per_batch: self.per_batch | other.per_batch, @@ -253,9 +249,10 @@ pub async fn delete_document( analytics.publish( DocumentsDeletionAggregator { - total_received: 1, per_document_id: true, - ..Default::default() + clear_all: false, + per_batch: false, + per_filter: false, }, &req, ); @@ -316,12 +313,12 @@ pub async fn documents_by_query_post( analytics.publish( DocumentsFetchAggregator:: { - total_received: 1, per_filter: body.filter.is_some(), retrieve_vectors: body.retrieve_vectors, max_limit: body.limit, max_offset: body.offset, - ..Default::default() + per_document_id: false, + marker: PhantomData, }, &req, ); @@ -358,12 +355,12 @@ pub async fn get_documents( analytics.publish( DocumentsFetchAggregator:: { - total_received: 1, per_filter: query.filter.is_some(), retrieve_vectors: query.retrieve_vectors, max_limit: query.limit, max_offset: query.offset, - ..Default::default() + per_document_id: false, + marker: PhantomData, }, &req, ); @@ -426,7 +423,7 @@ aggregate_methods!( Updated => "Documents Updated", ); -#[derive(Default, Serialize)] +#[derive(Serialize)] pub struct DocumentsAggregator { payload_types: HashSet, primary_key: HashSet, @@ -718,7 +715,12 @@ pub async fn delete_documents_batch( let index_uid = IndexUid::try_from(index_uid.into_inner())?; analytics.publish( - DocumentsDeletionAggregator { total_received: 1, per_batch: true, ..Default::default() }, + DocumentsDeletionAggregator { + per_batch: true, + per_document_id: false, + clear_all: false, + per_filter: false, + }, &req, ); @@ -761,7 +763,12 @@ pub async fn delete_documents_by_filter( let filter = body.into_inner().filter; analytics.publish( - DocumentsDeletionAggregator { total_received: 1, per_filter: true, ..Default::default() }, + DocumentsDeletionAggregator { + per_filter: true, + per_document_id: false, + clear_all: false, + per_batch: false, + }, &req, ); @@ -793,7 +800,7 @@ pub struct DocumentEditionByFunction { pub function: String, } -#[derive(Default, Serialize)] +#[derive(Serialize)] struct EditDocumentsByFunctionAggregator { // Set to true if at least one request was filtered filtered: bool, @@ -899,7 +906,12 @@ pub async fn clear_all_documents( ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; analytics.publish( - DocumentsDeletionAggregator { total_received: 1, clear_all: true, ..Default::default() }, + DocumentsDeletionAggregator { + clear_all: true, + per_document_id: false, + per_batch: false, + per_filter: false, + }, &req, ); From 73b57228967dffe4a3da7214f2f6bc3ebb15cf5c Mon Sep 17 00:00:00 2001 From: Tamo Date: Sun, 20 Oct 2024 17:31:21 +0200 Subject: [PATCH 70/92] rename the other parameter of the aggregate method to new to avoid confusion --- meilisearch/src/analytics/mod.rs | 12 +-- .../src/analytics/segment_analytics.rs | 26 +++--- meilisearch/src/routes/features.rs | 12 +-- meilisearch/src/routes/indexes/documents.rs | 38 ++++---- .../src/routes/indexes/facet_search.rs | 12 +-- meilisearch/src/routes/indexes/mod.rs | 12 +-- .../src/routes/indexes/settings_analytics.rs | 86 +++++++++---------- meilisearch/src/routes/swap_indexes.rs | 4 +- meilisearch/src/routes/tasks.rs | 24 +++--- 9 files changed, 108 insertions(+), 118 deletions(-) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index 48ac13fc0..27203ea71 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -89,7 +89,7 @@ pub trait Aggregate: 'static + mopa::Any + Send { fn event_name(&self) -> &'static str; /// Will be called every time an event has been used twice before segment flushed its buffer. - fn aggregate(self: Box, other: Box) -> Box + fn aggregate(self: Box, new: Box) -> Box where Self: Sized; @@ -97,16 +97,16 @@ pub trait Aggregate: 'static + mopa::Any + Send { /// This function should always be called on the same type. If `this` and `other` /// aren't the same type behind the function will do nothing and return `None`. fn downcast_aggregate( - this: Box, - other: Box, + old: Box, + new: Box, ) -> Option> where Self: Sized, { - if this.is::() && other.is::() { + if old.is::() && new.is::() { // Both the two following lines cannot fail, but just to be sure we don't crash, we're still avoiding unwrapping - let this = this.downcast::().ok()?; - let other = other.downcast::().ok()?; + let this = old.downcast::().ok()?; + let other = new.downcast::().ok()?; Some(Self::aggregate(this, other)) } else { None diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 10927f49b..328a3a048 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -702,7 +702,7 @@ impl Aggregate for SearchAggregator { Method::event_name() } - fn aggregate(mut self: Box, other: Box) -> Box { + fn aggregate(mut self: Box, new: Box) -> Box { let Self { total_received, total_succeeded, @@ -743,7 +743,7 @@ impl Aggregate for SearchAggregator { ranking_score_threshold, mut locales, marker: _, - } = *other; + } = *new; // request self.total_received = self.total_received.saturating_add(total_received); @@ -1038,22 +1038,22 @@ impl Aggregate for MultiSearchAggregator { } /// Aggregate one [MultiSearchAggregator] into another. - fn aggregate(self: Box, other: Box) -> Box { + fn aggregate(self: Box, new: Box) -> Box { // write the aggregate in a way that will cause a compilation error if a field is added. // get ownership of self, replacing it by a default value. let this = *self; - let total_received = this.total_received.saturating_add(other.total_received); - let total_succeeded = this.total_succeeded.saturating_add(other.total_succeeded); + let total_received = this.total_received.saturating_add(new.total_received); + let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded); let total_distinct_index_count = - this.total_distinct_index_count.saturating_add(other.total_distinct_index_count); - let total_single_index = this.total_single_index.saturating_add(other.total_single_index); - let total_search_count = this.total_search_count.saturating_add(other.total_search_count); - let show_ranking_score = this.show_ranking_score || other.show_ranking_score; + this.total_distinct_index_count.saturating_add(new.total_distinct_index_count); + let total_single_index = this.total_single_index.saturating_add(new.total_single_index); + let total_search_count = this.total_search_count.saturating_add(new.total_search_count); + let show_ranking_score = this.show_ranking_score || new.show_ranking_score; let show_ranking_score_details = - this.show_ranking_score_details || other.show_ranking_score_details; - let use_federation = this.use_federation || other.use_federation; + this.show_ranking_score_details || new.show_ranking_score_details; + let use_federation = this.use_federation || new.use_federation; Box::new(Self { total_received, @@ -1215,7 +1215,7 @@ impl Aggregate for SimilarAggregator { } /// Aggregate one [SimilarAggregator] into another. - fn aggregate(mut self: Box, other: Box) -> Box { + fn aggregate(mut self: Box, new: Box) -> Box { let Self { total_received, total_succeeded, @@ -1233,7 +1233,7 @@ impl Aggregate for SimilarAggregator { ranking_score_threshold, retrieve_vectors, marker: _, - } = *other; + } = *new; // request self.total_received = self.total_received.saturating_add(total_received); diff --git a/meilisearch/src/routes/features.rs b/meilisearch/src/routes/features.rs index 8bdb3ffb3..5d93adc02 100644 --- a/meilisearch/src/routes/features.rs +++ b/meilisearch/src/routes/features.rs @@ -64,13 +64,13 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { "Experimental features Updated" } - fn aggregate(self: Box, other: Box) -> Box { + fn aggregate(self: Box, new: Box) -> Box { Box::new(Self { - vector_store: other.vector_store, - metrics: other.metrics, - logs_route: other.logs_route, - edit_documents_by_function: other.edit_documents_by_function, - contains_filter: other.contains_filter, + vector_store: new.vector_store, + metrics: new.metrics, + logs_route: new.logs_route, + edit_documents_by_function: new.edit_documents_by_function, + contains_filter: new.contains_filter, }) } diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 60014bae4..47f73ef42 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -158,13 +158,13 @@ impl Aggregate for DocumentsFetchAggregator { Method::event_name() } - fn aggregate(self: Box, other: Box) -> Box { + fn aggregate(self: Box, new: Box) -> Box { Box::new(Self { - per_document_id: self.per_document_id | other.per_document_id, - per_filter: self.per_filter | other.per_filter, - retrieve_vectors: self.retrieve_vectors | other.retrieve_vectors, - max_limit: self.max_limit.max(other.max_limit), - max_offset: self.max_offset.max(other.max_offset), + per_document_id: self.per_document_id | new.per_document_id, + per_filter: self.per_filter | new.per_filter, + retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors, + max_limit: self.max_limit.max(new.max_limit), + max_offset: self.max_offset.max(new.max_offset), marker: PhantomData, }) } @@ -223,12 +223,12 @@ impl Aggregate for DocumentsDeletionAggregator { "Documents Deleted" } - fn aggregate(self: Box, other: Box) -> Box { + fn aggregate(self: Box, new: Box) -> Box { Box::new(Self { - per_document_id: self.per_document_id | other.per_document_id, - clear_all: self.clear_all | other.clear_all, - per_batch: self.per_batch | other.per_batch, - per_filter: self.per_filter | other.per_filter, + per_document_id: self.per_document_id | new.per_document_id, + clear_all: self.clear_all | new.clear_all, + per_batch: self.per_batch | new.per_batch, + per_filter: self.per_filter | new.per_filter, }) } @@ -437,11 +437,11 @@ impl Aggregate for DocumentsAggregator { Method::event_name() } - fn aggregate(self: Box, other: Box) -> Box { + fn aggregate(self: Box, new: Box) -> Box { Box::new(Self { - payload_types: self.payload_types.union(&other.payload_types).cloned().collect(), - primary_key: self.primary_key.union(&other.primary_key).cloned().collect(), - index_creation: self.index_creation | other.index_creation, + payload_types: self.payload_types.union(&new.payload_types).cloned().collect(), + primary_key: self.primary_key.union(&new.primary_key).cloned().collect(), + index_creation: self.index_creation | new.index_creation, method: PhantomData, }) } @@ -815,11 +815,11 @@ impl Aggregate for EditDocumentsByFunctionAggregator { "Documents Edited By Function" } - fn aggregate(self: Box, other: Box) -> Box { + fn aggregate(self: Box, new: Box) -> Box { Box::new(Self { - filtered: self.filtered | other.filtered, - with_context: self.with_context | other.with_context, - index_creation: self.index_creation | other.index_creation, + filtered: self.filtered | new.filtered, + with_context: self.with_context | new.with_context, + index_creation: self.index_creation | new.index_creation, }) } diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 8e40397c7..99a4a4f28 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -113,18 +113,18 @@ impl Aggregate for FacetSearchAggregator { "Facet Searched POST" } - fn aggregate(mut self: Box, other: Box) -> Box { - for time in other.time_spent { + fn aggregate(mut self: Box, new: Box) -> Box { + for time in new.time_spent { self.time_spent.push(time); } Box::new(Self { - total_received: self.total_received.saturating_add(other.total_received), - total_succeeded: self.total_succeeded.saturating_add(other.total_succeeded), + total_received: self.total_received.saturating_add(new.total_received), + total_succeeded: self.total_succeeded.saturating_add(new.total_succeeded), time_spent: self.time_spent, - facet_names: self.facet_names.union(&other.facet_names).cloned().collect(), + facet_names: self.facet_names.union(&new.facet_names).cloned().collect(), additional_search_parameters_provided: self.additional_search_parameters_provided - | other.additional_search_parameters_provided, + | new.additional_search_parameters_provided, }) } diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index 65c81a57e..c8183186d 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -134,10 +134,8 @@ impl Aggregate for IndexCreatedAggregate { "Index Created" } - fn aggregate(self: Box, other: Box) -> Box { - Box::new(Self { - primary_key: self.primary_key.union(&other.primary_key).cloned().collect(), - }) + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() }) } fn into_event(self: Box) -> serde_json::Value { @@ -225,10 +223,8 @@ impl Aggregate for IndexUpdatedAggregate { "Index Updated" } - fn aggregate(self: Box, other: Box) -> Box { - Box::new(Self { - primary_key: self.primary_key.union(&other.primary_key).cloned().collect(), - }) + fn aggregate(self: Box, new: Box) -> Box { + Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() }) } fn into_event(self: Box) -> serde_json::Value { diff --git a/meilisearch/src/routes/indexes/settings_analytics.rs b/meilisearch/src/routes/indexes/settings_analytics.rs index 636ef3c57..e7d44fa20 100644 --- a/meilisearch/src/routes/indexes/settings_analytics.rs +++ b/meilisearch/src/routes/indexes/settings_analytics.rs @@ -42,114 +42,108 @@ impl Aggregate for SettingsAnalytics { "Settings Updated" } - fn aggregate(self: Box, other: Box) -> Box { + fn aggregate(self: Box, new: Box) -> Box { Box::new(Self { ranking_rules: RankingRulesAnalytics { words_position: self .ranking_rules .words_position - .or(other.ranking_rules.words_position), - typo_position: self - .ranking_rules - .typo_position - .or(other.ranking_rules.typo_position), + .or(new.ranking_rules.words_position), + typo_position: self.ranking_rules.typo_position.or(new.ranking_rules.typo_position), proximity_position: self .ranking_rules .proximity_position - .or(other.ranking_rules.proximity_position), + .or(new.ranking_rules.proximity_position), attribute_position: self .ranking_rules .attribute_position - .or(other.ranking_rules.attribute_position), - sort_position: self - .ranking_rules - .sort_position - .or(other.ranking_rules.sort_position), + .or(new.ranking_rules.attribute_position), + sort_position: self.ranking_rules.sort_position.or(new.ranking_rules.sort_position), exactness_position: self .ranking_rules .exactness_position - .or(other.ranking_rules.exactness_position), - values: self.ranking_rules.values.or(other.ranking_rules.values), + .or(new.ranking_rules.exactness_position), + values: self.ranking_rules.values.or(new.ranking_rules.values), }, searchable_attributes: SearchableAttributesAnalytics { - total: self.searchable_attributes.total.or(other.searchable_attributes.total), + total: self.searchable_attributes.total.or(new.searchable_attributes.total), with_wildcard: self .searchable_attributes .with_wildcard - .or(other.searchable_attributes.with_wildcard), + .or(new.searchable_attributes.with_wildcard), }, displayed_attributes: DisplayedAttributesAnalytics { - total: self.displayed_attributes.total.or(other.displayed_attributes.total), + total: self.displayed_attributes.total.or(new.displayed_attributes.total), with_wildcard: self .displayed_attributes .with_wildcard - .or(other.displayed_attributes.with_wildcard), + .or(new.displayed_attributes.with_wildcard), }, sortable_attributes: SortableAttributesAnalytics { - total: self.sortable_attributes.total.or(other.sortable_attributes.total), - has_geo: self.sortable_attributes.has_geo.or(other.sortable_attributes.has_geo), + total: self.sortable_attributes.total.or(new.sortable_attributes.total), + has_geo: self.sortable_attributes.has_geo.or(new.sortable_attributes.has_geo), }, filterable_attributes: FilterableAttributesAnalytics { - total: self.filterable_attributes.total.or(other.filterable_attributes.total), - has_geo: self.filterable_attributes.has_geo.or(other.filterable_attributes.has_geo), + total: self.filterable_attributes.total.or(new.filterable_attributes.total), + has_geo: self.filterable_attributes.has_geo.or(new.filterable_attributes.has_geo), }, distinct_attribute: DistinctAttributeAnalytics { - set: self.distinct_attribute.set | other.distinct_attribute.set, + set: self.distinct_attribute.set | new.distinct_attribute.set, }, proximity_precision: ProximityPrecisionAnalytics { - set: self.proximity_precision.set | other.proximity_precision.set, - value: self.proximity_precision.value.or(other.proximity_precision.value), + set: self.proximity_precision.set | new.proximity_precision.set, + value: self.proximity_precision.value.or(new.proximity_precision.value), }, typo_tolerance: TypoToleranceAnalytics { - enabled: self.typo_tolerance.enabled.or(other.typo_tolerance.enabled), + enabled: self.typo_tolerance.enabled.or(new.typo_tolerance.enabled), disable_on_attributes: self .typo_tolerance .disable_on_attributes - .or(other.typo_tolerance.disable_on_attributes), + .or(new.typo_tolerance.disable_on_attributes), disable_on_words: self .typo_tolerance .disable_on_words - .or(other.typo_tolerance.disable_on_words), + .or(new.typo_tolerance.disable_on_words), min_word_size_for_one_typo: self .typo_tolerance .min_word_size_for_one_typo - .or(other.typo_tolerance.min_word_size_for_one_typo), + .or(new.typo_tolerance.min_word_size_for_one_typo), min_word_size_for_two_typos: self .typo_tolerance .min_word_size_for_two_typos - .or(other.typo_tolerance.min_word_size_for_two_typos), + .or(new.typo_tolerance.min_word_size_for_two_typos), }, faceting: FacetingAnalytics { max_values_per_facet: self .faceting .max_values_per_facet - .or(other.faceting.max_values_per_facet), + .or(new.faceting.max_values_per_facet), sort_facet_values_by_star_count: self .faceting .sort_facet_values_by_star_count - .or(other.faceting.sort_facet_values_by_star_count), + .or(new.faceting.sort_facet_values_by_star_count), sort_facet_values_by_total: self .faceting .sort_facet_values_by_total - .or(other.faceting.sort_facet_values_by_total), + .or(new.faceting.sort_facet_values_by_total), }, pagination: PaginationAnalytics { - max_total_hits: self.pagination.max_total_hits.or(other.pagination.max_total_hits), + max_total_hits: self.pagination.max_total_hits.or(new.pagination.max_total_hits), }, stop_words: StopWordsAnalytics { - total: self.stop_words.total.or(other.stop_words.total), + total: self.stop_words.total.or(new.stop_words.total), }, - synonyms: SynonymsAnalytics { total: self.synonyms.total.or(other.synonyms.total) }, + synonyms: SynonymsAnalytics { total: self.synonyms.total.or(new.synonyms.total) }, embedders: EmbeddersAnalytics { - total: self.embedders.total.or(other.embedders.total), - sources: match (self.embedders.sources, other.embedders.sources) { + total: self.embedders.total.or(new.embedders.total), + sources: match (self.embedders.sources, new.embedders.sources) { (None, None) => None, (Some(sources), None) | (None, Some(sources)) => Some(sources), (Some(this), Some(other)) => Some(this.union(&other).cloned().collect()), }, document_template_used: match ( self.embedders.document_template_used, - other.embedders.document_template_used, + new.embedders.document_template_used, ) { (None, None) => None, (Some(used), None) | (None, Some(used)) => Some(used), @@ -157,7 +151,7 @@ impl Aggregate for SettingsAnalytics { }, document_template_max_bytes: match ( self.embedders.document_template_max_bytes, - other.embedders.document_template_max_bytes, + new.embedders.document_template_max_bytes, ) { (None, None) => None, (Some(bytes), None) | (None, Some(bytes)) => Some(bytes), @@ -165,7 +159,7 @@ impl Aggregate for SettingsAnalytics { }, binary_quantization_used: match ( self.embedders.binary_quantization_used, - other.embedders.binary_quantization_used, + new.embedders.binary_quantization_used, ) { (None, None) => None, (Some(bq), None) | (None, Some(bq)) => Some(bq), @@ -176,17 +170,17 @@ impl Aggregate for SettingsAnalytics { search_cutoff_ms: self .search_cutoff_ms .search_cutoff_ms - .or(other.search_cutoff_ms.search_cutoff_ms), + .or(new.search_cutoff_ms.search_cutoff_ms), }, - locales: LocalesAnalytics { locales: self.locales.locales.or(other.locales.locales) }, + locales: LocalesAnalytics { locales: self.locales.locales.or(new.locales.locales) }, dictionary: DictionaryAnalytics { - total: self.dictionary.total.or(other.dictionary.total), + total: self.dictionary.total.or(new.dictionary.total), }, separator_tokens: SeparatorTokensAnalytics { - total: self.separator_tokens.total.or(other.non_separator_tokens.total), + total: self.separator_tokens.total.or(new.non_separator_tokens.total), }, non_separator_tokens: NonSeparatorTokensAnalytics { - total: self.non_separator_tokens.total.or(other.non_separator_tokens.total), + total: self.non_separator_tokens.total.or(new.non_separator_tokens.total), }, }) } diff --git a/meilisearch/src/routes/swap_indexes.rs b/meilisearch/src/routes/swap_indexes.rs index f7d8f4eff..9b8b67e63 100644 --- a/meilisearch/src/routes/swap_indexes.rs +++ b/meilisearch/src/routes/swap_indexes.rs @@ -39,9 +39,9 @@ impl Aggregate for IndexSwappedAnalytics { "Indexes Swapped" } - fn aggregate(self: Box, other: Box) -> Box { + fn aggregate(self: Box, new: Box) -> Box { Box::new(Self { - swap_operation_number: self.swap_operation_number.max(other.swap_operation_number), + swap_operation_number: self.swap_operation_number.max(new.swap_operation_number), }) } diff --git a/meilisearch/src/routes/tasks.rs b/meilisearch/src/routes/tasks.rs index ff4aee998..712b8ecde 100644 --- a/meilisearch/src/routes/tasks.rs +++ b/meilisearch/src/routes/tasks.rs @@ -185,25 +185,25 @@ impl Aggregate for TaskFilterAnalytics, other: Box) -> Box { + fn aggregate(self: Box, new: Box) -> Box { Box::new(Self { - filtered_by_uid: self.filtered_by_uid | other.filtered_by_uid, - filtered_by_index_uid: self.filtered_by_index_uid | other.filtered_by_index_uid, - filtered_by_type: self.filtered_by_type | other.filtered_by_type, - filtered_by_status: self.filtered_by_status | other.filtered_by_status, - filtered_by_canceled_by: self.filtered_by_canceled_by | other.filtered_by_canceled_by, + filtered_by_uid: self.filtered_by_uid | new.filtered_by_uid, + filtered_by_index_uid: self.filtered_by_index_uid | new.filtered_by_index_uid, + filtered_by_type: self.filtered_by_type | new.filtered_by_type, + filtered_by_status: self.filtered_by_status | new.filtered_by_status, + filtered_by_canceled_by: self.filtered_by_canceled_by | new.filtered_by_canceled_by, filtered_by_before_enqueued_at: self.filtered_by_before_enqueued_at - | other.filtered_by_before_enqueued_at, + | new.filtered_by_before_enqueued_at, filtered_by_after_enqueued_at: self.filtered_by_after_enqueued_at - | other.filtered_by_after_enqueued_at, + | new.filtered_by_after_enqueued_at, filtered_by_before_started_at: self.filtered_by_before_started_at - | other.filtered_by_before_started_at, + | new.filtered_by_before_started_at, filtered_by_after_started_at: self.filtered_by_after_started_at - | other.filtered_by_after_started_at, + | new.filtered_by_after_started_at, filtered_by_before_finished_at: self.filtered_by_before_finished_at - | other.filtered_by_before_finished_at, + | new.filtered_by_before_finished_at, filtered_by_after_finished_at: self.filtered_by_after_finished_at - | other.filtered_by_after_finished_at, + | new.filtered_by_after_finished_at, marker: std::marker::PhantomData, }) From ac919df37dff4dda34ae2687517bb4b1a6b2b4cf Mon Sep 17 00:00:00 2001 From: Tamo Date: Sun, 20 Oct 2024 17:36:29 +0200 Subject: [PATCH 71/92] simplify the trait a bit more by getting rids of the downcast_aggregate method --- meilisearch/src/analytics/mod.rs | 20 ------------------- .../src/analytics/segment_analytics.rs | 18 ++++++++++++++++- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index 27203ea71..d72ab9d01 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -93,26 +93,6 @@ pub trait Aggregate: 'static + mopa::Any + Send { where Self: Sized; - /// An internal helper function, you shouldn't implement it yourself. - /// This function should always be called on the same type. If `this` and `other` - /// aren't the same type behind the function will do nothing and return `None`. - fn downcast_aggregate( - old: Box, - new: Box, - ) -> Option> - where - Self: Sized, - { - if old.is::() && new.is::() { - // Both the two following lines cannot fail, but just to be sure we don't crash, we're still avoiding unwrapping - let this = old.downcast::().ok()?; - let other = new.downcast::().ok()?; - Some(Self::aggregate(this, other)) - } else { - None - } - } - /// Converts your structure to the final event that'll be sent to segment. fn into_event(self: Box) -> serde_json::Value; } diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 328a3a048..96a0a676c 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -82,6 +82,22 @@ pub struct Event { total: usize, } +/// This function should always be called on the same type. If `this` and `other` +/// aren't the same type the function will do nothing and return `None`. +fn downcast_aggregate( + old: Box, + new: Box, +) -> Option> { + if old.is::() && new.is::() { + // Both the two following lines cannot fail, but just to be sure we don't crash, we're still avoiding unwrapping + let this = old.downcast::().ok()?; + let other = new.downcast::().ok()?; + Some(ConcreteType::aggregate(this, other)) + } else { + None + } +} + impl Message { pub fn new(event: T, request: &HttpRequest) -> Self { Self { @@ -92,7 +108,7 @@ impl Message { user_agents: extract_user_agents(request), total: 1, }, - aggregator_function: T::downcast_aggregate, + aggregator_function: downcast_aggregate::, } } } From af589c85ec4746ef38a38420e0b6d433b1dc86d2 Mon Sep 17 00:00:00 2001 From: Tamo Date: Sun, 20 Oct 2024 17:40:31 +0200 Subject: [PATCH 72/92] reverse all the settings to keep the last one received instead of the first one received in case we receive the same setting multiple times --- .../src/routes/indexes/settings_analytics.rs | 94 +++++++++---------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/meilisearch/src/routes/indexes/settings_analytics.rs b/meilisearch/src/routes/indexes/settings_analytics.rs index e7d44fa20..de01b72e8 100644 --- a/meilisearch/src/routes/indexes/settings_analytics.rs +++ b/meilisearch/src/routes/indexes/settings_analytics.rs @@ -45,97 +45,97 @@ impl Aggregate for SettingsAnalytics { fn aggregate(self: Box, new: Box) -> Box { Box::new(Self { ranking_rules: RankingRulesAnalytics { - words_position: self + words_position: new .ranking_rules .words_position - .or(new.ranking_rules.words_position), - typo_position: self.ranking_rules.typo_position.or(new.ranking_rules.typo_position), - proximity_position: self + .or(self.ranking_rules.words_position), + typo_position: new.ranking_rules.typo_position.or(self.ranking_rules.typo_position), + proximity_position: new .ranking_rules .proximity_position - .or(new.ranking_rules.proximity_position), - attribute_position: self + .or(self.ranking_rules.proximity_position), + attribute_position: new .ranking_rules .attribute_position - .or(new.ranking_rules.attribute_position), - sort_position: self.ranking_rules.sort_position.or(new.ranking_rules.sort_position), - exactness_position: self + .or(self.ranking_rules.attribute_position), + sort_position: new.ranking_rules.sort_position.or(self.ranking_rules.sort_position), + exactness_position: new .ranking_rules .exactness_position - .or(new.ranking_rules.exactness_position), - values: self.ranking_rules.values.or(new.ranking_rules.values), + .or(self.ranking_rules.exactness_position), + values: new.ranking_rules.values.or(self.ranking_rules.values), }, searchable_attributes: SearchableAttributesAnalytics { - total: self.searchable_attributes.total.or(new.searchable_attributes.total), - with_wildcard: self + total: new.searchable_attributes.total.or(self.searchable_attributes.total), + with_wildcard: new .searchable_attributes .with_wildcard - .or(new.searchable_attributes.with_wildcard), + .or(self.searchable_attributes.with_wildcard), }, displayed_attributes: DisplayedAttributesAnalytics { - total: self.displayed_attributes.total.or(new.displayed_attributes.total), - with_wildcard: self + total: new.displayed_attributes.total.or(self.displayed_attributes.total), + with_wildcard: new .displayed_attributes .with_wildcard - .or(new.displayed_attributes.with_wildcard), + .or(self.displayed_attributes.with_wildcard), }, sortable_attributes: SortableAttributesAnalytics { - total: self.sortable_attributes.total.or(new.sortable_attributes.total), - has_geo: self.sortable_attributes.has_geo.or(new.sortable_attributes.has_geo), + total: new.sortable_attributes.total.or(self.sortable_attributes.total), + has_geo: new.sortable_attributes.has_geo.or(self.sortable_attributes.has_geo), }, filterable_attributes: FilterableAttributesAnalytics { - total: self.filterable_attributes.total.or(new.filterable_attributes.total), - has_geo: self.filterable_attributes.has_geo.or(new.filterable_attributes.has_geo), + total: new.filterable_attributes.total.or(self.filterable_attributes.total), + has_geo: new.filterable_attributes.has_geo.or(self.filterable_attributes.has_geo), }, distinct_attribute: DistinctAttributeAnalytics { set: self.distinct_attribute.set | new.distinct_attribute.set, }, proximity_precision: ProximityPrecisionAnalytics { set: self.proximity_precision.set | new.proximity_precision.set, - value: self.proximity_precision.value.or(new.proximity_precision.value), + value: new.proximity_precision.value.or(self.proximity_precision.value), }, typo_tolerance: TypoToleranceAnalytics { - enabled: self.typo_tolerance.enabled.or(new.typo_tolerance.enabled), - disable_on_attributes: self + enabled: new.typo_tolerance.enabled.or(self.typo_tolerance.enabled), + disable_on_attributes: new .typo_tolerance .disable_on_attributes - .or(new.typo_tolerance.disable_on_attributes), - disable_on_words: self + .or(self.typo_tolerance.disable_on_attributes), + disable_on_words: new .typo_tolerance .disable_on_words - .or(new.typo_tolerance.disable_on_words), - min_word_size_for_one_typo: self + .or(self.typo_tolerance.disable_on_words), + min_word_size_for_one_typo: new .typo_tolerance .min_word_size_for_one_typo - .or(new.typo_tolerance.min_word_size_for_one_typo), - min_word_size_for_two_typos: self + .or(self.typo_tolerance.min_word_size_for_one_typo), + min_word_size_for_two_typos: new .typo_tolerance .min_word_size_for_two_typos - .or(new.typo_tolerance.min_word_size_for_two_typos), + .or(self.typo_tolerance.min_word_size_for_two_typos), }, faceting: FacetingAnalytics { - max_values_per_facet: self + max_values_per_facet: new .faceting .max_values_per_facet - .or(new.faceting.max_values_per_facet), - sort_facet_values_by_star_count: self + .or(self.faceting.max_values_per_facet), + sort_facet_values_by_star_count: new .faceting .sort_facet_values_by_star_count - .or(new.faceting.sort_facet_values_by_star_count), - sort_facet_values_by_total: self + .or(self.faceting.sort_facet_values_by_star_count), + sort_facet_values_by_total: new .faceting .sort_facet_values_by_total - .or(new.faceting.sort_facet_values_by_total), + .or(self.faceting.sort_facet_values_by_total), }, pagination: PaginationAnalytics { - max_total_hits: self.pagination.max_total_hits.or(new.pagination.max_total_hits), + max_total_hits: new.pagination.max_total_hits.or(self.pagination.max_total_hits), }, stop_words: StopWordsAnalytics { - total: self.stop_words.total.or(new.stop_words.total), + total: new.stop_words.total.or(self.stop_words.total), }, - synonyms: SynonymsAnalytics { total: self.synonyms.total.or(new.synonyms.total) }, + synonyms: SynonymsAnalytics { total: new.synonyms.total.or(self.synonyms.total) }, embedders: EmbeddersAnalytics { - total: self.embedders.total.or(new.embedders.total), + total: new.embedders.total.or(self.embedders.total), sources: match (self.embedders.sources, new.embedders.sources) { (None, None) => None, (Some(sources), None) | (None, Some(sources)) => Some(sources), @@ -167,20 +167,20 @@ impl Aggregate for SettingsAnalytics { }, }, search_cutoff_ms: SearchCutoffMsAnalytics { - search_cutoff_ms: self + search_cutoff_ms: new .search_cutoff_ms .search_cutoff_ms - .or(new.search_cutoff_ms.search_cutoff_ms), + .or(self.search_cutoff_ms.search_cutoff_ms), }, - locales: LocalesAnalytics { locales: self.locales.locales.or(new.locales.locales) }, + locales: LocalesAnalytics { locales: new.locales.locales.or(self.locales.locales) }, dictionary: DictionaryAnalytics { - total: self.dictionary.total.or(new.dictionary.total), + total: new.dictionary.total.or(self.dictionary.total), }, separator_tokens: SeparatorTokensAnalytics { - total: self.separator_tokens.total.or(new.non_separator_tokens.total), + total: new.non_separator_tokens.total.or(self.separator_tokens.total), }, non_separator_tokens: NonSeparatorTokensAnalytics { - total: self.non_separator_tokens.total.or(new.non_separator_tokens.total), + total: new.non_separator_tokens.total.or(self.non_separator_tokens.total), }, }) } From 5675585fe8b4f51eed7b08bb30e1fed0f711e340 Mon Sep 17 00:00:00 2001 From: Tamo Date: Sun, 20 Oct 2024 17:54:43 +0200 Subject: [PATCH 73/92] move all the searches structures to new modules --- meilisearch/src/analytics/mod.rs | 4 - .../src/analytics/segment_analytics.rs | 868 +----------------- meilisearch/src/routes/indexes/mod.rs | 2 + meilisearch/src/routes/indexes/search.rs | 4 +- .../src/routes/indexes/search_analytics.rs | 485 ++++++++++ meilisearch/src/routes/indexes/similar.rs | 4 +- .../src/routes/indexes/similar_analytics.rs | 235 +++++ meilisearch/src/routes/mod.rs | 1 + meilisearch/src/routes/multi_search.rs | 4 +- .../src/routes/multi_search_analytics.rs | 170 ++++ 10 files changed, 903 insertions(+), 874 deletions(-) create mode 100644 meilisearch/src/routes/indexes/search_analytics.rs create mode 100644 meilisearch/src/routes/indexes/similar_analytics.rs create mode 100644 meilisearch/src/routes/multi_search_analytics.rs diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index d72ab9d01..bd14b0bfa 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -15,13 +15,9 @@ use platform_dirs::AppDirs; // if the feature analytics is enabled we use the real analytics pub type SegmentAnalytics = segment_analytics::SegmentAnalytics; -pub use segment_analytics::SearchAggregator; -pub use segment_analytics::SimilarAggregator; use crate::Opt; -pub use self::segment_analytics::MultiSearchAggregator; - /// A macro used to quickly define events that don't aggregate or send anything besides an empty event with its name. #[macro_export] macro_rules! empty_analytics { diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 96a0a676c..7dc746b14 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -1,5 +1,5 @@ use std::any::TypeId; -use std::collections::{BTreeSet, BinaryHeap, HashMap, HashSet}; +use std::collections::{HashMap, HashSet}; use std::fs; use std::path::{Path, PathBuf}; use std::sync::Arc; @@ -11,10 +11,8 @@ use byte_unit::Byte; use index_scheduler::IndexScheduler; use meilisearch_auth::{AuthController, AuthFilter}; use meilisearch_types::features::RuntimeTogglableFeatures; -use meilisearch_types::locales::Locale; use meilisearch_types::InstanceUid; use once_cell::sync::Lazy; -use regex::Regex; use segment::message::{Identify, Track, User}; use segment::{AutoBatcher, Batcher, HttpClient}; use serde::Serialize; @@ -25,17 +23,12 @@ use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; use uuid::Uuid; -use super::{config_user_id_path, Aggregate, AggregateMethod, MEILISEARCH_CONFIG_PATH}; +use super::{config_user_id_path, Aggregate, MEILISEARCH_CONFIG_PATH}; use crate::option::{ default_http_addr, IndexerOpts, LogMode, MaxMemory, MaxThreads, ScheduleSnapshot, }; use crate::routes::{create_all_stats, Stats}; -use crate::search::{ - FederatedSearch, SearchQuery, SearchQueryWithIndex, SearchResult, SimilarQuery, SimilarResult, - DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, - DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEMANTIC_RATIO, -}; -use crate::{aggregate_methods, Opt}; +use crate::Opt; const ANALYTICS_HEADER: &str = "X-Meilisearch-Client"; @@ -489,858 +482,3 @@ impl Segment { let _ = self.batcher.flush().await; } } - -#[derive(Default)] -pub struct SearchAggregator { - // requests - total_received: usize, - total_succeeded: usize, - total_degraded: usize, - total_used_negative_operator: usize, - time_spent: BinaryHeap, - - // sort - sort_with_geo_point: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - sort_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - sort_total_number_of_criteria: usize, - - // distinct - distinct: bool, - - // filter - filter_with_geo_radius: bool, - filter_with_geo_bounding_box: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - filter_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - filter_total_number_of_criteria: usize, - used_syntax: HashMap, - - // attributes_to_search_on - // every time a search is done using attributes_to_search_on - attributes_to_search_on_total_number_of_uses: usize, - - // q - // The maximum number of terms in a q request - max_terms_number: usize, - - // vector - // The maximum number of floats in a vector request - max_vector_size: usize, - // Whether the semantic ratio passed to a hybrid search equals the default ratio. - semantic_ratio: bool, - hybrid: bool, - retrieve_vectors: bool, - - // every time a search is done, we increment the counter linked to the used settings - matching_strategy: HashMap, - - // List of the unique Locales passed as parameter - locales: BTreeSet, - - // pagination - max_limit: usize, - max_offset: usize, - finite_pagination: usize, - - // formatting - max_attributes_to_retrieve: usize, - max_attributes_to_highlight: usize, - highlight_pre_tag: bool, - highlight_post_tag: bool, - max_attributes_to_crop: usize, - crop_marker: bool, - show_matches_position: bool, - crop_length: bool, - - // facets - facets_sum_of_terms: usize, - facets_total_number_of_facets: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - ranking_score_threshold: bool, - - marker: std::marker::PhantomData, -} - -impl SearchAggregator { - #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &SearchQuery) -> Self { - let SearchQuery { - q, - vector, - offset, - limit, - page, - hits_per_page, - attributes_to_retrieve: _, - retrieve_vectors, - attributes_to_crop: _, - crop_length, - attributes_to_highlight: _, - show_matches_position, - show_ranking_score, - show_ranking_score_details, - filter, - sort, - distinct, - facets: _, - highlight_pre_tag, - highlight_post_tag, - crop_marker, - matching_strategy, - attributes_to_search_on, - hybrid, - ranking_score_threshold, - locales, - } = query; - - let mut ret = Self::default(); - - ret.total_received = 1; - - if let Some(ref sort) = sort { - ret.sort_total_number_of_criteria = 1; - ret.sort_with_geo_point = sort.iter().any(|s| s.contains("_geoPoint(")); - ret.sort_sum_of_criteria_terms = sort.len(); - } - - ret.distinct = distinct.is_some(); - - if let Some(ref filter) = filter { - static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); - ret.filter_total_number_of_criteria = 1; - - let syntax = match filter { - Value::String(_) => "string".to_string(), - Value::Array(values) => { - if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { - "mixed".to_string() - } else { - "array".to_string() - } - } - _ => "none".to_string(), - }; - // convert the string to a HashMap - ret.used_syntax.insert(syntax, 1); - - let stringified_filters = filter.to_string(); - ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); - ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); - ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); - } - - // attributes_to_search_on - if attributes_to_search_on.is_some() { - ret.attributes_to_search_on_total_number_of_uses = 1; - } - - if let Some(ref q) = q { - ret.max_terms_number = q.split_whitespace().count(); - } - - if let Some(ref vector) = vector { - ret.max_vector_size = vector.len(); - } - ret.retrieve_vectors |= retrieve_vectors; - - if query.is_finite_pagination() { - let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); - ret.max_limit = limit; - ret.max_offset = page.unwrap_or(1).saturating_sub(1) * limit; - ret.finite_pagination = 1; - } else { - ret.max_limit = *limit; - ret.max_offset = *offset; - ret.finite_pagination = 0; - } - - ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); - - if let Some(locales) = locales { - ret.locales = locales.iter().copied().collect(); - } - - ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); - ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG(); - ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER(); - ret.crop_length = *crop_length != DEFAULT_CROP_LENGTH(); - ret.show_matches_position = *show_matches_position; - - ret.show_ranking_score = *show_ranking_score; - ret.show_ranking_score_details = *show_ranking_score_details; - ret.ranking_score_threshold = ranking_score_threshold.is_some(); - - if let Some(hybrid) = hybrid { - ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO(); - ret.hybrid = true; - } - - ret - } - - pub fn succeed(&mut self, result: &SearchResult) { - let SearchResult { - hits: _, - query: _, - processing_time_ms, - hits_info: _, - semantic_hit_count: _, - facet_distribution: _, - facet_stats: _, - degraded, - used_negative_operator, - } = result; - - self.total_succeeded = self.total_succeeded.saturating_add(1); - if *degraded { - self.total_degraded = self.total_degraded.saturating_add(1); - } - if *used_negative_operator { - self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1); - } - self.time_spent.push(*processing_time_ms as usize); - } -} - -aggregate_methods!( - SearchGET => "Documents Searched GET", - SearchPOST => "Documents Searched POST", -); - -impl Aggregate for SearchAggregator { - fn event_name(&self) -> &'static str { - Method::event_name() - } - - fn aggregate(mut self: Box, new: Box) -> Box { - let Self { - total_received, - total_succeeded, - mut time_spent, - sort_with_geo_point, - sort_sum_of_criteria_terms, - sort_total_number_of_criteria, - distinct, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - attributes_to_search_on_total_number_of_uses, - max_terms_number, - max_vector_size, - retrieve_vectors, - matching_strategy, - max_limit, - max_offset, - finite_pagination, - max_attributes_to_retrieve, - max_attributes_to_highlight, - highlight_pre_tag, - highlight_post_tag, - max_attributes_to_crop, - crop_marker, - show_matches_position, - crop_length, - facets_sum_of_terms, - facets_total_number_of_facets, - show_ranking_score, - show_ranking_score_details, - semantic_ratio, - hybrid, - total_degraded, - total_used_negative_operator, - ranking_score_threshold, - mut locales, - marker: _, - } = *new; - - // request - self.total_received = self.total_received.saturating_add(total_received); - self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.total_degraded = self.total_degraded.saturating_add(total_degraded); - self.total_used_negative_operator = - self.total_used_negative_operator.saturating_add(total_used_negative_operator); - self.time_spent.append(&mut time_spent); - - // sort - self.sort_with_geo_point |= sort_with_geo_point; - self.sort_sum_of_criteria_terms = - self.sort_sum_of_criteria_terms.saturating_add(sort_sum_of_criteria_terms); - self.sort_total_number_of_criteria = - self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria); - - // distinct - self.distinct |= distinct; - - // filter - self.filter_with_geo_radius |= filter_with_geo_radius; - self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; - self.filter_sum_of_criteria_terms = - self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); - self.filter_total_number_of_criteria = - self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); - for (key, value) in used_syntax.into_iter() { - let used_syntax = self.used_syntax.entry(key).or_insert(0); - *used_syntax = used_syntax.saturating_add(value); - } - - // attributes_to_search_on - self.attributes_to_search_on_total_number_of_uses = self - .attributes_to_search_on_total_number_of_uses - .saturating_add(attributes_to_search_on_total_number_of_uses); - - // q - self.max_terms_number = self.max_terms_number.max(max_terms_number); - - // vector - self.max_vector_size = self.max_vector_size.max(max_vector_size); - self.retrieve_vectors |= retrieve_vectors; - self.semantic_ratio |= semantic_ratio; - self.hybrid |= hybrid; - - // pagination - self.max_limit = self.max_limit.max(max_limit); - self.max_offset = self.max_offset.max(max_offset); - self.finite_pagination += finite_pagination; - - // formatting - self.max_attributes_to_retrieve = - self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); - self.max_attributes_to_highlight = - self.max_attributes_to_highlight.max(max_attributes_to_highlight); - self.highlight_pre_tag |= highlight_pre_tag; - self.highlight_post_tag |= highlight_post_tag; - self.max_attributes_to_crop = self.max_attributes_to_crop.max(max_attributes_to_crop); - self.crop_marker |= crop_marker; - self.show_matches_position |= show_matches_position; - self.crop_length |= crop_length; - - // facets - self.facets_sum_of_terms = self.facets_sum_of_terms.saturating_add(facets_sum_of_terms); - self.facets_total_number_of_facets = - self.facets_total_number_of_facets.saturating_add(facets_total_number_of_facets); - - // matching strategy - for (key, value) in matching_strategy.into_iter() { - let matching_strategy = self.matching_strategy.entry(key).or_insert(0); - *matching_strategy = matching_strategy.saturating_add(value); - } - - // scoring - self.show_ranking_score |= show_ranking_score; - self.show_ranking_score_details |= show_ranking_score_details; - self.ranking_score_threshold |= ranking_score_threshold; - - // locales - self.locales.append(&mut locales); - - self - } - - fn into_event(self: Box) -> serde_json::Value { - let Self { - total_received, - total_succeeded, - time_spent, - sort_with_geo_point, - sort_sum_of_criteria_terms, - sort_total_number_of_criteria, - distinct, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - attributes_to_search_on_total_number_of_uses, - max_terms_number, - max_vector_size, - retrieve_vectors, - matching_strategy, - max_limit, - max_offset, - finite_pagination, - max_attributes_to_retrieve, - max_attributes_to_highlight, - highlight_pre_tag, - highlight_post_tag, - max_attributes_to_crop, - crop_marker, - show_matches_position, - crop_length, - facets_sum_of_terms, - facets_total_number_of_facets, - show_ranking_score, - show_ranking_score_details, - semantic_ratio, - hybrid, - total_degraded, - total_used_negative_operator, - ranking_score_threshold, - locales, - marker: _, - } = *self; - - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // the index of the 99th percentage of value - let percentile_99th = time_spent.len() * 99 / 100; - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th); - - json!({ - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - "total_degraded": total_degraded, - "total_used_negative_operator": total_used_negative_operator, - }, - "sort": { - "with_geoPoint": sort_with_geo_point, - "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), - }, - "distinct": distinct, - "filter": { - "with_geoRadius": filter_with_geo_radius, - "with_geoBoundingBox": filter_with_geo_bounding_box, - "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), - "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "attributes_to_search_on": { - "total_number_of_uses": attributes_to_search_on_total_number_of_uses, - }, - "q": { - "max_terms_number": max_terms_number, - }, - "vector": { - "max_vector_size": max_vector_size, - "retrieve_vectors": retrieve_vectors, - }, - "hybrid": { - "enabled": hybrid, - "semantic_ratio": semantic_ratio, - }, - "pagination": { - "max_limit": max_limit, - "max_offset": max_offset, - "most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" }, - }, - "formatting": { - "max_attributes_to_retrieve": max_attributes_to_retrieve, - "max_attributes_to_highlight": max_attributes_to_highlight, - "highlight_pre_tag": highlight_pre_tag, - "highlight_post_tag": highlight_post_tag, - "max_attributes_to_crop": max_attributes_to_crop, - "crop_marker": crop_marker, - "show_matches_position": show_matches_position, - "crop_length": crop_length, - }, - "facets": { - "avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64), - }, - "matching_strategy": { - "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "locales": locales, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - "ranking_score_threshold": ranking_score_threshold, - }, - }) - } -} - -#[derive(Default)] -pub struct MultiSearchAggregator { - // requests - total_received: usize, - total_succeeded: usize, - - // sum of the number of distinct indexes in each single request, use with total_received to compute an avg - total_distinct_index_count: usize, - // number of queries with a single index, use with total_received to compute a proportion - total_single_index: usize, - - // sum of the number of search queries in the requests, use with total_received to compute an average - total_search_count: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - - // federation - use_federation: bool, -} - -impl MultiSearchAggregator { - pub fn from_federated_search(federated_search: &FederatedSearch) -> Self { - let use_federation = federated_search.federation.is_some(); - - let distinct_indexes: HashSet<_> = federated_search - .queries - .iter() - .map(|query| { - let query = &query; - // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex - let SearchQueryWithIndex { - index_uid, - federation_options: _, - q: _, - vector: _, - offset: _, - limit: _, - page: _, - hits_per_page: _, - attributes_to_retrieve: _, - retrieve_vectors: _, - attributes_to_crop: _, - crop_length: _, - attributes_to_highlight: _, - show_ranking_score: _, - show_ranking_score_details: _, - show_matches_position: _, - filter: _, - sort: _, - distinct: _, - facets: _, - highlight_pre_tag: _, - highlight_post_tag: _, - crop_marker: _, - matching_strategy: _, - attributes_to_search_on: _, - hybrid: _, - ranking_score_threshold: _, - locales: _, - } = query; - - index_uid.as_str() - }) - .collect(); - - let show_ranking_score = - federated_search.queries.iter().any(|query| query.show_ranking_score); - let show_ranking_score_details = - federated_search.queries.iter().any(|query| query.show_ranking_score_details); - - Self { - total_received: 1, - total_succeeded: 0, - total_distinct_index_count: distinct_indexes.len(), - total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 }, - total_search_count: federated_search.queries.len(), - show_ranking_score, - show_ranking_score_details, - use_federation, - } - } - - pub fn succeed(&mut self) { - self.total_succeeded = self.total_succeeded.saturating_add(1); - } -} - -impl Aggregate for MultiSearchAggregator { - fn event_name(&self) -> &'static str { - "Documents Searched by Multi-Search POST" - } - - /// Aggregate one [MultiSearchAggregator] into another. - fn aggregate(self: Box, new: Box) -> Box { - // write the aggregate in a way that will cause a compilation error if a field is added. - - // get ownership of self, replacing it by a default value. - let this = *self; - - let total_received = this.total_received.saturating_add(new.total_received); - let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded); - let total_distinct_index_count = - this.total_distinct_index_count.saturating_add(new.total_distinct_index_count); - let total_single_index = this.total_single_index.saturating_add(new.total_single_index); - let total_search_count = this.total_search_count.saturating_add(new.total_search_count); - let show_ranking_score = this.show_ranking_score || new.show_ranking_score; - let show_ranking_score_details = - this.show_ranking_score_details || new.show_ranking_score_details; - let use_federation = this.use_federation || new.use_federation; - - Box::new(Self { - total_received, - total_succeeded, - total_distinct_index_count, - total_single_index, - total_search_count, - show_ranking_score, - show_ranking_score_details, - use_federation, - }) - } - - fn into_event(self: Box) -> serde_json::Value { - let Self { - total_received, - total_succeeded, - total_distinct_index_count, - total_single_index, - total_search_count, - show_ranking_score, - show_ranking_score_details, - use_federation, - } = *self; - - json!({ - "requests": { - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "indexes": { - "total_single_index": total_single_index, - "total_distinct_index_count": total_distinct_index_count, - "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early - }, - "searches": { - "total_search_count": total_search_count, - "avg_search_count": (total_search_count as f64) / (total_received as f64), - }, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - }, - "federation": { - "use_federation": use_federation, - } - }) - } -} - -aggregate_methods!( - SimilarPOST => "Similar POST", - SimilarGET => "Similar GET", -); - -#[derive(Default)] -pub struct SimilarAggregator { - // requests - total_received: usize, - total_succeeded: usize, - time_spent: BinaryHeap, - - // filter - filter_with_geo_radius: bool, - filter_with_geo_bounding_box: bool, - // every time a request has a filter, this field must be incremented by the number of terms it contains - filter_sum_of_criteria_terms: usize, - // every time a request has a filter, this field must be incremented by one - filter_total_number_of_criteria: usize, - used_syntax: HashMap, - - // Whether a non-default embedder was specified - retrieve_vectors: bool, - - // pagination - max_limit: usize, - max_offset: usize, - - // formatting - max_attributes_to_retrieve: usize, - - // scoring - show_ranking_score: bool, - show_ranking_score_details: bool, - ranking_score_threshold: bool, - - marker: std::marker::PhantomData, -} - -impl SimilarAggregator { - #[allow(clippy::field_reassign_with_default)] - pub fn from_query(query: &SimilarQuery) -> Self { - let SimilarQuery { - id: _, - embedder: _, - offset, - limit, - attributes_to_retrieve: _, - retrieve_vectors, - show_ranking_score, - show_ranking_score_details, - filter, - ranking_score_threshold, - } = query; - - let mut ret = Self::default(); - - ret.total_received = 1; - - if let Some(ref filter) = filter { - static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); - ret.filter_total_number_of_criteria = 1; - - let syntax = match filter { - Value::String(_) => "string".to_string(), - Value::Array(values) => { - if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { - "mixed".to_string() - } else { - "array".to_string() - } - } - _ => "none".to_string(), - }; - // convert the string to a HashMap - ret.used_syntax.insert(syntax, 1); - - let stringified_filters = filter.to_string(); - ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); - ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); - ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); - } - - ret.max_limit = *limit; - ret.max_offset = *offset; - - ret.show_ranking_score = *show_ranking_score; - ret.show_ranking_score_details = *show_ranking_score_details; - ret.ranking_score_threshold = ranking_score_threshold.is_some(); - - ret.retrieve_vectors = *retrieve_vectors; - - ret - } - - pub fn succeed(&mut self, result: &SimilarResult) { - let SimilarResult { id: _, hits: _, processing_time_ms, hits_info: _ } = result; - - self.total_succeeded = self.total_succeeded.saturating_add(1); - - self.time_spent.push(*processing_time_ms as usize); - } -} - -impl Aggregate for SimilarAggregator { - fn event_name(&self) -> &'static str { - Method::event_name() - } - - /// Aggregate one [SimilarAggregator] into another. - fn aggregate(mut self: Box, new: Box) -> Box { - let Self { - total_received, - total_succeeded, - mut time_spent, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - max_limit, - max_offset, - max_attributes_to_retrieve, - show_ranking_score, - show_ranking_score_details, - ranking_score_threshold, - retrieve_vectors, - marker: _, - } = *new; - - // request - self.total_received = self.total_received.saturating_add(total_received); - self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); - self.time_spent.append(&mut time_spent); - - // filter - self.filter_with_geo_radius |= filter_with_geo_radius; - self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; - self.filter_sum_of_criteria_terms = - self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); - self.filter_total_number_of_criteria = - self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); - for (key, value) in used_syntax.into_iter() { - let used_syntax = self.used_syntax.entry(key).or_insert(0); - *used_syntax = used_syntax.saturating_add(value); - } - - self.retrieve_vectors |= retrieve_vectors; - - // pagination - self.max_limit = self.max_limit.max(max_limit); - self.max_offset = self.max_offset.max(max_offset); - - // formatting - self.max_attributes_to_retrieve = - self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); - - // scoring - self.show_ranking_score |= show_ranking_score; - self.show_ranking_score_details |= show_ranking_score_details; - self.ranking_score_threshold |= ranking_score_threshold; - - self - } - - fn into_event(self: Box) -> serde_json::Value { - let Self { - total_received, - total_succeeded, - time_spent, - filter_with_geo_radius, - filter_with_geo_bounding_box, - filter_sum_of_criteria_terms, - filter_total_number_of_criteria, - used_syntax, - max_limit, - max_offset, - max_attributes_to_retrieve, - show_ranking_score, - show_ranking_score_details, - ranking_score_threshold, - retrieve_vectors, - marker: _, - } = *self; - - // we get all the values in a sorted manner - let time_spent = time_spent.into_sorted_vec(); - // the index of the 99th percentage of value - let percentile_99th = time_spent.len() * 99 / 100; - // We are only interested by the slowest value of the 99th fastest results - let time_spent = time_spent.get(percentile_99th); - - json!({ - "requests": { - "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), - "total_succeeded": total_succeeded, - "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics - "total_received": total_received, - }, - "filter": { - "with_geoRadius": filter_with_geo_radius, - "with_geoBoundingBox": filter_with_geo_bounding_box, - "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), - "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), - }, - "vector": { - "retrieve_vectors": retrieve_vectors, - }, - "pagination": { - "max_limit": max_limit, - "max_offset": max_offset, - }, - "formatting": { - "max_attributes_to_retrieve": max_attributes_to_retrieve, - }, - "scoring": { - "show_ranking_score": show_ranking_score, - "show_ranking_score_details": show_ranking_score_details, - "ranking_score_threshold": ranking_score_threshold, - } - }) - } -} diff --git a/meilisearch/src/routes/indexes/mod.rs b/meilisearch/src/routes/indexes/mod.rs index c8183186d..7d073ec5f 100644 --- a/meilisearch/src/routes/indexes/mod.rs +++ b/meilisearch/src/routes/indexes/mod.rs @@ -28,9 +28,11 @@ use crate::Opt; pub mod documents; pub mod facet_search; pub mod search; +mod search_analytics; pub mod settings; mod settings_analytics; pub mod similar; +mod similar_analytics; pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service( diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index ac6e23c8f..2f5cb4a36 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -13,13 +13,13 @@ use meilisearch_types::serde_cs::vec::CS; use serde_json::Value; use tracing::debug; -use crate::analytics::segment_analytics::{SearchGET, SearchPOST}; -use crate::analytics::{Analytics, SearchAggregator}; +use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; +use crate::routes::indexes::search_analytics::{SearchAggregator, SearchGET, SearchPOST}; use crate::search::{ add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, diff --git a/meilisearch/src/routes/indexes/search_analytics.rs b/meilisearch/src/routes/indexes/search_analytics.rs new file mode 100644 index 000000000..8bbb1781f --- /dev/null +++ b/meilisearch/src/routes/indexes/search_analytics.rs @@ -0,0 +1,485 @@ +use once_cell::sync::Lazy; +use regex::Regex; +use serde_json::{json, Value}; +use std::collections::{BTreeSet, BinaryHeap, HashMap}; + +use meilisearch_types::locales::Locale; + +use crate::{ + aggregate_methods, + analytics::{Aggregate, AggregateMethod}, + search::{ + SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, + DEFAULT_SEMANTIC_RATIO, + }, +}; + +aggregate_methods!( + SearchGET => "Documents Searched GET", + SearchPOST => "Documents Searched POST", +); + +#[derive(Default)] +pub struct SearchAggregator { + // requests + total_received: usize, + total_succeeded: usize, + total_degraded: usize, + total_used_negative_operator: usize, + time_spent: BinaryHeap, + + // sort + sort_with_geo_point: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + sort_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + sort_total_number_of_criteria: usize, + + // distinct + distinct: bool, + + // filter + filter_with_geo_radius: bool, + filter_with_geo_bounding_box: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + filter_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + filter_total_number_of_criteria: usize, + used_syntax: HashMap, + + // attributes_to_search_on + // every time a search is done using attributes_to_search_on + attributes_to_search_on_total_number_of_uses: usize, + + // q + // The maximum number of terms in a q request + max_terms_number: usize, + + // vector + // The maximum number of floats in a vector request + max_vector_size: usize, + // Whether the semantic ratio passed to a hybrid search equals the default ratio. + semantic_ratio: bool, + hybrid: bool, + retrieve_vectors: bool, + + // every time a search is done, we increment the counter linked to the used settings + matching_strategy: HashMap, + + // List of the unique Locales passed as parameter + locales: BTreeSet, + + // pagination + max_limit: usize, + max_offset: usize, + finite_pagination: usize, + + // formatting + max_attributes_to_retrieve: usize, + max_attributes_to_highlight: usize, + highlight_pre_tag: bool, + highlight_post_tag: bool, + max_attributes_to_crop: usize, + crop_marker: bool, + show_matches_position: bool, + crop_length: bool, + + // facets + facets_sum_of_terms: usize, + facets_total_number_of_facets: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + ranking_score_threshold: bool, + + marker: std::marker::PhantomData, +} + +impl SearchAggregator { + #[allow(clippy::field_reassign_with_default)] + pub fn from_query(query: &SearchQuery) -> Self { + let SearchQuery { + q, + vector, + offset, + limit, + page, + hits_per_page, + attributes_to_retrieve: _, + retrieve_vectors, + attributes_to_crop: _, + crop_length, + attributes_to_highlight: _, + show_matches_position, + show_ranking_score, + show_ranking_score_details, + filter, + sort, + distinct, + facets: _, + highlight_pre_tag, + highlight_post_tag, + crop_marker, + matching_strategy, + attributes_to_search_on, + hybrid, + ranking_score_threshold, + locales, + } = query; + + let mut ret = Self::default(); + + ret.total_received = 1; + + if let Some(ref sort) = sort { + ret.sort_total_number_of_criteria = 1; + ret.sort_with_geo_point = sort.iter().any(|s| s.contains("_geoPoint(")); + ret.sort_sum_of_criteria_terms = sort.len(); + } + + ret.distinct = distinct.is_some(); + + if let Some(ref filter) = filter { + static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); + ret.filter_total_number_of_criteria = 1; + + let syntax = match filter { + Value::String(_) => "string".to_string(), + Value::Array(values) => { + if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { + "mixed".to_string() + } else { + "array".to_string() + } + } + _ => "none".to_string(), + }; + // convert the string to a HashMap + ret.used_syntax.insert(syntax, 1); + + let stringified_filters = filter.to_string(); + ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); + ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); + ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); + } + + // attributes_to_search_on + if attributes_to_search_on.is_some() { + ret.attributes_to_search_on_total_number_of_uses = 1; + } + + if let Some(ref q) = q { + ret.max_terms_number = q.split_whitespace().count(); + } + + if let Some(ref vector) = vector { + ret.max_vector_size = vector.len(); + } + ret.retrieve_vectors |= retrieve_vectors; + + if query.is_finite_pagination() { + let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); + ret.max_limit = limit; + ret.max_offset = page.unwrap_or(1).saturating_sub(1) * limit; + ret.finite_pagination = 1; + } else { + ret.max_limit = *limit; + ret.max_offset = *offset; + ret.finite_pagination = 0; + } + + ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); + + if let Some(locales) = locales { + ret.locales = locales.iter().copied().collect(); + } + + ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); + ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG(); + ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER(); + ret.crop_length = *crop_length != DEFAULT_CROP_LENGTH(); + ret.show_matches_position = *show_matches_position; + + ret.show_ranking_score = *show_ranking_score; + ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); + + if let Some(hybrid) = hybrid { + ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO(); + ret.hybrid = true; + } + + ret + } + + pub fn succeed(&mut self, result: &SearchResult) { + let SearchResult { + hits: _, + query: _, + processing_time_ms, + hits_info: _, + semantic_hit_count: _, + facet_distribution: _, + facet_stats: _, + degraded, + used_negative_operator, + } = result; + + self.total_succeeded = self.total_succeeded.saturating_add(1); + if *degraded { + self.total_degraded = self.total_degraded.saturating_add(1); + } + if *used_negative_operator { + self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1); + } + self.time_spent.push(*processing_time_ms as usize); + } +} + +impl Aggregate for SearchAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + fn aggregate(mut self: Box, new: Box) -> Box { + let Self { + total_received, + total_succeeded, + mut time_spent, + sort_with_geo_point, + sort_sum_of_criteria_terms, + sort_total_number_of_criteria, + distinct, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + attributes_to_search_on_total_number_of_uses, + max_terms_number, + max_vector_size, + retrieve_vectors, + matching_strategy, + max_limit, + max_offset, + finite_pagination, + max_attributes_to_retrieve, + max_attributes_to_highlight, + highlight_pre_tag, + highlight_post_tag, + max_attributes_to_crop, + crop_marker, + show_matches_position, + crop_length, + facets_sum_of_terms, + facets_total_number_of_facets, + show_ranking_score, + show_ranking_score_details, + semantic_ratio, + hybrid, + total_degraded, + total_used_negative_operator, + ranking_score_threshold, + mut locales, + marker: _, + } = *new; + + // request + self.total_received = self.total_received.saturating_add(total_received); + self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); + self.total_degraded = self.total_degraded.saturating_add(total_degraded); + self.total_used_negative_operator = + self.total_used_negative_operator.saturating_add(total_used_negative_operator); + self.time_spent.append(&mut time_spent); + + // sort + self.sort_with_geo_point |= sort_with_geo_point; + self.sort_sum_of_criteria_terms = + self.sort_sum_of_criteria_terms.saturating_add(sort_sum_of_criteria_terms); + self.sort_total_number_of_criteria = + self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria); + + // distinct + self.distinct |= distinct; + + // filter + self.filter_with_geo_radius |= filter_with_geo_radius; + self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; + self.filter_sum_of_criteria_terms = + self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); + self.filter_total_number_of_criteria = + self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); + for (key, value) in used_syntax.into_iter() { + let used_syntax = self.used_syntax.entry(key).or_insert(0); + *used_syntax = used_syntax.saturating_add(value); + } + + // attributes_to_search_on + self.attributes_to_search_on_total_number_of_uses = self + .attributes_to_search_on_total_number_of_uses + .saturating_add(attributes_to_search_on_total_number_of_uses); + + // q + self.max_terms_number = self.max_terms_number.max(max_terms_number); + + // vector + self.max_vector_size = self.max_vector_size.max(max_vector_size); + self.retrieve_vectors |= retrieve_vectors; + self.semantic_ratio |= semantic_ratio; + self.hybrid |= hybrid; + + // pagination + self.max_limit = self.max_limit.max(max_limit); + self.max_offset = self.max_offset.max(max_offset); + self.finite_pagination += finite_pagination; + + // formatting + self.max_attributes_to_retrieve = + self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); + self.max_attributes_to_highlight = + self.max_attributes_to_highlight.max(max_attributes_to_highlight); + self.highlight_pre_tag |= highlight_pre_tag; + self.highlight_post_tag |= highlight_post_tag; + self.max_attributes_to_crop = self.max_attributes_to_crop.max(max_attributes_to_crop); + self.crop_marker |= crop_marker; + self.show_matches_position |= show_matches_position; + self.crop_length |= crop_length; + + // facets + self.facets_sum_of_terms = self.facets_sum_of_terms.saturating_add(facets_sum_of_terms); + self.facets_total_number_of_facets = + self.facets_total_number_of_facets.saturating_add(facets_total_number_of_facets); + + // matching strategy + for (key, value) in matching_strategy.into_iter() { + let matching_strategy = self.matching_strategy.entry(key).or_insert(0); + *matching_strategy = matching_strategy.saturating_add(value); + } + + // scoring + self.show_ranking_score |= show_ranking_score; + self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; + + // locales + self.locales.append(&mut locales); + + self + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + time_spent, + sort_with_geo_point, + sort_sum_of_criteria_terms, + sort_total_number_of_criteria, + distinct, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + attributes_to_search_on_total_number_of_uses, + max_terms_number, + max_vector_size, + retrieve_vectors, + matching_strategy, + max_limit, + max_offset, + finite_pagination, + max_attributes_to_retrieve, + max_attributes_to_highlight, + highlight_pre_tag, + highlight_post_tag, + max_attributes_to_crop, + crop_marker, + show_matches_position, + crop_length, + facets_sum_of_terms, + facets_total_number_of_facets, + show_ranking_score, + show_ranking_score_details, + semantic_ratio, + hybrid, + total_degraded, + total_used_negative_operator, + ranking_score_threshold, + locales, + marker: _, + } = *self; + + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // the index of the 99th percentage of value + let percentile_99th = time_spent.len() * 99 / 100; + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th); + + json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + "total_degraded": total_degraded, + "total_used_negative_operator": total_used_negative_operator, + }, + "sort": { + "with_geoPoint": sort_with_geo_point, + "avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64), + }, + "distinct": distinct, + "filter": { + "with_geoRadius": filter_with_geo_radius, + "with_geoBoundingBox": filter_with_geo_bounding_box, + "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), + "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "attributes_to_search_on": { + "total_number_of_uses": attributes_to_search_on_total_number_of_uses, + }, + "q": { + "max_terms_number": max_terms_number, + }, + "vector": { + "max_vector_size": max_vector_size, + "retrieve_vectors": retrieve_vectors, + }, + "hybrid": { + "enabled": hybrid, + "semantic_ratio": semantic_ratio, + }, + "pagination": { + "max_limit": max_limit, + "max_offset": max_offset, + "most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" }, + }, + "formatting": { + "max_attributes_to_retrieve": max_attributes_to_retrieve, + "max_attributes_to_highlight": max_attributes_to_highlight, + "highlight_pre_tag": highlight_pre_tag, + "highlight_post_tag": highlight_post_tag, + "max_attributes_to_crop": max_attributes_to_crop, + "crop_marker": crop_marker, + "show_matches_position": show_matches_position, + "crop_length": crop_length, + }, + "facets": { + "avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64), + }, + "matching_strategy": { + "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "locales": locales, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, + }, + }) + } +} diff --git a/meilisearch/src/routes/indexes/similar.rs b/meilisearch/src/routes/indexes/similar.rs index 33df6bdad..79f42f0aa 100644 --- a/meilisearch/src/routes/indexes/similar.rs +++ b/meilisearch/src/routes/indexes/similar.rs @@ -13,10 +13,10 @@ use serde_json::Value; use tracing::debug; use super::ActionPolicy; -use crate::analytics::segment_analytics::{SimilarGET, SimilarPOST}; -use crate::analytics::{Analytics, SimilarAggregator}; +use crate::analytics::Analytics; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; +use crate::routes::indexes::similar_analytics::{SimilarAggregator, SimilarGET, SimilarPOST}; use crate::search::{ add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind, SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, diff --git a/meilisearch/src/routes/indexes/similar_analytics.rs b/meilisearch/src/routes/indexes/similar_analytics.rs new file mode 100644 index 000000000..69685a56c --- /dev/null +++ b/meilisearch/src/routes/indexes/similar_analytics.rs @@ -0,0 +1,235 @@ +use std::collections::{BinaryHeap, HashMap}; + +use once_cell::sync::Lazy; +use regex::Regex; +use serde_json::{json, Value}; + +use crate::{ + aggregate_methods, + analytics::{Aggregate, AggregateMethod}, + search::{SimilarQuery, SimilarResult}, +}; + +aggregate_methods!( + SimilarPOST => "Similar POST", + SimilarGET => "Similar GET", +); + +#[derive(Default)] +pub struct SimilarAggregator { + // requests + total_received: usize, + total_succeeded: usize, + time_spent: BinaryHeap, + + // filter + filter_with_geo_radius: bool, + filter_with_geo_bounding_box: bool, + // every time a request has a filter, this field must be incremented by the number of terms it contains + filter_sum_of_criteria_terms: usize, + // every time a request has a filter, this field must be incremented by one + filter_total_number_of_criteria: usize, + used_syntax: HashMap, + + // Whether a non-default embedder was specified + retrieve_vectors: bool, + + // pagination + max_limit: usize, + max_offset: usize, + + // formatting + max_attributes_to_retrieve: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + ranking_score_threshold: bool, + + marker: std::marker::PhantomData, +} + +impl SimilarAggregator { + #[allow(clippy::field_reassign_with_default)] + pub fn from_query(query: &SimilarQuery) -> Self { + let SimilarQuery { + id: _, + embedder: _, + offset, + limit, + attributes_to_retrieve: _, + retrieve_vectors, + show_ranking_score, + show_ranking_score_details, + filter, + ranking_score_threshold, + } = query; + + let mut ret = Self::default(); + + ret.total_received = 1; + + if let Some(ref filter) = filter { + static RE: Lazy = Lazy::new(|| Regex::new("AND | OR").unwrap()); + ret.filter_total_number_of_criteria = 1; + + let syntax = match filter { + Value::String(_) => "string".to_string(), + Value::Array(values) => { + if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) { + "mixed".to_string() + } else { + "array".to_string() + } + } + _ => "none".to_string(), + }; + // convert the string to a HashMap + ret.used_syntax.insert(syntax, 1); + + let stringified_filters = filter.to_string(); + ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); + ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); + ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); + } + + ret.max_limit = *limit; + ret.max_offset = *offset; + + ret.show_ranking_score = *show_ranking_score; + ret.show_ranking_score_details = *show_ranking_score_details; + ret.ranking_score_threshold = ranking_score_threshold.is_some(); + + ret.retrieve_vectors = *retrieve_vectors; + + ret + } + + pub fn succeed(&mut self, result: &SimilarResult) { + let SimilarResult { id: _, hits: _, processing_time_ms, hits_info: _ } = result; + + self.total_succeeded = self.total_succeeded.saturating_add(1); + + self.time_spent.push(*processing_time_ms as usize); + } +} + +impl Aggregate for SimilarAggregator { + fn event_name(&self) -> &'static str { + Method::event_name() + } + + /// Aggregate one [SimilarAggregator] into another. + fn aggregate(mut self: Box, new: Box) -> Box { + let Self { + total_received, + total_succeeded, + mut time_spent, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + max_limit, + max_offset, + max_attributes_to_retrieve, + show_ranking_score, + show_ranking_score_details, + ranking_score_threshold, + retrieve_vectors, + marker: _, + } = *new; + + // request + self.total_received = self.total_received.saturating_add(total_received); + self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); + self.time_spent.append(&mut time_spent); + + // filter + self.filter_with_geo_radius |= filter_with_geo_radius; + self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; + self.filter_sum_of_criteria_terms = + self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); + self.filter_total_number_of_criteria = + self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria); + for (key, value) in used_syntax.into_iter() { + let used_syntax = self.used_syntax.entry(key).or_insert(0); + *used_syntax = used_syntax.saturating_add(value); + } + + self.retrieve_vectors |= retrieve_vectors; + + // pagination + self.max_limit = self.max_limit.max(max_limit); + self.max_offset = self.max_offset.max(max_offset); + + // formatting + self.max_attributes_to_retrieve = + self.max_attributes_to_retrieve.max(max_attributes_to_retrieve); + + // scoring + self.show_ranking_score |= show_ranking_score; + self.show_ranking_score_details |= show_ranking_score_details; + self.ranking_score_threshold |= ranking_score_threshold; + + self + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + time_spent, + filter_with_geo_radius, + filter_with_geo_bounding_box, + filter_sum_of_criteria_terms, + filter_total_number_of_criteria, + used_syntax, + max_limit, + max_offset, + max_attributes_to_retrieve, + show_ranking_score, + show_ranking_score_details, + ranking_score_threshold, + retrieve_vectors, + marker: _, + } = *self; + + // we get all the values in a sorted manner + let time_spent = time_spent.into_sorted_vec(); + // the index of the 99th percentage of value + let percentile_99th = time_spent.len() * 99 / 100; + // We are only interested by the slowest value of the 99th fastest results + let time_spent = time_spent.get(percentile_99th); + + json!({ + "requests": { + "99th_response_time": time_spent.map(|t| format!("{:.2}", t)), + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "filter": { + "with_geoRadius": filter_with_geo_radius, + "with_geoBoundingBox": filter_with_geo_bounding_box, + "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), + "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), + }, + "vector": { + "retrieve_vectors": retrieve_vectors, + }, + "pagination": { + "max_limit": max_limit, + "max_offset": max_offset, + }, + "formatting": { + "max_attributes_to_retrieve": max_attributes_to_retrieve, + }, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + "ranking_score_threshold": ranking_score_threshold, + } + }) + } +} diff --git a/meilisearch/src/routes/mod.rs b/meilisearch/src/routes/mod.rs index c25aeee70..b7260ea08 100644 --- a/meilisearch/src/routes/mod.rs +++ b/meilisearch/src/routes/mod.rs @@ -25,6 +25,7 @@ pub mod indexes; mod logs; mod metrics; mod multi_search; +mod multi_search_analytics; mod snapshot; mod swap_indexes; pub mod tasks; diff --git a/meilisearch/src/routes/multi_search.rs b/meilisearch/src/routes/multi_search.rs index 13a39cb44..b7bd31716 100644 --- a/meilisearch/src/routes/multi_search.rs +++ b/meilisearch/src/routes/multi_search.rs @@ -9,7 +9,7 @@ use meilisearch_types::keys::actions; use serde::Serialize; use tracing::debug; -use crate::analytics::{Analytics, MultiSearchAggregator}; +use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{AuthenticationError, GuardedData}; @@ -21,6 +21,8 @@ use crate::search::{ }; use crate::search_queue::SearchQueue; +use super::multi_search_analytics::MultiSearchAggregator; + pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(multi_search_with_post)))); } diff --git a/meilisearch/src/routes/multi_search_analytics.rs b/meilisearch/src/routes/multi_search_analytics.rs new file mode 100644 index 000000000..be1218399 --- /dev/null +++ b/meilisearch/src/routes/multi_search_analytics.rs @@ -0,0 +1,170 @@ +use std::collections::HashSet; + +use serde_json::json; + +use crate::{ + analytics::Aggregate, + search::{FederatedSearch, SearchQueryWithIndex}, +}; + +#[derive(Default)] +pub struct MultiSearchAggregator { + // requests + total_received: usize, + total_succeeded: usize, + + // sum of the number of distinct indexes in each single request, use with total_received to compute an avg + total_distinct_index_count: usize, + // number of queries with a single index, use with total_received to compute a proportion + total_single_index: usize, + + // sum of the number of search queries in the requests, use with total_received to compute an average + total_search_count: usize, + + // scoring + show_ranking_score: bool, + show_ranking_score_details: bool, + + // federation + use_federation: bool, +} + +impl MultiSearchAggregator { + pub fn from_federated_search(federated_search: &FederatedSearch) -> Self { + let use_federation = federated_search.federation.is_some(); + + let distinct_indexes: HashSet<_> = federated_search + .queries + .iter() + .map(|query| { + let query = &query; + // make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex + let SearchQueryWithIndex { + index_uid, + federation_options: _, + q: _, + vector: _, + offset: _, + limit: _, + page: _, + hits_per_page: _, + attributes_to_retrieve: _, + retrieve_vectors: _, + attributes_to_crop: _, + crop_length: _, + attributes_to_highlight: _, + show_ranking_score: _, + show_ranking_score_details: _, + show_matches_position: _, + filter: _, + sort: _, + distinct: _, + facets: _, + highlight_pre_tag: _, + highlight_post_tag: _, + crop_marker: _, + matching_strategy: _, + attributes_to_search_on: _, + hybrid: _, + ranking_score_threshold: _, + locales: _, + } = query; + + index_uid.as_str() + }) + .collect(); + + let show_ranking_score = + federated_search.queries.iter().any(|query| query.show_ranking_score); + let show_ranking_score_details = + federated_search.queries.iter().any(|query| query.show_ranking_score_details); + + Self { + total_received: 1, + total_succeeded: 0, + total_distinct_index_count: distinct_indexes.len(), + total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 }, + total_search_count: federated_search.queries.len(), + show_ranking_score, + show_ranking_score_details, + use_federation, + } + } + + pub fn succeed(&mut self) { + self.total_succeeded = self.total_succeeded.saturating_add(1); + } +} + +impl Aggregate for MultiSearchAggregator { + fn event_name(&self) -> &'static str { + "Documents Searched by Multi-Search POST" + } + + /// Aggregate one [MultiSearchAggregator] into another. + fn aggregate(self: Box, new: Box) -> Box { + // write the aggregate in a way that will cause a compilation error if a field is added. + + // get ownership of self, replacing it by a default value. + let this = *self; + + let total_received = this.total_received.saturating_add(new.total_received); + let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded); + let total_distinct_index_count = + this.total_distinct_index_count.saturating_add(new.total_distinct_index_count); + let total_single_index = this.total_single_index.saturating_add(new.total_single_index); + let total_search_count = this.total_search_count.saturating_add(new.total_search_count); + let show_ranking_score = this.show_ranking_score || new.show_ranking_score; + let show_ranking_score_details = + this.show_ranking_score_details || new.show_ranking_score_details; + let use_federation = this.use_federation || new.use_federation; + + Box::new(Self { + total_received, + total_succeeded, + total_distinct_index_count, + total_single_index, + total_search_count, + show_ranking_score, + show_ranking_score_details, + use_federation, + }) + } + + fn into_event(self: Box) -> serde_json::Value { + let Self { + total_received, + total_succeeded, + total_distinct_index_count, + total_single_index, + total_search_count, + show_ranking_score, + show_ranking_score_details, + use_federation, + } = *self; + + json!({ + "requests": { + "total_succeeded": total_succeeded, + "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics + "total_received": total_received, + }, + "indexes": { + "total_single_index": total_single_index, + "total_distinct_index_count": total_distinct_index_count, + "avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early + }, + "searches": { + "total_search_count": total_search_count, + "avg_search_count": (total_search_count as f64) / (total_received as f64), + }, + "scoring": { + "show_ranking_score": show_ranking_score, + "show_ranking_score_details": show_ranking_score_details, + }, + "federation": { + "use_federation": use_federation, + } + }) + } +} From b02a72c0c0d68068c5c20e77b4f5c9d2e151375f Mon Sep 17 00:00:00 2001 From: Pedro Turik Firmino Date: Tue, 29 Oct 2024 19:30:11 -0300 Subject: [PATCH 74/92] Applies optimizations to some integration tests --- .../tests/documents/update_documents.rs | 73 +++++++++---------- 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/crates/meilisearch/tests/documents/update_documents.rs b/crates/meilisearch/tests/documents/update_documents.rs index 195dca914..c0703e81b 100644 --- a/crates/meilisearch/tests/documents/update_documents.rs +++ b/crates/meilisearch/tests/documents/update_documents.rs @@ -23,8 +23,8 @@ async fn error_document_update_create_index_bad_uid() { #[actix_rt::test] async fn document_update_with_primary_key() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); let documents = json!([ { @@ -32,15 +32,14 @@ async fn document_update_with_primary_key() { "content": "foo", } ]); - let (_response, code) = index.update_documents(documents, Some("primary")).await; + let (response, code) = index.update_documents(documents, Some("primary")).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); - let (response, code) = index.get_task(0).await; + let (response, code) = index.get_task(response.uid()).await; assert_eq!(code, 200); assert_eq!(response["status"], "succeeded"); - assert_eq!(response["uid"], 0); assert_eq!(response["type"], "documentAdditionOrUpdate"); assert_eq!(response["details"]["indexedDocuments"], 1); assert_eq!(response["details"]["receivedDocuments"], 1); @@ -52,8 +51,8 @@ async fn document_update_with_primary_key() { #[actix_rt::test] async fn update_document() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); let documents = json!([ { @@ -62,10 +61,10 @@ async fn update_document() { } ]); - let (_response, code) = index.add_documents(documents, None).await; + let (response, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); let documents = json!([ { @@ -77,9 +76,9 @@ async fn update_document() { let (response, code) = index.update_documents(documents, None).await; assert_eq!(code, 202, "response: {}", response); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); - let (response, code) = index.get_task(1).await; + let (response, code) = index.get_task(response.uid()).await; assert_eq!(code, 200); assert_eq!(response["status"], "succeeded"); @@ -96,8 +95,8 @@ async fn update_document() { #[actix_rt::test] async fn update_document_gzip_encoded() { - let server = Server::new().await; - let index = server.index_with_encoder("test", Encoder::Gzip); + let server = Server::new_shared(); + let index = server.unique_index_with_encoder(Encoder::Gzip); let documents = json!([ { @@ -106,10 +105,10 @@ async fn update_document_gzip_encoded() { } ]); - let (_response, code) = index.add_documents(documents, None).await; + let (response, code) = index.add_documents(documents, None).await; assert_eq!(code, 202); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); let documents = json!([ { @@ -121,9 +120,9 @@ async fn update_document_gzip_encoded() { let (response, code) = index.update_documents(documents, None).await; assert_eq!(code, 202, "response: {}", response); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); - let (response, code) = index.get_task(1).await; + let (response, code) = index.get_task(response.uid()).await; assert_eq!(code, 200); assert_eq!(response["status"], "succeeded"); @@ -140,12 +139,12 @@ async fn update_document_gzip_encoded() { #[actix_rt::test] async fn update_larger_dataset() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); let documents = serde_json::from_str(include_str!("../assets/test_set.json")).unwrap(); - index.update_documents(documents, None).await; - index.wait_task(0).await; - let (response, code) = index.get_task(0).await; + let (task, _code) = index.update_documents(documents, None).await; + index.wait_task(task.uid()).await.succeeded(); + let (response, code) = index.get_task(task.uid()).await; assert_eq!(code, 200); assert_eq!(response["type"], "documentAdditionOrUpdate"); assert_eq!(response["details"]["indexedDocuments"], 77); @@ -158,8 +157,8 @@ async fn update_larger_dataset() { #[actix_rt::test] async fn error_update_documents_bad_document_id() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); index.create(Some("docid")).await; let documents = json!([ { @@ -167,8 +166,8 @@ async fn error_update_documents_bad_document_id() { "content": "foobar" } ]); - index.update_documents(documents, None).await; - let response = index.wait_task(1).await; + let (task, _code) = index.update_documents(documents, None).await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], json!("failed")); assert_eq!( response["error"]["message"], @@ -186,8 +185,8 @@ async fn error_update_documents_bad_document_id() { #[actix_rt::test] async fn error_update_documents_missing_document_id() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); index.create(Some("docid")).await; let documents = json!([ { @@ -195,8 +194,8 @@ async fn error_update_documents_missing_document_id() { "content": "foobar" } ]); - index.update_documents(documents, None).await; - let response = index.wait_task(1).await; + let (task, _code) = index.update_documents(documents, None).await; + let response = index.wait_task(task.uid()).await; assert_eq!(response["status"], "failed"); assert_eq!( response["error"]["message"], @@ -212,8 +211,8 @@ async fn error_update_documents_missing_document_id() { #[actix_rt::test] async fn update_faceted_document() { - let server = Server::new().await; - let index = server.index("test"); + let server = Server::new_shared(); + let index = server.unique_index(); let (response, code) = index .update_settings(json!({ @@ -221,7 +220,7 @@ async fn update_faceted_document() { })) .await; assert_eq!("202", code.as_str(), "{:?}", response); - index.wait_task(0).await; + index.wait_task(response.uid()).await.succeeded(); let documents: Vec<_> = (0..1000) .map(|id| { @@ -232,10 +231,10 @@ async fn update_faceted_document() { }) .collect(); - let (_response, code) = index.add_documents(documents.into(), None).await; + let (response, code) = index.add_documents(documents.into(), None).await; assert_eq!(code, 202); - index.wait_task(1).await; + index.wait_task(response.uid()).await.succeeded(); let documents = json!([ { @@ -247,7 +246,7 @@ async fn update_faceted_document() { let (response, code) = index.update_documents(documents, None).await; assert_eq!(code, 202, "response: {}", response); - index.wait_task(2).await; + index.wait_task(response.uid()).await.succeeded(); index .search(json!({"limit": 10}), |response, code| { From 186326fe40af73956e520e294cedeaeb96093a78 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 4 Nov 2024 16:33:04 +0100 Subject: [PATCH 75/92] update the macos version --- .github/workflows/publish-binaries.yml | 6 +++--- .github/workflows/test-suite.yml | 2 +- bors.toml | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish-binaries.yml b/.github/workflows/publish-binaries.yml index 016a9d282..c53946fea 100644 --- a/.github/workflows/publish-binaries.yml +++ b/.github/workflows/publish-binaries.yml @@ -65,9 +65,9 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-12, windows-2022] + os: [macos-13, windows-2022] include: - - os: macos-12 + - os: macos-13 artifact_name: meilisearch asset_name: meilisearch-macos-amd64 - os: windows-2022 @@ -90,7 +90,7 @@ jobs: publish-macos-apple-silicon: name: Publish binary for macOS silicon - runs-on: macos-12 + runs-on: macos-13 needs: check-version strategy: matrix: diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index ce7fb30b6..90fb03538 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -51,7 +51,7 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-12, windows-2022] + os: [macos-13, windows-2022] steps: - uses: actions/checkout@v3 - name: Cache dependencies diff --git a/bors.toml b/bors.toml index 8750ed993..96e9ef65e 100644 --- a/bors.toml +++ b/bors.toml @@ -1,6 +1,6 @@ status = [ 'Tests on ubuntu-20.04', - 'Tests on macos-12', + 'Tests on macos-13', 'Tests on windows-2022', 'Run Clippy', 'Run Rustfmt', From 362836efb7d5924a485fa3e15171257f40214509 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 28 Oct 2024 11:57:02 +0100 Subject: [PATCH 76/92] make an upgrade module where we'll be able to shove each version instead of putting everything in the same file --- crates/meilitool/src/main.rs | 428 +-------------------------------- meilitool/src/upgrade/mod.rs | 46 ++++ meilitool/src/upgrade/v1_10.rs | 279 +++++++++++++++++++++ meilitool/src/upgrade/v1_9.rs | 100 ++++++++ 4 files changed, 430 insertions(+), 423 deletions(-) create mode 100644 meilitool/src/upgrade/mod.rs create mode 100644 meilitool/src/upgrade/v1_10.rs create mode 100644 meilitool/src/upgrade/v1_9.rs diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index 9dbff2486..ef137f746 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -2,7 +2,7 @@ use std::fs::{read_dir, read_to_string, remove_file, File}; use std::io::BufWriter; use std::path::PathBuf; -use anyhow::{bail, Context}; +use anyhow::Context; use clap::{Parser, Subcommand}; use dump::{DumpWriter, IndexMetadata}; use file_store::FileStore; @@ -10,15 +10,16 @@ use meilisearch_auth::AuthController; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified}; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; -use meilisearch_types::milli::index::{db_name, main_key}; use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::tasks::{Status, Task}; -use meilisearch_types::versioning::{create_version_file, get_version, parse_version}; +use meilisearch_types::versioning::{get_version, parse_version}; use meilisearch_types::Index; use time::macros::format_description; use time::OffsetDateTime; +use upgrade::OfflineUpgrade; use uuid_codec::UuidCodec; +mod upgrade; mod uuid_codec; #[derive(Parser)] @@ -72,7 +73,7 @@ enum Command { /// /// Supported upgrade paths: /// - /// - v1.9.0 -> v1.10.0 + /// - v1.9.0 -> v1.10.0 -> v1.11.0 OfflineUpgrade { #[arg(long)] target_version: String, @@ -96,425 +97,6 @@ fn main() -> anyhow::Result<()> { } } -struct OfflineUpgrade { - db_path: PathBuf, - current_version: (String, String, String), - target_version: (String, String, String), -} - -impl OfflineUpgrade { - fn upgrade(self) -> anyhow::Result<()> { - // TODO: if we make this process support more versions, introduce a more flexible way of checking for the version - // currently only supports v1.9 to v1.10 - let (current_major, current_minor, current_patch) = &self.current_version; - - match (current_major.as_str(), current_minor.as_str(), current_patch.as_str()) { - ("1", "9", _) => {} - _ => { - bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9") - } - } - - let (target_major, target_minor, target_patch) = &self.target_version; - - match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { - ("1", "10", _) => {} - _ => { - bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10") - } - } - - println!("Upgrading from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}"); - - self.v1_9_to_v1_10()?; - - println!("Writing VERSION file"); - - create_version_file(&self.db_path, target_major, target_minor, target_patch) - .context("while writing VERSION file after the upgrade")?; - - println!("Success"); - - Ok(()) - } - - fn v1_9_to_v1_10(&self) -> anyhow::Result<()> { - // 2 changes here - - // 1. date format. needs to be done before opening the Index - // 2. REST embedders. We don't support this case right now, so bail - - let index_scheduler_path = self.db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } - .with_context(|| { - format!("While trying to open {:?}", index_scheduler_path.display()) - })?; - - let mut sched_wtxn = env.write_txn()?; - - let index_mapping: Database = - try_opening_database(&env, &sched_wtxn, "index-mapping")?; - - let index_stats: Database = - try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| { - format!("While trying to open {:?}", index_scheduler_path.display()) - })?; - - let index_count = - index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?; - - // FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn - // 1. immutably for the iteration - // 2. mutably for updating index stats - let indexes: Vec<_> = index_mapping - .iter(&sched_wtxn)? - .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) - .collect(); - - let mut rest_embedders = Vec::new(); - - let mut unwrapped_indexes = Vec::new(); - - // check that update can take place - for (index_index, result) in indexes.into_iter().enumerate() { - let (uid, uuid) = result?; - let index_path = self.db_path.join("indexes").join(uuid.to_string()); - - println!( - "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", - index_index + 1, - index_path.display() - ); - - let index_env = unsafe { - // FIXME: fetch the 25 magic number from the index file - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? - }; - - let index_txn = index_env.read_txn().with_context(|| { - format!( - "while obtaining a write transaction for index {uid} at {}", - index_path.display() - ) - })?; - - println!("\t- Checking for incompatible embedders (REST embedders)"); - let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?; - - if rest_embedders_for_index.is_empty() { - unwrapped_indexes.push((uid, uuid)); - } else { - // no need to add to unwrapped indexes because we'll exit early - rest_embedders.push((uid, rest_embedders_for_index)); - } - } - - if !rest_embedders.is_empty() { - let rest_embedders = rest_embedders - .into_iter() - .flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders)) - .map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`")) - .collect::>() - .join("\n"); - bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\ - The database has not been modified and is still a valid v1.9 database."); - } - - println!("Update can take place, updating"); - - for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() { - let index_path = self.db_path.join("indexes").join(uuid.to_string()); - - println!( - "[{}/{index_count}]Updating index `{uid}` at `{}`", - index_index + 1, - index_path.display() - ); - - let index_env = unsafe { - // FIXME: fetch the 25 magic number from the index file - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? - }; - - let mut index_wtxn = index_env.write_txn().with_context(|| { - format!( - "while obtaining a write transaction for index `{uid}` at `{}`", - index_path.display() - ) - })?; - - println!("\t- Updating index stats"); - update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?; - println!("\t- Updating date format"); - update_date_format(&uid, &index_env, &mut index_wtxn)?; - - index_wtxn.commit().with_context(|| { - format!( - "while committing the write txn for index `{uid}` at {}", - index_path.display() - ) - })?; - } - - sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?; - - println!("Upgrading database succeeded"); - - Ok(()) - } -} - -pub mod v1_9 { - pub type FieldDistribution = std::collections::BTreeMap; - - /// The statistics that can be computed from an `Index` object. - #[derive(serde::Serialize, serde::Deserialize, Debug)] - pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, - /// Size taken up by the index' DB, in bytes. - /// - /// This includes the size taken by both the used and free pages of the DB, and as the free pages - /// are not returned to the disk after a deletion, this number is typically larger than - /// `used_database_size` that only includes the size of the used pages. - pub database_size: u64, - /// Size taken by the used pages of the index' DB, in bytes. - /// - /// As the DB backend does not return to the disk the pages that are not currently used by the DB, - /// this value is typically smaller than `database_size`. - pub used_database_size: u64, - /// Association of every field name with the number of times it occurs in the documents. - pub field_distribution: FieldDistribution, - /// Creation date of the index. - pub created_at: time::OffsetDateTime, - /// Date of the last update of the index. - pub updated_at: time::OffsetDateTime, - } - - use serde::{Deserialize, Serialize}; - - #[derive(Debug, Deserialize, Serialize)] - pub struct IndexEmbeddingConfig { - pub name: String, - pub config: EmbeddingConfig, - } - - #[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)] - pub struct EmbeddingConfig { - /// Options of the embedder, specific to each kind of embedder - pub embedder_options: EmbedderOptions, - } - - /// Options of an embedder, specific to each kind of embedder. - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub enum EmbedderOptions { - HuggingFace(hf::EmbedderOptions), - OpenAi(openai::EmbedderOptions), - Ollama(ollama::EmbedderOptions), - UserProvided(manual::EmbedderOptions), - Rest(rest::EmbedderOptions), - } - - impl Default for EmbedderOptions { - fn default() -> Self { - Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None }) - } - } - - mod hf { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub model: String, - pub revision: Option, - } - } - mod openai { - - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub api_key: Option, - pub dimensions: Option, - } - } - mod ollama { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub embedding_model: String, - pub url: Option, - pub api_key: Option, - } - } - mod manual { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub dimensions: usize, - } - } - mod rest { - #[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)] - pub struct EmbedderOptions { - pub api_key: Option, - pub dimensions: Option, - pub url: String, - pub input_field: Vec, - // path to the array of embeddings - pub path_to_embeddings: Vec, - // shape of a single embedding - pub embedding_object: Vec, - } - } - - pub type OffsetDateTime = time::OffsetDateTime; -} - -pub mod v1_10 { - use crate::v1_9; - - pub type FieldDistribution = std::collections::BTreeMap; - - /// The statistics that can be computed from an `Index` object. - #[derive(serde::Serialize, serde::Deserialize, Debug)] - pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, - /// Size taken up by the index' DB, in bytes. - /// - /// This includes the size taken by both the used and free pages of the DB, and as the free pages - /// are not returned to the disk after a deletion, this number is typically larger than - /// `used_database_size` that only includes the size of the used pages. - pub database_size: u64, - /// Size taken by the used pages of the index' DB, in bytes. - /// - /// As the DB backend does not return to the disk the pages that are not currently used by the DB, - /// this value is typically smaller than `database_size`. - pub used_database_size: u64, - /// Association of every field name with the number of times it occurs in the documents. - pub field_distribution: FieldDistribution, - /// Creation date of the index. - #[serde(with = "time::serde::rfc3339")] - pub created_at: time::OffsetDateTime, - /// Date of the last update of the index. - #[serde(with = "time::serde::rfc3339")] - pub updated_at: time::OffsetDateTime, - } - - impl From for IndexStats { - fn from( - v1_9::IndexStats { - number_of_documents, - database_size, - used_database_size, - field_distribution, - created_at, - updated_at, - }: v1_9::IndexStats, - ) -> Self { - IndexStats { - number_of_documents, - database_size, - used_database_size, - field_distribution, - created_at, - updated_at, - } - } - } - - #[derive(serde::Serialize, serde::Deserialize)] - #[serde(transparent)] - pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime); -} - -fn update_index_stats( - index_stats: Database, - index_uid: &str, - index_uuid: uuid::Uuid, - sched_wtxn: &mut RwTxn, -) -> anyhow::Result<()> { - let ctx = || format!("while updating index stats for index `{index_uid}`"); - - let stats: Option = index_stats - .remap_data_type::>() - .get(sched_wtxn, &index_uuid) - .with_context(ctx)?; - - if let Some(stats) = stats { - let stats: v1_10::IndexStats = stats.into(); - - index_stats - .remap_data_type::>() - .put(sched_wtxn, &index_uuid, &stats) - .with_context(ctx)?; - } - - Ok(()) -} - -fn update_date_format( - index_uid: &str, - index_env: &Env, - index_wtxn: &mut RwTxn, -) -> anyhow::Result<()> { - let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN) - .with_context(|| format!("while updating date format for index `{index_uid}`"))?; - - date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?; - date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?; - - Ok(()) -} - -fn find_rest_embedders( - index_uid: &str, - index_env: &Env, - index_txn: &RoTxn, -) -> anyhow::Result> { - let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN) - .with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?; - - let mut rest_embedders = vec![]; - - for config in main - .remap_types::>>() - .get(index_txn, main_key::EMBEDDING_CONFIGS)? - .unwrap_or_default() - { - if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options { - rest_embedders.push(config.name); - } - } - - Ok(rest_embedders) -} - -fn date_round_trip( - wtxn: &mut RwTxn, - index_uid: &str, - db: Database, - key: &str, -) -> anyhow::Result<()> { - let datetime = - db.remap_types::>().get(wtxn, key).with_context( - || format!("could not read `{key}` while updating date format for index `{index_uid}`"), - )?; - - if let Some(datetime) = datetime { - db.remap_types::>() - .put(wtxn, key, &v1_10::OffsetDateTime(datetime)) - .with_context(|| { - format!( - "could not write `{key}` while updating date format for index `{index_uid}`" - ) - })?; - } - - Ok(()) -} - /// Clears the task queue located at `db_path`. fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> { let path = db_path.join("tasks"); diff --git a/meilitool/src/upgrade/mod.rs b/meilitool/src/upgrade/mod.rs new file mode 100644 index 000000000..053c61c14 --- /dev/null +++ b/meilitool/src/upgrade/mod.rs @@ -0,0 +1,46 @@ +mod v1_10; +mod v1_9; + +use std::path::PathBuf; + +use anyhow::{bail, Context}; +use meilisearch_types::versioning::create_version_file; + +use v1_10::v1_9_to_v1_10; + +pub struct OfflineUpgrade { + pub db_path: PathBuf, + pub current_version: (String, String, String), + pub target_version: (String, String, String), +} + +impl OfflineUpgrade { + pub fn upgrade(self) -> anyhow::Result<()> { + let (current_major, current_minor, current_patch) = &self.current_version; + let (target_major, target_minor, target_patch) = &self.target_version; + + println!("Upgrading from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}"); + + match ( + (current_major.as_str(), current_minor.as_str(), current_patch.as_str()), + (target_major.as_str(), target_minor.as_str(), target_patch.as_str()), + ) { + (("1", "9", _), ("1", "10", _)) => v1_9_to_v1_10(&self.db_path)?, + ((major, minor, _), _) if major != "1" && minor != "9" => + bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9"), + (_, (major, minor, _)) if major != "1" && minor != "10" => + bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10"), + _ => + bail!("Unsupported upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}. Can only upgrade from v1.9 to v1.10"), + } + + println!("Writing VERSION file"); + + create_version_file(&self.db_path, target_major, target_minor, target_patch) + .context("while writing VERSION file after the upgrade")?; + + println!("Success"); + + Ok(()) + } +} diff --git a/meilitool/src/upgrade/v1_10.rs b/meilitool/src/upgrade/v1_10.rs new file mode 100644 index 000000000..96af99c39 --- /dev/null +++ b/meilitool/src/upgrade/v1_10.rs @@ -0,0 +1,279 @@ +use anyhow::bail; +use std::path::Path; + +use anyhow::Context; +use meilisearch_types::{ + heed::{ + types::{SerdeJson, Str}, + Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified, + }, + milli::index::{db_name, main_key}, +}; + +use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; + +use super::v1_9; + +pub type FieldDistribution = std::collections::BTreeMap; + +/// The statistics that can be computed from an `Index` object. +#[derive(serde::Serialize, serde::Deserialize, Debug)] +pub struct IndexStats { + /// Number of documents in the index. + pub number_of_documents: u64, + /// Size taken up by the index' DB, in bytes. + /// + /// This includes the size taken by both the used and free pages of the DB, and as the free pages + /// are not returned to the disk after a deletion, this number is typically larger than + /// `used_database_size` that only includes the size of the used pages. + pub database_size: u64, + /// Size taken by the used pages of the index' DB, in bytes. + /// + /// As the DB backend does not return to the disk the pages that are not currently used by the DB, + /// this value is typically smaller than `database_size`. + pub used_database_size: u64, + /// Association of every field name with the number of times it occurs in the documents. + pub field_distribution: FieldDistribution, + /// Creation date of the index. + #[serde(with = "time::serde::rfc3339")] + pub created_at: time::OffsetDateTime, + /// Date of the last update of the index. + #[serde(with = "time::serde::rfc3339")] + pub updated_at: time::OffsetDateTime, +} + +impl From for IndexStats { + fn from( + v1_9::IndexStats { + number_of_documents, + database_size, + used_database_size, + field_distribution, + created_at, + updated_at, + }: v1_9::IndexStats, + ) -> Self { + IndexStats { + number_of_documents, + database_size, + used_database_size, + field_distribution, + created_at, + updated_at, + } + } +} + +#[derive(serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime); + +fn update_index_stats( + index_stats: Database, + index_uid: &str, + index_uuid: uuid::Uuid, + sched_wtxn: &mut RwTxn, +) -> anyhow::Result<()> { + let ctx = || format!("while updating index stats for index `{index_uid}`"); + + let stats: Option = index_stats + .remap_data_type::>() + .get(sched_wtxn, &index_uuid) + .with_context(ctx)?; + + if let Some(stats) = stats { + let stats: self::IndexStats = stats.into(); + + index_stats + .remap_data_type::>() + .put(sched_wtxn, &index_uuid, &stats) + .with_context(ctx)?; + } + + Ok(()) +} + +fn update_date_format( + index_uid: &str, + index_env: &Env, + index_wtxn: &mut RwTxn, +) -> anyhow::Result<()> { + let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN) + .with_context(|| format!("while updating date format for index `{index_uid}`"))?; + + date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?; + date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?; + + Ok(()) +} + +fn find_rest_embedders( + index_uid: &str, + index_env: &Env, + index_txn: &RoTxn, +) -> anyhow::Result> { + let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN) + .with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?; + + let mut rest_embedders = vec![]; + + for config in main + .remap_types::>>() + .get(index_txn, main_key::EMBEDDING_CONFIGS)? + .unwrap_or_default() + { + if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options { + rest_embedders.push(config.name); + } + } + + Ok(rest_embedders) +} + +fn date_round_trip( + wtxn: &mut RwTxn, + index_uid: &str, + db: Database, + key: &str, +) -> anyhow::Result<()> { + let datetime = + db.remap_types::>().get(wtxn, key).with_context( + || format!("could not read `{key}` while updating date format for index `{index_uid}`"), + )?; + + if let Some(datetime) = datetime { + db.remap_types::>() + .put(wtxn, key, &self::OffsetDateTime(datetime)) + .with_context(|| { + format!( + "could not write `{key}` while updating date format for index `{index_uid}`" + ) + })?; + } + + Ok(()) +} + +pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> { + // 2 changes here + + // 1. date format. needs to be done before opening the Index + // 2. REST embedders. We don't support this case right now, so bail + + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + let mut sched_wtxn = env.write_txn()?; + + let index_mapping: Database = + try_opening_database(&env, &sched_wtxn, "index-mapping")?; + + let index_stats: Database = + try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| { + format!("While trying to open {:?}", index_scheduler_path.display()) + })?; + + let index_count = + index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?; + + // FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn + // 1. immutably for the iteration + // 2. mutably for updating index stats + let indexes: Vec<_> = index_mapping + .iter(&sched_wtxn)? + .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) + .collect(); + + let mut rest_embedders = Vec::new(); + + let mut unwrapped_indexes = Vec::new(); + + // check that update can take place + for (index_index, result) in indexes.into_iter().enumerate() { + let (uid, uuid) = result?; + let index_path = db_path.join("indexes").join(uuid.to_string()); + + println!( + "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", + index_index + 1, + index_path.display() + ); + + let index_env = unsafe { + // FIXME: fetch the 25 magic number from the index file + EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? + }; + + let index_txn = index_env.read_txn().with_context(|| { + format!( + "while obtaining a write transaction for index {uid} at {}", + index_path.display() + ) + })?; + + println!("\t- Checking for incompatible embedders (REST embedders)"); + let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?; + + if rest_embedders_for_index.is_empty() { + unwrapped_indexes.push((uid, uuid)); + } else { + // no need to add to unwrapped indexes because we'll exit early + rest_embedders.push((uid, rest_embedders_for_index)); + } + } + + if !rest_embedders.is_empty() { + let rest_embedders = rest_embedders + .into_iter() + .flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders)) + .map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`")) + .collect::>() + .join("\n"); + bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\ + The database has not been modified and is still a valid v1.9 database."); + } + + println!("Update can take place, updating"); + + for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() { + let index_path = db_path.join("indexes").join(uuid.to_string()); + + println!( + "[{}/{index_count}]Updating index `{uid}` at `{}`", + index_index + 1, + index_path.display() + ); + + let index_env = unsafe { + // FIXME: fetch the 25 magic number from the index file + EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? + }; + + let mut index_wtxn = index_env.write_txn().with_context(|| { + format!( + "while obtaining a write transaction for index `{uid}` at `{}`", + index_path.display() + ) + })?; + + println!("\t- Updating index stats"); + update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?; + println!("\t- Updating date format"); + update_date_format(&uid, &index_env, &mut index_wtxn)?; + + index_wtxn.commit().with_context(|| { + format!("while committing the write txn for index `{uid}` at {}", index_path.display()) + })?; + } + + sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?; + + println!("Upgrading database succeeded"); + + Ok(()) +} diff --git a/meilitool/src/upgrade/v1_9.rs b/meilitool/src/upgrade/v1_9.rs new file mode 100644 index 000000000..faa2d9814 --- /dev/null +++ b/meilitool/src/upgrade/v1_9.rs @@ -0,0 +1,100 @@ +use serde::{Deserialize, Serialize}; + +pub type FieldDistribution = std::collections::BTreeMap; + +/// The statistics that can be computed from an `Index` object. +#[derive(serde::Serialize, serde::Deserialize, Debug)] +pub struct IndexStats { + /// Number of documents in the index. + pub number_of_documents: u64, + /// Size taken up by the index' DB, in bytes. + /// + /// This includes the size taken by both the used and free pages of the DB, and as the free pages + /// are not returned to the disk after a deletion, this number is typically larger than + /// `used_database_size` that only includes the size of the used pages. + pub database_size: u64, + /// Size taken by the used pages of the index' DB, in bytes. + /// + /// As the DB backend does not return to the disk the pages that are not currently used by the DB, + /// this value is typically smaller than `database_size`. + pub used_database_size: u64, + /// Association of every field name with the number of times it occurs in the documents. + pub field_distribution: FieldDistribution, + /// Creation date of the index. + pub created_at: time::OffsetDateTime, + /// Date of the last update of the index. + pub updated_at: time::OffsetDateTime, +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub config: EmbeddingConfig, +} + +#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)] +pub struct EmbeddingConfig { + /// Options of the embedder, specific to each kind of embedder + pub embedder_options: EmbedderOptions, +} + +/// Options of an embedder, specific to each kind of embedder. +#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub enum EmbedderOptions { + HuggingFace(hf::EmbedderOptions), + OpenAi(openai::EmbedderOptions), + Ollama(ollama::EmbedderOptions), + UserProvided(manual::EmbedderOptions), + Rest(rest::EmbedderOptions), +} + +impl Default for EmbedderOptions { + fn default() -> Self { + Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None }) + } +} + +mod hf { + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub model: String, + pub revision: Option, + } +} +mod openai { + + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub api_key: Option, + pub dimensions: Option, + } +} +mod ollama { + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub embedding_model: String, + pub url: Option, + pub api_key: Option, + } +} +mod manual { + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub dimensions: usize, + } +} +mod rest { + #[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)] + pub struct EmbedderOptions { + pub api_key: Option, + pub dimensions: Option, + pub url: String, + pub input_field: Vec, + // path to the array of embeddings + pub path_to_embeddings: Vec, + // shape of a single embedding + pub embedding_object: Vec, + } +} + +pub type OffsetDateTime = time::OffsetDateTime; From ddd03e9b370f145787bca447b8791aeff5485c94 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 29 Oct 2024 02:46:14 +0100 Subject: [PATCH 77/92] implement the upgrade from v1.10 to v1.11 in meilitool --- Cargo.lock | 28 +++++++++++-- crates/meilitool/Cargo.toml | 2 + crates/milli/Cargo.toml | 2 +- meilitool/src/upgrade/mod.rs | 60 +++++++++++++++++++-------- meilitool/src/upgrade/v1_10.rs | 7 +++- meilitool/src/upgrade/v1_11.rs | 76 ++++++++++++++++++++++++++++++++++ 6 files changed, 150 insertions(+), 25 deletions(-) create mode 100644 meilitool/src/upgrade/v1_11.rs diff --git a/Cargo.lock b/Cargo.lock index 500f28454..43a93bb05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -404,6 +404,25 @@ dependencies = [ "thiserror", ] +[[package]] +name = "arroy" +version = "0.5.0" +source = "git+https://github.com/meilisearch/arroy/?rev=3908c9e#3908c9edfba77ba18cc50bda41c88166ba5ebd37" +dependencies = [ + "bytemuck", + "byteorder", + "heed", + "log", + "memmap2", + "nohash", + "ordered-float", + "rand", + "rayon", + "roaring", + "tempfile", + "thiserror", +] + [[package]] name = "assert-json-diff" version = "2.0.2" @@ -707,9 +726,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c" [[package]] name = "bytemuck" -version = "1.16.1" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e" +checksum = "8334215b81e418a0a7bdb8ef0849474f40bb10c8b71f1c4ed315cff49f32494d" dependencies = [ "bytemuck_derive", ] @@ -2556,7 +2575,7 @@ name = "index-scheduler" version = "1.11.0" dependencies = [ "anyhow", - "arroy", + "arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "big_s", "bincode", "crossbeam", @@ -3517,6 +3536,7 @@ name = "meilitool" version = "1.11.0" dependencies = [ "anyhow", + "arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?rev=3908c9e)", "clap", "dump", "file-store", @@ -3547,7 +3567,7 @@ dependencies = [ name = "milli" version = "1.11.0" dependencies = [ - "arroy", + "arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", "big_s", "bimap", "bincode", diff --git a/crates/meilitool/Cargo.toml b/crates/meilitool/Cargo.toml index ce6c1ad5b..937a484e2 100644 --- a/crates/meilitool/Cargo.toml +++ b/crates/meilitool/Cargo.toml @@ -18,3 +18,5 @@ meilisearch-types = { path = "../meilisearch-types" } serde = { version = "1.0.209", features = ["derive"] } time = { version = "0.3.36", features = ["formatting"] } uuid = { version = "1.10.0", features = ["v4"], default-features = false } +arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", rev = "3908c9e" } + diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index df0e59496..7b43fbf33 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -15,7 +15,7 @@ license.workspace = true bimap = { version = "0.6.3", features = ["serde"] } bincode = "1.3.3" bstr = "1.9.1" -bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] } +bytemuck = { version = "1.18.0", features = ["extern_crate_alloc"] } byteorder = "1.5.0" charabia = { version = "0.9.1", default-features = false } concat-arrays = "0.1.2" diff --git a/meilitool/src/upgrade/mod.rs b/meilitool/src/upgrade/mod.rs index 053c61c14..9a1e4286f 100644 --- a/meilitool/src/upgrade/mod.rs +++ b/meilitool/src/upgrade/mod.rs @@ -1,13 +1,16 @@ mod v1_10; +mod v1_11; mod v1_9; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use anyhow::{bail, Context}; use meilisearch_types::versioning::create_version_file; use v1_10::v1_9_to_v1_10; +use crate::upgrade::v1_11::v1_10_to_v1_11; + pub struct OfflineUpgrade { pub db_path: PathBuf, pub current_version: (String, String, String), @@ -16,29 +19,50 @@ pub struct OfflineUpgrade { impl OfflineUpgrade { pub fn upgrade(self) -> anyhow::Result<()> { + let upgrade_list = [ + (v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"), + (v1_10_to_v1_11, "1", "11", "0"), + ]; + let (current_major, current_minor, current_patch) = &self.current_version; + + let start_at = match ( + current_major.as_str(), + current_minor.as_str(), + current_patch.as_str(), + ) { + ("1", "9", _) => 0, + ("1", "10", _) => 1, + _ => { + bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9") + } + }; + let (target_major, target_minor, target_patch) = &self.target_version; - println!("Upgrading from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}"); + let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { + ("v1", "10", _) => 0, + ("v1", "11", _) => 1, + _ => { + bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.11") + } + }; - match ( - (current_major.as_str(), current_minor.as_str(), current_patch.as_str()), - (target_major.as_str(), target_minor.as_str(), target_patch.as_str()), - ) { - (("1", "9", _), ("1", "10", _)) => v1_9_to_v1_10(&self.db_path)?, - ((major, minor, _), _) if major != "1" && minor != "9" => - bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9"), - (_, (major, minor, _)) if major != "1" && minor != "10" => - bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10"), - _ => - bail!("Unsupported upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}. Can only upgrade from v1.9 to v1.10"), + println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}"); + + #[allow(clippy::needless_range_loop)] + for index in start_at..=ends_at { + let (func, major, minor, patch) = upgrade_list[index]; + (func)(&self.db_path)?; + println!("Done"); + // We're writing the version file just in case an issue arise _while_ upgrading. + // We don't want the DB to fail in an unknown state. + println!("Writing VERSION file"); + + create_version_file(&self.db_path, major, minor, patch) + .context("while writing VERSION file after the upgrade")?; } - println!("Writing VERSION file"); - - create_version_file(&self.db_path, target_major, target_minor, target_patch) - .context("while writing VERSION file after the upgrade")?; - println!("Success"); Ok(()) diff --git a/meilitool/src/upgrade/v1_10.rs b/meilitool/src/upgrade/v1_10.rs index 96af99c39..99fe104e3 100644 --- a/meilitool/src/upgrade/v1_10.rs +++ b/meilitool/src/upgrade/v1_10.rs @@ -79,7 +79,8 @@ fn update_index_stats( let stats: Option = index_stats .remap_data_type::>() .get(sched_wtxn, &index_uuid) - .with_context(ctx)?; + .with_context(ctx) + .with_context(|| "While reading value")?; if let Some(stats) = stats { let stats: self::IndexStats = stats.into(); @@ -87,7 +88,8 @@ fn update_index_stats( index_stats .remap_data_type::>() .put(sched_wtxn, &index_uuid, &stats) - .with_context(ctx)?; + .with_context(ctx) + .with_context(|| "While writing value")?; } Ok(()) @@ -155,6 +157,7 @@ fn date_round_trip( } pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> { + println!("Upgrading from v1.9.0 to v1.10.0"); // 2 changes here // 1. date format. needs to be done before opening the Index diff --git a/meilitool/src/upgrade/v1_11.rs b/meilitool/src/upgrade/v1_11.rs new file mode 100644 index 000000000..26c4234f6 --- /dev/null +++ b/meilitool/src/upgrade/v1_11.rs @@ -0,0 +1,76 @@ +//! The breaking changes that happened between the v1.10 and the v1.11 are: +//! - Arroy went from the v0.4.0 to the v0.5.0, see this release note to get the whole context: https://github.com/meilisearch/arroy/releases/tag/v0.5.0 +//! - The `angular` distance has been renamed to `cosine` => We only need to update the string in the metadata. +//! - Reorganize the `NodeId` to make the appending of vectors work => We'll have to update the keys of almost all items in the DB. +//! - Store the list of updated IDs directly in LMDB instead of a roaring bitmap => This shouldn't be an issue since we are never supposed to commit this roaring bitmap, but it's not forbidden by arroy so ensuring it works is probably better than anything. + +use std::path::Path; + +use anyhow::Context; +use meilisearch_types::{ + heed::{types::Str, Database, EnvOpenOptions}, + milli::index::db_name, +}; + +use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; + +pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { + println!("Upgrading from v1.10.0 to v1.11.0"); + + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + let sched_rtxn = env.read_txn()?; + + let index_mapping: Database = + try_opening_database(&env, &sched_rtxn, "index-mapping")?; + + let index_count = + index_mapping.len(&sched_rtxn).context("while reading the number of indexes")?; + + let indexes: Vec<_> = index_mapping + .iter(&sched_rtxn)? + .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) + .collect(); + + // check that update can take place + for (index_index, result) in indexes.into_iter().enumerate() { + let (uid, uuid) = result?; + let index_path = db_path.join("indexes").join(uuid.to_string()); + + println!( + "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", + index_index + 1, + index_path.display() + ); + + let index_env = unsafe { + EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? + }; + + let index_rtxn = index_env.read_txn().with_context(|| { + format!( + "while obtaining a read transaction for index {uid} at {}", + index_path.display() + ) + })?; + let mut index_wtxn = index_env.write_txn().with_context(|| { + format!( + "while obtaining a write transaction for index {uid} at {}", + index_path.display() + ) + })?; + + let database = try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY) + .with_context(|| format!("while updating date format for index `{uid}`"))?; + + arroy_v04_to_v05::ugrade_from_prev_version(&index_rtxn, &mut index_wtxn, database)?; + + index_wtxn.commit()?; + } + + Ok(()) +} From a9b61c84349e23cf34ce9ed342ec46339c36eb9a Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 29 Oct 2024 02:51:26 +0100 Subject: [PATCH 78/92] fix the version parsing and improve error handling --- meilitool/src/upgrade/mod.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/meilitool/src/upgrade/mod.rs b/meilitool/src/upgrade/mod.rs index 9a1e4286f..ae095b6bd 100644 --- a/meilitool/src/upgrade/mod.rs +++ b/meilitool/src/upgrade/mod.rs @@ -41,8 +41,11 @@ impl OfflineUpgrade { let (target_major, target_minor, target_patch) = &self.target_version; let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { - ("v1", "10", _) => 0, - ("v1", "11", _) => 1, + ("1", "10", _) => 0, + ("1", "11", _) => 1, + (major, _, _) if major.starts_with('v') => { + bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.") + } _ => { bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.11") } From 690eb42fc09db277d8426aeaa1d54e54001e1501 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 29 Oct 2024 03:27:26 +0100 Subject: [PATCH 79/92] update the version of arroy --- Cargo.lock | 4 ++-- crates/meilitool/Cargo.toml | 3 +-- meilitool/src/upgrade/v1_11.rs | 16 +++++++++++++--- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 43a93bb05..fd14a4a7d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -407,7 +407,7 @@ dependencies = [ [[package]] name = "arroy" version = "0.5.0" -source = "git+https://github.com/meilisearch/arroy/?rev=3908c9e#3908c9edfba77ba18cc50bda41c88166ba5ebd37" +source = "git+https://github.com/meilisearch/arroy/?rev=32670e7dd8b93640fcb53261ace89bda1c06497b#32670e7dd8b93640fcb53261ace89bda1c06497b" dependencies = [ "bytemuck", "byteorder", @@ -3536,7 +3536,7 @@ name = "meilitool" version = "1.11.0" dependencies = [ "anyhow", - "arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?rev=3908c9e)", + "arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?rev=32670e7dd8b93640fcb53261ace89bda1c06497b)", "clap", "dump", "file-store", diff --git a/crates/meilitool/Cargo.toml b/crates/meilitool/Cargo.toml index 937a484e2..693de6da8 100644 --- a/crates/meilitool/Cargo.toml +++ b/crates/meilitool/Cargo.toml @@ -18,5 +18,4 @@ meilisearch-types = { path = "../meilisearch-types" } serde = { version = "1.0.209", features = ["derive"] } time = { version = "0.3.36", features = ["formatting"] } uuid = { version = "1.10.0", features = ["v4"], default-features = false } -arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", rev = "3908c9e" } - +arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", rev = "32670e7dd8b93640fcb53261ace89bda1c06497b" } diff --git a/meilitool/src/upgrade/v1_11.rs b/meilitool/src/upgrade/v1_11.rs index 26c4234f6..4105879fd 100644 --- a/meilitool/src/upgrade/v1_11.rs +++ b/meilitool/src/upgrade/v1_11.rs @@ -57,6 +57,10 @@ pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { index_path.display() ) })?; + let index_read_database = + try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY) + .with_context(|| format!("while updating date format for index `{uid}`"))?; + let mut index_wtxn = index_env.write_txn().with_context(|| { format!( "while obtaining a write transaction for index {uid} at {}", @@ -64,10 +68,16 @@ pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { ) })?; - let database = try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY) - .with_context(|| format!("while updating date format for index `{uid}`"))?; + let index_write_database = + try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY) + .with_context(|| format!("while updating date format for index `{uid}`"))?; - arroy_v04_to_v05::ugrade_from_prev_version(&index_rtxn, &mut index_wtxn, database)?; + arroy_v04_to_v05::ugrade_from_prev_version( + &index_rtxn, + index_read_database, + &mut index_wtxn, + index_write_database, + )?; index_wtxn.commit()?; } From 5f57306858b86c4ca8755cffbb4e3d2dd36ffbfa Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 4 Nov 2024 11:46:36 +0100 Subject: [PATCH 80/92] update the arroy version in meilitool --- Cargo.lock | 4 ++-- crates/meilitool/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fd14a4a7d..04812fd1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -407,7 +407,7 @@ dependencies = [ [[package]] name = "arroy" version = "0.5.0" -source = "git+https://github.com/meilisearch/arroy/?rev=32670e7dd8b93640fcb53261ace89bda1c06497b#32670e7dd8b93640fcb53261ace89bda1c06497b" +source = "git+https://github.com/meilisearch/arroy/?rev=053807bf38dc079f25b003f19fc30fbf3613f6e7#053807bf38dc079f25b003f19fc30fbf3613f6e7" dependencies = [ "bytemuck", "byteorder", @@ -3536,7 +3536,7 @@ name = "meilitool" version = "1.11.0" dependencies = [ "anyhow", - "arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?rev=32670e7dd8b93640fcb53261ace89bda1c06497b)", + "arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?rev=053807bf38dc079f25b003f19fc30fbf3613f6e7)", "clap", "dump", "file-store", diff --git a/crates/meilitool/Cargo.toml b/crates/meilitool/Cargo.toml index 693de6da8..f2c8920c9 100644 --- a/crates/meilitool/Cargo.toml +++ b/crates/meilitool/Cargo.toml @@ -18,4 +18,4 @@ meilisearch-types = { path = "../meilisearch-types" } serde = { version = "1.0.209", features = ["derive"] } time = { version = "0.3.36", features = ["formatting"] } uuid = { version = "1.10.0", features = ["v4"], default-features = false } -arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", rev = "32670e7dd8b93640fcb53261ace89bda1c06497b" } +arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", rev = "053807bf38dc079f25b003f19fc30fbf3613f6e7" } From 4eef0cd332168e60c38b9115560e1180d0a13d8e Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 4 Nov 2024 15:50:38 +0100 Subject: [PATCH 81/92] fix the update from v1_9 to v1_10 by providing a custom datetime formatter myself --- meilitool/src/upgrade/v1_10.rs | 19 +++++++++++++------ meilitool/src/upgrade/v1_9.rs | 12 +++++++++--- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/meilitool/src/upgrade/v1_10.rs b/meilitool/src/upgrade/v1_10.rs index 99fe104e3..671f4d6d2 100644 --- a/meilitool/src/upgrade/v1_10.rs +++ b/meilitool/src/upgrade/v1_10.rs @@ -58,8 +58,8 @@ impl From for IndexStats { database_size, used_database_size, field_distribution, - created_at, - updated_at, + created_at: created_at.0, + updated_at: updated_at.0, } } } @@ -76,6 +76,13 @@ fn update_index_stats( ) -> anyhow::Result<()> { let ctx = || format!("while updating index stats for index `{index_uid}`"); + let stats: Option<&str> = index_stats + .remap_data_type::() + .get(sched_wtxn, &index_uuid) + .with_context(ctx) + .with_context(|| "While reading value")?; + dbg!(stats); + let stats: Option = index_stats .remap_data_type::>() .get(sched_wtxn, &index_uuid) @@ -139,13 +146,13 @@ fn date_round_trip( key: &str, ) -> anyhow::Result<()> { let datetime = - db.remap_types::>().get(wtxn, key).with_context( - || format!("could not read `{key}` while updating date format for index `{index_uid}`"), - )?; + db.remap_types::>().get(wtxn, key).with_context(|| { + format!("could not read `{key}` while updating date format for index `{index_uid}`") + })?; if let Some(datetime) = datetime { db.remap_types::>() - .put(wtxn, key, &self::OffsetDateTime(datetime)) + .put(wtxn, key, &self::OffsetDateTime(datetime.0)) .with_context(|| { format!( "could not write `{key}` while updating date format for index `{index_uid}`" diff --git a/meilitool/src/upgrade/v1_9.rs b/meilitool/src/upgrade/v1_9.rs index faa2d9814..3e6cfde6c 100644 --- a/meilitool/src/upgrade/v1_9.rs +++ b/meilitool/src/upgrade/v1_9.rs @@ -1,4 +1,5 @@ use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; pub type FieldDistribution = std::collections::BTreeMap; @@ -21,9 +22,9 @@ pub struct IndexStats { /// Association of every field name with the number of times it occurs in the documents. pub field_distribution: FieldDistribution, /// Creation date of the index. - pub created_at: time::OffsetDateTime, + pub created_at: LegacyTime, /// Date of the last update of the index. - pub updated_at: time::OffsetDateTime, + pub updated_at: LegacyTime, } #[derive(Debug, Deserialize, Serialize)] @@ -97,4 +98,9 @@ mod rest { } } -pub type OffsetDateTime = time::OffsetDateTime; +// 2024-11-04 13:32:08.48368 +00:00:00 +time::serde::format_description!(legacy_datetime, OffsetDateTime, "[year]-[month]-[day] [hour]:[minute]:[second].[subsecond] [offset_hour sign:mandatory]:[offset_minute]:[offset_second]"); + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct LegacyTime(#[serde(with = "legacy_datetime")] pub OffsetDateTime); From 106cc7fe3a8dd295b9230fd77c3a98c3d8f86ace Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 4 Nov 2024 17:51:40 +0100 Subject: [PATCH 82/92] fmt --- .../src/routes/indexes/search_analytics.rs | 20 +++++++++---------- .../src/routes/indexes/settings.rs | 2 +- .../src/routes/indexes/settings_analytics.rs | 7 ++++--- .../src/routes/indexes/similar_analytics.rs | 8 +++----- crates/meilisearch/src/routes/multi_search.rs | 3 +-- .../src/routes/multi_search_analytics.rs | 6 ++---- crates/meilisearch/tests/common/index.rs | 3 +-- 7 files changed, 21 insertions(+), 28 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/search_analytics.rs b/crates/meilisearch/src/routes/indexes/search_analytics.rs index 8bbb1781f..b16e2636e 100644 --- a/crates/meilisearch/src/routes/indexes/search_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/search_analytics.rs @@ -1,18 +1,16 @@ -use once_cell::sync::Lazy; -use regex::Regex; -use serde_json::{json, Value}; use std::collections::{BTreeSet, BinaryHeap, HashMap}; use meilisearch_types::locales::Locale; +use once_cell::sync::Lazy; +use regex::Regex; +use serde_json::{json, Value}; -use crate::{ - aggregate_methods, - analytics::{Aggregate, AggregateMethod}, - search::{ - SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, - DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, - DEFAULT_SEMANTIC_RATIO, - }, +use crate::aggregate_methods; +use crate::analytics::{Aggregate, AggregateMethod}; +use crate::search::{ + SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, + DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, + DEFAULT_SEMANTIC_RATIO, }; aggregate_methods!( diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index bca763a99..a9d8d3053 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -1,4 +1,3 @@ -use super::settings_analytics::*; use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; use deserr::actix_web::AwebJson; @@ -11,6 +10,7 @@ use meilisearch_types::settings::{settings, SecretPolicy, Settings, Unchecked}; use meilisearch_types::tasks::KindWithContent; use tracing::debug; +use super::settings_analytics::*; use crate::analytics::Analytics; use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; diff --git a/crates/meilisearch/src/routes/indexes/settings_analytics.rs b/crates/meilisearch/src/routes/indexes/settings_analytics.rs index de01b72e8..32bddcbdd 100644 --- a/crates/meilisearch/src/routes/indexes/settings_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/settings_analytics.rs @@ -3,15 +3,16 @@ //! through the sub-settings route directly without any manipulation. //! This is why we often use a `Option<&Vec<_>>` instead of a `Option<&[_]>`. +use std::collections::{BTreeMap, BTreeSet, HashSet}; + +use meilisearch_types::facet_values_sort::FacetValuesSort; use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView}; use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::settings::EmbeddingSettings; use meilisearch_types::settings::{ - FacetingSettings, PaginationSettings, ProximityPrecisionView, TypoSettings, + FacetingSettings, PaginationSettings, ProximityPrecisionView, RankingRuleView, TypoSettings, }; -use meilisearch_types::{facet_values_sort::FacetValuesSort, settings::RankingRuleView}; use serde::Serialize; -use std::collections::{BTreeMap, BTreeSet, HashSet}; use crate::analytics::Aggregate; diff --git a/crates/meilisearch/src/routes/indexes/similar_analytics.rs b/crates/meilisearch/src/routes/indexes/similar_analytics.rs index 69685a56c..726839c3a 100644 --- a/crates/meilisearch/src/routes/indexes/similar_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/similar_analytics.rs @@ -4,11 +4,9 @@ use once_cell::sync::Lazy; use regex::Regex; use serde_json::{json, Value}; -use crate::{ - aggregate_methods, - analytics::{Aggregate, AggregateMethod}, - search::{SimilarQuery, SimilarResult}, -}; +use crate::aggregate_methods; +use crate::analytics::{Aggregate, AggregateMethod}; +use crate::search::{SimilarQuery, SimilarResult}; aggregate_methods!( SimilarPOST => "Similar POST", diff --git a/crates/meilisearch/src/routes/multi_search.rs b/crates/meilisearch/src/routes/multi_search.rs index b7bd31716..f8b1bc6ee 100644 --- a/crates/meilisearch/src/routes/multi_search.rs +++ b/crates/meilisearch/src/routes/multi_search.rs @@ -9,6 +9,7 @@ use meilisearch_types::keys::actions; use serde::Serialize; use tracing::debug; +use super::multi_search_analytics::MultiSearchAggregator; use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; @@ -21,8 +22,6 @@ use crate::search::{ }; use crate::search_queue::SearchQueue; -use super::multi_search_analytics::MultiSearchAggregator; - pub fn configure(cfg: &mut web::ServiceConfig) { cfg.service(web::resource("").route(web::post().to(SeqHandler(multi_search_with_post)))); } diff --git a/crates/meilisearch/src/routes/multi_search_analytics.rs b/crates/meilisearch/src/routes/multi_search_analytics.rs index be1218399..3d07f471c 100644 --- a/crates/meilisearch/src/routes/multi_search_analytics.rs +++ b/crates/meilisearch/src/routes/multi_search_analytics.rs @@ -2,10 +2,8 @@ use std::collections::HashSet; use serde_json::json; -use crate::{ - analytics::Aggregate, - search::{FederatedSearch, SearchQueryWithIndex}, -}; +use crate::analytics::Aggregate; +use crate::search::{FederatedSearch, SearchQueryWithIndex}; #[derive(Default)] pub struct MultiSearchAggregator { diff --git a/crates/meilisearch/tests/common/index.rs b/crates/meilisearch/tests/common/index.rs index 784067c2d..221333fd7 100644 --- a/crates/meilisearch/tests/common/index.rs +++ b/crates/meilisearch/tests/common/index.rs @@ -9,8 +9,7 @@ use urlencoding::encode as urlencode; use super::encoder::Encoder; use super::service::Service; -use super::Value; -use super::{Owned, Shared}; +use super::{Owned, Shared, Value}; use crate::json; pub struct Index<'a, State = Owned> { From 99a9fde37f18b0498cdbc7b88a1510f8912d00b9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 4 Nov 2024 17:55:55 +0100 Subject: [PATCH 83/92] push back the removed files --- crates/meilitool/src/upgrade/mod.rs | 73 +++++++ crates/meilitool/src/upgrade/v1_10.rs | 289 ++++++++++++++++++++++++++ crates/meilitool/src/upgrade/v1_11.rs | 86 ++++++++ crates/meilitool/src/upgrade/v1_9.rs | 106 ++++++++++ 4 files changed, 554 insertions(+) create mode 100644 crates/meilitool/src/upgrade/mod.rs create mode 100644 crates/meilitool/src/upgrade/v1_10.rs create mode 100644 crates/meilitool/src/upgrade/v1_11.rs create mode 100644 crates/meilitool/src/upgrade/v1_9.rs diff --git a/crates/meilitool/src/upgrade/mod.rs b/crates/meilitool/src/upgrade/mod.rs new file mode 100644 index 000000000..ae095b6bd --- /dev/null +++ b/crates/meilitool/src/upgrade/mod.rs @@ -0,0 +1,73 @@ +mod v1_10; +mod v1_11; +mod v1_9; + +use std::path::{Path, PathBuf}; + +use anyhow::{bail, Context}; +use meilisearch_types::versioning::create_version_file; + +use v1_10::v1_9_to_v1_10; + +use crate::upgrade::v1_11::v1_10_to_v1_11; + +pub struct OfflineUpgrade { + pub db_path: PathBuf, + pub current_version: (String, String, String), + pub target_version: (String, String, String), +} + +impl OfflineUpgrade { + pub fn upgrade(self) -> anyhow::Result<()> { + let upgrade_list = [ + (v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"), + (v1_10_to_v1_11, "1", "11", "0"), + ]; + + let (current_major, current_minor, current_patch) = &self.current_version; + + let start_at = match ( + current_major.as_str(), + current_minor.as_str(), + current_patch.as_str(), + ) { + ("1", "9", _) => 0, + ("1", "10", _) => 1, + _ => { + bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9") + } + }; + + let (target_major, target_minor, target_patch) = &self.target_version; + + let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { + ("1", "10", _) => 0, + ("1", "11", _) => 1, + (major, _, _) if major.starts_with('v') => { + bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.") + } + _ => { + bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.11") + } + }; + + println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}"); + + #[allow(clippy::needless_range_loop)] + for index in start_at..=ends_at { + let (func, major, minor, patch) = upgrade_list[index]; + (func)(&self.db_path)?; + println!("Done"); + // We're writing the version file just in case an issue arise _while_ upgrading. + // We don't want the DB to fail in an unknown state. + println!("Writing VERSION file"); + + create_version_file(&self.db_path, major, minor, patch) + .context("while writing VERSION file after the upgrade")?; + } + + println!("Success"); + + Ok(()) + } +} diff --git a/crates/meilitool/src/upgrade/v1_10.rs b/crates/meilitool/src/upgrade/v1_10.rs new file mode 100644 index 000000000..671f4d6d2 --- /dev/null +++ b/crates/meilitool/src/upgrade/v1_10.rs @@ -0,0 +1,289 @@ +use anyhow::bail; +use std::path::Path; + +use anyhow::Context; +use meilisearch_types::{ + heed::{ + types::{SerdeJson, Str}, + Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified, + }, + milli::index::{db_name, main_key}, +}; + +use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; + +use super::v1_9; + +pub type FieldDistribution = std::collections::BTreeMap; + +/// The statistics that can be computed from an `Index` object. +#[derive(serde::Serialize, serde::Deserialize, Debug)] +pub struct IndexStats { + /// Number of documents in the index. + pub number_of_documents: u64, + /// Size taken up by the index' DB, in bytes. + /// + /// This includes the size taken by both the used and free pages of the DB, and as the free pages + /// are not returned to the disk after a deletion, this number is typically larger than + /// `used_database_size` that only includes the size of the used pages. + pub database_size: u64, + /// Size taken by the used pages of the index' DB, in bytes. + /// + /// As the DB backend does not return to the disk the pages that are not currently used by the DB, + /// this value is typically smaller than `database_size`. + pub used_database_size: u64, + /// Association of every field name with the number of times it occurs in the documents. + pub field_distribution: FieldDistribution, + /// Creation date of the index. + #[serde(with = "time::serde::rfc3339")] + pub created_at: time::OffsetDateTime, + /// Date of the last update of the index. + #[serde(with = "time::serde::rfc3339")] + pub updated_at: time::OffsetDateTime, +} + +impl From for IndexStats { + fn from( + v1_9::IndexStats { + number_of_documents, + database_size, + used_database_size, + field_distribution, + created_at, + updated_at, + }: v1_9::IndexStats, + ) -> Self { + IndexStats { + number_of_documents, + database_size, + used_database_size, + field_distribution, + created_at: created_at.0, + updated_at: updated_at.0, + } + } +} + +#[derive(serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime); + +fn update_index_stats( + index_stats: Database, + index_uid: &str, + index_uuid: uuid::Uuid, + sched_wtxn: &mut RwTxn, +) -> anyhow::Result<()> { + let ctx = || format!("while updating index stats for index `{index_uid}`"); + + let stats: Option<&str> = index_stats + .remap_data_type::() + .get(sched_wtxn, &index_uuid) + .with_context(ctx) + .with_context(|| "While reading value")?; + dbg!(stats); + + let stats: Option = index_stats + .remap_data_type::>() + .get(sched_wtxn, &index_uuid) + .with_context(ctx) + .with_context(|| "While reading value")?; + + if let Some(stats) = stats { + let stats: self::IndexStats = stats.into(); + + index_stats + .remap_data_type::>() + .put(sched_wtxn, &index_uuid, &stats) + .with_context(ctx) + .with_context(|| "While writing value")?; + } + + Ok(()) +} + +fn update_date_format( + index_uid: &str, + index_env: &Env, + index_wtxn: &mut RwTxn, +) -> anyhow::Result<()> { + let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN) + .with_context(|| format!("while updating date format for index `{index_uid}`"))?; + + date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?; + date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?; + + Ok(()) +} + +fn find_rest_embedders( + index_uid: &str, + index_env: &Env, + index_txn: &RoTxn, +) -> anyhow::Result> { + let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN) + .with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?; + + let mut rest_embedders = vec![]; + + for config in main + .remap_types::>>() + .get(index_txn, main_key::EMBEDDING_CONFIGS)? + .unwrap_or_default() + { + if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options { + rest_embedders.push(config.name); + } + } + + Ok(rest_embedders) +} + +fn date_round_trip( + wtxn: &mut RwTxn, + index_uid: &str, + db: Database, + key: &str, +) -> anyhow::Result<()> { + let datetime = + db.remap_types::>().get(wtxn, key).with_context(|| { + format!("could not read `{key}` while updating date format for index `{index_uid}`") + })?; + + if let Some(datetime) = datetime { + db.remap_types::>() + .put(wtxn, key, &self::OffsetDateTime(datetime.0)) + .with_context(|| { + format!( + "could not write `{key}` while updating date format for index `{index_uid}`" + ) + })?; + } + + Ok(()) +} + +pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> { + println!("Upgrading from v1.9.0 to v1.10.0"); + // 2 changes here + + // 1. date format. needs to be done before opening the Index + // 2. REST embedders. We don't support this case right now, so bail + + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + let mut sched_wtxn = env.write_txn()?; + + let index_mapping: Database = + try_opening_database(&env, &sched_wtxn, "index-mapping")?; + + let index_stats: Database = + try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| { + format!("While trying to open {:?}", index_scheduler_path.display()) + })?; + + let index_count = + index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?; + + // FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn + // 1. immutably for the iteration + // 2. mutably for updating index stats + let indexes: Vec<_> = index_mapping + .iter(&sched_wtxn)? + .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) + .collect(); + + let mut rest_embedders = Vec::new(); + + let mut unwrapped_indexes = Vec::new(); + + // check that update can take place + for (index_index, result) in indexes.into_iter().enumerate() { + let (uid, uuid) = result?; + let index_path = db_path.join("indexes").join(uuid.to_string()); + + println!( + "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", + index_index + 1, + index_path.display() + ); + + let index_env = unsafe { + // FIXME: fetch the 25 magic number from the index file + EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? + }; + + let index_txn = index_env.read_txn().with_context(|| { + format!( + "while obtaining a write transaction for index {uid} at {}", + index_path.display() + ) + })?; + + println!("\t- Checking for incompatible embedders (REST embedders)"); + let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?; + + if rest_embedders_for_index.is_empty() { + unwrapped_indexes.push((uid, uuid)); + } else { + // no need to add to unwrapped indexes because we'll exit early + rest_embedders.push((uid, rest_embedders_for_index)); + } + } + + if !rest_embedders.is_empty() { + let rest_embedders = rest_embedders + .into_iter() + .flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders)) + .map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`")) + .collect::>() + .join("\n"); + bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\ + The database has not been modified and is still a valid v1.9 database."); + } + + println!("Update can take place, updating"); + + for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() { + let index_path = db_path.join("indexes").join(uuid.to_string()); + + println!( + "[{}/{index_count}]Updating index `{uid}` at `{}`", + index_index + 1, + index_path.display() + ); + + let index_env = unsafe { + // FIXME: fetch the 25 magic number from the index file + EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? + }; + + let mut index_wtxn = index_env.write_txn().with_context(|| { + format!( + "while obtaining a write transaction for index `{uid}` at `{}`", + index_path.display() + ) + })?; + + println!("\t- Updating index stats"); + update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?; + println!("\t- Updating date format"); + update_date_format(&uid, &index_env, &mut index_wtxn)?; + + index_wtxn.commit().with_context(|| { + format!("while committing the write txn for index `{uid}` at {}", index_path.display()) + })?; + } + + sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?; + + println!("Upgrading database succeeded"); + + Ok(()) +} diff --git a/crates/meilitool/src/upgrade/v1_11.rs b/crates/meilitool/src/upgrade/v1_11.rs new file mode 100644 index 000000000..4105879fd --- /dev/null +++ b/crates/meilitool/src/upgrade/v1_11.rs @@ -0,0 +1,86 @@ +//! The breaking changes that happened between the v1.10 and the v1.11 are: +//! - Arroy went from the v0.4.0 to the v0.5.0, see this release note to get the whole context: https://github.com/meilisearch/arroy/releases/tag/v0.5.0 +//! - The `angular` distance has been renamed to `cosine` => We only need to update the string in the metadata. +//! - Reorganize the `NodeId` to make the appending of vectors work => We'll have to update the keys of almost all items in the DB. +//! - Store the list of updated IDs directly in LMDB instead of a roaring bitmap => This shouldn't be an issue since we are never supposed to commit this roaring bitmap, but it's not forbidden by arroy so ensuring it works is probably better than anything. + +use std::path::Path; + +use anyhow::Context; +use meilisearch_types::{ + heed::{types::Str, Database, EnvOpenOptions}, + milli::index::db_name, +}; + +use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; + +pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { + println!("Upgrading from v1.10.0 to v1.11.0"); + + let index_scheduler_path = db_path.join("tasks"); + let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } + .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; + + let sched_rtxn = env.read_txn()?; + + let index_mapping: Database = + try_opening_database(&env, &sched_rtxn, "index-mapping")?; + + let index_count = + index_mapping.len(&sched_rtxn).context("while reading the number of indexes")?; + + let indexes: Vec<_> = index_mapping + .iter(&sched_rtxn)? + .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) + .collect(); + + // check that update can take place + for (index_index, result) in indexes.into_iter().enumerate() { + let (uid, uuid) = result?; + let index_path = db_path.join("indexes").join(uuid.to_string()); + + println!( + "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", + index_index + 1, + index_path.display() + ); + + let index_env = unsafe { + EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { + format!("while opening index {uid} at '{}'", index_path.display()) + })? + }; + + let index_rtxn = index_env.read_txn().with_context(|| { + format!( + "while obtaining a read transaction for index {uid} at {}", + index_path.display() + ) + })?; + let index_read_database = + try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY) + .with_context(|| format!("while updating date format for index `{uid}`"))?; + + let mut index_wtxn = index_env.write_txn().with_context(|| { + format!( + "while obtaining a write transaction for index {uid} at {}", + index_path.display() + ) + })?; + + let index_write_database = + try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY) + .with_context(|| format!("while updating date format for index `{uid}`"))?; + + arroy_v04_to_v05::ugrade_from_prev_version( + &index_rtxn, + index_read_database, + &mut index_wtxn, + index_write_database, + )?; + + index_wtxn.commit()?; + } + + Ok(()) +} diff --git a/crates/meilitool/src/upgrade/v1_9.rs b/crates/meilitool/src/upgrade/v1_9.rs new file mode 100644 index 000000000..3e6cfde6c --- /dev/null +++ b/crates/meilitool/src/upgrade/v1_9.rs @@ -0,0 +1,106 @@ +use serde::{Deserialize, Serialize}; +use time::OffsetDateTime; + +pub type FieldDistribution = std::collections::BTreeMap; + +/// The statistics that can be computed from an `Index` object. +#[derive(serde::Serialize, serde::Deserialize, Debug)] +pub struct IndexStats { + /// Number of documents in the index. + pub number_of_documents: u64, + /// Size taken up by the index' DB, in bytes. + /// + /// This includes the size taken by both the used and free pages of the DB, and as the free pages + /// are not returned to the disk after a deletion, this number is typically larger than + /// `used_database_size` that only includes the size of the used pages. + pub database_size: u64, + /// Size taken by the used pages of the index' DB, in bytes. + /// + /// As the DB backend does not return to the disk the pages that are not currently used by the DB, + /// this value is typically smaller than `database_size`. + pub used_database_size: u64, + /// Association of every field name with the number of times it occurs in the documents. + pub field_distribution: FieldDistribution, + /// Creation date of the index. + pub created_at: LegacyTime, + /// Date of the last update of the index. + pub updated_at: LegacyTime, +} + +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub config: EmbeddingConfig, +} + +#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)] +pub struct EmbeddingConfig { + /// Options of the embedder, specific to each kind of embedder + pub embedder_options: EmbedderOptions, +} + +/// Options of an embedder, specific to each kind of embedder. +#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub enum EmbedderOptions { + HuggingFace(hf::EmbedderOptions), + OpenAi(openai::EmbedderOptions), + Ollama(ollama::EmbedderOptions), + UserProvided(manual::EmbedderOptions), + Rest(rest::EmbedderOptions), +} + +impl Default for EmbedderOptions { + fn default() -> Self { + Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None }) + } +} + +mod hf { + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub model: String, + pub revision: Option, + } +} +mod openai { + + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub api_key: Option, + pub dimensions: Option, + } +} +mod ollama { + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub embedding_model: String, + pub url: Option, + pub api_key: Option, + } +} +mod manual { + #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] + pub struct EmbedderOptions { + pub dimensions: usize, + } +} +mod rest { + #[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)] + pub struct EmbedderOptions { + pub api_key: Option, + pub dimensions: Option, + pub url: String, + pub input_field: Vec, + // path to the array of embeddings + pub path_to_embeddings: Vec, + // shape of a single embedding + pub embedding_object: Vec, + } +} + +// 2024-11-04 13:32:08.48368 +00:00:00 +time::serde::format_description!(legacy_datetime, OffsetDateTime, "[year]-[month]-[day] [hour]:[minute]:[second].[subsecond] [offset_hour sign:mandatory]:[offset_minute]:[offset_second]"); + +#[derive(Debug, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct LegacyTime(#[serde(with = "legacy_datetime")] pub OffsetDateTime); From a1f228f662f5fd76b15fab8acabcbf3b7f40080e Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 4 Nov 2024 18:19:36 +0100 Subject: [PATCH 84/92] remove the uneeded files after the rebase --- meilitool/src/upgrade/mod.rs | 73 --------- meilitool/src/upgrade/v1_10.rs | 289 --------------------------------- meilitool/src/upgrade/v1_11.rs | 86 ---------- meilitool/src/upgrade/v1_9.rs | 106 ------------ 4 files changed, 554 deletions(-) delete mode 100644 meilitool/src/upgrade/mod.rs delete mode 100644 meilitool/src/upgrade/v1_10.rs delete mode 100644 meilitool/src/upgrade/v1_11.rs delete mode 100644 meilitool/src/upgrade/v1_9.rs diff --git a/meilitool/src/upgrade/mod.rs b/meilitool/src/upgrade/mod.rs deleted file mode 100644 index ae095b6bd..000000000 --- a/meilitool/src/upgrade/mod.rs +++ /dev/null @@ -1,73 +0,0 @@ -mod v1_10; -mod v1_11; -mod v1_9; - -use std::path::{Path, PathBuf}; - -use anyhow::{bail, Context}; -use meilisearch_types::versioning::create_version_file; - -use v1_10::v1_9_to_v1_10; - -use crate::upgrade::v1_11::v1_10_to_v1_11; - -pub struct OfflineUpgrade { - pub db_path: PathBuf, - pub current_version: (String, String, String), - pub target_version: (String, String, String), -} - -impl OfflineUpgrade { - pub fn upgrade(self) -> anyhow::Result<()> { - let upgrade_list = [ - (v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"), - (v1_10_to_v1_11, "1", "11", "0"), - ]; - - let (current_major, current_minor, current_patch) = &self.current_version; - - let start_at = match ( - current_major.as_str(), - current_minor.as_str(), - current_patch.as_str(), - ) { - ("1", "9", _) => 0, - ("1", "10", _) => 1, - _ => { - bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9") - } - }; - - let (target_major, target_minor, target_patch) = &self.target_version; - - let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) { - ("1", "10", _) => 0, - ("1", "11", _) => 1, - (major, _, _) if major.starts_with('v') => { - bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.") - } - _ => { - bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.11") - } - }; - - println!("Starting the upgrade from {current_major}.{current_minor}.{current_patch} to {target_major}.{target_minor}.{target_patch}"); - - #[allow(clippy::needless_range_loop)] - for index in start_at..=ends_at { - let (func, major, minor, patch) = upgrade_list[index]; - (func)(&self.db_path)?; - println!("Done"); - // We're writing the version file just in case an issue arise _while_ upgrading. - // We don't want the DB to fail in an unknown state. - println!("Writing VERSION file"); - - create_version_file(&self.db_path, major, minor, patch) - .context("while writing VERSION file after the upgrade")?; - } - - println!("Success"); - - Ok(()) - } -} diff --git a/meilitool/src/upgrade/v1_10.rs b/meilitool/src/upgrade/v1_10.rs deleted file mode 100644 index 671f4d6d2..000000000 --- a/meilitool/src/upgrade/v1_10.rs +++ /dev/null @@ -1,289 +0,0 @@ -use anyhow::bail; -use std::path::Path; - -use anyhow::Context; -use meilisearch_types::{ - heed::{ - types::{SerdeJson, Str}, - Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified, - }, - milli::index::{db_name, main_key}, -}; - -use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; - -use super::v1_9; - -pub type FieldDistribution = std::collections::BTreeMap; - -/// The statistics that can be computed from an `Index` object. -#[derive(serde::Serialize, serde::Deserialize, Debug)] -pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, - /// Size taken up by the index' DB, in bytes. - /// - /// This includes the size taken by both the used and free pages of the DB, and as the free pages - /// are not returned to the disk after a deletion, this number is typically larger than - /// `used_database_size` that only includes the size of the used pages. - pub database_size: u64, - /// Size taken by the used pages of the index' DB, in bytes. - /// - /// As the DB backend does not return to the disk the pages that are not currently used by the DB, - /// this value is typically smaller than `database_size`. - pub used_database_size: u64, - /// Association of every field name with the number of times it occurs in the documents. - pub field_distribution: FieldDistribution, - /// Creation date of the index. - #[serde(with = "time::serde::rfc3339")] - pub created_at: time::OffsetDateTime, - /// Date of the last update of the index. - #[serde(with = "time::serde::rfc3339")] - pub updated_at: time::OffsetDateTime, -} - -impl From for IndexStats { - fn from( - v1_9::IndexStats { - number_of_documents, - database_size, - used_database_size, - field_distribution, - created_at, - updated_at, - }: v1_9::IndexStats, - ) -> Self { - IndexStats { - number_of_documents, - database_size, - used_database_size, - field_distribution, - created_at: created_at.0, - updated_at: updated_at.0, - } - } -} - -#[derive(serde::Serialize, serde::Deserialize)] -#[serde(transparent)] -pub struct OffsetDateTime(#[serde(with = "time::serde::rfc3339")] pub time::OffsetDateTime); - -fn update_index_stats( - index_stats: Database, - index_uid: &str, - index_uuid: uuid::Uuid, - sched_wtxn: &mut RwTxn, -) -> anyhow::Result<()> { - let ctx = || format!("while updating index stats for index `{index_uid}`"); - - let stats: Option<&str> = index_stats - .remap_data_type::() - .get(sched_wtxn, &index_uuid) - .with_context(ctx) - .with_context(|| "While reading value")?; - dbg!(stats); - - let stats: Option = index_stats - .remap_data_type::>() - .get(sched_wtxn, &index_uuid) - .with_context(ctx) - .with_context(|| "While reading value")?; - - if let Some(stats) = stats { - let stats: self::IndexStats = stats.into(); - - index_stats - .remap_data_type::>() - .put(sched_wtxn, &index_uuid, &stats) - .with_context(ctx) - .with_context(|| "While writing value")?; - } - - Ok(()) -} - -fn update_date_format( - index_uid: &str, - index_env: &Env, - index_wtxn: &mut RwTxn, -) -> anyhow::Result<()> { - let main = try_opening_poly_database(index_env, index_wtxn, db_name::MAIN) - .with_context(|| format!("while updating date format for index `{index_uid}`"))?; - - date_round_trip(index_wtxn, index_uid, main, main_key::CREATED_AT_KEY)?; - date_round_trip(index_wtxn, index_uid, main, main_key::UPDATED_AT_KEY)?; - - Ok(()) -} - -fn find_rest_embedders( - index_uid: &str, - index_env: &Env, - index_txn: &RoTxn, -) -> anyhow::Result> { - let main = try_opening_poly_database(index_env, index_txn, db_name::MAIN) - .with_context(|| format!("while checking REST embedders for index `{index_uid}`"))?; - - let mut rest_embedders = vec![]; - - for config in main - .remap_types::>>() - .get(index_txn, main_key::EMBEDDING_CONFIGS)? - .unwrap_or_default() - { - if let v1_9::EmbedderOptions::Rest(_) = config.config.embedder_options { - rest_embedders.push(config.name); - } - } - - Ok(rest_embedders) -} - -fn date_round_trip( - wtxn: &mut RwTxn, - index_uid: &str, - db: Database, - key: &str, -) -> anyhow::Result<()> { - let datetime = - db.remap_types::>().get(wtxn, key).with_context(|| { - format!("could not read `{key}` while updating date format for index `{index_uid}`") - })?; - - if let Some(datetime) = datetime { - db.remap_types::>() - .put(wtxn, key, &self::OffsetDateTime(datetime.0)) - .with_context(|| { - format!( - "could not write `{key}` while updating date format for index `{index_uid}`" - ) - })?; - } - - Ok(()) -} - -pub fn v1_9_to_v1_10(db_path: &Path) -> anyhow::Result<()> { - println!("Upgrading from v1.9.0 to v1.10.0"); - // 2 changes here - - // 1. date format. needs to be done before opening the Index - // 2. REST embedders. We don't support this case right now, so bail - - let index_scheduler_path = db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } - .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; - - let mut sched_wtxn = env.write_txn()?; - - let index_mapping: Database = - try_opening_database(&env, &sched_wtxn, "index-mapping")?; - - let index_stats: Database = - try_opening_database(&env, &sched_wtxn, "index-stats").with_context(|| { - format!("While trying to open {:?}", index_scheduler_path.display()) - })?; - - let index_count = - index_mapping.len(&sched_wtxn).context("while reading the number of indexes")?; - - // FIXME: not ideal, we have to pre-populate all indexes to prevent double borrow of sched_wtxn - // 1. immutably for the iteration - // 2. mutably for updating index stats - let indexes: Vec<_> = index_mapping - .iter(&sched_wtxn)? - .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) - .collect(); - - let mut rest_embedders = Vec::new(); - - let mut unwrapped_indexes = Vec::new(); - - // check that update can take place - for (index_index, result) in indexes.into_iter().enumerate() { - let (uid, uuid) = result?; - let index_path = db_path.join("indexes").join(uuid.to_string()); - - println!( - "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", - index_index + 1, - index_path.display() - ); - - let index_env = unsafe { - // FIXME: fetch the 25 magic number from the index file - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? - }; - - let index_txn = index_env.read_txn().with_context(|| { - format!( - "while obtaining a write transaction for index {uid} at {}", - index_path.display() - ) - })?; - - println!("\t- Checking for incompatible embedders (REST embedders)"); - let rest_embedders_for_index = find_rest_embedders(&uid, &index_env, &index_txn)?; - - if rest_embedders_for_index.is_empty() { - unwrapped_indexes.push((uid, uuid)); - } else { - // no need to add to unwrapped indexes because we'll exit early - rest_embedders.push((uid, rest_embedders_for_index)); - } - } - - if !rest_embedders.is_empty() { - let rest_embedders = rest_embedders - .into_iter() - .flat_map(|(index, embedders)| std::iter::repeat(index.clone()).zip(embedders)) - .map(|(index, embedder)| format!("\t- embedder `{embedder}` in index `{index}`")) - .collect::>() - .join("\n"); - bail!("The update cannot take place because there are REST embedder(s). Remove them before proceeding with the update:\n{rest_embedders}\n\n\ - The database has not been modified and is still a valid v1.9 database."); - } - - println!("Update can take place, updating"); - - for (index_index, (uid, uuid)) in unwrapped_indexes.into_iter().enumerate() { - let index_path = db_path.join("indexes").join(uuid.to_string()); - - println!( - "[{}/{index_count}]Updating index `{uid}` at `{}`", - index_index + 1, - index_path.display() - ); - - let index_env = unsafe { - // FIXME: fetch the 25 magic number from the index file - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? - }; - - let mut index_wtxn = index_env.write_txn().with_context(|| { - format!( - "while obtaining a write transaction for index `{uid}` at `{}`", - index_path.display() - ) - })?; - - println!("\t- Updating index stats"); - update_index_stats(index_stats, &uid, uuid, &mut sched_wtxn)?; - println!("\t- Updating date format"); - update_date_format(&uid, &index_env, &mut index_wtxn)?; - - index_wtxn.commit().with_context(|| { - format!("while committing the write txn for index `{uid}` at {}", index_path.display()) - })?; - } - - sched_wtxn.commit().context("while committing the write txn for the index-scheduler")?; - - println!("Upgrading database succeeded"); - - Ok(()) -} diff --git a/meilitool/src/upgrade/v1_11.rs b/meilitool/src/upgrade/v1_11.rs deleted file mode 100644 index 4105879fd..000000000 --- a/meilitool/src/upgrade/v1_11.rs +++ /dev/null @@ -1,86 +0,0 @@ -//! The breaking changes that happened between the v1.10 and the v1.11 are: -//! - Arroy went from the v0.4.0 to the v0.5.0, see this release note to get the whole context: https://github.com/meilisearch/arroy/releases/tag/v0.5.0 -//! - The `angular` distance has been renamed to `cosine` => We only need to update the string in the metadata. -//! - Reorganize the `NodeId` to make the appending of vectors work => We'll have to update the keys of almost all items in the DB. -//! - Store the list of updated IDs directly in LMDB instead of a roaring bitmap => This shouldn't be an issue since we are never supposed to commit this roaring bitmap, but it's not forbidden by arroy so ensuring it works is probably better than anything. - -use std::path::Path; - -use anyhow::Context; -use meilisearch_types::{ - heed::{types::Str, Database, EnvOpenOptions}, - milli::index::db_name, -}; - -use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec}; - -pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { - println!("Upgrading from v1.10.0 to v1.11.0"); - - let index_scheduler_path = db_path.join("tasks"); - let env = unsafe { EnvOpenOptions::new().max_dbs(100).open(&index_scheduler_path) } - .with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?; - - let sched_rtxn = env.read_txn()?; - - let index_mapping: Database = - try_opening_database(&env, &sched_rtxn, "index-mapping")?; - - let index_count = - index_mapping.len(&sched_rtxn).context("while reading the number of indexes")?; - - let indexes: Vec<_> = index_mapping - .iter(&sched_rtxn)? - .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) - .collect(); - - // check that update can take place - for (index_index, result) in indexes.into_iter().enumerate() { - let (uid, uuid) = result?; - let index_path = db_path.join("indexes").join(uuid.to_string()); - - println!( - "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", - index_index + 1, - index_path.display() - ); - - let index_env = unsafe { - EnvOpenOptions::new().max_dbs(25).open(&index_path).with_context(|| { - format!("while opening index {uid} at '{}'", index_path.display()) - })? - }; - - let index_rtxn = index_env.read_txn().with_context(|| { - format!( - "while obtaining a read transaction for index {uid} at {}", - index_path.display() - ) - })?; - let index_read_database = - try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY) - .with_context(|| format!("while updating date format for index `{uid}`"))?; - - let mut index_wtxn = index_env.write_txn().with_context(|| { - format!( - "while obtaining a write transaction for index {uid} at {}", - index_path.display() - ) - })?; - - let index_write_database = - try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY) - .with_context(|| format!("while updating date format for index `{uid}`"))?; - - arroy_v04_to_v05::ugrade_from_prev_version( - &index_rtxn, - index_read_database, - &mut index_wtxn, - index_write_database, - )?; - - index_wtxn.commit()?; - } - - Ok(()) -} diff --git a/meilitool/src/upgrade/v1_9.rs b/meilitool/src/upgrade/v1_9.rs deleted file mode 100644 index 3e6cfde6c..000000000 --- a/meilitool/src/upgrade/v1_9.rs +++ /dev/null @@ -1,106 +0,0 @@ -use serde::{Deserialize, Serialize}; -use time::OffsetDateTime; - -pub type FieldDistribution = std::collections::BTreeMap; - -/// The statistics that can be computed from an `Index` object. -#[derive(serde::Serialize, serde::Deserialize, Debug)] -pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, - /// Size taken up by the index' DB, in bytes. - /// - /// This includes the size taken by both the used and free pages of the DB, and as the free pages - /// are not returned to the disk after a deletion, this number is typically larger than - /// `used_database_size` that only includes the size of the used pages. - pub database_size: u64, - /// Size taken by the used pages of the index' DB, in bytes. - /// - /// As the DB backend does not return to the disk the pages that are not currently used by the DB, - /// this value is typically smaller than `database_size`. - pub used_database_size: u64, - /// Association of every field name with the number of times it occurs in the documents. - pub field_distribution: FieldDistribution, - /// Creation date of the index. - pub created_at: LegacyTime, - /// Date of the last update of the index. - pub updated_at: LegacyTime, -} - -#[derive(Debug, Deserialize, Serialize)] -pub struct IndexEmbeddingConfig { - pub name: String, - pub config: EmbeddingConfig, -} - -#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)] -pub struct EmbeddingConfig { - /// Options of the embedder, specific to each kind of embedder - pub embedder_options: EmbedderOptions, -} - -/// Options of an embedder, specific to each kind of embedder. -#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] -pub enum EmbedderOptions { - HuggingFace(hf::EmbedderOptions), - OpenAi(openai::EmbedderOptions), - Ollama(ollama::EmbedderOptions), - UserProvided(manual::EmbedderOptions), - Rest(rest::EmbedderOptions), -} - -impl Default for EmbedderOptions { - fn default() -> Self { - Self::OpenAi(openai::EmbedderOptions { api_key: None, dimensions: None }) - } -} - -mod hf { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub model: String, - pub revision: Option, - } -} -mod openai { - - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub api_key: Option, - pub dimensions: Option, - } -} -mod ollama { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub embedding_model: String, - pub url: Option, - pub api_key: Option, - } -} -mod manual { - #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] - pub struct EmbedderOptions { - pub dimensions: usize, - } -} -mod rest { - #[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize, Hash)] - pub struct EmbedderOptions { - pub api_key: Option, - pub dimensions: Option, - pub url: String, - pub input_field: Vec, - // path to the array of embeddings - pub path_to_embeddings: Vec, - // shape of a single embedding - pub embedding_object: Vec, - } -} - -// 2024-11-04 13:32:08.48368 +00:00:00 -time::serde::format_description!(legacy_datetime, OffsetDateTime, "[year]-[month]-[day] [hour]:[minute]:[second].[subsecond] [offset_hour sign:mandatory]:[offset_minute]:[offset_second]"); - -#[derive(Debug, serde::Serialize, serde::Deserialize)] -#[serde(transparent)] -pub struct LegacyTime(#[serde(with = "legacy_datetime")] pub OffsetDateTime); From 48ab898ca2d8cd125458aac1ea500ecf324b7bc8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Nov 2024 10:30:53 +0100 Subject: [PATCH 85/92] fix the datetime of v1.9 --- crates/meilitool/Cargo.toml | 2 +- crates/meilitool/src/upgrade/v1_10.rs | 6 +-- crates/meilitool/src/upgrade/v1_9.rs | 70 +++++++++++++++++++++++---- 3 files changed, 65 insertions(+), 13 deletions(-) diff --git a/crates/meilitool/Cargo.toml b/crates/meilitool/Cargo.toml index f2c8920c9..353d44e9a 100644 --- a/crates/meilitool/Cargo.toml +++ b/crates/meilitool/Cargo.toml @@ -16,6 +16,6 @@ file-store = { path = "../file-store" } meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } serde = { version = "1.0.209", features = ["derive"] } -time = { version = "0.3.36", features = ["formatting"] } +time = { version = "0.3.36", features = ["formatting", "parsing", "alloc"] } uuid = { version = "1.10.0", features = ["v4"], default-features = false } arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", rev = "053807bf38dc079f25b003f19fc30fbf3613f6e7" } diff --git a/crates/meilitool/src/upgrade/v1_10.rs b/crates/meilitool/src/upgrade/v1_10.rs index 671f4d6d2..3dd7c72a2 100644 --- a/crates/meilitool/src/upgrade/v1_10.rs +++ b/crates/meilitool/src/upgrade/v1_10.rs @@ -146,9 +146,9 @@ fn date_round_trip( key: &str, ) -> anyhow::Result<()> { let datetime = - db.remap_types::>().get(wtxn, key).with_context(|| { - format!("could not read `{key}` while updating date format for index `{index_uid}`") - })?; + db.remap_types::>().get(wtxn, key).with_context( + || format!("could not read `{key}` while updating date format for index `{index_uid}`"), + )?; if let Some(datetime) = datetime { db.remap_types::>() diff --git a/crates/meilitool/src/upgrade/v1_9.rs b/crates/meilitool/src/upgrade/v1_9.rs index 3e6cfde6c..96cbfe68c 100644 --- a/crates/meilitool/src/upgrade/v1_9.rs +++ b/crates/meilitool/src/upgrade/v1_9.rs @@ -1,10 +1,10 @@ use serde::{Deserialize, Serialize}; -use time::OffsetDateTime; +use time::{Date, OffsetDateTime, Time, UtcOffset}; pub type FieldDistribution = std::collections::BTreeMap; /// The statistics that can be computed from an `Index` object. -#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[derive(serde::Deserialize, Debug)] pub struct IndexStats { /// Number of documents in the index. pub number_of_documents: u64, @@ -22,9 +22,9 @@ pub struct IndexStats { /// Association of every field name with the number of times it occurs in the documents. pub field_distribution: FieldDistribution, /// Creation date of the index. - pub created_at: LegacyTime, + pub created_at: LegacyDateTime, /// Date of the last update of the index. - pub updated_at: LegacyTime, + pub updated_at: LegacyDateTime, } #[derive(Debug, Deserialize, Serialize)] @@ -98,9 +98,61 @@ mod rest { } } -// 2024-11-04 13:32:08.48368 +00:00:00 -time::serde::format_description!(legacy_datetime, OffsetDateTime, "[year]-[month]-[day] [hour]:[minute]:[second].[subsecond] [offset_hour sign:mandatory]:[offset_minute]:[offset_second]"); +/// A datetime from Meilisearch v1.9 with an unspecified format. +#[derive(Debug)] +pub struct LegacyDateTime(pub OffsetDateTime); -#[derive(Debug, serde::Serialize, serde::Deserialize)] -#[serde(transparent)] -pub struct LegacyTime(#[serde(with = "legacy_datetime")] pub OffsetDateTime); +impl<'de> Deserialize<'de> for LegacyDateTime { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct Visitor; + impl<'de> serde::de::Visitor<'de> for Visitor { + type Value = OffsetDateTime; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "a valid datetime") + } + + // Comes from a binary. The legacy format is: + // 2024-11-04 13:32:08.48368 +00:00:00 + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + let format = time::macros::format_description!("[year]-[month]-[day] [hour]:[minute]:[second].[subsecond] [offset_hour sign:mandatory]:[offset_minute]:[offset_second]"); + OffsetDateTime::parse(v, format).map_err(E::custom) + } + + // Comes from the docker image, the legacy format is: + // [2024, 309, 17, 15, 1, 698184971, 0,0,0] + // year, day in year, hour, minute, sec, subsec , offset stuff + fn visit_seq(self, mut seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let mut vec = Vec::new(); + // We must deserialize the value as `i64` because the largest values are `u32` and `i32` + while let Some(el) = seq.next_element::()? { + vec.push(el); + } + if vec.len() != 9 { + return Err(serde::de::Error::custom(format!( + "Invalid datetime, received an array of {} elements instead of 9", + vec.len() + ))); + } + Ok(OffsetDateTime::new_in_offset( + Date::from_ordinal_date(vec[0] as i32, vec[1] as u16) + .map_err(serde::de::Error::custom)?, + Time::from_hms_nano(vec[2] as u8, vec[3] as u8, vec[4] as u8, vec[5] as u32) + .map_err(serde::de::Error::custom)?, + UtcOffset::from_hms(vec[6] as i8, vec[7] as i8, vec[8] as i8) + .map_err(serde::de::Error::custom)?, + )) + } + } + deserializer.deserialize_any(Visitor).map(LegacyDateTime) + } +} From 9799812b27b0fee47b969a1e3bdba771f29b93bc Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Nov 2024 15:08:01 +0100 Subject: [PATCH 86/92] fix the benchmarks --- .github/workflows/benchmarks-manual.yml | 2 +- .github/workflows/benchmarks-pr.yml | 2 +- .github/workflows/benchmarks-push-indexing.yml | 2 +- .github/workflows/benchmarks-push-search-songs.yml | 2 +- .github/workflows/benchmarks-push-search-wiki.yml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmarks-manual.yml b/.github/workflows/benchmarks-manual.yml index da33bf803..14b77c83d 100644 --- a/.github/workflows/benchmarks-manual.yml +++ b/.github/workflows/benchmarks-manual.yml @@ -43,7 +43,7 @@ jobs: # Run benchmarks - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | - cd benchmarks + cd crates/benchmarks cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} # Generate critcmp files diff --git a/.github/workflows/benchmarks-pr.yml b/.github/workflows/benchmarks-pr.yml index f9d609d6e..a083baa3c 100644 --- a/.github/workflows/benchmarks-pr.yml +++ b/.github/workflows/benchmarks-pr.yml @@ -88,7 +88,7 @@ jobs: # Run benchmarks - name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | - cd benchmarks + cd crates/benchmarks cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }} # Generate critcmp files diff --git a/.github/workflows/benchmarks-push-indexing.yml b/.github/workflows/benchmarks-push-indexing.yml index 1fdd5fd67..4495b4b9d 100644 --- a/.github/workflows/benchmarks-push-indexing.yml +++ b/.github/workflows/benchmarks-push-indexing.yml @@ -41,7 +41,7 @@ jobs: # Run benchmarks - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | - cd benchmarks + cd crates/benchmarks cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} # Generate critcmp files diff --git a/.github/workflows/benchmarks-push-search-songs.yml b/.github/workflows/benchmarks-push-search-songs.yml index b6169ddf7..e9744a434 100644 --- a/.github/workflows/benchmarks-push-search-songs.yml +++ b/.github/workflows/benchmarks-push-search-songs.yml @@ -40,7 +40,7 @@ jobs: # Run benchmarks - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | - cd benchmarks + cd crates/benchmarks cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} # Generate critcmp files diff --git a/.github/workflows/benchmarks-push-search-wiki.yml b/.github/workflows/benchmarks-push-search-wiki.yml index dd3146a14..bc9e1bcd0 100644 --- a/.github/workflows/benchmarks-push-search-wiki.yml +++ b/.github/workflows/benchmarks-push-search-wiki.yml @@ -40,7 +40,7 @@ jobs: # Run benchmarks - name: Run benchmarks - Dataset ${BENCH_NAME} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }} run: | - cd benchmarks + cd crates/benchmarks cargo bench --bench ${BENCH_NAME} -- --save-baseline ${{ steps.file.outputs.basename }} # Generate critcmp files From f193c3a67c5d0a39d94e8437ef683aaa27b0e377 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Nov 2024 15:13:32 +0100 Subject: [PATCH 87/92] Update crates/meilitool/src/main.rs Co-authored-by: Louis Dureuil --- crates/meilitool/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index ef137f746..978824356 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -73,7 +73,7 @@ enum Command { /// /// Supported upgrade paths: /// - /// - v1.9.0 -> v1.10.0 -> v1.11.0 + /// - v1.9.x -> v1.10.x -> v1.11.x OfflineUpgrade { #[arg(long)] target_version: String, From 66b7e0824efd310b335be45b12f461695f99e1b4 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Nov 2024 15:13:40 +0100 Subject: [PATCH 88/92] Update crates/meilitool/src/upgrade/mod.rs Co-authored-by: Louis Dureuil --- crates/meilitool/src/upgrade/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilitool/src/upgrade/mod.rs b/crates/meilitool/src/upgrade/mod.rs index ae095b6bd..0fd903ffe 100644 --- a/crates/meilitool/src/upgrade/mod.rs +++ b/crates/meilitool/src/upgrade/mod.rs @@ -34,7 +34,7 @@ impl OfflineUpgrade { ("1", "9", _) => 0, ("1", "10", _) => 1, _ => { - bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9") + bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10") } }; From e4993aa705a8e8a3a870a4616c845bfd143fd5f9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Nov 2024 15:13:50 +0100 Subject: [PATCH 89/92] Update crates/meilitool/src/upgrade/mod.rs Co-authored-by: Louis Dureuil --- crates/meilitool/src/upgrade/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilitool/src/upgrade/mod.rs b/crates/meilitool/src/upgrade/mod.rs index 0fd903ffe..36630c3b3 100644 --- a/crates/meilitool/src/upgrade/mod.rs +++ b/crates/meilitool/src/upgrade/mod.rs @@ -47,7 +47,7 @@ impl OfflineUpgrade { bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.") } _ => { - bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.11") + bail!("Unsupported target version {target_major}.{target_minor}.{target_patch}. Can only upgrade to v1.10 and v1.11") } }; From 0f74a933467b0e372898975fa18a69cc3d1dd5b9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Nov 2024 15:14:02 +0100 Subject: [PATCH 90/92] Update crates/meilitool/src/upgrade/v1_11.rs Co-authored-by: Louis Dureuil --- crates/meilitool/src/upgrade/v1_11.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/meilitool/src/upgrade/v1_11.rs b/crates/meilitool/src/upgrade/v1_11.rs index 4105879fd..de852f3dc 100644 --- a/crates/meilitool/src/upgrade/v1_11.rs +++ b/crates/meilitool/src/upgrade/v1_11.rs @@ -34,7 +34,6 @@ pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { .map(|res| res.map(|(uid, uuid)| (uid.to_owned(), uuid))) .collect(); - // check that update can take place for (index_index, result) in indexes.into_iter().enumerate() { let (uid, uuid) = result?; let index_path = db_path.join("indexes").join(uuid.to_string()); From a5d138ac34448c7fc2410dee1e16ebca91b1a248 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Nov 2024 15:23:27 +0100 Subject: [PATCH 91/92] use a tag while importing arroy instead of a loose branch or rev --- Cargo.lock | 4 ++-- crates/meilitool/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 04812fd1b..cef8e9c8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -407,7 +407,7 @@ dependencies = [ [[package]] name = "arroy" version = "0.5.0" -source = "git+https://github.com/meilisearch/arroy/?rev=053807bf38dc079f25b003f19fc30fbf3613f6e7#053807bf38dc079f25b003f19fc30fbf3613f6e7" +source = "git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05#053807bf38dc079f25b003f19fc30fbf3613f6e7" dependencies = [ "bytemuck", "byteorder", @@ -3536,7 +3536,7 @@ name = "meilitool" version = "1.11.0" dependencies = [ "anyhow", - "arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?rev=053807bf38dc079f25b003f19fc30fbf3613f6e7)", + "arroy 0.5.0 (git+https://github.com/meilisearch/arroy/?tag=DO-NOT-DELETE-upgrade-v04-to-v05)", "clap", "dump", "file-store", diff --git a/crates/meilitool/Cargo.toml b/crates/meilitool/Cargo.toml index 353d44e9a..048da6232 100644 --- a/crates/meilitool/Cargo.toml +++ b/crates/meilitool/Cargo.toml @@ -18,4 +18,4 @@ meilisearch-types = { path = "../meilisearch-types" } serde = { version = "1.0.209", features = ["derive"] } time = { version = "0.3.36", features = ["formatting", "parsing", "alloc"] } uuid = { version = "1.10.0", features = ["v4"], default-features = false } -arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", rev = "053807bf38dc079f25b003f19fc30fbf3613f6e7" } +arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", tag = "DO-NOT-DELETE-upgrade-v04-to-v05" } From 7415ef7ff5498bdc93ef835713f865df80c4b144 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Nov 2024 15:37:59 +0100 Subject: [PATCH 92/92] Update crates/meilitool/src/upgrade/v1_11.rs Co-authored-by: Louis Dureuil --- crates/meilitool/src/upgrade/v1_11.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilitool/src/upgrade/v1_11.rs b/crates/meilitool/src/upgrade/v1_11.rs index de852f3dc..0c84d3842 100644 --- a/crates/meilitool/src/upgrade/v1_11.rs +++ b/crates/meilitool/src/upgrade/v1_11.rs @@ -39,7 +39,7 @@ pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> { let index_path = db_path.join("indexes").join(uuid.to_string()); println!( - "[{}/{index_count}]Checking that update can take place for `{uid}` at `{}`", + "[{}/{index_count}]Updating embeddings for `{uid}` at `{}`", index_index + 1, index_path.display() );