3056: refactor the way we send the cli informations + add the analytics for the config file and ssl usage r=Kerollmops a=irevoire

Partially fix #2955

Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
bors[bot] 2022-11-15 17:03:30 +00:00 committed by GitHub
commit 51be75a264
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 130 additions and 42 deletions

View File

@ -6,6 +6,7 @@ use std::time::{Duration, Instant};
use actix_web::http::header::USER_AGENT;
use actix_web::HttpRequest;
use byte_unit::Byte;
use http::header::CONTENT_TYPE;
use index_scheduler::IndexScheduler;
use meilisearch_auth::SearchRules;
@ -14,6 +15,7 @@ use once_cell::sync::Lazy;
use regex::Regex;
use segment::message::{Identify, Track, User};
use segment::{AutoBatcher, Batcher, HttpClient};
use serde::Serialize;
use serde_json::{json, Value};
use sysinfo::{DiskExt, System, SystemExt};
use time::OffsetDateTime;
@ -23,7 +25,7 @@ use uuid::Uuid;
use super::{config_user_id_path, MEILISEARCH_CONFIG_PATH};
use crate::analytics::Analytics;
use crate::option::default_http_addr;
use crate::option::{default_http_addr, IndexerOpts, MaxMemory, MaxThreads, SchedulerConfig};
use crate::routes::indexes::documents::UpdateDocumentsQuery;
use crate::routes::{create_all_stats, Stats};
use crate::search::{
@ -182,6 +184,124 @@ impl super::Analytics for SegmentAnalytics {
}
}
/// This structure represent the `infos` field we send in the analytics.
/// It's quite close to the `Opt` structure except all sensitive informations
/// have been simplified to a boolean.
/// It's send as-is in amplitude thus you should never update a name of the
/// struct without the approval of the PM.
#[derive(Debug, Clone, Serialize)]
struct Infos {
env: String,
db_path: bool,
import_dump: bool,
dumps_dir: bool,
ignore_missing_dump: bool,
ignore_dump_if_db_exists: bool,
import_snapshot: bool,
schedule_snapshot: bool,
snapshot_dir: bool,
snapshot_interval_sec: u64,
ignore_missing_snapshot: bool,
ignore_snapshot_if_db_exists: bool,
http_addr: bool,
max_index_size: Byte,
max_task_db_size: Byte,
http_payload_size_limit: Byte,
disable_auto_batching: bool,
log_level: String,
max_indexing_memory: MaxMemory,
max_indexing_threads: MaxThreads,
with_configuration_file: bool,
ssl_auth_path: bool,
ssl_cert_path: bool,
ssl_key_path: bool,
ssl_ocsp_path: bool,
ssl_require_auth: bool,
ssl_resumption: bool,
ssl_tickets: bool,
}
impl From<Opt> for Infos {
fn from(options: Opt) -> Self {
// We wants to decompose this whole struct by hand to be sure we don't forget
// to add analytics when we add a field in the Opt.
// Thus we must not insert `..` at the end.
let Opt {
db_path,
http_addr,
master_key: _,
env,
max_index_size,
max_task_db_size,
http_payload_size_limit,
ssl_cert_path,
ssl_key_path,
ssl_auth_path,
ssl_ocsp_path,
ssl_require_auth,
ssl_resumption,
ssl_tickets,
import_snapshot,
ignore_missing_snapshot,
ignore_snapshot_if_db_exists,
snapshot_dir,
schedule_snapshot,
snapshot_interval_sec,
import_dump,
ignore_missing_dump,
ignore_dump_if_db_exists,
dumps_dir,
log_level,
indexer_options,
scheduler_options,
config_file_path,
#[cfg(all(not(debug_assertions), feature = "analytics"))]
no_analytics: _,
} = options;
let SchedulerConfig { disable_auto_batching } = scheduler_options;
let IndexerOpts {
log_every_n: _,
max_nb_chunks: _,
max_indexing_memory,
max_indexing_threads,
} = indexer_options;
// We're going to override every sensible information.
// We consider information sensible if it contains a path, an address, or a key.
Self {
env,
db_path: db_path != PathBuf::from("./data.ms"),
import_dump: import_dump.is_some(),
dumps_dir: dumps_dir != PathBuf::from("dumps/"),
ignore_missing_dump,
ignore_dump_if_db_exists,
import_snapshot: import_snapshot.is_some(),
schedule_snapshot,
snapshot_dir: snapshot_dir != PathBuf::from("snapshots/"),
snapshot_interval_sec,
ignore_missing_snapshot,
ignore_snapshot_if_db_exists,
http_addr: http_addr != default_http_addr(),
max_index_size,
max_task_db_size,
http_payload_size_limit,
disable_auto_batching,
log_level,
max_indexing_memory,
max_indexing_threads,
with_configuration_file: config_file_path.is_some(),
ssl_auth_path: ssl_auth_path.is_some(),
ssl_cert_path: ssl_cert_path.is_some(),
ssl_key_path: ssl_key_path.is_some(),
ssl_ocsp_path: ssl_ocsp_path.is_some(),
ssl_require_auth,
ssl_resumption,
ssl_tickets,
}
}
}
pub struct Segment {
inbox: Receiver<AnalyticsMsg>,
user: User,
@ -212,31 +332,6 @@ impl Segment {
"server_provider": std::env::var("MEILI_SERVER_PROVIDER").ok(),
})
});
// The infos are all cli option except every option containing sensitive information.
// We consider an information as sensible if it contains a path, an address or a key.
let infos = {
// First we see if any sensitive fields were used.
let db_path = opt.db_path != PathBuf::from("./data.ms");
let import_dump = opt.import_dump.is_some();
let dumps_dir = opt.dumps_dir != PathBuf::from("dumps/");
let import_snapshot = opt.import_snapshot.is_some();
let snapshots_dir = opt.snapshot_dir != PathBuf::from("snapshots/");
let http_addr = opt.http_addr != default_http_addr();
let mut infos = serde_json::to_value(opt).unwrap();
// Then we overwrite all sensitive field with a boolean representing if
// the feature was used or not.
infos["db_path"] = json!(db_path);
infos["import_dump"] = json!(import_dump);
infos["dumps_dir"] = json!(dumps_dir);
infos["import_snapshot"] = json!(import_snapshot);
infos["snapshot_dir"] = json!(snapshots_dir);
infos["http_addr"] = json!(http_addr);
infos
};
let number_of_documents =
stats.indexes.values().map(|index| index.number_of_documents).collect::<Vec<u64>>();
@ -248,7 +343,7 @@ impl Segment {
"indexes_number": stats.indexes.len(),
"documents_number": number_of_documents,
},
"infos": infos,
"infos": Infos::from(opt.clone()),
})
}

View File

@ -69,7 +69,7 @@ const MEILI_MAX_INDEXING_THREADS: &str = "MEILI_MAX_INDEXING_THREADS";
const DISABLE_AUTO_BATCHING: &str = "DISABLE_AUTO_BATCHING";
const DEFAULT_LOG_EVERY_N: usize = 100000;
#[derive(Debug, Clone, Parser, Serialize, Deserialize)]
#[derive(Debug, Clone, Parser, Deserialize)]
#[clap(version, next_display_order = None)]
#[serde(rename_all = "snake_case", deny_unknown_fields)]
pub struct Opt {
@ -84,7 +84,6 @@ pub struct Opt {
pub http_addr: String,
/// Sets the instance's master key, automatically protecting all routes except `GET /health`.
#[serde(skip_serializing)]
#[clap(long, env = MEILI_MASTER_KEY)]
pub master_key: Option<String>,
@ -99,7 +98,7 @@ pub struct Opt {
/// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted
/// at any time.
#[cfg(all(not(debug_assertions), feature = "analytics"))]
#[serde(skip_serializing, default)] // we can't send true
#[serde(default)] // we can't send true
#[clap(long, env = MEILI_NO_ANALYTICS)]
pub no_analytics: bool,
@ -121,39 +120,35 @@ pub struct Opt {
pub http_payload_size_limit: Byte,
/// Sets the server's SSL certificates.
#[serde(skip_serializing)]
#[clap(long, env = MEILI_SSL_CERT_PATH, value_parser)]
pub ssl_cert_path: Option<PathBuf>,
/// Sets the server's SSL key files.
#[serde(skip_serializing)]
#[clap(long, env = MEILI_SSL_KEY_PATH, value_parser)]
pub ssl_key_path: Option<PathBuf>,
/// Enables client authentication in the specified path.
#[serde(skip_serializing)]
#[clap(long, env = MEILI_SSL_AUTH_PATH, value_parser)]
pub ssl_auth_path: Option<PathBuf>,
/// Sets the server's OCSP file. *Optional*
///
/// Reads DER-encoded OCSP response from OCSPFILE and staple to certificate.
#[serde(skip_serializing)]
#[clap(long, env = MEILI_SSL_OCSP_PATH, value_parser)]
pub ssl_ocsp_path: Option<PathBuf>,
/// Makes SSL authentication mandatory.
#[serde(skip_serializing, default)]
#[serde(default)]
#[clap(long, env = MEILI_SSL_REQUIRE_AUTH)]
pub ssl_require_auth: bool,
/// Activates SSL session resumption.
#[serde(skip_serializing, default)]
#[serde(default)]
#[clap(long, env = MEILI_SSL_RESUMPTION)]
pub ssl_resumption: bool,
/// Activates SSL tickets.
#[serde(skip_serializing, default)]
#[serde(default)]
#[clap(long, env = MEILI_SSL_TICKETS)]
pub ssl_tickets: bool,
@ -251,7 +246,6 @@ pub struct Opt {
/// Set the path to a configuration file that should be used to setup the engine.
/// Format must be TOML.
#[serde(skip_serializing)]
#[clap(long)]
pub config_file_path: Option<PathBuf>,
}
@ -439,16 +433,15 @@ impl Opt {
}
}
#[derive(Debug, Clone, Parser, Deserialize, Serialize)]
#[derive(Debug, Clone, Parser, Deserialize)]
pub struct IndexerOpts {
/// Sets the amount of documents to skip before printing
/// a log regarding the indexing advancement.
#[serde(skip_serializing, default = "default_log_every_n")]
#[serde(default = "default_log_every_n")]
#[clap(long, default_value_t = default_log_every_n(), hide = true)] // 100k
pub log_every_n: usize,
/// Grenad max number of chunks in bytes.
#[serde(skip_serializing)]
#[clap(long, hide = true)]
pub max_nb_chunks: Option<usize>,
@ -488,7 +481,7 @@ impl IndexerOpts {
}
}
#[derive(Debug, Clone, Parser, Default, Deserialize, Serialize)]
#[derive(Debug, Clone, Parser, Default, Deserialize)]
#[serde(rename_all = "snake_case", deny_unknown_fields)]
pub struct SchedulerConfig {
/// Deactivates auto-batching when provided.