From 93f8786da19e8de3dc5b1e8f076e55a7678a18b2 Mon Sep 17 00:00:00 2001 From: alyssa Date: Thu, 14 Nov 2024 13:39:47 +0900 Subject: [PATCH] feat: add basic api/gateway metrics --- Cargo.lock | 70 ++++++++++++--------- services/api/src/main.rs | 2 +- services/api/src/middleware/authnz.rs | 12 +++- services/api/src/middleware/logger.rs | 56 +++++++++++++---- services/gateway/Cargo.toml | 2 + services/gateway/src/discord/gateway.rs | 7 +++ services/gateway/src/discord/shard_state.rs | 9 +++ services/gateway/src/logger.rs | 36 ++++++++--- 8 files changed, 141 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b103416c..de637542 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -89,7 +89,7 @@ dependencies = [ "anyhow", "axum 0.7.5", "fred", - "hyper 1.3.1", + "hyper 1.5.0", "hyper-util", "lazy_static", "libpk", @@ -256,7 +256,7 @@ dependencies = [ "http 1.1.0", "http-body 1.0.0", "http-body-util", - "hyper 1.3.1", + "hyper 1.5.0", "hyper-util", "itoa", "matchit", @@ -398,9 +398,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" +checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" [[package]] name = "bytes-utils" @@ -1022,6 +1022,7 @@ dependencies = [ "metrics", "prost", "serde_json", + "serde_variant", "signal-hook", "tokio", "tracing", @@ -1350,9 +1351,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.3.1" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe575dd17d0862a9a33781c8c4696a55c320909004a67a00fb286ba8b1bc496d" +checksum = "bbbff0a806a4728c99295b254c8838933b5b082d75e3cb70c8dab21fdfbcfa9a" dependencies = [ "bytes", "futures-channel", @@ -1391,7 +1392,7 @@ checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c" dependencies = [ "futures-util", "http 1.1.0", - "hyper 1.3.1", + "hyper 1.5.0", "hyper-util", "rustls 0.22.4", "rustls-native-certs", @@ -1409,7 +1410,7 @@ checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" dependencies = [ "futures-util", "http 1.1.0", - "hyper 1.3.1", + "hyper 1.5.0", "hyper-util", "rustls 0.23.10", "rustls-pki-types", @@ -1421,20 +1422,19 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.5" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b875924a60b96e5d7b9ae7b066540b1dd1cbd90d1828f54c92e02a283351c56" +checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" dependencies = [ "bytes", "futures-channel", "futures-util", "http 1.1.0", "http-body 1.0.0", - "hyper 1.3.1", + "hyper 1.5.0", "pin-project-lite", "socket2 0.5.7", "tokio", - "tower", "tower-service", "tracing", ] @@ -1753,7 +1753,7 @@ checksum = "b4f0c8427b39666bf970460908b213ec09b3b350f20c0c2eabcbba51704a08e6" dependencies = [ "base64 0.22.1", "http-body-util", - "hyper 1.3.1", + "hyper 1.5.0", "hyper-util", "indexmap", "ipnet", @@ -2523,7 +2523,7 @@ dependencies = [ "http 1.1.0", "http-body 1.0.0", "http-body-util", - "hyper 1.3.1", + "hyper 1.5.0", "hyper-rustls 0.27.3", "hyper-util", "ipnet", @@ -2963,6 +2963,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_variant" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a0068df419f9d9b6488fdded3f1c818522cdea328e02ce9d9f147380265a432" +dependencies = [ + "serde", +] + [[package]] name = "sha-1" version = "0.10.1" @@ -3584,9 +3593,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.11" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d660770404473ccd7bc9f8b28494a811bc18542b915c0855c51e8f419d5223ce" +checksum = "4f4e6ce100d0eb49a2734f8c0812bcd324cf357d21810932c5df6b96ef2b86f1" dependencies = [ "futures-core", "pin-project-lite", @@ -3764,12 +3773,12 @@ dependencies = [ [[package]] name = "tracing-log" -version = "0.1.3" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" dependencies = [ - "lazy_static", "log", + "once_cell", "tracing-core", ] @@ -3785,9 +3794,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.16" +version = "0.3.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6176eae26dd70d0c919749377897b54a9276bd7061339665dd68777926b5a70" +checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" dependencies = [ "matchers", "nu-ansi-term", @@ -3813,11 +3822,12 @@ checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" [[package]] name = "twilight-cache-inmemory" version = "0.16.0-rc.1" -source = "git+https://github.com/pluralkit/twilight#41a71cc74441b54ed10e813e76ad040ce4c366e2" +source = "git+https://github.com/pluralkit/twilight#b346757b3054d461f5652b5ba0148a70eda5697e" dependencies = [ "bitflags 2.5.0", "dashmap", "serde", + "tracing", "twilight-model", "twilight-util", ] @@ -3825,7 +3835,7 @@ dependencies = [ [[package]] name = "twilight-gateway" version = "0.16.0-rc.1" -source = "git+https://github.com/pluralkit/twilight#41a71cc74441b54ed10e813e76ad040ce4c366e2" +source = "git+https://github.com/pluralkit/twilight#b346757b3054d461f5652b5ba0148a70eda5697e" dependencies = [ "bitflags 2.5.0", "fastrand", @@ -3845,7 +3855,7 @@ dependencies = [ [[package]] name = "twilight-gateway-queue" version = "0.16.0-rc.1" -source = "git+https://github.com/pluralkit/twilight#41a71cc74441b54ed10e813e76ad040ce4c366e2" +source = "git+https://github.com/pluralkit/twilight#b346757b3054d461f5652b5ba0148a70eda5697e" dependencies = [ "tokio", "tracing", @@ -3854,12 +3864,12 @@ dependencies = [ [[package]] name = "twilight-http" version = "0.16.0-rc.1" -source = "git+https://github.com/pluralkit/twilight#41a71cc74441b54ed10e813e76ad040ce4c366e2" +source = "git+https://github.com/pluralkit/twilight#b346757b3054d461f5652b5ba0148a70eda5697e" dependencies = [ "fastrand", "http 1.1.0", "http-body-util", - "hyper 1.3.1", + "hyper 1.5.0", "hyper-rustls 0.26.0", "hyper-util", "percent-encoding", @@ -3875,7 +3885,7 @@ dependencies = [ [[package]] name = "twilight-http-ratelimiting" version = "0.16.0-rc.1" -source = "git+https://github.com/pluralkit/twilight#41a71cc74441b54ed10e813e76ad040ce4c366e2" +source = "git+https://github.com/pluralkit/twilight#b346757b3054d461f5652b5ba0148a70eda5697e" dependencies = [ "tokio", "tracing", @@ -3884,7 +3894,7 @@ dependencies = [ [[package]] name = "twilight-model" version = "0.16.0-rc.1" -source = "git+https://github.com/pluralkit/twilight#41a71cc74441b54ed10e813e76ad040ce4c366e2" +source = "git+https://github.com/pluralkit/twilight#b346757b3054d461f5652b5ba0148a70eda5697e" dependencies = [ "bitflags 2.5.0", "serde", @@ -3896,7 +3906,7 @@ dependencies = [ [[package]] name = "twilight-util" version = "0.16.0-rc.1" -source = "git+https://github.com/pluralkit/twilight#41a71cc74441b54ed10e813e76ad040ce4c366e2" +source = "git+https://github.com/pluralkit/twilight#b346757b3054d461f5652b5ba0148a70eda5697e" dependencies = [ "twilight-model", ] @@ -3904,7 +3914,7 @@ dependencies = [ [[package]] name = "twilight-validate" version = "0.16.0-rc.1" -source = "git+https://github.com/pluralkit/twilight#41a71cc74441b54ed10e813e76ad040ce4c366e2" +source = "git+https://github.com/pluralkit/twilight#b346757b3054d461f5652b5ba0148a70eda5697e" dependencies = [ "twilight-model", ] diff --git a/services/api/src/main.rs b/services/api/src/main.rs index dbd2d46f..e9e89815 100644 --- a/services/api/src/main.rs +++ b/services/api/src/main.rs @@ -133,11 +133,11 @@ async fn main() -> anyhow::Result<()> { .route("/v2/members/:member_id/oembed.json", get(rproxy)) .route("/v2/groups/:group_id/oembed.json", get(rproxy)) - .layer(axum::middleware::from_fn(middleware::logger)) .layer(middleware::ratelimit::ratelimiter(middleware::ratelimit::do_request_ratelimited)) // this sucks .layer(axum::middleware::from_fn_with_state(ctx.clone(), middleware::authnz)) .layer(axum::middleware::from_fn(middleware::ignore_invalid_routes)) .layer(axum::middleware::from_fn(middleware::cors)) + .layer(axum::middleware::from_fn(middleware::logger)) .layer(tower_http::catch_panic::CatchPanicLayer::custom(util::handle_panic)) diff --git a/services/api/src/middleware/authnz.rs b/services/api/src/middleware/authnz.rs index e1b44941..4544e6bf 100644 --- a/services/api/src/middleware/authnz.rs +++ b/services/api/src/middleware/authnz.rs @@ -8,6 +8,8 @@ use tracing::error; use crate::ApiContext; +use super::logger::DID_AUTHENTICATE_HEADER; + pub async fn authnz(State(ctx): State, mut request: Request, next: Next) -> Response { let headers = request.headers_mut(); headers.remove("x-pluralkit-systemid"); @@ -15,6 +17,7 @@ pub async fn authnz(State(ctx): State, mut request: Request, next: N .get("authorization") .map(|h| h.to_str().ok()) .flatten(); + let mut authenticated = false; if let Some(auth_header) = auth_header { if let Some(system_id) = match libpk::db::repository::legacy_token_auth(&ctx.db, auth_header).await { @@ -29,7 +32,14 @@ pub async fn authnz(State(ctx): State, mut request: Request, next: N "x-pluralkit-systemid", HeaderValue::from_str(format!("{system_id}").as_str()).unwrap(), ); + authenticated = true; } } - next.run(request).await + let mut response = next.run(request).await; + if authenticated { + response + .headers_mut() + .insert(DID_AUTHENTICATE_HEADER, HeaderValue::from_static("1")); + } + response } diff --git a/services/api/src/middleware/logger.rs b/services/api/src/middleware/logger.rs index 7250f225..cfb84b4a 100644 --- a/services/api/src/middleware/logger.rs +++ b/services/api/src/middleware/logger.rs @@ -1,7 +1,7 @@ use std::time::Instant; use axum::{extract::MatchedPath, extract::Request, middleware::Next, response::Response}; -use metrics::histogram; +use metrics::{counter, histogram}; use tracing::{info, span, warn, Instrument, Level}; use crate::util::header_or_unknown; @@ -10,11 +10,12 @@ use crate::util::header_or_unknown; // todo: change as necessary const MIN_LOG_TIME: u128 = 2_000; +pub const DID_AUTHENTICATE_HEADER: &'static str = "x-pluralkit-didauthenticate"; + pub async fn logger(request: Request, next: Next) -> Response { let method = request.method().clone(); - let request_id = header_or_unknown(request.headers().get("Fly-Request-Id")); - let remote_ip = header_or_unknown(request.headers().get("Fly-Client-IP")); + let remote_ip = header_or_unknown(request.headers().get("X-PluralKit-Client-IP")); let user_agent = header_or_unknown(request.headers().get("User-Agent")); let endpoint = request @@ -26,10 +27,9 @@ pub async fn logger(request: Request, next: Next) -> Response { let uri = request.uri().clone(); - let request_id_span = span!( + let request_span = span!( Level::INFO, "request", - request_id, remote_ip, method = method.as_str(), endpoint = endpoint.clone(), @@ -37,9 +37,37 @@ pub async fn logger(request: Request, next: Next) -> Response { ); let start = Instant::now(); - let response = next.run(request).instrument(request_id_span).await; + let mut response = next.run(request).instrument(request_span).await; let elapsed = start.elapsed().as_millis(); + let authenticated = { + let headers = response.headers_mut(); + println!("{:#?}", headers.keys()); + if headers.contains_key(DID_AUTHENTICATE_HEADER) { + headers.remove(DID_AUTHENTICATE_HEADER); + true + } else { + false + } + }; + + counter!( + "pluralkit_api_requests", + "method" => method.to_string(), + "endpoint" => endpoint.clone(), + "status" => response.status().to_string(), + "authenticated" => authenticated.to_string(), + ) + .increment(1); + histogram!( + "pluralkit_api_requests_bucket", + "method" => method.to_string(), + "endpoint" => endpoint.clone(), + "status" => response.status().to_string(), + "authenticated" => authenticated.to_string(), + ) + .record(elapsed as f64 / 1_000_f64); + info!( "{} handled request for {} {} in {}ms", response.status(), @@ -47,15 +75,17 @@ pub async fn logger(request: Request, next: Next) -> Response { endpoint, elapsed ); - histogram!( - "pk_http_requests", - "method" => method.to_string(), - "route" => endpoint.clone(), - "status" => response.status().to_string() - ) - .record((elapsed as f64) / 1_000_f64); if elapsed > MIN_LOG_TIME { + counter!( + "pluralkit_api_slow_requests_count", + "method" => method.to_string(), + "endpoint" => endpoint.clone(), + "status" => response.status().to_string(), + "authenticated" => authenticated.to_string(), + ) + .increment(1); + warn!( "request to {} full path {} (endpoint {}) took a long time ({}ms)!", method, diff --git a/services/gateway/Cargo.toml b/services/gateway/Cargo.toml index b1a2daa7..6b9f3e25 100644 --- a/services/gateway/Cargo.toml +++ b/services/gateway/Cargo.toml @@ -24,3 +24,5 @@ twilight-cache-inmemory = { workspace = true } twilight-util = { workspace = true } twilight-model = { workspace = true } twilight-http = { workspace = true } + +serde_variant = "0.1.3" diff --git a/services/gateway/src/discord/gateway.rs b/services/gateway/src/discord/gateway.rs index 15d5ac80..40ed9bf7 100644 --- a/services/gateway/src/discord/gateway.rs +++ b/services/gateway/src/discord/gateway.rs @@ -1,4 +1,5 @@ use libpk::_config::ClusterSettings; +use metrics::counter; use std::sync::{mpsc::Sender, Arc}; use tracing::{info, warn}; use twilight_gateway::{ @@ -85,6 +86,12 @@ pub async fn runner( while let Some(item) = shard.next_event(EventTypeFlags::all()).await { match item { Ok(event) => { + counter!( + "pluralkit_gateway_events", + "shard_id" => shard.id().number().to_string(), + "event_type" => serde_variant::to_variant_name(&event.kind()).unwrap(), + ) + .increment(1); if let Err(error) = shard_state .handle_event(shard.id().number(), event.clone()) .await diff --git a/services/gateway/src/discord/shard_state.rs b/services/gateway/src/discord/shard_state.rs index 1a4b57ef..f92ff866 100644 --- a/services/gateway/src/discord/shard_state.rs +++ b/services/gateway/src/discord/shard_state.rs @@ -1,5 +1,6 @@ use bytes::Bytes; use fred::{clients::RedisPool, interfaces::HashesInterface}; +use metrics::{counter, gauge}; use prost::Message; use tracing::info; use twilight_gateway::Event; @@ -59,6 +60,13 @@ impl ShardStateManager { shard_id, if resumed { "resumed" } else { "ready" } ); + counter!( + "pluralkit_gateway_shard_reconnect", + "shard_id" => shard_id.to_string(), + "resumed" => resumed.to_string(), + ) + .increment(1); + gauge!("pluralkit_gateway_shard_up").increment(1); let mut info = self.get_shard(shard_id).await?; info.last_connection = chrono::offset::Utc::now().timestamp() as i32; info.up = true; @@ -68,6 +76,7 @@ impl ShardStateManager { async fn socket_closed(&self, shard_id: u32) -> anyhow::Result<()> { info!("shard {} closed", shard_id); + gauge!("pluralkit_gateway_shard_up").decrement(1); let mut info = self.get_shard(shard_id).await?; info.up = false; info.disconnection_count += 1; diff --git a/services/gateway/src/logger.rs b/services/gateway/src/logger.rs index aa65bc67..459aef31 100644 --- a/services/gateway/src/logger.rs +++ b/services/gateway/src/logger.rs @@ -1,6 +1,9 @@ use std::time::Instant; -use axum::{extract::MatchedPath, extract::Request, middleware::Next, response::Response}; +use axum::{ + extract::MatchedPath, extract::Request, http::StatusCode, middleware::Next, response::Response, +}; +use metrics::{counter, histogram}; use tracing::{info, span, warn, Instrument, Level}; // log any requests that take longer than 2 seconds @@ -30,13 +33,30 @@ pub async fn logger(request: Request, next: Next) -> Response { let response = next.run(request).instrument(request_id_span).await; let elapsed = start.elapsed().as_millis(); - info!( - "{} handled request for {} {} in {}ms", - response.status(), - method, - uri.path(), - elapsed - ); + counter!( + "pluralkit_gateway_cache_api_requests", + "method" => method.to_string(), + "endpoint" => endpoint.clone(), + "status" => response.status().to_string(), + ) + .increment(1); + histogram!( + "pluralkit_gateway_cache_api_requests_bucket", + "method" => method.to_string(), + "endpoint" => endpoint.clone(), + "status" => response.status().to_string(), + ) + .record(elapsed as f64 / 1_000_f64); + + if response.status() != StatusCode::FOUND { + info!( + "{} handled request for {} {} in {}ms", + response.status(), + method, + uri.path(), + elapsed + ); + } if elapsed > MIN_LOG_TIME { warn!(