feat(infra): add performance remediation for k8s deployment
- Increase resource limits: 1Gi memory, 2 CPU cores per pod - Tune health probes: 10s timeout, 5 failures (75s grace period) - Add Traefik rate limiting: 50 req/sec avg, 100 burst - Add in-flight request limiting: max 100 concurrent connections Fixes pod crashes under moderate load (50+ concurrent connections). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
f2fc30878d
commit
0abed6ae55
|
|
@ -222,25 +222,26 @@ resource "kubernetes_stateful_set" "app" {
|
|||
|
||||
resources {
|
||||
requests = {
|
||||
memory = "256Mi"
|
||||
cpu = "100m"
|
||||
memory = "512Mi"
|
||||
cpu = "200m"
|
||||
}
|
||||
limits = {
|
||||
memory = "512Mi"
|
||||
cpu = "1000m"
|
||||
memory = "1Gi"
|
||||
cpu = "2000m"
|
||||
}
|
||||
}
|
||||
|
||||
# Health checks hit the LiteFS proxy
|
||||
# Tuned for resilience under load: longer timeouts, more failures allowed
|
||||
liveness_probe {
|
||||
http_get {
|
||||
path = "/api/health"
|
||||
port = 8080
|
||||
}
|
||||
initial_delay_seconds = 30
|
||||
period_seconds = 10
|
||||
timeout_seconds = 5
|
||||
failure_threshold = 3
|
||||
period_seconds = 15
|
||||
timeout_seconds = 10
|
||||
failure_threshold = 5
|
||||
}
|
||||
|
||||
readiness_probe {
|
||||
|
|
@ -249,9 +250,9 @@ resource "kubernetes_stateful_set" "app" {
|
|||
port = 8080
|
||||
}
|
||||
initial_delay_seconds = 5
|
||||
period_seconds = 5
|
||||
timeout_seconds = 3
|
||||
failure_threshold = 3
|
||||
period_seconds = 10
|
||||
timeout_seconds = 10
|
||||
failure_threshold = 5
|
||||
}
|
||||
|
||||
volume_mount {
|
||||
|
|
@ -358,7 +359,7 @@ resource "kubernetes_ingress_v1" "app" {
|
|||
annotations = {
|
||||
"cert-manager.io/cluster-issuer" = var.use_staging_certs ? "letsencrypt-staging" : "letsencrypt-prod"
|
||||
"traefik.ingress.kubernetes.io/router.entrypoints" = "websecure"
|
||||
"traefik.ingress.kubernetes.io/router.middlewares" = "${kubernetes_namespace.abaci.metadata[0].name}-hsts@kubernetescrd"
|
||||
"traefik.ingress.kubernetes.io/router.middlewares" = "${kubernetes_namespace.abaci.metadata[0].name}-hsts@kubernetescrd,${kubernetes_namespace.abaci.metadata[0].name}-rate-limit@kubernetescrd,${kubernetes_namespace.abaci.metadata[0].name}-in-flight-req@kubernetescrd"
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -413,6 +414,41 @@ resource "kubernetes_manifest" "hsts_middleware" {
|
|||
}
|
||||
}
|
||||
|
||||
# Rate limiting middleware - protect against traffic spikes
|
||||
resource "kubernetes_manifest" "rate_limit_middleware" {
|
||||
manifest = {
|
||||
apiVersion = "traefik.io/v1alpha1"
|
||||
kind = "Middleware"
|
||||
metadata = {
|
||||
name = "rate-limit"
|
||||
namespace = kubernetes_namespace.abaci.metadata[0].name
|
||||
}
|
||||
spec = {
|
||||
rateLimit = {
|
||||
average = 50 # 50 requests/sec average
|
||||
burst = 100 # Allow bursts up to 100
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# In-flight request limiting - cap concurrent connections
|
||||
resource "kubernetes_manifest" "in_flight_middleware" {
|
||||
manifest = {
|
||||
apiVersion = "traefik.io/v1alpha1"
|
||||
kind = "Middleware"
|
||||
metadata = {
|
||||
name = "in-flight-req"
|
||||
namespace = kubernetes_namespace.abaci.metadata[0].name
|
||||
}
|
||||
spec = {
|
||||
inFlightReq = {
|
||||
amount = 100 # Max 100 concurrent requests
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# HTTP to HTTPS redirect
|
||||
resource "kubernetes_ingress_v1" "app_http_redirect" {
|
||||
metadata {
|
||||
|
|
|
|||
Loading…
Reference in New Issue