fix: add retry middleware for zero-downtime deployments

The problem: During deployments, users pinned via sticky session to the restarting container experienced ~60s of downtime because: 1. Health checks were too slow (10s interval) 2. No retry on failure - requests just failed The fix: - Add retry middleware: 3 attempts with 100ms initial interval - Reduce health check interval from 10s to 3s - Add health check timeout of 2s Now when your pinned server restarts: 1. Request fails 2. Traefik retries on the OTHER healthy server 3. You get a response (maybe with new server_id cookie) Combined with Redis for session state, this should give true zero-downtime deployments. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-16 05:25:19 -06:00 · 2026-01-16 05:25:19 -06:00 · 2082710ab2
parent 49eb2c8c36
commit 2082710ab2
3 changed files with 30 additions and 12 deletions
--- a/nas-deployment/docker-compose.blue.yaml
+++ b/nas-deployment/docker-compose.blue.yaml
@ -26,19 +26,25 @@ services:
      traefik.http.routers.abaci.entrypoints: websecure
      traefik.http.routers.abaci.tls: "true"
      traefik.http.routers.abaci.tls.certresolver: myresolver
-      traefik.http.routers.abaci.middlewares: hsts@docker
+      # Chain middlewares: retry failed requests, then HSTS headers
      traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
      traefik.http.routers.abaci.service: abaci
      traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
      traefik.http.routers.abaci-http.entrypoints: web
      traefik.http.routers.abaci-http.middlewares: redirect-https@docker
      traefik.http.services.abaci.loadbalancer.server.port: "3000"
      # Faster health checks for quicker failover during deployments
      traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
-      traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s
+      traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
-      # Sticky sessions required for Socket.IO and remote camera sessions
+      traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
-      # Without this, desktop and phone may hit different instances
+      # Sticky sessions for Socket.IO (Redis handles cross-instance state)
      # If pinned server is unhealthy, Traefik will failover + retry middleware helps
      traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
      traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
      traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
      # Retry middleware: retry on another server if request fails (zero-downtime deploys)
      traefik.http.middlewares.retry.retry.attempts: "3"
      traefik.http.middlewares.retry.retry.initialinterval: 100ms
      traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
      traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
      traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"
--- a/nas-deployment/docker-compose.green.yaml
+++ b/nas-deployment/docker-compose.green.yaml
@ -26,19 +26,25 @@ services:
      traefik.http.routers.abaci.entrypoints: websecure
      traefik.http.routers.abaci.tls: "true"
      traefik.http.routers.abaci.tls.certresolver: myresolver
-      traefik.http.routers.abaci.middlewares: hsts@docker
+      # Chain middlewares: retry failed requests, then HSTS headers
      traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
      traefik.http.routers.abaci.service: abaci
      traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
      traefik.http.routers.abaci-http.entrypoints: web
      traefik.http.routers.abaci-http.middlewares: redirect-https@docker
      traefik.http.services.abaci.loadbalancer.server.port: "3000"
      # Faster health checks for quicker failover during deployments
      traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
-      traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s
+      traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
-      # Sticky sessions required for Socket.IO and remote camera sessions
+      traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
-      # Without this, desktop and phone may hit different instances
+      # Sticky sessions for Socket.IO (Redis handles cross-instance state)
      # If pinned server is unhealthy, Traefik will failover + retry middleware helps
      traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
      traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
      traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
      # Retry middleware: retry on another server if request fails (zero-downtime deploys)
      traefik.http.middlewares.retry.retry.attempts: "3"
      traefik.http.middlewares.retry.retry.initialinterval: 100ms
      traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
      traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
      traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"
--- a/nas-deployment/docker-compose.yaml
+++ b/nas-deployment/docker-compose.yaml
@ -42,19 +42,25 @@ x-traefik-labels: &traefik-labels
  traefik.http.routers.abaci.entrypoints: websecure
  traefik.http.routers.abaci.tls: "true"
  traefik.http.routers.abaci.tls.certresolver: myresolver
-  traefik.http.routers.abaci.middlewares: hsts@docker
+  # Chain middlewares: retry failed requests, then HSTS headers
  traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
  traefik.http.routers.abaci.service: abaci
  traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
  traefik.http.routers.abaci-http.entrypoints: web
  traefik.http.routers.abaci-http.middlewares: redirect-https@docker
  traefik.http.services.abaci.loadbalancer.server.port: "3000"
  # Faster health checks for quicker failover during deployments
  traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
-  traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s
+  traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
-  # Sticky sessions required for Socket.IO and remote camera sessions
+  traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
-  # Without this, desktop and phone may hit different instances
+  # Sticky sessions for Socket.IO (Redis handles cross-instance state)
  # If pinned server is unhealthy, Traefik will failover + retry middleware helps
  traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
  traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
  traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
  # Retry middleware: retry on another server if request fails (zero-downtime deploys)
  traefik.http.middlewares.retry.retry.attempts: "3"
  traefik.http.middlewares.retry.retry.initialinterval: 100ms
  traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
  traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
  traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"