From 2082710ab25d58ef2eac9f9cc150ceff0e7bfe37 Mon Sep 17 00:00:00 2001
From: Thomas Hallock <thomas@evolutioniq.com>
Date: Fri, 16 Jan 2026 05:25:19 -0600
Subject: [PATCH] fix: add retry middleware for zero-downtime deployments

The problem: During deployments, users pinned via sticky session to
the restarting container experienced ~60s of downtime because:
1. Health checks were too slow (10s interval)
2. No retry on failure - requests just failed

The fix:
- Add retry middleware: 3 attempts with 100ms initial interval
- Reduce health check interval from 10s to 3s
- Add health check timeout of 2s

Now when your pinned server restarts:
1. Request fails
2. Traefik retries on the OTHER healthy server
3. You get a response (maybe with new server_id cookie)

Combined with Redis for session state, this should give true
zero-downtime deployments.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 nas-deployment/docker-compose.blue.yaml  | 14 ++++++++++----
 nas-deployment/docker-compose.green.yaml | 14 ++++++++++----
 nas-deployment/docker-compose.yaml       | 14 ++++++++++----
 3 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/nas-deployment/docker-compose.blue.yaml b/nas-deployment/docker-compose.blue.yaml
index f61817e6..c195487f 100644
--- a/nas-deployment/docker-compose.blue.yaml
+++ b/nas-deployment/docker-compose.blue.yaml
@@ -26,19 +26,25 @@ services:
       traefik.http.routers.abaci.entrypoints: websecure
       traefik.http.routers.abaci.tls: "true"
       traefik.http.routers.abaci.tls.certresolver: myresolver
-      traefik.http.routers.abaci.middlewares: hsts@docker
+      # Chain middlewares: retry failed requests, then HSTS headers
+      traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
       traefik.http.routers.abaci.service: abaci
       traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
       traefik.http.routers.abaci-http.entrypoints: web
       traefik.http.routers.abaci-http.middlewares: redirect-https@docker
       traefik.http.services.abaci.loadbalancer.server.port: "3000"
+      # Faster health checks for quicker failover during deployments
       traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
-      traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s
-      # Sticky sessions required for Socket.IO and remote camera sessions
-      # Without this, desktop and phone may hit different instances
+      traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
+      traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
+      # Sticky sessions for Socket.IO (Redis handles cross-instance state)
+      # If pinned server is unhealthy, Traefik will failover + retry middleware helps
       traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
       traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
       traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
+      # Retry middleware: retry on another server if request fails (zero-downtime deploys)
+      traefik.http.middlewares.retry.retry.attempts: "3"
+      traefik.http.middlewares.retry.retry.initialinterval: 100ms
       traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
       traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
       traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"
diff --git a/nas-deployment/docker-compose.green.yaml b/nas-deployment/docker-compose.green.yaml
index a7f81156..2f2d8d8a 100644
--- a/nas-deployment/docker-compose.green.yaml
+++ b/nas-deployment/docker-compose.green.yaml
@@ -26,19 +26,25 @@ services:
       traefik.http.routers.abaci.entrypoints: websecure
       traefik.http.routers.abaci.tls: "true"
       traefik.http.routers.abaci.tls.certresolver: myresolver
-      traefik.http.routers.abaci.middlewares: hsts@docker
+      # Chain middlewares: retry failed requests, then HSTS headers
+      traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
       traefik.http.routers.abaci.service: abaci
       traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
       traefik.http.routers.abaci-http.entrypoints: web
       traefik.http.routers.abaci-http.middlewares: redirect-https@docker
       traefik.http.services.abaci.loadbalancer.server.port: "3000"
+      # Faster health checks for quicker failover during deployments
       traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
-      traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s
-      # Sticky sessions required for Socket.IO and remote camera sessions
-      # Without this, desktop and phone may hit different instances
+      traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
+      traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
+      # Sticky sessions for Socket.IO (Redis handles cross-instance state)
+      # If pinned server is unhealthy, Traefik will failover + retry middleware helps
       traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
       traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
       traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
+      # Retry middleware: retry on another server if request fails (zero-downtime deploys)
+      traefik.http.middlewares.retry.retry.attempts: "3"
+      traefik.http.middlewares.retry.retry.initialinterval: 100ms
       traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
       traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
       traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"
diff --git a/nas-deployment/docker-compose.yaml b/nas-deployment/docker-compose.yaml
index 5b99511f..dbe12eeb 100644
--- a/nas-deployment/docker-compose.yaml
+++ b/nas-deployment/docker-compose.yaml
@@ -42,19 +42,25 @@ x-traefik-labels: &traefik-labels
   traefik.http.routers.abaci.entrypoints: websecure
   traefik.http.routers.abaci.tls: "true"
   traefik.http.routers.abaci.tls.certresolver: myresolver
-  traefik.http.routers.abaci.middlewares: hsts@docker
+  # Chain middlewares: retry failed requests, then HSTS headers
+  traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
   traefik.http.routers.abaci.service: abaci
   traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
   traefik.http.routers.abaci-http.entrypoints: web
   traefik.http.routers.abaci-http.middlewares: redirect-https@docker
   traefik.http.services.abaci.loadbalancer.server.port: "3000"
+  # Faster health checks for quicker failover during deployments
   traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
-  traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s
-  # Sticky sessions required for Socket.IO and remote camera sessions
-  # Without this, desktop and phone may hit different instances
+  traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
+  traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
+  # Sticky sessions for Socket.IO (Redis handles cross-instance state)
+  # If pinned server is unhealthy, Traefik will failover + retry middleware helps
   traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
   traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
   traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
+  # Retry middleware: retry on another server if request fails (zero-downtime deploys)
+  traefik.http.middlewares.retry.retry.attempts: "3"
+  traefik.http.middlewares.retry.retry.initialinterval: 100ms
   traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
   traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
   traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"