From 2082710ab25d58ef2eac9f9cc150ceff0e7bfe37 Mon Sep 17 00:00:00 2001 From: Thomas Hallock Date: Fri, 16 Jan 2026 05:25:19 -0600 Subject: [PATCH] fix: add retry middleware for zero-downtime deployments The problem: During deployments, users pinned via sticky session to the restarting container experienced ~60s of downtime because: 1. Health checks were too slow (10s interval) 2. No retry on failure - requests just failed The fix: - Add retry middleware: 3 attempts with 100ms initial interval - Reduce health check interval from 10s to 3s - Add health check timeout of 2s Now when your pinned server restarts: 1. Request fails 2. Traefik retries on the OTHER healthy server 3. You get a response (maybe with new server_id cookie) Combined with Redis for session state, this should give true zero-downtime deployments. Co-Authored-By: Claude Opus 4.5 --- nas-deployment/docker-compose.blue.yaml | 14 ++++++++++---- nas-deployment/docker-compose.green.yaml | 14 ++++++++++---- nas-deployment/docker-compose.yaml | 14 ++++++++++---- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/nas-deployment/docker-compose.blue.yaml b/nas-deployment/docker-compose.blue.yaml index f61817e6..c195487f 100644 --- a/nas-deployment/docker-compose.blue.yaml +++ b/nas-deployment/docker-compose.blue.yaml @@ -26,19 +26,25 @@ services: traefik.http.routers.abaci.entrypoints: websecure traefik.http.routers.abaci.tls: "true" traefik.http.routers.abaci.tls.certresolver: myresolver - traefik.http.routers.abaci.middlewares: hsts@docker + # Chain middlewares: retry failed requests, then HSTS headers + traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker traefik.http.routers.abaci.service: abaci traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)" traefik.http.routers.abaci-http.entrypoints: web traefik.http.routers.abaci-http.middlewares: redirect-https@docker traefik.http.services.abaci.loadbalancer.server.port: "3000" + # Faster health checks for quicker failover during deployments traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health - traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s - # Sticky sessions required for Socket.IO and remote camera sessions - # Without this, desktop and phone may hit different instances + traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s + traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s + # Sticky sessions for Socket.IO (Redis handles cross-instance state) + # If pinned server is unhealthy, Traefik will failover + retry middleware helps traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true" traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true" + # Retry middleware: retry on another server if request fails (zero-downtime deploys) + traefik.http.middlewares.retry.retry.attempts: "3" + traefik.http.middlewares.retry.retry.initialinterval: 100ms traefik.http.middlewares.redirect-https.redirectscheme.scheme: https traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true" traefik.http.middlewares.hsts.headers.stsSeconds: "63072000" diff --git a/nas-deployment/docker-compose.green.yaml b/nas-deployment/docker-compose.green.yaml index a7f81156..2f2d8d8a 100644 --- a/nas-deployment/docker-compose.green.yaml +++ b/nas-deployment/docker-compose.green.yaml @@ -26,19 +26,25 @@ services: traefik.http.routers.abaci.entrypoints: websecure traefik.http.routers.abaci.tls: "true" traefik.http.routers.abaci.tls.certresolver: myresolver - traefik.http.routers.abaci.middlewares: hsts@docker + # Chain middlewares: retry failed requests, then HSTS headers + traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker traefik.http.routers.abaci.service: abaci traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)" traefik.http.routers.abaci-http.entrypoints: web traefik.http.routers.abaci-http.middlewares: redirect-https@docker traefik.http.services.abaci.loadbalancer.server.port: "3000" + # Faster health checks for quicker failover during deployments traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health - traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s - # Sticky sessions required for Socket.IO and remote camera sessions - # Without this, desktop and phone may hit different instances + traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s + traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s + # Sticky sessions for Socket.IO (Redis handles cross-instance state) + # If pinned server is unhealthy, Traefik will failover + retry middleware helps traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true" traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true" + # Retry middleware: retry on another server if request fails (zero-downtime deploys) + traefik.http.middlewares.retry.retry.attempts: "3" + traefik.http.middlewares.retry.retry.initialinterval: 100ms traefik.http.middlewares.redirect-https.redirectscheme.scheme: https traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true" traefik.http.middlewares.hsts.headers.stsSeconds: "63072000" diff --git a/nas-deployment/docker-compose.yaml b/nas-deployment/docker-compose.yaml index 5b99511f..dbe12eeb 100644 --- a/nas-deployment/docker-compose.yaml +++ b/nas-deployment/docker-compose.yaml @@ -42,19 +42,25 @@ x-traefik-labels: &traefik-labels traefik.http.routers.abaci.entrypoints: websecure traefik.http.routers.abaci.tls: "true" traefik.http.routers.abaci.tls.certresolver: myresolver - traefik.http.routers.abaci.middlewares: hsts@docker + # Chain middlewares: retry failed requests, then HSTS headers + traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker traefik.http.routers.abaci.service: abaci traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)" traefik.http.routers.abaci-http.entrypoints: web traefik.http.routers.abaci-http.middlewares: redirect-https@docker traefik.http.services.abaci.loadbalancer.server.port: "3000" + # Faster health checks for quicker failover during deployments traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health - traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s - # Sticky sessions required for Socket.IO and remote camera sessions - # Without this, desktop and phone may hit different instances + traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s + traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s + # Sticky sessions for Socket.IO (Redis handles cross-instance state) + # If pinned server is unhealthy, Traefik will failover + retry middleware helps traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true" traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true" + # Retry middleware: retry on another server if request fails (zero-downtime deploys) + traefik.http.middlewares.retry.retry.attempts: "3" + traefik.http.middlewares.retry.retry.initialinterval: 100ms traefik.http.middlewares.redirect-https.redirectscheme.scheme: https traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true" traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"