fix: add retry middleware for zero-downtime deployments

The problem: During deployments, users pinned via sticky session to
the restarting container experienced ~60s of downtime because:
1. Health checks were too slow (10s interval)
2. No retry on failure - requests just failed

The fix:
- Add retry middleware: 3 attempts with 100ms initial interval
- Reduce health check interval from 10s to 3s
- Add health check timeout of 2s

Now when your pinned server restarts:
1. Request fails
2. Traefik retries on the OTHER healthy server
3. You get a response (maybe with new server_id cookie)

Combined with Redis for session state, this should give true
zero-downtime deployments.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Thomas Hallock 2026-01-16 05:25:19 -06:00
parent 49eb2c8c36
commit 2082710ab2
3 changed files with 30 additions and 12 deletions

View File

@ -26,19 +26,25 @@ services:
traefik.http.routers.abaci.entrypoints: websecure traefik.http.routers.abaci.entrypoints: websecure
traefik.http.routers.abaci.tls: "true" traefik.http.routers.abaci.tls: "true"
traefik.http.routers.abaci.tls.certresolver: myresolver traefik.http.routers.abaci.tls.certresolver: myresolver
traefik.http.routers.abaci.middlewares: hsts@docker # Chain middlewares: retry failed requests, then HSTS headers
traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
traefik.http.routers.abaci.service: abaci traefik.http.routers.abaci.service: abaci
traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)" traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
traefik.http.routers.abaci-http.entrypoints: web traefik.http.routers.abaci-http.entrypoints: web
traefik.http.routers.abaci-http.middlewares: redirect-https@docker traefik.http.routers.abaci-http.middlewares: redirect-https@docker
traefik.http.services.abaci.loadbalancer.server.port: "3000" traefik.http.services.abaci.loadbalancer.server.port: "3000"
# Faster health checks for quicker failover during deployments
traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
# Sticky sessions required for Socket.IO and remote camera sessions traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
# Without this, desktop and phone may hit different instances # Sticky sessions for Socket.IO (Redis handles cross-instance state)
# If pinned server is unhealthy, Traefik will failover + retry middleware helps
traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true" traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true" traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
# Retry middleware: retry on another server if request fails (zero-downtime deploys)
traefik.http.middlewares.retry.retry.attempts: "3"
traefik.http.middlewares.retry.retry.initialinterval: 100ms
traefik.http.middlewares.redirect-https.redirectscheme.scheme: https traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true" traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
traefik.http.middlewares.hsts.headers.stsSeconds: "63072000" traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"

View File

@ -26,19 +26,25 @@ services:
traefik.http.routers.abaci.entrypoints: websecure traefik.http.routers.abaci.entrypoints: websecure
traefik.http.routers.abaci.tls: "true" traefik.http.routers.abaci.tls: "true"
traefik.http.routers.abaci.tls.certresolver: myresolver traefik.http.routers.abaci.tls.certresolver: myresolver
traefik.http.routers.abaci.middlewares: hsts@docker # Chain middlewares: retry failed requests, then HSTS headers
traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
traefik.http.routers.abaci.service: abaci traefik.http.routers.abaci.service: abaci
traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)" traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
traefik.http.routers.abaci-http.entrypoints: web traefik.http.routers.abaci-http.entrypoints: web
traefik.http.routers.abaci-http.middlewares: redirect-https@docker traefik.http.routers.abaci-http.middlewares: redirect-https@docker
traefik.http.services.abaci.loadbalancer.server.port: "3000" traefik.http.services.abaci.loadbalancer.server.port: "3000"
# Faster health checks for quicker failover during deployments
traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
# Sticky sessions required for Socket.IO and remote camera sessions traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
# Without this, desktop and phone may hit different instances # Sticky sessions for Socket.IO (Redis handles cross-instance state)
# If pinned server is unhealthy, Traefik will failover + retry middleware helps
traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true" traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true" traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
# Retry middleware: retry on another server if request fails (zero-downtime deploys)
traefik.http.middlewares.retry.retry.attempts: "3"
traefik.http.middlewares.retry.retry.initialinterval: 100ms
traefik.http.middlewares.redirect-https.redirectscheme.scheme: https traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true" traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
traefik.http.middlewares.hsts.headers.stsSeconds: "63072000" traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"

View File

@ -42,19 +42,25 @@ x-traefik-labels: &traefik-labels
traefik.http.routers.abaci.entrypoints: websecure traefik.http.routers.abaci.entrypoints: websecure
traefik.http.routers.abaci.tls: "true" traefik.http.routers.abaci.tls: "true"
traefik.http.routers.abaci.tls.certresolver: myresolver traefik.http.routers.abaci.tls.certresolver: myresolver
traefik.http.routers.abaci.middlewares: hsts@docker # Chain middlewares: retry failed requests, then HSTS headers
traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
traefik.http.routers.abaci.service: abaci traefik.http.routers.abaci.service: abaci
traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)" traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
traefik.http.routers.abaci-http.entrypoints: web traefik.http.routers.abaci-http.entrypoints: web
traefik.http.routers.abaci-http.middlewares: redirect-https@docker traefik.http.routers.abaci-http.middlewares: redirect-https@docker
traefik.http.services.abaci.loadbalancer.server.port: "3000" traefik.http.services.abaci.loadbalancer.server.port: "3000"
# Faster health checks for quicker failover during deployments
traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
# Sticky sessions required for Socket.IO and remote camera sessions traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
# Without this, desktop and phone may hit different instances # Sticky sessions for Socket.IO (Redis handles cross-instance state)
# If pinned server is unhealthy, Traefik will failover + retry middleware helps
traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true" traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true" traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
# Retry middleware: retry on another server if request fails (zero-downtime deploys)
traefik.http.middlewares.retry.retry.attempts: "3"
traefik.http.middlewares.retry.retry.initialinterval: 100ms
traefik.http.middlewares.redirect-https.redirectscheme.scheme: https traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true" traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
traefik.http.middlewares.hsts.headers.stsSeconds: "63072000" traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"