fix: add retry middleware for zero-downtime deployments
The problem: During deployments, users pinned via sticky session to the restarting container experienced ~60s of downtime because: 1. Health checks were too slow (10s interval) 2. No retry on failure - requests just failed The fix: - Add retry middleware: 3 attempts with 100ms initial interval - Reduce health check interval from 10s to 3s - Add health check timeout of 2s Now when your pinned server restarts: 1. Request fails 2. Traefik retries on the OTHER healthy server 3. You get a response (maybe with new server_id cookie) Combined with Redis for session state, this should give true zero-downtime deployments. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
49eb2c8c36
commit
2082710ab2
|
|
@ -26,19 +26,25 @@ services:
|
||||||
traefik.http.routers.abaci.entrypoints: websecure
|
traefik.http.routers.abaci.entrypoints: websecure
|
||||||
traefik.http.routers.abaci.tls: "true"
|
traefik.http.routers.abaci.tls: "true"
|
||||||
traefik.http.routers.abaci.tls.certresolver: myresolver
|
traefik.http.routers.abaci.tls.certresolver: myresolver
|
||||||
traefik.http.routers.abaci.middlewares: hsts@docker
|
# Chain middlewares: retry failed requests, then HSTS headers
|
||||||
|
traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
|
||||||
traefik.http.routers.abaci.service: abaci
|
traefik.http.routers.abaci.service: abaci
|
||||||
traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
|
traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
|
||||||
traefik.http.routers.abaci-http.entrypoints: web
|
traefik.http.routers.abaci-http.entrypoints: web
|
||||||
traefik.http.routers.abaci-http.middlewares: redirect-https@docker
|
traefik.http.routers.abaci-http.middlewares: redirect-https@docker
|
||||||
traefik.http.services.abaci.loadbalancer.server.port: "3000"
|
traefik.http.services.abaci.loadbalancer.server.port: "3000"
|
||||||
|
# Faster health checks for quicker failover during deployments
|
||||||
traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
|
traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
|
||||||
traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s
|
traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
|
||||||
# Sticky sessions required for Socket.IO and remote camera sessions
|
traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
|
||||||
# Without this, desktop and phone may hit different instances
|
# Sticky sessions for Socket.IO (Redis handles cross-instance state)
|
||||||
|
# If pinned server is unhealthy, Traefik will failover + retry middleware helps
|
||||||
traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
|
traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
|
||||||
traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
|
traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
|
||||||
traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
|
traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
|
||||||
|
# Retry middleware: retry on another server if request fails (zero-downtime deploys)
|
||||||
|
traefik.http.middlewares.retry.retry.attempts: "3"
|
||||||
|
traefik.http.middlewares.retry.retry.initialinterval: 100ms
|
||||||
traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
|
traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
|
||||||
traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
|
traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
|
||||||
traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"
|
traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"
|
||||||
|
|
|
||||||
|
|
@ -26,19 +26,25 @@ services:
|
||||||
traefik.http.routers.abaci.entrypoints: websecure
|
traefik.http.routers.abaci.entrypoints: websecure
|
||||||
traefik.http.routers.abaci.tls: "true"
|
traefik.http.routers.abaci.tls: "true"
|
||||||
traefik.http.routers.abaci.tls.certresolver: myresolver
|
traefik.http.routers.abaci.tls.certresolver: myresolver
|
||||||
traefik.http.routers.abaci.middlewares: hsts@docker
|
# Chain middlewares: retry failed requests, then HSTS headers
|
||||||
|
traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
|
||||||
traefik.http.routers.abaci.service: abaci
|
traefik.http.routers.abaci.service: abaci
|
||||||
traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
|
traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
|
||||||
traefik.http.routers.abaci-http.entrypoints: web
|
traefik.http.routers.abaci-http.entrypoints: web
|
||||||
traefik.http.routers.abaci-http.middlewares: redirect-https@docker
|
traefik.http.routers.abaci-http.middlewares: redirect-https@docker
|
||||||
traefik.http.services.abaci.loadbalancer.server.port: "3000"
|
traefik.http.services.abaci.loadbalancer.server.port: "3000"
|
||||||
|
# Faster health checks for quicker failover during deployments
|
||||||
traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
|
traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
|
||||||
traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s
|
traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
|
||||||
# Sticky sessions required for Socket.IO and remote camera sessions
|
traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
|
||||||
# Without this, desktop and phone may hit different instances
|
# Sticky sessions for Socket.IO (Redis handles cross-instance state)
|
||||||
|
# If pinned server is unhealthy, Traefik will failover + retry middleware helps
|
||||||
traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
|
traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
|
||||||
traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
|
traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
|
||||||
traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
|
traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
|
||||||
|
# Retry middleware: retry on another server if request fails (zero-downtime deploys)
|
||||||
|
traefik.http.middlewares.retry.retry.attempts: "3"
|
||||||
|
traefik.http.middlewares.retry.retry.initialinterval: 100ms
|
||||||
traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
|
traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
|
||||||
traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
|
traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
|
||||||
traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"
|
traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"
|
||||||
|
|
|
||||||
|
|
@ -42,19 +42,25 @@ x-traefik-labels: &traefik-labels
|
||||||
traefik.http.routers.abaci.entrypoints: websecure
|
traefik.http.routers.abaci.entrypoints: websecure
|
||||||
traefik.http.routers.abaci.tls: "true"
|
traefik.http.routers.abaci.tls: "true"
|
||||||
traefik.http.routers.abaci.tls.certresolver: myresolver
|
traefik.http.routers.abaci.tls.certresolver: myresolver
|
||||||
traefik.http.routers.abaci.middlewares: hsts@docker
|
# Chain middlewares: retry failed requests, then HSTS headers
|
||||||
|
traefik.http.routers.abaci.middlewares: retry@docker,hsts@docker
|
||||||
traefik.http.routers.abaci.service: abaci
|
traefik.http.routers.abaci.service: abaci
|
||||||
traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
|
traefik.http.routers.abaci-http.rule: "Host(`abaci.one`)"
|
||||||
traefik.http.routers.abaci-http.entrypoints: web
|
traefik.http.routers.abaci-http.entrypoints: web
|
||||||
traefik.http.routers.abaci-http.middlewares: redirect-https@docker
|
traefik.http.routers.abaci-http.middlewares: redirect-https@docker
|
||||||
traefik.http.services.abaci.loadbalancer.server.port: "3000"
|
traefik.http.services.abaci.loadbalancer.server.port: "3000"
|
||||||
|
# Faster health checks for quicker failover during deployments
|
||||||
traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
|
traefik.http.services.abaci.loadbalancer.healthcheck.path: /api/health
|
||||||
traefik.http.services.abaci.loadbalancer.healthcheck.interval: 10s
|
traefik.http.services.abaci.loadbalancer.healthcheck.interval: 3s
|
||||||
# Sticky sessions required for Socket.IO and remote camera sessions
|
traefik.http.services.abaci.loadbalancer.healthcheck.timeout: 2s
|
||||||
# Without this, desktop and phone may hit different instances
|
# Sticky sessions for Socket.IO (Redis handles cross-instance state)
|
||||||
|
# If pinned server is unhealthy, Traefik will failover + retry middleware helps
|
||||||
traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
|
traefik.http.services.abaci.loadbalancer.sticky.cookie.name: server_id
|
||||||
traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
|
traefik.http.services.abaci.loadbalancer.sticky.cookie.secure: "true"
|
||||||
traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
|
traefik.http.services.abaci.loadbalancer.sticky.cookie.httpOnly: "true"
|
||||||
|
# Retry middleware: retry on another server if request fails (zero-downtime deploys)
|
||||||
|
traefik.http.middlewares.retry.retry.attempts: "3"
|
||||||
|
traefik.http.middlewares.retry.retry.initialinterval: 100ms
|
||||||
traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
|
traefik.http.middlewares.redirect-https.redirectscheme.scheme: https
|
||||||
traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
|
traefik.http.middlewares.redirect-https.redirectscheme.permanent: "true"
|
||||||
traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"
|
traefik.http.middlewares.hsts.headers.stsSeconds: "63072000"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue