feat(observability): add OpenTelemetry tracing with Tempo backend

- Add instrumentation.js for OTel SDK bootstrap via --require flag
- Add tracing.ts utility functions (getCurrentTraceId, recordError, withSpan)
- Install @opentelemetry packages for auto-instrumentation
- Update Dockerfile to copy instrumentation.js and use --require
- Add trace IDs to error responses in API routes

Traces are exported to Tempo via OTLP/gRPC when running in production
(KUBERNETES_SERVICE_HOST env var present).

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Thomas Hallock 2026-01-24 16:31:18 -06:00
parent 8362db4572
commit dcad5bca46
7 changed files with 1708 additions and 4 deletions

View File

@ -151,6 +151,7 @@ COPY --from=builder --chown=nextjs:nodejs /app/apps/web/styled-system ./apps/web
# Copy server files (compiled from TypeScript) # Copy server files (compiled from TypeScript)
COPY --from=builder --chown=nextjs:nodejs /app/apps/web/server.js ./apps/web/ COPY --from=builder --chown=nextjs:nodejs /app/apps/web/server.js ./apps/web/
COPY --from=builder --chown=nextjs:nodejs /app/apps/web/instrumentation.js ./apps/web/
COPY --from=builder --chown=nextjs:nodejs /app/apps/web/dist ./apps/web/dist COPY --from=builder --chown=nextjs:nodejs /app/apps/web/dist ./apps/web/dist
# Copy database migrations # Copy database migrations
@ -204,4 +205,5 @@ ENV NODE_ENV=production
# Default: run without LiteFS (for local dev and Docker Compose) # Default: run without LiteFS (for local dev and Docker Compose)
# For k8s with LiteFS: override with command "litefs mount" and run as root # For k8s with LiteFS: override with command "litefs mount" and run as root
CMD ["node", "server.js"] # Use --require to load OpenTelemetry instrumentation before any other modules
CMD ["node", "--require", "./instrumentation.js", "server.js"]

View File

@ -0,0 +1,85 @@
/**
* OpenTelemetry Instrumentation Bootstrap
*
* This file must be loaded BEFORE any other modules via:
* node --require ./instrumentation.js server.js
*
* Environment variables:
* - OTEL_ENABLED: Set to 'true' to enable tracing
* - OTEL_EXPORTER_OTLP_ENDPOINT: Tempo endpoint (default: http://tempo:4317)
* - OTEL_SERVICE_NAME: Service name for traces (default: abaci-app)
*/
const isEnabled =
process.env.OTEL_ENABLED === 'true' ||
(process.env.NODE_ENV === 'production' && process.env.KUBERNETES_SERVICE_HOST)
if (isEnabled) {
const { NodeSDK } = require('@opentelemetry/sdk-node')
const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node')
const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc')
const { Resource } = require('@opentelemetry/resources')
const {
ATTR_SERVICE_NAME,
ATTR_SERVICE_VERSION,
ATTR_DEPLOYMENT_ENVIRONMENT_NAME,
} = require('@opentelemetry/semantic-conventions')
const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://tempo:4317'
const serviceName = process.env.OTEL_SERVICE_NAME || 'abaci-app'
const serviceVersion = process.env.npm_package_version || '0.0.0'
const environment = process.env.NODE_ENV || 'development'
const podName = process.env.HOSTNAME || 'unknown'
console.log(`[Tracing] Initializing OpenTelemetry - endpoint: ${endpoint}, service: ${serviceName}, pod: ${podName}`)
const resource = new Resource({
[ATTR_SERVICE_NAME]: serviceName,
[ATTR_SERVICE_VERSION]: serviceVersion,
[ATTR_DEPLOYMENT_ENVIRONMENT_NAME]: environment,
'k8s.pod.name': podName,
})
const traceExporter = new OTLPTraceExporter({
url: endpoint,
})
const sdk = new NodeSDK({
resource,
traceExporter,
instrumentations: [
getNodeAutoInstrumentations({
// Disable noisy instrumentations
'@opentelemetry/instrumentation-fs': { enabled: false },
'@opentelemetry/instrumentation-dns': { enabled: false },
'@opentelemetry/instrumentation-net': { enabled: false },
// Configure HTTP instrumentation
'@opentelemetry/instrumentation-http': {
ignoreIncomingPaths: [
/^\/api\/health/,
/^\/api\/metrics/,
/^\/api\/heartbeat/,
/^\/_next\/static/,
/^\/favicon\.ico/,
],
},
}),
],
})
sdk.start()
console.log('[Tracing] OpenTelemetry SDK started')
// Graceful shutdown
const shutdown = () => {
sdk
.shutdown()
.then(() => console.log('[Tracing] SDK shut down successfully'))
.catch((error) => console.error('[Tracing] Error shutting down SDK:', error))
}
process.on('SIGTERM', shutdown)
process.on('SIGINT', shutdown)
} else {
console.log('[Tracing] OpenTelemetry disabled (set OTEL_ENABLED=true or run in k8s to enable)')
}

View File

@ -32,6 +32,12 @@
"@dnd-kit/utilities": "^3.2.2", "@dnd-kit/utilities": "^3.2.2",
"@flatten-js/core": "^1.6.8", "@flatten-js/core": "^1.6.8",
"@number-flow/react": "^0.5.10", "@number-flow/react": "^0.5.10",
"@opentelemetry/api": "^1.9.0",
"@opentelemetry/auto-instrumentations-node": "^0.69.0",
"@opentelemetry/exporter-trace-otlp-grpc": "^0.211.0",
"@opentelemetry/resources": "^2.5.0",
"@opentelemetry/sdk-node": "^0.211.0",
"@opentelemetry/semantic-conventions": "^1.39.0",
"@paralleldrive/cuid2": "^2.2.2", "@paralleldrive/cuid2": "^2.2.2",
"@radix-ui/react-accordion": "^1.1.2", "@radix-ui/react-accordion": "^1.1.2",
"@radix-ui/react-checkbox": "^1.0.4", "@radix-ui/react-checkbox": "^1.0.4",

View File

@ -16,6 +16,7 @@ import type { WorksheetFormState, WorksheetProblem } from '@/app/create/workshee
import { db } from '@/db' import { db } from '@/db'
import { worksheetShares } from '@/db/schema' import { worksheetShares } from '@/db/schema'
import { generateShareId } from '@/lib/generateShareId' import { generateShareId } from '@/lib/generateShareId'
import { getCurrentTraceId, recordError } from '@/lib/tracing'
export async function POST(request: NextRequest) { export async function POST(request: NextRequest) {
const startTime = Date.now() const startTime = Date.now()
@ -146,6 +147,9 @@ export async function POST(request: NextRequest) {
}) })
} catch (error) { } catch (error) {
console.error('Typst compilation error:', error) console.error('Typst compilation error:', error)
if (error instanceof Error) {
recordError(error)
}
// Extract the actual Typst error message // Extract the actual Typst error message
const stderr = const stderr =
@ -153,10 +157,12 @@ export async function POST(request: NextRequest) {
? String((error as any).stderr) ? String((error as any).stderr)
: 'Unknown compilation error' : 'Unknown compilation error'
const traceId = getCurrentTraceId()
return NextResponse.json( return NextResponse.json(
{ {
error: 'Failed to compile worksheet PDF', error: 'Failed to compile worksheet PDF',
details: stderr, details: stderr,
...(traceId && { traceId }),
...(process.env.NODE_ENV === 'development' && { ...(process.env.NODE_ENV === 'development' && {
typstSource: typstSource.split('\n').slice(0, 20).join('\n') + '\n...', typstSource: typstSource.split('\n').slice(0, 20).join('\n') + '\n...',
}), }),
@ -181,14 +187,19 @@ export async function POST(request: NextRequest) {
}) })
} catch (error) { } catch (error) {
console.error('Error generating worksheet:', error) console.error('Error generating worksheet:', error)
if (error instanceof Error) {
recordError(error)
}
const errorMessage = error instanceof Error ? error.message : String(error) const errorMessage = error instanceof Error ? error.message : String(error)
const errorStack = error instanceof Error ? error.stack : undefined const errorStack = error instanceof Error ? error.stack : undefined
const traceId = getCurrentTraceId()
return NextResponse.json( return NextResponse.json(
{ {
error: 'Failed to generate worksheet', error: 'Failed to generate worksheet',
message: errorMessage, message: errorMessage,
...(traceId && { traceId }),
...(process.env.NODE_ENV === 'development' && { stack: errorStack }), ...(process.env.NODE_ENV === 'development' && { stack: errorStack }),
}, },
{ status: 500 } { status: 500 }

View File

@ -11,6 +11,7 @@ import { canPerformAction } from '@/lib/classroom'
import { getDbUserId } from '@/lib/viewer' import { getDbUserId } from '@/lib/viewer'
import type { GameResultsReport } from '@/lib/arcade/game-sdk/types' import type { GameResultsReport } from '@/lib/arcade/game-sdk/types'
import { metrics } from '@/lib/metrics' import { metrics } from '@/lib/metrics'
import { getCurrentTraceId, recordError } from '@/lib/tracing'
interface SaveGameResultRequest { interface SaveGameResultRequest {
playerId: string playerId: string
@ -82,6 +83,13 @@ export async function POST(request: Request) {
return NextResponse.json(result[0]) return NextResponse.json(result[0])
} catch (error) { } catch (error) {
console.error('Error saving game result:', error) console.error('Error saving game result:', error)
return NextResponse.json({ error: 'Failed to save game result' }, { status: 500 }) if (error instanceof Error) {
recordError(error)
}
const traceId = getCurrentTraceId()
return NextResponse.json(
{ error: 'Failed to save game result', ...(traceId && { traceId }) },
{ status: 500 }
)
} }
} }

110
apps/web/src/lib/tracing.ts Normal file
View File

@ -0,0 +1,110 @@
/**
* OpenTelemetry Tracing Utilities
*
* This file exports helper functions for working with traces in the application.
* The OpenTelemetry SDK is initialized via instrumentation.js (loaded with --require).
*
* Usage:
* import { getCurrentTraceId, recordError } from '@/lib/tracing'
*
* // In error handlers:
* const traceId = getCurrentTraceId()
* return NextResponse.json({ error: 'Something failed', traceId }, { status: 500 })
*/
import { trace, context, SpanStatusCode } from '@opentelemetry/api'
// Export utilities for manual tracing
/**
* Get the current trace ID (useful for error responses)
*/
export function getCurrentTraceId(): string | null {
const span = trace.getActiveSpan()
if (!span) return null
return span.spanContext().traceId
}
/**
* Get the current span context for logging
*/
export function getTraceContext(): { traceId: string; spanId: string } | null {
const span = trace.getActiveSpan()
if (!span) return null
const ctx = span.spanContext()
return {
traceId: ctx.traceId,
spanId: ctx.spanId,
}
}
/**
* Add an error to the current span
*/
export function recordError(error: Error, attributes?: Record<string, string>): void {
const span = trace.getActiveSpan()
if (!span) return
span.recordException(error)
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message })
if (attributes) {
span.setAttributes(attributes)
}
}
/**
* Create a custom span for tracing a specific operation
*/
export function withSpan<T>(
name: string,
fn: () => T,
attributes?: Record<string, string>
): T {
const tracer = trace.getTracer('abaci-app')
return tracer.startActiveSpan(name, (span) => {
try {
if (attributes) {
span.setAttributes(attributes)
}
const result = fn()
span.end()
return result
} catch (error) {
if (error instanceof Error) {
span.recordException(error)
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message })
}
span.end()
throw error
}
})
}
/**
* Create a custom span for tracing an async operation
*/
export async function withSpanAsync<T>(
name: string,
fn: () => Promise<T>,
attributes?: Record<string, string>
): Promise<T> {
const tracer = trace.getTracer('abaci-app')
return tracer.startActiveSpan(name, async (span) => {
try {
if (attributes) {
span.setAttributes(attributes)
}
const result = await fn()
span.end()
return result
} catch (error) {
if (error instanceof Error) {
span.recordException(error)
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message })
}
span.end()
throw error
}
})
}
export { trace, context }

File diff suppressed because it is too large Load Diff