feat(observability): add OpenTelemetry tracing with Tempo backend
- Add instrumentation.js for OTel SDK bootstrap via --require flag - Add tracing.ts utility functions (getCurrentTraceId, recordError, withSpan) - Install @opentelemetry packages for auto-instrumentation - Update Dockerfile to copy instrumentation.js and use --require - Add trace IDs to error responses in API routes Traces are exported to Tempo via OTLP/gRPC when running in production (KUBERNETES_SERVICE_HOST env var present). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
8362db4572
commit
dcad5bca46
|
|
@ -151,6 +151,7 @@ COPY --from=builder --chown=nextjs:nodejs /app/apps/web/styled-system ./apps/web
|
|||
|
||||
# Copy server files (compiled from TypeScript)
|
||||
COPY --from=builder --chown=nextjs:nodejs /app/apps/web/server.js ./apps/web/
|
||||
COPY --from=builder --chown=nextjs:nodejs /app/apps/web/instrumentation.js ./apps/web/
|
||||
COPY --from=builder --chown=nextjs:nodejs /app/apps/web/dist ./apps/web/dist
|
||||
|
||||
# Copy database migrations
|
||||
|
|
@ -204,4 +205,5 @@ ENV NODE_ENV=production
|
|||
|
||||
# Default: run without LiteFS (for local dev and Docker Compose)
|
||||
# For k8s with LiteFS: override with command "litefs mount" and run as root
|
||||
CMD ["node", "server.js"]
|
||||
# Use --require to load OpenTelemetry instrumentation before any other modules
|
||||
CMD ["node", "--require", "./instrumentation.js", "server.js"]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,85 @@
|
|||
/**
|
||||
* OpenTelemetry Instrumentation Bootstrap
|
||||
*
|
||||
* This file must be loaded BEFORE any other modules via:
|
||||
* node --require ./instrumentation.js server.js
|
||||
*
|
||||
* Environment variables:
|
||||
* - OTEL_ENABLED: Set to 'true' to enable tracing
|
||||
* - OTEL_EXPORTER_OTLP_ENDPOINT: Tempo endpoint (default: http://tempo:4317)
|
||||
* - OTEL_SERVICE_NAME: Service name for traces (default: abaci-app)
|
||||
*/
|
||||
|
||||
const isEnabled =
|
||||
process.env.OTEL_ENABLED === 'true' ||
|
||||
(process.env.NODE_ENV === 'production' && process.env.KUBERNETES_SERVICE_HOST)
|
||||
|
||||
if (isEnabled) {
|
||||
const { NodeSDK } = require('@opentelemetry/sdk-node')
|
||||
const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node')
|
||||
const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc')
|
||||
const { Resource } = require('@opentelemetry/resources')
|
||||
const {
|
||||
ATTR_SERVICE_NAME,
|
||||
ATTR_SERVICE_VERSION,
|
||||
ATTR_DEPLOYMENT_ENVIRONMENT_NAME,
|
||||
} = require('@opentelemetry/semantic-conventions')
|
||||
|
||||
const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://tempo:4317'
|
||||
const serviceName = process.env.OTEL_SERVICE_NAME || 'abaci-app'
|
||||
const serviceVersion = process.env.npm_package_version || '0.0.0'
|
||||
const environment = process.env.NODE_ENV || 'development'
|
||||
const podName = process.env.HOSTNAME || 'unknown'
|
||||
|
||||
console.log(`[Tracing] Initializing OpenTelemetry - endpoint: ${endpoint}, service: ${serviceName}, pod: ${podName}`)
|
||||
|
||||
const resource = new Resource({
|
||||
[ATTR_SERVICE_NAME]: serviceName,
|
||||
[ATTR_SERVICE_VERSION]: serviceVersion,
|
||||
[ATTR_DEPLOYMENT_ENVIRONMENT_NAME]: environment,
|
||||
'k8s.pod.name': podName,
|
||||
})
|
||||
|
||||
const traceExporter = new OTLPTraceExporter({
|
||||
url: endpoint,
|
||||
})
|
||||
|
||||
const sdk = new NodeSDK({
|
||||
resource,
|
||||
traceExporter,
|
||||
instrumentations: [
|
||||
getNodeAutoInstrumentations({
|
||||
// Disable noisy instrumentations
|
||||
'@opentelemetry/instrumentation-fs': { enabled: false },
|
||||
'@opentelemetry/instrumentation-dns': { enabled: false },
|
||||
'@opentelemetry/instrumentation-net': { enabled: false },
|
||||
// Configure HTTP instrumentation
|
||||
'@opentelemetry/instrumentation-http': {
|
||||
ignoreIncomingPaths: [
|
||||
/^\/api\/health/,
|
||||
/^\/api\/metrics/,
|
||||
/^\/api\/heartbeat/,
|
||||
/^\/_next\/static/,
|
||||
/^\/favicon\.ico/,
|
||||
],
|
||||
},
|
||||
}),
|
||||
],
|
||||
})
|
||||
|
||||
sdk.start()
|
||||
console.log('[Tracing] OpenTelemetry SDK started')
|
||||
|
||||
// Graceful shutdown
|
||||
const shutdown = () => {
|
||||
sdk
|
||||
.shutdown()
|
||||
.then(() => console.log('[Tracing] SDK shut down successfully'))
|
||||
.catch((error) => console.error('[Tracing] Error shutting down SDK:', error))
|
||||
}
|
||||
|
||||
process.on('SIGTERM', shutdown)
|
||||
process.on('SIGINT', shutdown)
|
||||
} else {
|
||||
console.log('[Tracing] OpenTelemetry disabled (set OTEL_ENABLED=true or run in k8s to enable)')
|
||||
}
|
||||
|
|
@ -32,6 +32,12 @@
|
|||
"@dnd-kit/utilities": "^3.2.2",
|
||||
"@flatten-js/core": "^1.6.8",
|
||||
"@number-flow/react": "^0.5.10",
|
||||
"@opentelemetry/api": "^1.9.0",
|
||||
"@opentelemetry/auto-instrumentations-node": "^0.69.0",
|
||||
"@opentelemetry/exporter-trace-otlp-grpc": "^0.211.0",
|
||||
"@opentelemetry/resources": "^2.5.0",
|
||||
"@opentelemetry/sdk-node": "^0.211.0",
|
||||
"@opentelemetry/semantic-conventions": "^1.39.0",
|
||||
"@paralleldrive/cuid2": "^2.2.2",
|
||||
"@radix-ui/react-accordion": "^1.1.2",
|
||||
"@radix-ui/react-checkbox": "^1.0.4",
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ import type { WorksheetFormState, WorksheetProblem } from '@/app/create/workshee
|
|||
import { db } from '@/db'
|
||||
import { worksheetShares } from '@/db/schema'
|
||||
import { generateShareId } from '@/lib/generateShareId'
|
||||
import { getCurrentTraceId, recordError } from '@/lib/tracing'
|
||||
|
||||
export async function POST(request: NextRequest) {
|
||||
const startTime = Date.now()
|
||||
|
|
@ -146,6 +147,9 @@ export async function POST(request: NextRequest) {
|
|||
})
|
||||
} catch (error) {
|
||||
console.error('Typst compilation error:', error)
|
||||
if (error instanceof Error) {
|
||||
recordError(error)
|
||||
}
|
||||
|
||||
// Extract the actual Typst error message
|
||||
const stderr =
|
||||
|
|
@ -153,10 +157,12 @@ export async function POST(request: NextRequest) {
|
|||
? String((error as any).stderr)
|
||||
: 'Unknown compilation error'
|
||||
|
||||
const traceId = getCurrentTraceId()
|
||||
return NextResponse.json(
|
||||
{
|
||||
error: 'Failed to compile worksheet PDF',
|
||||
details: stderr,
|
||||
...(traceId && { traceId }),
|
||||
...(process.env.NODE_ENV === 'development' && {
|
||||
typstSource: typstSource.split('\n').slice(0, 20).join('\n') + '\n...',
|
||||
}),
|
||||
|
|
@ -181,14 +187,19 @@ export async function POST(request: NextRequest) {
|
|||
})
|
||||
} catch (error) {
|
||||
console.error('Error generating worksheet:', error)
|
||||
if (error instanceof Error) {
|
||||
recordError(error)
|
||||
}
|
||||
|
||||
const errorMessage = error instanceof Error ? error.message : String(error)
|
||||
const errorStack = error instanceof Error ? error.stack : undefined
|
||||
const traceId = getCurrentTraceId()
|
||||
|
||||
return NextResponse.json(
|
||||
{
|
||||
error: 'Failed to generate worksheet',
|
||||
message: errorMessage,
|
||||
...(traceId && { traceId }),
|
||||
...(process.env.NODE_ENV === 'development' && { stack: errorStack }),
|
||||
},
|
||||
{ status: 500 }
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ import { canPerformAction } from '@/lib/classroom'
|
|||
import { getDbUserId } from '@/lib/viewer'
|
||||
import type { GameResultsReport } from '@/lib/arcade/game-sdk/types'
|
||||
import { metrics } from '@/lib/metrics'
|
||||
import { getCurrentTraceId, recordError } from '@/lib/tracing'
|
||||
|
||||
interface SaveGameResultRequest {
|
||||
playerId: string
|
||||
|
|
@ -82,6 +83,13 @@ export async function POST(request: Request) {
|
|||
return NextResponse.json(result[0])
|
||||
} catch (error) {
|
||||
console.error('Error saving game result:', error)
|
||||
return NextResponse.json({ error: 'Failed to save game result' }, { status: 500 })
|
||||
if (error instanceof Error) {
|
||||
recordError(error)
|
||||
}
|
||||
const traceId = getCurrentTraceId()
|
||||
return NextResponse.json(
|
||||
{ error: 'Failed to save game result', ...(traceId && { traceId }) },
|
||||
{ status: 500 }
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,110 @@
|
|||
/**
|
||||
* OpenTelemetry Tracing Utilities
|
||||
*
|
||||
* This file exports helper functions for working with traces in the application.
|
||||
* The OpenTelemetry SDK is initialized via instrumentation.js (loaded with --require).
|
||||
*
|
||||
* Usage:
|
||||
* import { getCurrentTraceId, recordError } from '@/lib/tracing'
|
||||
*
|
||||
* // In error handlers:
|
||||
* const traceId = getCurrentTraceId()
|
||||
* return NextResponse.json({ error: 'Something failed', traceId }, { status: 500 })
|
||||
*/
|
||||
|
||||
import { trace, context, SpanStatusCode } from '@opentelemetry/api'
|
||||
|
||||
// Export utilities for manual tracing
|
||||
|
||||
/**
|
||||
* Get the current trace ID (useful for error responses)
|
||||
*/
|
||||
export function getCurrentTraceId(): string | null {
|
||||
const span = trace.getActiveSpan()
|
||||
if (!span) return null
|
||||
return span.spanContext().traceId
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the current span context for logging
|
||||
*/
|
||||
export function getTraceContext(): { traceId: string; spanId: string } | null {
|
||||
const span = trace.getActiveSpan()
|
||||
if (!span) return null
|
||||
const ctx = span.spanContext()
|
||||
return {
|
||||
traceId: ctx.traceId,
|
||||
spanId: ctx.spanId,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an error to the current span
|
||||
*/
|
||||
export function recordError(error: Error, attributes?: Record<string, string>): void {
|
||||
const span = trace.getActiveSpan()
|
||||
if (!span) return
|
||||
span.recordException(error)
|
||||
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message })
|
||||
if (attributes) {
|
||||
span.setAttributes(attributes)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a custom span for tracing a specific operation
|
||||
*/
|
||||
export function withSpan<T>(
|
||||
name: string,
|
||||
fn: () => T,
|
||||
attributes?: Record<string, string>
|
||||
): T {
|
||||
const tracer = trace.getTracer('abaci-app')
|
||||
return tracer.startActiveSpan(name, (span) => {
|
||||
try {
|
||||
if (attributes) {
|
||||
span.setAttributes(attributes)
|
||||
}
|
||||
const result = fn()
|
||||
span.end()
|
||||
return result
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
span.recordException(error)
|
||||
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message })
|
||||
}
|
||||
span.end()
|
||||
throw error
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a custom span for tracing an async operation
|
||||
*/
|
||||
export async function withSpanAsync<T>(
|
||||
name: string,
|
||||
fn: () => Promise<T>,
|
||||
attributes?: Record<string, string>
|
||||
): Promise<T> {
|
||||
const tracer = trace.getTracer('abaci-app')
|
||||
return tracer.startActiveSpan(name, async (span) => {
|
||||
try {
|
||||
if (attributes) {
|
||||
span.setAttributes(attributes)
|
||||
}
|
||||
const result = await fn()
|
||||
span.end()
|
||||
return result
|
||||
} catch (error) {
|
||||
if (error instanceof Error) {
|
||||
span.recordException(error)
|
||||
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message })
|
||||
}
|
||||
span.end()
|
||||
throw error
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
export { trace, context }
|
||||
1486
pnpm-lock.yaml
1486
pnpm-lock.yaml
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue