feat(observability): add OpenTelemetry tracing with Tempo backend
- Add instrumentation.js for OTel SDK bootstrap via --require flag - Add tracing.ts utility functions (getCurrentTraceId, recordError, withSpan) - Install @opentelemetry packages for auto-instrumentation - Update Dockerfile to copy instrumentation.js and use --require - Add trace IDs to error responses in API routes Traces are exported to Tempo via OTLP/gRPC when running in production (KUBERNETES_SERVICE_HOST env var present). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
8362db4572
commit
dcad5bca46
|
|
@ -151,6 +151,7 @@ COPY --from=builder --chown=nextjs:nodejs /app/apps/web/styled-system ./apps/web
|
||||||
|
|
||||||
# Copy server files (compiled from TypeScript)
|
# Copy server files (compiled from TypeScript)
|
||||||
COPY --from=builder --chown=nextjs:nodejs /app/apps/web/server.js ./apps/web/
|
COPY --from=builder --chown=nextjs:nodejs /app/apps/web/server.js ./apps/web/
|
||||||
|
COPY --from=builder --chown=nextjs:nodejs /app/apps/web/instrumentation.js ./apps/web/
|
||||||
COPY --from=builder --chown=nextjs:nodejs /app/apps/web/dist ./apps/web/dist
|
COPY --from=builder --chown=nextjs:nodejs /app/apps/web/dist ./apps/web/dist
|
||||||
|
|
||||||
# Copy database migrations
|
# Copy database migrations
|
||||||
|
|
@ -204,4 +205,5 @@ ENV NODE_ENV=production
|
||||||
|
|
||||||
# Default: run without LiteFS (for local dev and Docker Compose)
|
# Default: run without LiteFS (for local dev and Docker Compose)
|
||||||
# For k8s with LiteFS: override with command "litefs mount" and run as root
|
# For k8s with LiteFS: override with command "litefs mount" and run as root
|
||||||
CMD ["node", "server.js"]
|
# Use --require to load OpenTelemetry instrumentation before any other modules
|
||||||
|
CMD ["node", "--require", "./instrumentation.js", "server.js"]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,85 @@
|
||||||
|
/**
|
||||||
|
* OpenTelemetry Instrumentation Bootstrap
|
||||||
|
*
|
||||||
|
* This file must be loaded BEFORE any other modules via:
|
||||||
|
* node --require ./instrumentation.js server.js
|
||||||
|
*
|
||||||
|
* Environment variables:
|
||||||
|
* - OTEL_ENABLED: Set to 'true' to enable tracing
|
||||||
|
* - OTEL_EXPORTER_OTLP_ENDPOINT: Tempo endpoint (default: http://tempo:4317)
|
||||||
|
* - OTEL_SERVICE_NAME: Service name for traces (default: abaci-app)
|
||||||
|
*/
|
||||||
|
|
||||||
|
const isEnabled =
|
||||||
|
process.env.OTEL_ENABLED === 'true' ||
|
||||||
|
(process.env.NODE_ENV === 'production' && process.env.KUBERNETES_SERVICE_HOST)
|
||||||
|
|
||||||
|
if (isEnabled) {
|
||||||
|
const { NodeSDK } = require('@opentelemetry/sdk-node')
|
||||||
|
const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node')
|
||||||
|
const { OTLPTraceExporter } = require('@opentelemetry/exporter-trace-otlp-grpc')
|
||||||
|
const { Resource } = require('@opentelemetry/resources')
|
||||||
|
const {
|
||||||
|
ATTR_SERVICE_NAME,
|
||||||
|
ATTR_SERVICE_VERSION,
|
||||||
|
ATTR_DEPLOYMENT_ENVIRONMENT_NAME,
|
||||||
|
} = require('@opentelemetry/semantic-conventions')
|
||||||
|
|
||||||
|
const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT || 'http://tempo:4317'
|
||||||
|
const serviceName = process.env.OTEL_SERVICE_NAME || 'abaci-app'
|
||||||
|
const serviceVersion = process.env.npm_package_version || '0.0.0'
|
||||||
|
const environment = process.env.NODE_ENV || 'development'
|
||||||
|
const podName = process.env.HOSTNAME || 'unknown'
|
||||||
|
|
||||||
|
console.log(`[Tracing] Initializing OpenTelemetry - endpoint: ${endpoint}, service: ${serviceName}, pod: ${podName}`)
|
||||||
|
|
||||||
|
const resource = new Resource({
|
||||||
|
[ATTR_SERVICE_NAME]: serviceName,
|
||||||
|
[ATTR_SERVICE_VERSION]: serviceVersion,
|
||||||
|
[ATTR_DEPLOYMENT_ENVIRONMENT_NAME]: environment,
|
||||||
|
'k8s.pod.name': podName,
|
||||||
|
})
|
||||||
|
|
||||||
|
const traceExporter = new OTLPTraceExporter({
|
||||||
|
url: endpoint,
|
||||||
|
})
|
||||||
|
|
||||||
|
const sdk = new NodeSDK({
|
||||||
|
resource,
|
||||||
|
traceExporter,
|
||||||
|
instrumentations: [
|
||||||
|
getNodeAutoInstrumentations({
|
||||||
|
// Disable noisy instrumentations
|
||||||
|
'@opentelemetry/instrumentation-fs': { enabled: false },
|
||||||
|
'@opentelemetry/instrumentation-dns': { enabled: false },
|
||||||
|
'@opentelemetry/instrumentation-net': { enabled: false },
|
||||||
|
// Configure HTTP instrumentation
|
||||||
|
'@opentelemetry/instrumentation-http': {
|
||||||
|
ignoreIncomingPaths: [
|
||||||
|
/^\/api\/health/,
|
||||||
|
/^\/api\/metrics/,
|
||||||
|
/^\/api\/heartbeat/,
|
||||||
|
/^\/_next\/static/,
|
||||||
|
/^\/favicon\.ico/,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}),
|
||||||
|
],
|
||||||
|
})
|
||||||
|
|
||||||
|
sdk.start()
|
||||||
|
console.log('[Tracing] OpenTelemetry SDK started')
|
||||||
|
|
||||||
|
// Graceful shutdown
|
||||||
|
const shutdown = () => {
|
||||||
|
sdk
|
||||||
|
.shutdown()
|
||||||
|
.then(() => console.log('[Tracing] SDK shut down successfully'))
|
||||||
|
.catch((error) => console.error('[Tracing] Error shutting down SDK:', error))
|
||||||
|
}
|
||||||
|
|
||||||
|
process.on('SIGTERM', shutdown)
|
||||||
|
process.on('SIGINT', shutdown)
|
||||||
|
} else {
|
||||||
|
console.log('[Tracing] OpenTelemetry disabled (set OTEL_ENABLED=true or run in k8s to enable)')
|
||||||
|
}
|
||||||
|
|
@ -32,6 +32,12 @@
|
||||||
"@dnd-kit/utilities": "^3.2.2",
|
"@dnd-kit/utilities": "^3.2.2",
|
||||||
"@flatten-js/core": "^1.6.8",
|
"@flatten-js/core": "^1.6.8",
|
||||||
"@number-flow/react": "^0.5.10",
|
"@number-flow/react": "^0.5.10",
|
||||||
|
"@opentelemetry/api": "^1.9.0",
|
||||||
|
"@opentelemetry/auto-instrumentations-node": "^0.69.0",
|
||||||
|
"@opentelemetry/exporter-trace-otlp-grpc": "^0.211.0",
|
||||||
|
"@opentelemetry/resources": "^2.5.0",
|
||||||
|
"@opentelemetry/sdk-node": "^0.211.0",
|
||||||
|
"@opentelemetry/semantic-conventions": "^1.39.0",
|
||||||
"@paralleldrive/cuid2": "^2.2.2",
|
"@paralleldrive/cuid2": "^2.2.2",
|
||||||
"@radix-ui/react-accordion": "^1.1.2",
|
"@radix-ui/react-accordion": "^1.1.2",
|
||||||
"@radix-ui/react-checkbox": "^1.0.4",
|
"@radix-ui/react-checkbox": "^1.0.4",
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ import type { WorksheetFormState, WorksheetProblem } from '@/app/create/workshee
|
||||||
import { db } from '@/db'
|
import { db } from '@/db'
|
||||||
import { worksheetShares } from '@/db/schema'
|
import { worksheetShares } from '@/db/schema'
|
||||||
import { generateShareId } from '@/lib/generateShareId'
|
import { generateShareId } from '@/lib/generateShareId'
|
||||||
|
import { getCurrentTraceId, recordError } from '@/lib/tracing'
|
||||||
|
|
||||||
export async function POST(request: NextRequest) {
|
export async function POST(request: NextRequest) {
|
||||||
const startTime = Date.now()
|
const startTime = Date.now()
|
||||||
|
|
@ -146,6 +147,9 @@ export async function POST(request: NextRequest) {
|
||||||
})
|
})
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Typst compilation error:', error)
|
console.error('Typst compilation error:', error)
|
||||||
|
if (error instanceof Error) {
|
||||||
|
recordError(error)
|
||||||
|
}
|
||||||
|
|
||||||
// Extract the actual Typst error message
|
// Extract the actual Typst error message
|
||||||
const stderr =
|
const stderr =
|
||||||
|
|
@ -153,10 +157,12 @@ export async function POST(request: NextRequest) {
|
||||||
? String((error as any).stderr)
|
? String((error as any).stderr)
|
||||||
: 'Unknown compilation error'
|
: 'Unknown compilation error'
|
||||||
|
|
||||||
|
const traceId = getCurrentTraceId()
|
||||||
return NextResponse.json(
|
return NextResponse.json(
|
||||||
{
|
{
|
||||||
error: 'Failed to compile worksheet PDF',
|
error: 'Failed to compile worksheet PDF',
|
||||||
details: stderr,
|
details: stderr,
|
||||||
|
...(traceId && { traceId }),
|
||||||
...(process.env.NODE_ENV === 'development' && {
|
...(process.env.NODE_ENV === 'development' && {
|
||||||
typstSource: typstSource.split('\n').slice(0, 20).join('\n') + '\n...',
|
typstSource: typstSource.split('\n').slice(0, 20).join('\n') + '\n...',
|
||||||
}),
|
}),
|
||||||
|
|
@ -181,14 +187,19 @@ export async function POST(request: NextRequest) {
|
||||||
})
|
})
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error generating worksheet:', error)
|
console.error('Error generating worksheet:', error)
|
||||||
|
if (error instanceof Error) {
|
||||||
|
recordError(error)
|
||||||
|
}
|
||||||
|
|
||||||
const errorMessage = error instanceof Error ? error.message : String(error)
|
const errorMessage = error instanceof Error ? error.message : String(error)
|
||||||
const errorStack = error instanceof Error ? error.stack : undefined
|
const errorStack = error instanceof Error ? error.stack : undefined
|
||||||
|
const traceId = getCurrentTraceId()
|
||||||
|
|
||||||
return NextResponse.json(
|
return NextResponse.json(
|
||||||
{
|
{
|
||||||
error: 'Failed to generate worksheet',
|
error: 'Failed to generate worksheet',
|
||||||
message: errorMessage,
|
message: errorMessage,
|
||||||
|
...(traceId && { traceId }),
|
||||||
...(process.env.NODE_ENV === 'development' && { stack: errorStack }),
|
...(process.env.NODE_ENV === 'development' && { stack: errorStack }),
|
||||||
},
|
},
|
||||||
{ status: 500 }
|
{ status: 500 }
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ import { canPerformAction } from '@/lib/classroom'
|
||||||
import { getDbUserId } from '@/lib/viewer'
|
import { getDbUserId } from '@/lib/viewer'
|
||||||
import type { GameResultsReport } from '@/lib/arcade/game-sdk/types'
|
import type { GameResultsReport } from '@/lib/arcade/game-sdk/types'
|
||||||
import { metrics } from '@/lib/metrics'
|
import { metrics } from '@/lib/metrics'
|
||||||
|
import { getCurrentTraceId, recordError } from '@/lib/tracing'
|
||||||
|
|
||||||
interface SaveGameResultRequest {
|
interface SaveGameResultRequest {
|
||||||
playerId: string
|
playerId: string
|
||||||
|
|
@ -82,6 +83,13 @@ export async function POST(request: Request) {
|
||||||
return NextResponse.json(result[0])
|
return NextResponse.json(result[0])
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error saving game result:', error)
|
console.error('Error saving game result:', error)
|
||||||
return NextResponse.json({ error: 'Failed to save game result' }, { status: 500 })
|
if (error instanceof Error) {
|
||||||
|
recordError(error)
|
||||||
|
}
|
||||||
|
const traceId = getCurrentTraceId()
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Failed to save game result', ...(traceId && { traceId }) },
|
||||||
|
{ status: 500 }
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,110 @@
|
||||||
|
/**
|
||||||
|
* OpenTelemetry Tracing Utilities
|
||||||
|
*
|
||||||
|
* This file exports helper functions for working with traces in the application.
|
||||||
|
* The OpenTelemetry SDK is initialized via instrumentation.js (loaded with --require).
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* import { getCurrentTraceId, recordError } from '@/lib/tracing'
|
||||||
|
*
|
||||||
|
* // In error handlers:
|
||||||
|
* const traceId = getCurrentTraceId()
|
||||||
|
* return NextResponse.json({ error: 'Something failed', traceId }, { status: 500 })
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { trace, context, SpanStatusCode } from '@opentelemetry/api'
|
||||||
|
|
||||||
|
// Export utilities for manual tracing
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the current trace ID (useful for error responses)
|
||||||
|
*/
|
||||||
|
export function getCurrentTraceId(): string | null {
|
||||||
|
const span = trace.getActiveSpan()
|
||||||
|
if (!span) return null
|
||||||
|
return span.spanContext().traceId
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the current span context for logging
|
||||||
|
*/
|
||||||
|
export function getTraceContext(): { traceId: string; spanId: string } | null {
|
||||||
|
const span = trace.getActiveSpan()
|
||||||
|
if (!span) return null
|
||||||
|
const ctx = span.spanContext()
|
||||||
|
return {
|
||||||
|
traceId: ctx.traceId,
|
||||||
|
spanId: ctx.spanId,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Add an error to the current span
|
||||||
|
*/
|
||||||
|
export function recordError(error: Error, attributes?: Record<string, string>): void {
|
||||||
|
const span = trace.getActiveSpan()
|
||||||
|
if (!span) return
|
||||||
|
span.recordException(error)
|
||||||
|
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message })
|
||||||
|
if (attributes) {
|
||||||
|
span.setAttributes(attributes)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a custom span for tracing a specific operation
|
||||||
|
*/
|
||||||
|
export function withSpan<T>(
|
||||||
|
name: string,
|
||||||
|
fn: () => T,
|
||||||
|
attributes?: Record<string, string>
|
||||||
|
): T {
|
||||||
|
const tracer = trace.getTracer('abaci-app')
|
||||||
|
return tracer.startActiveSpan(name, (span) => {
|
||||||
|
try {
|
||||||
|
if (attributes) {
|
||||||
|
span.setAttributes(attributes)
|
||||||
|
}
|
||||||
|
const result = fn()
|
||||||
|
span.end()
|
||||||
|
return result
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof Error) {
|
||||||
|
span.recordException(error)
|
||||||
|
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message })
|
||||||
|
}
|
||||||
|
span.end()
|
||||||
|
throw error
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a custom span for tracing an async operation
|
||||||
|
*/
|
||||||
|
export async function withSpanAsync<T>(
|
||||||
|
name: string,
|
||||||
|
fn: () => Promise<T>,
|
||||||
|
attributes?: Record<string, string>
|
||||||
|
): Promise<T> {
|
||||||
|
const tracer = trace.getTracer('abaci-app')
|
||||||
|
return tracer.startActiveSpan(name, async (span) => {
|
||||||
|
try {
|
||||||
|
if (attributes) {
|
||||||
|
span.setAttributes(attributes)
|
||||||
|
}
|
||||||
|
const result = await fn()
|
||||||
|
span.end()
|
||||||
|
return result
|
||||||
|
} catch (error) {
|
||||||
|
if (error instanceof Error) {
|
||||||
|
span.recordException(error)
|
||||||
|
span.setStatus({ code: SpanStatusCode.ERROR, message: error.message })
|
||||||
|
}
|
||||||
|
span.end()
|
||||||
|
throw error
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
export { trace, context }
|
||||||
1486
pnpm-lock.yaml
1486
pnpm-lock.yaml
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue