Prometheus scrapes metrics from /metrics — prom-client exposes them from Node.js. import { Registry, Counter, Gauge, Histogram, Summary, collectDefaultMetrics } from "prom-client" sets up the registry. collectDefaultMetrics({ register }) adds Node.js process metrics (heap, event loop, GC). new Counter({ name: "http_requests_total", help: "...", labelNames: ["method", "route", "status_code"], registers: [register] }) increments on each request. new Histogram({ name: "http_request_duration_seconds", buckets: [0.005, 0.01, 0.05, 0.1, 0.5, 1, 5] }) measures latency — timer = hist.startTimer() then timer({ method, route, status_code }) records duration. new Gauge({ name: "queue_depth", help: "..." }) tracks current values — .set(n) or .inc()/.dec(). new Summary({ name: "...", percentiles: [0.5, 0.9, 0.99] }) for quantile-based latency. Grafana Dashboard JSON: panels[].targets[].expr is PromQL — rate(http_requests_total[5m]), histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[5m])). Loki via pino-loki: const logger = pino({ transport: { target: "pino-loki", options: { host: LOKI_HOST, labels: { app: "my-app" } } } }). Alert rules: YAML groups[].rules[].alert with expr, for: 5m, labels.severity, annotations.summary. remote_write to Grafana Cloud: url: https://prometheus-prod-{id}.grafana.net/api/prom/push with basic_auth. Claude Code generates prom-client instrumentation, Grafana Dashboard JSON, Loki logging, and alert rules.
CLAUDE.md for Grafana + Prometheus
## Observability Stack
- Metrics: prom-client >= 15 — collectDefaultMetrics + custom Counter/Gauge/Histogram
- Endpoint: GET /metrics → register.metrics() with Content-Type: register.contentType
- Histogram buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10] (seconds)
- Labels: method, route (normalized, no IDs), status_code
- Loki: pino-loki transport — structured JSON logs with app/env/service labels
- Grafana Cloud: remote_write with PROMETHEUS_URL + PROMETHEUS_USER + PROMETHEUS_PASSWORD
- Alert rules: YAML, 5m evaluation interval, severity: warning|critical
Prometheus Metrics Registry
// lib/metrics/registry.ts — prom-client setup with default + custom metrics
import {
Registry,
collectDefaultMetrics,
Counter,
Gauge,
Histogram,
Summary,
} from "prom-client"
export const register = new Registry()
// Node.js process metrics (heap, GC, event loop lag, file descriptors)
collectDefaultMetrics({ register, prefix: "app_" })
// ── HTTP metrics ───────────────────────────────────────────────────────────
export const httpRequestsTotal = new Counter({
name: "http_requests_total",
help: "Total number of HTTP requests",
labelNames: ["method", "route", "status_code"] as const,
registers: [register],
})
export const httpRequestDuration = new Histogram({
name: "http_request_duration_seconds",
help: "HTTP request duration in seconds",
labelNames: ["method", "route", "status_code"] as const,
buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5],
registers: [register],
})
export const httpRequestSize = new Histogram({
name: "http_request_size_bytes",
help: "HTTP request body size in bytes",
labelNames: ["method", "route"] as const,
buckets: [100, 1_000, 10_000, 100_000, 1_000_000],
registers: [register],
})
// ── Business metrics ───────────────────────────────────────────────────────
export const activeUsers = new Gauge({
name: "app_active_users",
help: "Number of currently active users (sessions < 30m)",
registers: [register],
})
export const jobQueueDepth = new Gauge({
name: "app_job_queue_depth",
help: "Number of pending background jobs",
labelNames: ["queue_name"] as const,
registers: [register],
})
export const jobDuration = new Histogram({
name: "app_job_duration_seconds",
help: "Background job execution duration",
labelNames: ["job_type", "status"] as const,
buckets: [0.1, 0.5, 1, 5, 10, 30, 60, 120, 300],
registers: [register],
})
export const databaseQueryDuration = new Histogram({
name: "app_db_query_duration_seconds",
help: "Database query latency",
labelNames: ["operation", "table"] as const,
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
registers: [register],
})
export const cacheHits = new Counter({
name: "app_cache_operations_total",
help: "Cache hit/miss counts",
labelNames: ["result"] as const, // "hit" | "miss"
registers: [register],
})
export const externalApiDuration = new Summary({
name: "app_external_api_duration_seconds",
help: "External API call duration",
labelNames: ["service", "endpoint", "status"] as const,
percentiles: [0.5, 0.9, 0.95, 0.99],
registers: [register],
})
Metrics Middleware
// lib/metrics/middleware.ts — Next.js / Express HTTP instrumentation
import { httpRequestsTotal, httpRequestDuration, httpRequestSize } from "./registry"
/** Normalize route — replace UUIDs and numeric IDs */
function normalizeRoute(url: string): string {
return url
.replace(/\/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, "/:id")
.replace(/\/\d+/g, "/:id")
.split("?")[0] // strip query params
}
/** Wrap a Next.js route handler with Prometheus instrumentation */
export function withMetrics<T>(
handler: (req: Request) => Promise<T>,
route?: string,
): (req: Request) => Promise<T> {
return async (req: Request) => {
const method = req.method ?? "GET"
const normalRoute = route ?? normalizeRoute(new URL(req.url).pathname)
const endTimer = httpRequestDuration.startTimer({ method, route: normalRoute })
const contentLen = parseInt(req.headers.get("content-length") ?? "0")
if (contentLen > 0) {
httpRequestSize.observe({ method, route: normalRoute }, contentLen)
}
let statusCode = "200"
try {
const result = await handler(req)
// Extract status if result is a Response
if (result instanceof Response) statusCode = String(result.status)
return result
} catch (err) {
statusCode = "500"
throw err
} finally {
endTimer({ status_code: statusCode })
httpRequestsTotal.inc({ method, route: normalRoute, status_code: statusCode })
}
}
}
/** Instrument a database operation */
export async function trackDbQuery<T>(
operation: string,
table: string,
fn: () => Promise<T>,
): Promise<T> {
const { databaseQueryDuration } = await import("./registry")
const end = databaseQueryDuration.startTimer({ operation, table })
try {
const result = await fn()
end()
return result
} catch (err) {
end()
throw err
}
}
/metrics Endpoint
// app/api/metrics/route.ts — Prometheus scrape endpoint
import { NextResponse } from "next/server"
import { register } from "@/lib/metrics/registry"
// Protect the metrics endpoint from public access
const METRICS_TOKEN = process.env.METRICS_TOKEN
export async function GET(req: Request) {
if (METRICS_TOKEN) {
const auth = req.headers.get("authorization")
if (auth !== `Bearer ${METRICS_TOKEN}`) {
return new NextResponse("Unauthorized", { status: 401 })
}
}
const metrics = await register.metrics()
return new NextResponse(metrics, {
headers: { "Content-Type": register.contentType },
})
}
Loki Structured Logging
// lib/logger.ts — pino with Grafana Loki transport
import pino, { type Logger } from "pino"
const isDev = process.env.NODE_ENV !== "production"
const lokiUrl = process.env.LOKI_URL // e.g. https://logs-prod-{id}.grafana.net
function buildLogger(): Logger {
if (isDev || !lokiUrl) {
return pino({ level: "debug" })
}
return pino({
level: "info",
transport: {
targets: [
// Console transport for Cloud Run / ECS logs
{ target: "pino/file", options: { destination: 1 }, level: "info" },
// Loki transport
{
target: "pino-loki",
options: {
host: lokiUrl,
basicAuth: {
username: process.env.LOKI_USER!,
password: process.env.LOKI_PASSWORD!,
},
labels: {
app: process.env.APP_NAME ?? "my-app",
environment: process.env.NODE_ENV ?? "production",
},
// Map pino level numbers to Grafana Loki severity labels
levelMap: {
10: "debug", 20: "debug", 30: "info",
40: "warning", 50: "error", 60: "critical",
},
},
level: "info",
},
],
},
base: { service: "api" },
serializers: {
req: pino.stdSerializers.req,
err: pino.stdSerializers.err,
},
})
}
export const logger = buildLogger()
/** Child logger with request context */
export function requestLogger(requestId: string, route: string): Logger {
return logger.child({ requestId, route })
}
Grafana Dashboard JSON
{
"__inputs": [{ "name": "DS_PROMETHEUS", "type": "datasource", "pluginId": "prometheus" }],
"title": "Application Overview",
"uid": "app-overview",
"refresh": "30s",
"panels": [
{
"title": "Request Rate",
"type": "timeseries",
"gridPos": { "x": 0, "y": 0, "w": 12, "h": 8 },
"targets": [{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "sum(rate(http_requests_total[5m])) by (route)",
"legendFormat": "{{route}}"
}]
},
{
"title": "P99 Latency",
"type": "timeseries",
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 8 },
"fieldConfig": { "defaults": { "unit": "s" } },
"targets": [{
"datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" },
"expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route))",
"legendFormat": "p99 {{route}}"
}]
},
{
"title": "Error Rate",
"type": "stat",
"gridPos": { "x": 0, "y": 8, "w": 6, "h": 4 },
"fieldConfig": { "defaults": { "unit": "percentunit", "thresholds": {
"steps": [{ "value": 0, "color": "green" }, { "value": 0.01, "color": "yellow" }, { "value": 0.05, "color": "red" }]
}}},
"targets": [{
"expr": "sum(rate(http_requests_total{status_code=~\"5..\"}[5m])) / sum(rate(http_requests_total[5m]))"
}]
},
{
"title": "Active Users",
"type": "stat",
"gridPos": { "x": 6, "y": 8, "w": 6, "h": 4 },
"targets": [{ "expr": "app_active_users" }]
},
{
"title": "Job Queue Depth",
"type": "timeseries",
"gridPos": { "x": 12, "y": 8, "w": 12, "h": 8 },
"targets": [{
"expr": "app_job_queue_depth",
"legendFormat": "{{queue_name}}"
}]
}
]
}
Alert Rules
# prometheus/alerts.yaml — Prometheus alerting rules
groups:
- name: application
interval: 30s
rules:
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status_code=~"5.."}[5m]))
/ sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High HTTP error rate ({{ $value | humanizePercentage }})"
description: "Error rate above 5% for 5 minutes on instance {{ $labels.instance }}"
- alert: SlowP99Latency
expr: |
histogram_quantile(0.99,
sum(rate(http_request_duration_seconds_bucket[5m])) by (le, route)
) > 2
for: 10m
labels:
severity: warning
annotations:
summary: "P99 latency above 2s on route {{ $labels.route }}"
- alert: JobQueueBacklog
expr: app_job_queue_depth > 500
for: 5m
labels:
severity: warning
annotations:
summary: "Job queue backlog: {{ $value }} pending jobs"
- alert: HighMemoryUsage
expr: |
app_nodejs_heap_size_used_bytes / app_nodejs_heap_size_total_bytes > 0.9
for: 5m
labels:
severity: critical
annotations:
summary: "Node.js heap usage above 90%"
For the Datadog alternative when needing an all-in-one SaaS observability platform with APM traces, infrastructure monitoring, log management, and RUM in a single product with minimal self-hosting — Datadog provides a unified paid platform while Prometheus + Grafana is open-source and self-hostable with more flexibility in storage backends and alerting rules. For the OpenTelemetry alternative when needing vendor-neutral distributed tracing with spans, context propagation, and the ability to export to any backend (Jaeger, Tempo, OTLP) alongside metrics in a unified SDK — OpenTelemetry is the standard for traces while Prometheus + prom-client is the standard for pull-based metrics scraping. The Claude Skills 360 bundle includes Grafana/Prometheus skill sets covering prom-client instrumentation, Loki logging, and Grafana dashboards. Start with the free tier to try observability stack generation.