云原生监控
📖 概述
云原生监控是现代应用运维的核心组成部分,通过全面的可观测性(Observability)来确保系统的健康运行。它包括指标监控、日志管理、分布式追踪和告警系统,为运维团队提供完整的系统洞察。
🎯 学习目标
- 掌握云原生监控的核心概念
- 学习 Prometheus、Grafana 等监控工具
- 了解日志聚合和分析技术
- 掌握分布式追踪和 APM 实践
🏗️ 监控架构
1. 三大支柱
javascript
// 可观测性三大支柱示例
class ObservabilityPillars {
constructor() {
this.metrics = new MetricsCollector();
this.logs = new LogManager();
this.traces = new TraceCollector();
}
// 指标收集
collectMetrics() {
return {
// 业务指标
business: {
activeUsers: this.getActiveUsers(),
ordersPerMinute: this.getOrderRate(),
revenue: this.getRevenue()
},
// 应用指标
application: {
responseTime: this.getResponseTime(),
errorRate: this.getErrorRate(),
throughput: this.getThroughput()
},
// 基础设施指标
infrastructure: {
cpuUsage: this.getCPUUsage(),
memoryUsage: this.getMemoryUsage(),
diskIO: this.getDiskIO(),
networkIO: this.getNetworkIO()
}
};
}
// 结构化日志
generateLogs(level, message, context = {}) {
return {
timestamp: new Date().toISOString(),
level,
message,
service: 'nodejs-app',
version: process.env.APP_VERSION,
traceId: context.traceId,
spanId: context.spanId,
userId: context.userId,
requestId: context.requestId,
...context
};
}
// 分布式追踪
createTrace(operationName, parentSpan = null) {
return {
traceId: this.generateTraceId(),
spanId: this.generateSpanId(),
parentSpanId: parentSpan?.spanId,
operationName,
startTime: Date.now(),
tags: {},
logs: []
};
}
}
📊 Prometheus 监控
1. 指标收集
javascript
// prometheus-metrics.js
const promClient = require('prom-client');
// 创建指标收集器
class PrometheusMetrics {
constructor() {
// 启用默认指标
promClient.collectDefaultMetrics({
timeout: 10000,
gcDurationBuckets: [0.001, 0.01, 0.1, 1, 2, 5]
});
this.register = promClient.register;
this.setupCustomMetrics();
}
setupCustomMetrics() {
// HTTP 请求计数器
this.httpRequestsTotal = new promClient.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
// HTTP 请求持续时间
this.httpRequestDuration = new promClient.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.1, 0.3, 0.5, 0.7, 1, 3, 5, 7, 10]
});
// 活跃连接数
this.activeConnections = new promClient.Gauge({
name: 'active_connections',
help: 'Number of active connections'
});
// 数据库连接池
this.dbConnectionPool = new promClient.Gauge({
name: 'db_connection_pool_size',
help: 'Database connection pool size',
labelNames: ['state'] // active, idle, waiting
});
// 业务指标
this.businessMetrics = {
ordersTotal: new promClient.Counter({
name: 'orders_total',
help: 'Total number of orders',
labelNames: ['status']
}),
revenueTotal: new promClient.Counter({
name: 'revenue_total',
help: 'Total revenue in cents'
}),
activeUsers: new promClient.Gauge({
name: 'active_users',
help: 'Number of active users'
})
};
// 自定义指标
this.customMetrics = {
queueSize: new promClient.Gauge({
name: 'queue_size',
help: 'Size of processing queue',
labelNames: ['queue_name']
}),
cacheHitRatio: new promClient.Gauge({
name: 'cache_hit_ratio',
help: 'Cache hit ratio'
}),
errorsByType: new promClient.Counter({
name: 'errors_by_type_total',
help: 'Total errors by type',
labelNames: ['error_type', 'service']
})
};
}
// Express 中间件
createExpressMiddleware() {
return (req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
const labels = {
method: req.method,
route: req.route?.path || req.path,
status_code: res.statusCode
};
this.httpRequestsTotal.inc(labels);
this.httpRequestDuration.observe(labels, duration);
});
next();
};
}
// 更新数据库连接池指标
updateDbPoolMetrics(pool) {
this.dbConnectionPool.set({ state: 'active' }, pool.totalCount - pool.idleCount);
this.dbConnectionPool.set({ state: 'idle' }, pool.idleCount);
this.dbConnectionPool.set({ state: 'waiting' }, pool.waitingCount);
}
// 记录业务事件
recordBusinessEvent(eventType, value = 1, labels = {}) {
switch (eventType) {
case 'order_created':
this.businessMetrics.ordersTotal.inc({ status: 'created' }, value);
break;
case 'order_completed':
this.businessMetrics.ordersTotal.inc({ status: 'completed' }, value);
break;
case 'revenue':
this.businessMetrics.revenueTotal.inc(value);
break;
case 'user_login':
this.businessMetrics.activeUsers.inc();
break;
case 'user_logout':
this.businessMetrics.activeUsers.dec();
break;
}
}
// 记录错误
recordError(errorType, service = 'nodejs-app') {
this.customMetrics.errorsByType.inc({ error_type: errorType, service });
}
// 更新队列大小
updateQueueSize(queueName, size) {
this.customMetrics.queueSize.set({ queue_name: queueName }, size);
}
// 更新缓存命中率
updateCacheHitRatio(ratio) {
this.customMetrics.cacheHitRatio.set(ratio);
}
// 获取所有指标
getMetrics() {
return this.register.metrics();
}
// 清除所有指标
clearMetrics() {
this.register.clear();
}
}
// 使用示例
const metrics = new PrometheusMetrics();
// Express 应用集成
const express = require('express');
const app = express();
// 添加监控中间件
app.use(metrics.createExpressMiddleware());
// 指标端点
app.get('/metrics', async (req, res) => {
res.set('Content-Type', promClient.register.contentType);
res.end(await metrics.getMetrics());
});
// 业务逻辑中记录指标
app.post('/orders', async (req, res) => {
try {
const order = await createOrder(req.body);
// 记录业务指标
metrics.recordBusinessEvent('order_created');
metrics.recordBusinessEvent('revenue', order.amount);
res.json(order);
} catch (error) {
metrics.recordError('order_creation_failed');
res.status(500).json({ error: error.message });
}
});
// 定期更新指标
setInterval(() => {
// 更新数据库连接池指标
if (database.pool) {
metrics.updateDbPoolMetrics(database.pool);
}
// 更新队列大小
metrics.updateQueueSize('processing', getQueueSize('processing'));
metrics.updateQueueSize('notifications', getQueueSize('notifications'));
// 更新缓存命中率
metrics.updateCacheHitRatio(cache.getHitRatio());
}, 10000);
2. Prometheus 配置
yaml
# prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
cluster: 'production'
region: 'us-east-1'
rule_files:
- "alert_rules.yml"
- "recording_rules.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
# Node.js 应用
- job_name: 'nodejs-app'
static_configs:
- targets: ['nodejs-app:3000']
metrics_path: '/metrics'
scrape_interval: 5s
scrape_timeout: 3s
# Kubernetes 服务发现
- job_name: 'kubernetes-pods'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
action: replace
target_label: kubernetes_pod_name
# 基础设施监控
- job_name: 'node-exporter'
static_configs:
- targets: ['node-exporter:9100']
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
# 告警规则
# alert_rules.yml
groups:
- name: nodejs-app-alerts
rules:
- alert: HighErrorRate
expr: rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} for {{ $labels.instance }}"
- alert: HighResponseTime
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
for: 10m
labels:
severity: warning
annotations:
summary: "High response time detected"
description: "95th percentile response time is {{ $value }}s for {{ $labels.instance }}"
- alert: HighMemoryUsage
expr: (process_resident_memory_bytes / 1024 / 1024) > 1000
for: 15m
labels:
severity: warning
annotations:
summary: "High memory usage"
description: "Memory usage is {{ $value }}MB for {{ $labels.instance }}"
- alert: DatabaseConnectionPoolExhausted
expr: db_connection_pool_size{state="waiting"} > 0
for: 2m
labels:
severity: critical
annotations:
summary: "Database connection pool exhausted"
description: "{{ $value }} connections are waiting for {{ $labels.instance }}"
📝 日志管理
1. 结构化日志
javascript
// structured-logging.js
const winston = require('winston');
const { ElasticsearchTransport } = require('winston-elasticsearch');
class StructuredLogger {
constructor() {
this.logger = winston.createLogger({
level: process.env.LOG_LEVEL || 'info',
format: winston.format.combine(
winston.format.timestamp(),
winston.format.errors({ stack: true }),
winston.format.json(),
winston.format.printf(this.customFormat)
),
defaultMeta: this.getDefaultMeta(),
transports: this.createTransports()
});
this.setupErrorHandling();
}
getDefaultMeta() {
return {
service: process.env.SERVICE_NAME || 'nodejs-app',
version: process.env.APP_VERSION || '1.0.0',
environment: process.env.NODE_ENV || 'development',
hostname: require('os').hostname(),
pid: process.pid
};
}
createTransports() {
const transports = [
// 控制台输出
new winston.transports.Console({
format: winston.format.combine(
winston.format.colorize(),
winston.format.simple()
)
})
];
// 生产环境添加文件日志
if (process.env.NODE_ENV === 'production') {
transports.push(
new winston.transports.File({
filename: 'logs/error.log',
level: 'error',
maxsize: 5242880, // 5MB
maxFiles: 5
}),
new winston.transports.File({
filename: 'logs/combined.log',
maxsize: 5242880, // 5MB
maxFiles: 10
})
);
// Elasticsearch 集成
if (process.env.ELASTICSEARCH_URL) {
transports.push(
new ElasticsearchTransport({
level: 'info',
clientOpts: {
node: process.env.ELASTICSEARCH_URL,
auth: {
username: process.env.ELASTICSEARCH_USER,
password: process.env.ELASTICSEARCH_PASSWORD
}
},
index: `logs-${process.env.SERVICE_NAME}-${new Date().toISOString().slice(0, 7)}`
})
);
}
}
return transports;
}
customFormat(info) {
const { timestamp, level, message, service, ...meta } = info;
return JSON.stringify({
'@timestamp': timestamp,
level: level.toUpperCase(),
message,
service,
...meta
});
}
setupErrorHandling() {
this.logger.on('error', (error) => {
console.error('Logger error:', error);
});
}
// 创建子日志器
child(meta) {
return this.logger.child(meta);
}
// 日志方法
info(message, meta = {}) {
this.logger.info(message, meta);
}
warn(message, meta = {}) {
this.logger.warn(message, meta);
}
error(message, error = null, meta = {}) {
this.logger.error(message, {
...meta,
error: error ? {
name: error.name,
message: error.message,
stack: error.stack
} : undefined
});
}
debug(message, meta = {}) {
this.logger.debug(message, meta);
}
// HTTP 请求日志
logRequest(req, res, duration) {
this.info('HTTP Request', {
method: req.method,
url: req.originalUrl,
statusCode: res.statusCode,
duration,
userAgent: req.get('User-Agent'),
ip: req.ip,
userId: req.user?.id,
requestId: req.requestId
});
}
// 业务事件日志
logBusinessEvent(event, data = {}) {
this.info('Business Event', {
eventType: event,
...data
});
}
// 数据库操作日志
logDatabaseOperation(operation, table, duration, error = null) {
if (error) {
this.error('Database Operation Failed', error, {
operation,
table,
duration
});
} else {
this.debug('Database Operation', {
operation,
table,
duration
});
}
}
// 外部API调用日志
logExternalApiCall(service, endpoint, method, statusCode, duration, error = null) {
const logData = {
externalService: service,
endpoint,
method,
statusCode,
duration
};
if (error) {
this.error('External API Call Failed', error, logData);
} else {
this.info('External API Call', logData);
}
}
}
// Express 中间件
function createLoggingMiddleware(logger) {
return (req, res, next) => {
const start = Date.now();
// 生成请求ID
req.requestId = require('uuid').v4();
// 创建请求专用日志器
req.logger = logger.child({
requestId: req.requestId,
method: req.method,
url: req.originalUrl
});
// 记录请求开始
req.logger.info('Request started');
res.on('finish', () => {
const duration = Date.now() - start;
logger.logRequest(req, res, duration);
});
next();
};
}
// 全局错误日志
function setupGlobalErrorLogging(logger) {
process.on('uncaughtException', (error) => {
logger.error('Uncaught Exception', error, {
fatal: true
});
process.exit(1);
});
process.on('unhandledRejection', (reason, promise) => {
logger.error('Unhandled Rejection', reason, {
promise: promise.toString()
});
});
}
// 使用示例
const logger = new StructuredLogger();
// 设置全局错误日志
setupGlobalErrorLogging(logger);
// Express 应用
const app = express();
app.use(createLoggingMiddleware(logger));
// 业务逻辑中使用日志
app.post('/orders', async (req, res) => {
try {
req.logger.info('Creating order', { orderData: req.body });
const order = await createOrder(req.body);
req.logger.logBusinessEvent('order_created', {
orderId: order.id,
amount: order.amount,
userId: req.user.id
});
res.json(order);
} catch (error) {
req.logger.error('Order creation failed', error, {
orderData: req.body,
userId: req.user?.id
});
res.status(500).json({ error: 'Internal server error' });
}
});
2. 日志聚合配置
yaml
# filebeat.yml
filebeat.inputs:
- type: log
enabled: true
paths:
- /var/log/nodejs-app/*.log
fields:
service: nodejs-app
environment: production
fields_under_root: true
json.keys_under_root: true
json.add_error_key: true
- type: docker
containers.ids:
- "*"
containers.path: "/var/lib/docker/containers"
containers.stream: "stdout"
processors:
- add_docker_metadata:
host: "unix:///var/run/docker.sock"
output.elasticsearch:
hosts: ["elasticsearch:9200"]
username: "${ELASTICSEARCH_USERNAME}"
password: "${ELASTICSEARCH_PASSWORD}"
index: "logs-%{[service]}-%{+yyyy.MM.dd}"
setup.template.settings:
index.number_of_shards: 1
index.codec: best_compression
processors:
- add_host_metadata:
when.not.contains.tags: forwarded
- add_kubernetes_metadata:
host: ${NODE_NAME}
matchers:
- logs_path:
logs_path: "/var/log/containers/"
# logstash.conf
input {
beats {
port => 5044
}
}
filter {
if [service] == "nodejs-app" {
json {
source => "message"
}
date {
match => [ "@timestamp", "ISO8601" ]
}
if [level] == "ERROR" {
mutate {
add_tag => ["error"]
}
}
if [error] {
mutate {
add_field => { "error_message" => "%{[error][message]}" }
add_field => { "error_stack" => "%{[error][stack]}" }
}
}
}
}
output {
elasticsearch {
hosts => ["elasticsearch:9200"]
index => "logs-%{service}-%{+YYYY.MM.dd}"
}
if "error" in [tags] {
slack {
url => "${SLACK_WEBHOOK_URL}"
channel => "#alerts"
username => "LogAlert"
icon_emoji => ":warning:"
format => "Error in %{service}: %{message}"
}
}
}
🔍 分布式追踪
1. OpenTelemetry 集成
javascript
// tracing.js
const { NodeSDK } = require('@opentelemetry/sdk-node');
const { Resource } = require('@opentelemetry/resources');
const { SemanticResourceAttributes } = require('@opentelemetry/semantic-conventions');
const { JaegerExporter } = require('@opentelemetry/exporter-jaeger');
const { getNodeAutoInstrumentations } = require('@opentelemetry/auto-instrumentations-node');
class DistributedTracing {
constructor() {
this.serviceName = process.env.SERVICE_NAME || 'nodejs-app';
this.serviceVersion = process.env.APP_VERSION || '1.0.0';
this.environment = process.env.NODE_ENV || 'development';
this.setupTracing();
}
setupTracing() {
// 创建 Jaeger 导出器
const jaegerExporter = new JaegerExporter({
endpoint: process.env.JAEGER_ENDPOINT || 'http://jaeger:14268/api/traces',
});
// 配置 SDK
const sdk = new NodeSDK({
resource: new Resource({
[SemanticResourceAttributes.SERVICE_NAME]: this.serviceName,
[SemanticResourceAttributes.SERVICE_VERSION]: this.serviceVersion,
[SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: this.environment,
}),
traceExporter: jaegerExporter,
instrumentations: [getNodeAutoInstrumentations({
'@opentelemetry/instrumentation-fs': {
enabled: false, // 禁用文件系统追踪
},
})],
});
// 启动 SDK
sdk.start();
console.log('分布式追踪已初始化');
}
// 手动创建 span
createSpan(name, operation, parentSpan = null) {
const tracer = require('@opentelemetry/api').trace.getTracer(this.serviceName);
const span = tracer.startSpan(name, {
parent: parentSpan,
kind: require('@opentelemetry/api').SpanKind.INTERNAL,
attributes: {
'operation.name': operation,
'service.name': this.serviceName,
}
});
return span;
}
// 装饰器模式添加追踪
traceMethod(operationName) {
return function(target, propertyName, descriptor) {
const originalMethod = descriptor.value;
descriptor.value = async function(...args) {
const tracer = require('@opentelemetry/api').trace.getTracer('nodejs-app');
const span = tracer.startSpan(`${target.constructor.name}.${propertyName}`);
try {
span.setAttributes({
'operation.name': operationName,
'method.name': propertyName,
'class.name': target.constructor.name
});
const result = await originalMethod.apply(this, args);
span.setStatus({ code: require('@opentelemetry/api').SpanStatusCode.OK });
return result;
} catch (error) {
span.recordException(error);
span.setStatus({
code: require('@opentelemetry/api').SpanStatusCode.ERROR,
message: error.message
});
throw error;
} finally {
span.end();
}
};
return descriptor;
};
}
}
// 业务服务追踪
class UserService {
constructor() {
this.tracing = new DistributedTracing();
}
@tracing.traceMethod('user.create')
async createUser(userData) {
const span = require('@opentelemetry/api').trace.getActiveSpan();
span?.setAttributes({
'user.email': userData.email,
'user.role': userData.role
});
// 数据库操作会自动被追踪
const user = await database.users.create(userData);
// 外部API调用
await this.sendWelcomeEmail(user.email);
return user;
}
async sendWelcomeEmail(email) {
const tracer = require('@opentelemetry/api').trace.getTracer('nodejs-app');
const span = tracer.startSpan('email.send_welcome');
try {
span.setAttributes({
'email.recipient': email,
'email.type': 'welcome'
});
// 外部服务调用
const response = await fetch('https://email-service.com/api/send', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ to: email, template: 'welcome' })
});
span.setAttributes({
'http.status_code': response.status,
'email.sent': response.ok
});
if (!response.ok) {
throw new Error(`Email service error: ${response.status}`);
}
} catch (error) {
span.recordException(error);
throw error;
} finally {
span.end();
}
}
}
// Express 中间件追踪
function createTracingMiddleware() {
return (req, res, next) => {
const tracer = require('@opentelemetry/api').trace.getTracer('nodejs-app');
const span = tracer.startSpan(`${req.method} ${req.route?.path || req.path}`);
span.setAttributes({
'http.method': req.method,
'http.url': req.url,
'http.route': req.route?.path,
'user.id': req.user?.id,
'request.id': req.requestId
});
// 将 span 添加到请求对象
req.span = span;
res.on('finish', () => {
span.setAttributes({
'http.status_code': res.statusCode,
'http.response.size': res.get('content-length') || 0
});
if (res.statusCode >= 400) {
span.setStatus({
code: require('@opentelemetry/api').SpanStatusCode.ERROR,
message: `HTTP ${res.statusCode}`
});
}
span.end();
});
next();
};
}
2. 自定义追踪
javascript
// custom-tracing.js
class CustomTracing {
constructor() {
this.activeSpans = new Map();
}
// 开始业务操作追踪
startBusinessOperation(operationName, context = {}) {
const tracer = require('@opentelemetry/api').trace.getTracer('nodejs-app');
const span = tracer.startSpan(operationName);
span.setAttributes({
'business.operation': operationName,
...context
});
const operationId = require('uuid').v4();
this.activeSpans.set(operationId, span);
return operationId;
}
// 添加业务事件
addBusinessEvent(operationId, eventName, data = {}) {
const span = this.activeSpans.get(operationId);
if (span) {
span.addEvent(eventName, {
timestamp: Date.now(),
...data
});
}
}
// 结束业务操作
endBusinessOperation(operationId, success = true, error = null) {
const span = this.activeSpans.get(operationId);
if (span) {
if (success) {
span.setStatus({ code: require('@opentelemetry/api').SpanStatusCode.OK });
} else {
span.setStatus({
code: require('@opentelemetry/api').SpanStatusCode.ERROR,
message: error?.message || 'Operation failed'
});
if (error) {
span.recordException(error);
}
}
span.end();
this.activeSpans.delete(operationId);
}
}
// 数据库操作追踪
async traceDbOperation(operation, table, query, params = []) {
const tracer = require('@opentelemetry/api').trace.getTracer('nodejs-app');
const span = tracer.startSpan(`db.${operation}`);
span.setAttributes({
'db.system': 'postgresql',
'db.operation': operation,
'db.table': table,
'db.statement': query
});
const startTime = Date.now();
try {
const result = await database.query(query, params);
span.setAttributes({
'db.rows_affected': result.rowCount || 0,
'db.duration': Date.now() - startTime
});
return result;
} catch (error) {
span.recordException(error);
span.setStatus({
code: require('@opentelemetry/api').SpanStatusCode.ERROR,
message: error.message
});
throw error;
} finally {
span.end();
}
}
// 缓存操作追踪
async traceCacheOperation(operation, key, value = null) {
const tracer = require('@opentelemetry/api').trace.getTracer('nodejs-app');
const span = tracer.startSpan(`cache.${operation}`);
span.setAttributes({
'cache.operation': operation,
'cache.key': key,
'cache.system': 'redis'
});
try {
let result;
switch (operation) {
case 'get':
result = await redis.get(key);
span.setAttributes({
'cache.hit': result !== null
});
break;
case 'set':
result = await redis.set(key, value);
break;
case 'del':
result = await redis.del(key);
break;
}
return result;
} catch (error) {
span.recordException(error);
throw error;
} finally {
span.end();
}
}
}
// 使用示例
const customTracing = new CustomTracing();
app.post('/orders', async (req, res) => {
const operationId = customTracing.startBusinessOperation('create_order', {
'user.id': req.user.id,
'order.type': req.body.type
});
try {
// 验证订单数据
customTracing.addBusinessEvent(operationId, 'validation_started');
const validatedData = await validateOrderData(req.body);
customTracing.addBusinessEvent(operationId, 'validation_completed');
// 检查库存
customTracing.addBusinessEvent(operationId, 'inventory_check_started');
const inventoryCheck = await customTracing.traceDbOperation(
'select', 'inventory',
'SELECT quantity FROM inventory WHERE product_id = $1',
[validatedData.productId]
);
customTracing.addBusinessEvent(operationId, 'inventory_check_completed');
// 创建订单
customTracing.addBusinessEvent(operationId, 'order_creation_started');
const order = await customTracing.traceDbOperation(
'insert', 'orders',
'INSERT INTO orders (user_id, product_id, quantity, amount) VALUES ($1, $2, $3, $4) RETURNING *',
[req.user.id, validatedData.productId, validatedData.quantity, validatedData.amount]
);
customTracing.addBusinessEvent(operationId, 'order_creation_completed');
// 更新缓存
await customTracing.traceCacheOperation('set', `order:${order.id}`, JSON.stringify(order));
customTracing.endBusinessOperation(operationId, true);
res.json(order);
} catch (error) {
customTracing.endBusinessOperation(operationId, false, error);
res.status(500).json({ error: error.message });
}
});
🚨 告警系统
1. Alertmanager 配置
yaml
# alertmanager.yml
global:
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@company.com'
smtp_auth_username: 'alerts@company.com'
smtp_auth_password: 'password'
route:
group_by: ['alertname', 'service']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
group_wait: 10s
repeat_interval: 1h
- match:
severity: warning
receiver: 'warning-alerts'
repeat_interval: 6h
receivers:
- name: 'default'
email_configs:
- to: 'team@company.com'
subject: '{{ .GroupLabels.alertname }} - {{ .Status }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
Service: {{ .Labels.service }}
Instance: {{ .Labels.instance }}
{{ end }}
- name: 'critical-alerts'
email_configs:
- to: 'oncall@company.com'
subject: 'CRITICAL: {{ .GroupLabels.alertname }}'
slack_configs:
- api_url: 'https://hooks.slack.com/services/...'
channel: '#critical-alerts'
title: 'Critical Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
webhook_configs:
- url: 'https://api.pagerduty.com/integration/...'
- name: 'warning-alerts'
slack_configs:
- api_url: 'https://hooks.slack.com/services/...'
channel: '#alerts'
title: 'Warning Alert'
text: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'service', 'instance']
2. 自定义告警
javascript
// alerting.js
class AlertingSystem {
constructor() {
this.alertChannels = {
slack: this.createSlackChannel(),
email: this.createEmailChannel(),
webhook: this.createWebhookChannel()
};
this.alertRules = new Map();
this.setupAlertRules();
}
setupAlertRules() {
// 错误率告警
this.alertRules.set('high_error_rate', {
condition: (metrics) => metrics.errorRate > 0.05,
severity: 'critical',
message: (metrics) => `错误率过高: ${(metrics.errorRate * 100).toFixed(2)}%`,
cooldown: 300000 // 5分钟冷却
});
// 响应时间告警
this.alertRules.set('high_response_time', {
condition: (metrics) => metrics.avgResponseTime > 2000,
severity: 'warning',
message: (metrics) => `平均响应时间过高: ${metrics.avgResponseTime}ms`,
cooldown: 600000 // 10分钟冷却
});
// 内存使用告警
this.alertRules.set('high_memory_usage', {
condition: (metrics) => metrics.memoryUsage > 0.9,
severity: 'warning',
message: (metrics) => `内存使用率过高: ${(metrics.memoryUsage * 100).toFixed(2)}%`,
cooldown: 900000 // 15分钟冷却
});
// 数据库连接池告警
this.alertRules.set('db_pool_exhausted', {
condition: (metrics) => metrics.dbPoolWaiting > 0,
severity: 'critical',
message: (metrics) => `数据库连接池耗尽: ${metrics.dbPoolWaiting} 个连接等待`,
cooldown: 60000 // 1分钟冷却
});
}
createSlackChannel() {
const { WebClient } = require('@slack/web-api');
return new WebClient(process.env.SLACK_TOKEN);
}
createEmailChannel() {
const nodemailer = require('nodemailer');
return nodemailer.createTransporter({
service: 'gmail',
auth: {
user: process.env.EMAIL_USER,
pass: process.env.EMAIL_PASSWORD
}
});
}
createWebhookChannel() {
return {
send: async (url, data) => {
const response = await fetch(url, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify(data)
});
return response.ok;
}
};
}
async checkAlerts(metrics) {
const currentTime = Date.now();
for (const [ruleName, rule] of this.alertRules) {
const lastAlert = this.getLastAlertTime(ruleName);
// 检查冷却时间
if (lastAlert && (currentTime - lastAlert) < rule.cooldown) {
continue;
}
// 检查告警条件
if (rule.condition(metrics)) {
await this.triggerAlert(ruleName, rule, metrics);
this.setLastAlertTime(ruleName, currentTime);
}
}
}
async triggerAlert(ruleName, rule, metrics) {
const alert = {
name: ruleName,
severity: rule.severity,
message: rule.message(metrics),
timestamp: new Date().toISOString(),
service: process.env.SERVICE_NAME || 'nodejs-app',
instance: require('os').hostname(),
metrics: metrics
};
console.log(`触发告警: ${alert.name} - ${alert.message}`);
// 发送到不同渠道
await Promise.all([
this.sendSlackAlert(alert),
this.sendEmailAlert(alert),
this.sendWebhookAlert(alert)
]);
}
async sendSlackAlert(alert) {
if (!this.alertChannels.slack) return;
const color = alert.severity === 'critical' ? 'danger' : 'warning';
const channel = alert.severity === 'critical' ? '#critical-alerts' : '#alerts';
try {
await this.alertChannels.slack.chat.postMessage({
channel: channel,
attachments: [{
color: color,
title: `${alert.severity.toUpperCase()}: ${alert.name}`,
text: alert.message,
fields: [
{ title: 'Service', value: alert.service, short: true },
{ title: 'Instance', value: alert.instance, short: true },
{ title: 'Time', value: alert.timestamp, short: true }
]
}]
});
} catch (error) {
console.error('Slack 告警发送失败:', error);
}
}
async sendEmailAlert(alert) {
if (!this.alertChannels.email) return;
const recipients = alert.severity === 'critical'
? process.env.CRITICAL_ALERT_EMAILS?.split(',')
: process.env.ALERT_EMAILS?.split(',');
if (!recipients) return;
try {
await this.alertChannels.email.sendMail({
from: process.env.EMAIL_FROM,
to: recipients.join(','),
subject: `${alert.severity.toUpperCase()}: ${alert.name}`,
html: `
<h2>${alert.message}</h2>
<p><strong>Service:</strong> ${alert.service}</p>
<p><strong>Instance:</strong> ${alert.instance}</p>
<p><strong>Time:</strong> ${alert.timestamp}</p>
<p><strong>Metrics:</strong></p>
<pre>${JSON.stringify(alert.metrics, null, 2)}</pre>
`
});
} catch (error) {
console.error('邮件告警发送失败:', error);
}
}
async sendWebhookAlert(alert) {
const webhookUrl = process.env.ALERT_WEBHOOK_URL;
if (!webhookUrl) return;
try {
await this.alertChannels.webhook.send(webhookUrl, alert);
} catch (error) {
console.error('Webhook 告警发送失败:', error);
}
}
getLastAlertTime(ruleName) {
// 实际应用中应该使用持久化存储
return this.lastAlertTimes?.get(ruleName);
}
setLastAlertTime(ruleName, time) {
if (!this.lastAlertTimes) {
this.lastAlertTimes = new Map();
}
this.lastAlertTimes.set(ruleName, time);
}
}
// 使用示例
const alerting = new AlertingSystem();
// 定期检查告警
setInterval(async () => {
const metrics = await collectMetrics();
await alerting.checkAlerts(metrics);
}, 30000); // 每30秒检查一次
async function collectMetrics() {
return {
errorRate: await calculateErrorRate(),
avgResponseTime: await calculateAvgResponseTime(),
memoryUsage: process.memoryUsage().heapUsed / process.memoryUsage().heapTotal,
dbPoolWaiting: await getDbPoolWaitingCount(),
activeConnections: getActiveConnectionCount()
};
}
📚 最佳实践总结
- 全面覆盖:实现指标、日志、追踪的完整监控
- 标准化:使用标准格式和命名规范
- 自动化:自动化告警和响应流程
- 可视化:构建直观的监控仪表板
- 性能考虑:监控系统本身不应影响应用性能
- 数据保留:合理设置数据保留策略
- 安全性:保护监控数据的安全性
- 文档化:维护监控系统的文档和运维手册
通过掌握这些云原生监控技术,您将能够构建完整的可观测性体系,确保系统的稳定运行。