监控基础
📋 概述
应用程序监控是确保系统稳定性和性能的关键实践。通过收集、分析和可视化各种指标,我们可以及时发现问题、优化性能并提高用户体验。
🎯 学习目标
- 理解监控的核心概念和重要性
- 掌握Node.js应用的监控策略
- 学会设置关键指标和告警
- 了解监控工具和最佳实践
📚 监控基础概念
监控的四个黄金信号
1. 延迟(Latency)
请求处理时间,包括成功和失败的请求。
javascript
// Express中间件记录响应时间
const responseTime = require('response-time');
app.use(responseTime((req, res, time) => {
// 记录响应时间
console.log(`${req.method} ${req.url} - ${time}ms`);
// 发送到监控系统
metrics.histogram('http_request_duration_ms', time, {
method: req.method,
route: req.route?.path || req.url,
status: res.statusCode
});
}));
2. 流量(Traffic)
系统处理的请求数量。
javascript
// 记录请求数量
app.use((req, res, next) => {
metrics.increment('http_requests_total', {
method: req.method,
route: req.route?.path || req.url
});
next();
});
3. 错误率(Errors)
失败请求的比例。
javascript
// 错误处理中间件
app.use((err, req, res, next) => {
// 记录错误
metrics.increment('http_errors_total', {
method: req.method,
route: req.route?.path || req.url,
error: err.name
});
console.error('Request error:', {
method: req.method,
url: req.url,
error: err.message,
stack: err.stack
});
res.status(500).json({ error: 'Internal Server Error' });
});
4. 饱和度(Saturation)
系统资源的使用程度。
javascript
const os = require('os');
// 定期收集系统指标
setInterval(() => {
const memUsage = process.memoryUsage();
const cpuUsage = process.cpuUsage();
// 内存使用率
metrics.gauge('memory_usage_bytes', memUsage.rss);
metrics.gauge('memory_heap_used_bytes', memUsage.heapUsed);
metrics.gauge('memory_heap_total_bytes', memUsage.heapTotal);
// CPU使用率
metrics.gauge('cpu_user_microseconds', cpuUsage.user);
metrics.gauge('cpu_system_microseconds', cpuUsage.system);
// 系统负载
const loadAvg = os.loadavg();
metrics.gauge('system_load_1m', loadAvg[0]);
metrics.gauge('system_load_5m', loadAvg[1]);
metrics.gauge('system_load_15m', loadAvg[2]);
}, 10000);
🛠 Node.js应用监控实现
基础监控设置
健康检查端点
javascript
// health.js
const express = require('express');
const router = express.Router();
let isHealthy = true;
let readinessChecks = [];
// 添加就绪检查
function addReadinessCheck(name, checkFn) {
readinessChecks.push({ name, check: checkFn });
}
// 健康检查
router.get('/health', (req, res) => {
if (isHealthy) {
res.status(200).json({
status: 'UP',
timestamp: new Date().toISOString(),
uptime: process.uptime(),
version: process.env.APP_VERSION || '1.0.0'
});
} else {
res.status(503).json({
status: 'DOWN',
timestamp: new Date().toISOString()
});
}
});
// 就绪检查
router.get('/ready', async (req, res) => {
const results = {};
let allReady = true;
for (const { name, check } of readinessChecks) {
try {
const result = await check();
results[name] = { status: 'UP', ...result };
} catch (error) {
results[name] = { status: 'DOWN', error: error.message };
allReady = false;
}
}
const status = allReady ? 200 : 503;
res.status(status).json({
status: allReady ? 'READY' : 'NOT_READY',
checks: results,
timestamp: new Date().toISOString()
});
});
// 添加数据库连接检查
addReadinessCheck('database', async () => {
// 检查数据库连接
const startTime = Date.now();
await db.query('SELECT 1');
return { responseTime: Date.now() - startTime };
});
// 添加Redis连接检查
addReadinessCheck('redis', async () => {
const startTime = Date.now();
await redis.ping();
return { responseTime: Date.now() - startTime };
});
module.exports = router;
指标收集中间件
javascript
// metrics.js
const client = require('prom-client');
// 创建指标注册表
const register = new client.Registry();
// 默认指标
client.collectDefaultMetrics({ register });
// HTTP请求指标
const httpRequestsTotal = new client.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code'],
registers: [register]
});
const httpRequestDuration = new client.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
registers: [register]
});
// 数据库查询指标
const dbQueriesTotal = new client.Counter({
name: 'db_queries_total',
help: 'Total number of database queries',
labelNames: ['operation', 'table'],
registers: [register]
});
const dbQueryDuration = new client.Histogram({
name: 'db_query_duration_seconds',
help: 'Duration of database queries in seconds',
labelNames: ['operation', 'table'],
buckets: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1],
registers: [register]
});
// 业务指标
const activeUsers = new client.Gauge({
name: 'active_users_total',
help: 'Number of active users',
registers: [register]
});
const ordersTotal = new client.Counter({
name: 'orders_total',
help: 'Total number of orders',
labelNames: ['status'],
registers: [register]
});
// 监控中间件
function monitoringMiddleware() {
return (req, res, next) => {
const start = Date.now();
// 监听响应结束事件
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
const route = req.route?.path || req.url;
httpRequestsTotal.inc({
method: req.method,
route,
status_code: res.statusCode
});
httpRequestDuration.observe({
method: req.method,
route,
status_code: res.statusCode
}, duration);
});
next();
};
}
// 数据库监控装饰器
function monitorDbQuery(operation, table) {
return function(target, propertyKey, descriptor) {
const originalMethod = descriptor.value;
descriptor.value = async function(...args) {
const start = Date.now();
try {
const result = await originalMethod.apply(this, args);
dbQueriesTotal.inc({ operation, table });
dbQueryDuration.observe({ operation, table }, (Date.now() - start) / 1000);
return result;
} catch (error) {
dbQueriesTotal.inc({ operation: `${operation}_error`, table });
throw error;
}
};
return descriptor;
};
}
// 导出指标端点
function metricsEndpoint() {
return async (req, res) => {
try {
res.set('Content-Type', register.contentType);
res.end(await register.metrics());
} catch (error) {
res.status(500).end(error);
}
};
}
module.exports = {
register,
monitoringMiddleware,
monitorDbQuery,
metricsEndpoint,
metrics: {
httpRequestsTotal,
httpRequestDuration,
dbQueriesTotal,
dbQueryDuration,
activeUsers,
ordersTotal
}
};
应用程序集成
javascript
// app.js
const express = require('express');
const { monitoringMiddleware, metricsEndpoint } = require('./middleware/metrics');
const healthRouter = require('./routes/health');
const app = express();
// 监控中间件
app.use(monitoringMiddleware());
// 健康检查路由
app.use('/health', healthRouter);
// 指标端点
app.get('/metrics', metricsEndpoint());
// 业务路由
app.use('/api', require('./routes/api'));
// 错误处理
app.use((err, req, res, next) => {
console.error('Unhandled error:', err);
// 记录错误指标
metrics.httpRequestsTotal.inc({
method: req.method,
route: req.route?.path || req.url,
status_code: 500
});
res.status(500).json({ error: 'Internal Server Error' });
});
const port = process.env.PORT || 3000;
app.listen(port, () => {
console.log(`Server running on port ${port}`);
});
📊 关键指标定义
系统指标
javascript
// system-metrics.js
const os = require('os');
const fs = require('fs').promises;
const { metrics } = require('./metrics');
class SystemMetrics {
constructor() {
this.startTime = Date.now();
this.collectInterval = null;
}
start(interval = 10000) {
this.collectInterval = setInterval(() => {
this.collectMetrics();
}, interval);
}
stop() {
if (this.collectInterval) {
clearInterval(this.collectInterval);
}
}
async collectMetrics() {
try {
// 内存指标
const memUsage = process.memoryUsage();
metrics.memoryUsage.set(memUsage.rss);
metrics.heapUsed.set(memUsage.heapUsed);
metrics.heapTotal.set(memUsage.heapTotal);
metrics.external.set(memUsage.external);
// CPU指标
const cpuUsage = process.cpuUsage();
metrics.cpuUser.set(cpuUsage.user);
metrics.cpuSystem.set(cpuUsage.system);
// 系统负载
const loadAvg = os.loadavg();
metrics.systemLoad1.set(loadAvg[0]);
metrics.systemLoad5.set(loadAvg[1]);
metrics.systemLoad15.set(loadAvg[2]);
// 运行时间
metrics.uptime.set(Date.now() - this.startTime);
// 文件描述符
try {
const fdCount = await this.getFileDescriptorCount();
metrics.openFileDescriptors.set(fdCount);
} catch (error) {
console.warn('Failed to get file descriptor count:', error.message);
}
// 事件循环延迟
const start = process.hrtime();
setImmediate(() => {
const delta = process.hrtime(start);
const nanosec = delta[0] * 1e9 + delta[1];
const millisec = nanosec / 1e6;
metrics.eventLoopLag.set(millisec);
});
} catch (error) {
console.error('Error collecting system metrics:', error);
}
}
async getFileDescriptorCount() {
try {
const fdDir = await fs.readdir(`/proc/${process.pid}/fd`);
return fdDir.length;
} catch (error) {
// Fallback for non-Linux systems
return -1;
}
}
}
module.exports = SystemMetrics;
业务指标
javascript
// business-metrics.js
const { metrics } = require('./metrics');
class BusinessMetrics {
constructor() {
this.userSessions = new Map();
}
// 用户登录
userLogin(userId) {
this.userSessions.set(userId, Date.now());
metrics.activeUsers.set(this.userSessions.size);
metrics.userLogins.inc();
}
// 用户登出
userLogout(userId) {
if (this.userSessions.has(userId)) {
const loginTime = this.userSessions.get(userId);
const sessionDuration = (Date.now() - loginTime) / 1000;
metrics.sessionDuration.observe(sessionDuration);
this.userSessions.delete(userId);
metrics.activeUsers.set(this.userSessions.size);
}
}
// 订单创建
orderCreated(orderId, amount, userId) {
metrics.ordersTotal.inc({ status: 'created' });
metrics.orderValue.observe(amount);
console.log(`Order created: ${orderId}, Amount: ${amount}, User: ${userId}`);
}
// 订单完成
orderCompleted(orderId, amount) {
metrics.ordersTotal.inc({ status: 'completed' });
metrics.revenue.inc(amount);
console.log(`Order completed: ${orderId}, Revenue: ${amount}`);
}
// 订单取消
orderCancelled(orderId, reason) {
metrics.ordersTotal.inc({ status: 'cancelled' });
metrics.orderCancellations.inc({ reason });
console.log(`Order cancelled: ${orderId}, Reason: ${reason}`);
}
// API调用
apiCall(endpoint, method, duration, statusCode) {
metrics.apiCalls.inc({
endpoint,
method,
status: statusCode
});
metrics.apiDuration.observe({
endpoint,
method
}, duration);
}
// 错误追踪
trackError(error, context = {}) {
metrics.errors.inc({
type: error.name,
module: context.module || 'unknown'
});
console.error('Error tracked:', {
error: error.message,
stack: error.stack,
context
});
}
// 清理过期会话
cleanupSessions(maxAge = 24 * 60 * 60 * 1000) { // 24小时
const now = Date.now();
let cleanedCount = 0;
for (const [userId, loginTime] of this.userSessions.entries()) {
if (now - loginTime > maxAge) {
this.userSessions.delete(userId);
cleanedCount++;
}
}
if (cleanedCount > 0) {
metrics.activeUsers.set(this.userSessions.size);
console.log(`Cleaned up ${cleanedCount} expired sessions`);
}
}
}
module.exports = BusinessMetrics;
🚨 告警配置
告警规则定义
javascript
// alerting.js
const nodemailer = require('nodemailer');
const { Webhook } = require('discord-webhook-node');
class AlertManager {
constructor(config) {
this.config = config;
this.alerts = new Map();
this.emailTransporter = this.setupEmail();
this.discordWebhook = new Webhook(config.discord?.webhookUrl);
}
setupEmail() {
if (!this.config.email) return null;
return nodemailer.createTransporter({
host: this.config.email.host,
port: this.config.email.port,
secure: this.config.email.secure,
auth: {
user: this.config.email.user,
pass: this.config.email.password
}
});
}
// 定义告警规则
defineAlerts() {
return [
{
name: 'high_error_rate',
condition: (metrics) => {
const errorRate = this.calculateErrorRate(metrics);
return errorRate > 0.05; // 5%错误率
},
severity: 'warning',
message: 'High error rate detected',
cooldown: 5 * 60 * 1000 // 5分钟冷却
},
{
name: 'high_response_time',
condition: (metrics) => {
const avgResponseTime = this.getAverageResponseTime(metrics);
return avgResponseTime > 1000; // 1秒
},
severity: 'warning',
message: 'High response time detected',
cooldown: 5 * 60 * 1000
},
{
name: 'high_memory_usage',
condition: () => {
const memUsage = process.memoryUsage();
const usagePercent = memUsage.rss / (1024 * 1024 * 1024); // GB
return usagePercent > 1; // 1GB
},
severity: 'critical',
message: 'High memory usage detected',
cooldown: 10 * 60 * 1000 // 10分钟冷却
},
{
name: 'service_down',
condition: () => {
// 检查关键服务是否可用
return !this.isServiceHealthy();
},
severity: 'critical',
message: 'Service is down',
cooldown: 1 * 60 * 1000 // 1分钟冷却
}
];
}
async checkAlerts(metrics) {
const alerts = this.defineAlerts();
const now = Date.now();
for (const alert of alerts) {
const lastTriggered = this.alerts.get(alert.name) || 0;
// 检查冷却时间
if (now - lastTriggered < alert.cooldown) {
continue;
}
try {
if (alert.condition(metrics)) {
await this.triggerAlert(alert);
this.alerts.set(alert.name, now);
}
} catch (error) {
console.error(`Error checking alert ${alert.name}:`, error);
}
}
}
async triggerAlert(alert) {
console.warn(`🚨 ALERT: ${alert.name} - ${alert.message}`);
const alertData = {
name: alert.name,
severity: alert.severity,
message: alert.message,
timestamp: new Date().toISOString(),
hostname: require('os').hostname(),
environment: process.env.NODE_ENV || 'development'
};
// 发送邮件告警
if (this.emailTransporter && this.config.email?.recipients) {
await this.sendEmailAlert(alertData);
}
// 发送Discord告警
if (this.config.discord?.enabled) {
await this.sendDiscordAlert(alertData);
}
// 发送Slack告警
if (this.config.slack?.enabled) {
await this.sendSlackAlert(alertData);
}
}
async sendEmailAlert(alert) {
try {
const subject = `🚨 Alert: ${alert.name} [${alert.severity.toUpperCase()}]`;
const html = `
<h2>Alert Triggered</h2>
<p><strong>Name:</strong> ${alert.name}</p>
<p><strong>Severity:</strong> ${alert.severity}</p>
<p><strong>Message:</strong> ${alert.message}</p>
<p><strong>Time:</strong> ${alert.timestamp}</p>
<p><strong>Host:</strong> ${alert.hostname}</p>
<p><strong>Environment:</strong> ${alert.environment}</p>
`;
await this.emailTransporter.sendMail({
from: this.config.email.from,
to: this.config.email.recipients,
subject,
html
});
} catch (error) {
console.error('Failed to send email alert:', error);
}
}
async sendDiscordAlert(alert) {
try {
const color = alert.severity === 'critical' ? 0xFF0000 : 0xFFA500;
await this.discordWebhook.send({
embeds: [{
title: `🚨 Alert: ${alert.name}`,
description: alert.message,
color,
fields: [
{ name: 'Severity', value: alert.severity, inline: true },
{ name: 'Host', value: alert.hostname, inline: true },
{ name: 'Environment', value: alert.environment, inline: true }
],
timestamp: alert.timestamp
}]
});
} catch (error) {
console.error('Failed to send Discord alert:', error);
}
}
calculateErrorRate(metrics) {
// 计算错误率逻辑
const totalRequests = metrics.httpRequestsTotal || 0;
const errorRequests = metrics.httpErrorsTotal || 0;
return totalRequests > 0 ? errorRequests / totalRequests : 0;
}
getAverageResponseTime(metrics) {
// 获取平均响应时间
return metrics.avgResponseTime || 0;
}
isServiceHealthy() {
// 检查服务健康状态
return true; // 实际实现中应该检查各种依赖服务
}
}
module.exports = AlertManager;
🔧 监控工具集成
Prometheus集成
javascript
// prometheus-config.js
const client = require('prom-client');
// 创建自定义指标
const httpRequestsTotal = new client.Counter({
name: 'http_requests_total',
help: 'Total number of HTTP requests',
labelNames: ['method', 'route', 'status_code']
});
const httpRequestDuration = new client.Histogram({
name: 'http_request_duration_seconds',
help: 'Duration of HTTP requests in seconds',
labelNames: ['method', 'route', 'status_code'],
buckets: [0.1, 0.3, 0.5, 0.7, 1, 3, 5, 7, 10]
});
const activeConnections = new client.Gauge({
name: 'active_connections',
help: 'Number of active connections'
});
// 导出配置
module.exports = {
register: client.register,
httpRequestsTotal,
httpRequestDuration,
activeConnections
};
Grafana仪表板配置
json
{
"dashboard": {
"title": "Node.js Application Dashboard",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total[5m])",
"legendFormat": "{{method}} {{route}}"
}
]
},
{
"title": "Response Time",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "95th percentile"
},
{
"expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket[5m]))",
"legendFormat": "50th percentile"
}
]
},
{
"title": "Error Rate",
"type": "singlestat",
"targets": [
{
"expr": "rate(http_requests_total{status_code=~\"5..\"}[5m]) / rate(http_requests_total[5m]) * 100",
"legendFormat": "Error Rate %"
}
]
},
{
"title": "Memory Usage",
"type": "graph",
"targets": [
{
"expr": "process_resident_memory_bytes",
"legendFormat": "RSS Memory"
},
{
"expr": "nodejs_heap_size_used_bytes",
"legendFormat": "Heap Used"
}
]
}
]
}
}
📝 总结
有效的监控系统应该包括:
- 全面的指标收集:系统、应用和业务指标
- 智能的告警机制:基于阈值和趋势的告警
- 可视化仪表板:直观的数据展示
- 日志聚合:结构化的日志收集和分析
- 性能追踪:请求链路追踪
监控不仅仅是收集数据,更重要的是从数据中获得洞察,持续改进系统的稳定性和性能。