告警配置
📋 概述
告警系统是监控体系的重要组成部分,能够在系统出现问题时及时通知相关人员。有效的告警配置可以帮助团队快速响应问题,减少系统停机时间。
🎯 学习目标
- 理解告警系统的核心概念
- 掌握告警规则的设计原则
- 学会配置多种告警渠道
- 了解告警疲劳的预防和处理
📚 告警系统概念
告警级别
javascript
const AlertLevels = {
INFO: 'info', // 信息性通知
WARNING: 'warning', // 警告,需要关注
CRITICAL: 'critical', // 严重,需要立即处理
FATAL: 'fatal' // 致命,系统不可用
};
告警状态
javascript
const AlertStates = {
PENDING: 'pending', // 等待中
FIRING: 'firing', // 触发中
RESOLVED: 'resolved', // 已解决
SUPPRESSED: 'suppressed' // 已抑制
};
告警生命周期
mermaid
graph LR
A[正常状态] --> B[阈值触发]
B --> C[等待确认]
C --> D[发送告警]
D --> E[问题解决]
E --> F[告警恢复]
F --> A
D --> G[告警升级]
G --> H[通知更多人员]
🛠 告警规则设计
Node.js应用告警规则
javascript
// alert-rules.js
class AlertRuleManager {
constructor() {
this.rules = [];
this.alertHistory = new Map();
this.suppressions = new Map();
}
// 定义告警规则
defineRules() {
return [
{
name: 'high_error_rate',
description: '错误率过高',
severity: 'critical',
condition: {
metric: 'http_error_rate',
operator: '>',
threshold: 5, // 5%
duration: '5m'
},
labels: {
team: 'backend',
service: 'api'
},
annotations: {
summary: 'API错误率过高',
description: '在过去5分钟内,API错误率超过5%',
runbook: 'https://wiki.company.com/runbooks/high-error-rate'
}
},
{
name: 'high_response_time',
description: '响应时间过长',
severity: 'warning',
condition: {
metric: 'http_response_time_p95',
operator: '>',
threshold: 1000, // 1秒
duration: '10m'
},
labels: {
team: 'backend',
service: 'api'
},
annotations: {
summary: 'API响应时间过长',
description: '95分位响应时间超过1秒,持续10分钟',
runbook: 'https://wiki.company.com/runbooks/high-response-time'
}
},
{
name: 'high_memory_usage',
description: '内存使用率过高',
severity: 'warning',
condition: {
metric: 'memory_usage_percent',
operator: '>',
threshold: 85,
duration: '15m'
},
labels: {
team: 'infrastructure',
service: 'api'
},
annotations: {
summary: '内存使用率过高',
description: '内存使用率超过85%,持续15分钟',
runbook: 'https://wiki.company.com/runbooks/high-memory-usage'
}
},
{
name: 'database_connection_exhaustion',
description: '数据库连接池耗尽',
severity: 'critical',
condition: {
metric: 'db_connection_pool_usage_percent',
operator: '>',
threshold: 90,
duration: '2m'
},
labels: {
team: 'database',
service: 'postgres'
},
annotations: {
summary: '数据库连接池即将耗尽',
description: '连接池使用率超过90%,可能影响新请求',
runbook: 'https://wiki.company.com/runbooks/db-connection-pool'
}
},
{
name: 'queue_backlog',
description: '队列积压严重',
severity: 'warning',
condition: {
metric: 'queue_size',
operator: '>',
threshold: 1000,
duration: '5m'
},
labels: {
team: 'backend',
service: 'worker'
},
annotations: {
summary: '队列积压严重',
description: '队列中待处理任务超过1000个',
runbook: 'https://wiki.company.com/runbooks/queue-backlog'
}
}
];
}
// 评估告警条件
async evaluateAlert(rule, currentValue) {
const now = Date.now();
const ruleKey = rule.name;
// 检查条件是否满足
const conditionMet = this.evaluateCondition(rule.condition, currentValue);
if (conditionMet) {
if (!this.alertHistory.has(ruleKey)) {
// 首次触发,记录开始时间
this.alertHistory.set(ruleKey, {
startTime: now,
state: 'pending'
});
} else {
const alert = this.alertHistory.get(ruleKey);
const duration = now - alert.startTime;
const requiredDuration = this.parseDuration(rule.condition.duration);
if (duration >= requiredDuration && alert.state === 'pending') {
// 持续时间达到阈值,触发告警
alert.state = 'firing';
await this.fireAlert(rule, currentValue);
}
}
} else {
// 条件不满足,检查是否需要恢复告警
if (this.alertHistory.has(ruleKey)) {
const alert = this.alertHistory.get(ruleKey);
if (alert.state === 'firing') {
alert.state = 'resolved';
await this.resolveAlert(rule, currentValue);
}
this.alertHistory.delete(ruleKey);
}
}
}
evaluateCondition(condition, value) {
switch (condition.operator) {
case '>':
return value > condition.threshold;
case '<':
return value < condition.threshold;
case '>=':
return value >= condition.threshold;
case '<=':
return value <= condition.threshold;
case '==':
return value === condition.threshold;
case '!=':
return value !== condition.threshold;
default:
return false;
}
}
parseDuration(duration) {
const match = duration.match(/^(\d+)([smh])$/);
if (!match) return 0;
const value = parseInt(match[1]);
const unit = match[2];
switch (unit) {
case 's': return value * 1000;
case 'm': return value * 60 * 1000;
case 'h': return value * 60 * 60 * 1000;
default: return 0;
}
}
async fireAlert(rule, value) {
console.log(`🚨 ALERT FIRING: ${rule.name}`);
const alert = {
name: rule.name,
severity: rule.severity,
state: 'firing',
value: value,
timestamp: new Date().toISOString(),
labels: rule.labels,
annotations: rule.annotations
};
// 发送告警通知
await this.sendNotification(alert);
}
async resolveAlert(rule, value) {
console.log(`✅ ALERT RESOLVED: ${rule.name}`);
const alert = {
name: rule.name,
severity: rule.severity,
state: 'resolved',
value: value,
timestamp: new Date().toISOString(),
labels: rule.labels,
annotations: {
...rule.annotations,
summary: `${rule.annotations.summary} - 已恢复`
}
};
await this.sendNotification(alert);
}
async sendNotification(alert) {
// 实现通知发送逻辑
// 这里会调用各种通知渠道
}
}
module.exports = AlertRuleManager;
📢 通知渠道配置
Slack通知
javascript
// slack-notifier.js
const { WebClient } = require('@slack/web-api');
class SlackNotifier {
constructor(token, defaultChannel) {
this.client = new WebClient(token);
this.defaultChannel = defaultChannel;
}
async sendAlert(alert) {
const color = this.getColorForSeverity(alert.severity);
const emoji = this.getEmojiForSeverity(alert.severity);
const blocks = [
{
type: 'header',
text: {
type: 'plain_text',
text: `${emoji} ${alert.annotations.summary}`
}
},
{
type: 'section',
fields: [
{
type: 'mrkdwn',
text: `*Severity:* ${alert.severity.toUpperCase()}`
},
{
type: 'mrkdwn',
text: `*Status:* ${alert.state.toUpperCase()}`
},
{
type: 'mrkdwn',
text: `*Service:* ${alert.labels.service || 'Unknown'}`
},
{
type: 'mrkdwn',
text: `*Team:* ${alert.labels.team || 'Unknown'}`
}
]
},
{
type: 'section',
text: {
type: 'mrkdwn',
text: alert.annotations.description
}
}
];
if (alert.annotations.runbook) {
blocks.push({
type: 'actions',
elements: [
{
type: 'button',
text: {
type: 'plain_text',
text: 'View Runbook'
},
url: alert.annotations.runbook,
style: 'primary'
}
]
});
}
try {
await this.client.chat.postMessage({
channel: this.defaultChannel,
blocks: blocks,
text: alert.annotations.summary // fallback text
});
} catch (error) {
console.error('Failed to send Slack notification:', error);
}
}
getColorForSeverity(severity) {
const colors = {
info: '#36a64f',
warning: '#ff9500',
critical: '#ff0000',
fatal: '#8b0000'
};
return colors[severity] || '#cccccc';
}
getEmojiForSeverity(severity) {
const emojis = {
info: '💡',
warning: '⚠️',
critical: '🚨',
fatal: '💀'
};
return emojis[severity] || '❓';
}
}
module.exports = SlackNotifier;
邮件通知
javascript
// email-notifier.js
const nodemailer = require('nodemailer');
class EmailNotifier {
constructor(config) {
this.transporter = nodemailer.createTransporter({
host: config.smtp.host,
port: config.smtp.port,
secure: config.smtp.secure,
auth: {
user: config.smtp.user,
pass: config.smtp.password
}
});
this.from = config.from;
this.recipients = config.recipients;
}
async sendAlert(alert) {
const subject = this.buildSubject(alert);
const html = this.buildHtmlBody(alert);
const text = this.buildTextBody(alert);
const mailOptions = {
from: this.from,
to: this.getRecipientsForAlert(alert),
subject: subject,
text: text,
html: html
};
try {
await this.transporter.sendMail(mailOptions);
} catch (error) {
console.error('Failed to send email notification:', error);
}
}
buildSubject(alert) {
const prefix = alert.state === 'firing' ? '🚨' : '✅';
const action = alert.state === 'firing' ? 'ALERT' : 'RESOLVED';
return `${prefix} ${action}: ${alert.annotations.summary}`;
}
buildHtmlBody(alert) {
const statusColor = alert.state === 'firing' ? '#ff0000' : '#00ff00';
return `
<html>
<body style="font-family: Arial, sans-serif; max-width: 600px; margin: 0 auto;">
<div style="background-color: #f5f5f5; padding: 20px; border-radius: 5px;">
<h2 style="color: ${statusColor}; margin-top: 0;">
${alert.annotations.summary}
</h2>
<table style="width: 100%; border-collapse: collapse; margin: 20px 0;">
<tr>
<td style="padding: 8px; border: 1px solid #ddd; font-weight: bold;">Severity:</td>
<td style="padding: 8px; border: 1px solid #ddd;">${alert.severity.toUpperCase()}</td>
</tr>
<tr>
<td style="padding: 8px; border: 1px solid #ddd; font-weight: bold;">Status:</td>
<td style="padding: 8px; border: 1px solid #ddd; color: ${statusColor};">
${alert.state.toUpperCase()}
</td>
</tr>
<tr>
<td style="padding: 8px; border: 1px solid #ddd; font-weight: bold;">Service:</td>
<td style="padding: 8px; border: 1px solid #ddd;">${alert.labels.service || 'Unknown'}</td>
</tr>
<tr>
<td style="padding: 8px; border: 1px solid #ddd; font-weight: bold;">Team:</td>
<td style="padding: 8px; border: 1px solid #ddd;">${alert.labels.team || 'Unknown'}</td>
</tr>
<tr>
<td style="padding: 8px; border: 1px solid #ddd; font-weight: bold;">Time:</td>
<td style="padding: 8px; border: 1px solid #ddd;">${alert.timestamp}</td>
</tr>
</table>
<div style="margin: 20px 0;">
<h3>Description:</h3>
<p>${alert.annotations.description}</p>
</div>
${alert.annotations.runbook ? `
<div style="margin: 20px 0;">
<a href="${alert.annotations.runbook}"
style="background-color: #007cba; color: white; padding: 10px 20px;
text-decoration: none; border-radius: 5px; display: inline-block;">
View Runbook
</a>
</div>
` : ''}
</div>
</body>
</html>
`;
}
buildTextBody(alert) {
return `
ALERT: ${alert.annotations.summary}
Severity: ${alert.severity.toUpperCase()}
Status: ${alert.state.toUpperCase()}
Service: ${alert.labels.service || 'Unknown'}
Team: ${alert.labels.team || 'Unknown'}
Time: ${alert.timestamp}
Description:
${alert.annotations.description}
${alert.annotations.runbook ? `Runbook: ${alert.annotations.runbook}` : ''}
`.trim();
}
getRecipientsForAlert(alert) {
// 根据告警标签决定收件人
if (alert.labels.team && this.recipients[alert.labels.team]) {
return this.recipients[alert.labels.team];
}
return this.recipients.default || [];
}
}
module.exports = EmailNotifier;
钉钉通知
javascript
// dingtalk-notifier.js
const crypto = require('crypto');
const axios = require('axios');
class DingTalkNotifier {
constructor(webhook, secret) {
this.webhook = webhook;
this.secret = secret;
}
async sendAlert(alert) {
const timestamp = Date.now();
const sign = this.generateSign(timestamp);
const url = `${this.webhook}×tamp=${timestamp}&sign=${sign}`;
const payload = {
msgtype: 'markdown',
markdown: {
title: alert.annotations.summary,
text: this.buildMarkdownContent(alert)
},
at: {
atMobiles: this.getAtMobiles(alert),
isAtAll: alert.severity === 'critical' || alert.severity === 'fatal'
}
};
try {
await axios.post(url, payload);
} catch (error) {
console.error('Failed to send DingTalk notification:', error);
}
}
generateSign(timestamp) {
const stringToSign = `${timestamp}\n${this.secret}`;
return encodeURIComponent(
crypto.createHmac('sha256', this.secret)
.update(stringToSign)
.digest('base64')
);
}
buildMarkdownContent(alert) {
const statusEmoji = alert.state === 'firing' ? '🚨' : '✅';
const severityColor = this.getSeverityColor(alert.severity);
return `
# ${statusEmoji} ${alert.annotations.summary}
---
**告警详情:**
- **严重程度:** <font color="${severityColor}">${alert.severity.toUpperCase()}</font>
- **状态:** ${alert.state.toUpperCase()}
- **服务:** ${alert.labels.service || 'Unknown'}
- **团队:** ${alert.labels.team || 'Unknown'}
- **时间:** ${alert.timestamp}
**描述:**
${alert.annotations.description}
${alert.annotations.runbook ? `[查看处理手册](${alert.annotations.runbook})` : ''}
`;
}
getSeverityColor(severity) {
const colors = {
info: '#108ee9',
warning: '#ff9500',
critical: '#ff0000',
fatal: '#8b0000'
};
return colors[severity] || '#cccccc';
}
getAtMobiles(alert) {
// 根据告警级别和团队决定@的手机号
const teamMobiles = {
backend: ['13800138001', '13800138002'],
frontend: ['13800138003', '13800138004'],
infrastructure: ['13800138005', '13800138006']
};
if (alert.severity === 'critical' || alert.severity === 'fatal') {
// 严重告警@所有相关人员
return teamMobiles[alert.labels.team] || [];
}
return [];
}
}
module.exports = DingTalkNotifier;
🔧 告警管理器
统一告警管理
javascript
// alert-manager.js
const SlackNotifier = require('./slack-notifier');
const EmailNotifier = require('./email-notifier');
const DingTalkNotifier = require('./dingtalk-notifier');
class AlertManager {
constructor(config) {
this.config = config;
this.notifiers = this.initializeNotifiers();
this.suppressions = new Map();
this.escalations = new Map();
}
initializeNotifiers() {
const notifiers = {};
if (this.config.slack?.enabled) {
notifiers.slack = new SlackNotifier(
this.config.slack.token,
this.config.slack.channel
);
}
if (this.config.email?.enabled) {
notifiers.email = new EmailNotifier(this.config.email);
}
if (this.config.dingtalk?.enabled) {
notifiers.dingtalk = new DingTalkNotifier(
this.config.dingtalk.webhook,
this.config.dingtalk.secret
);
}
return notifiers;
}
async sendAlert(alert) {
// 检查告警抑制
if (this.isAlertSuppressed(alert)) {
console.log(`Alert ${alert.name} is suppressed`);
return;
}
// 检查告警升级
const escalationLevel = this.getEscalationLevel(alert);
// 根据告警级别和升级级别决定通知渠道
const channels = this.getNotificationChannels(alert, escalationLevel);
// 并行发送通知
const notifications = channels.map(channel =>
this.notifiers[channel]?.sendAlert(alert)
).filter(Boolean);
try {
await Promise.all(notifications);
console.log(`Alert ${alert.name} sent to channels: ${channels.join(', ')}`);
} catch (error) {
console.error('Failed to send some notifications:', error);
}
// 记录告警历史
this.recordAlertHistory(alert);
}
isAlertSuppressed(alert) {
const suppressionKey = `${alert.name}_${alert.labels.service}`;
const suppression = this.suppressions.get(suppressionKey);
if (!suppression) return false;
const now = Date.now();
if (now > suppression.expiresAt) {
this.suppressions.delete(suppressionKey);
return false;
}
return true;
}
getEscalationLevel(alert) {
const escalationKey = `${alert.name}_${alert.labels.service}`;
const escalation = this.escalations.get(escalationKey);
if (!escalation) {
// 首次告警
this.escalations.set(escalationKey, {
level: 0,
lastEscalation: Date.now(),
alertCount: 1
});
return 0;
}
const timeSinceLastEscalation = Date.now() - escalation.lastEscalation;
const escalationInterval = this.config.escalation?.interval || 30 * 60 * 1000; // 30分钟
if (timeSinceLastEscalation > escalationInterval) {
escalation.level++;
escalation.lastEscalation = Date.now();
}
escalation.alertCount++;
return escalation.level;
}
getNotificationChannels(alert, escalationLevel) {
const channels = [];
// 基础通知渠道
if (alert.severity === 'info') {
channels.push('slack');
} else if (alert.severity === 'warning') {
channels.push('slack', 'dingtalk');
} else if (alert.severity === 'critical' || alert.severity === 'fatal') {
channels.push('slack', 'email', 'dingtalk');
}
// 升级通知
if (escalationLevel > 0) {
channels.push('email'); // 确保邮件通知
if (escalationLevel > 1) {
// 二级升级:通知更多人
channels.push('dingtalk');
}
}
return [...new Set(channels)]; // 去重
}
suppressAlert(alertName, service, duration = 60 * 60 * 1000) { // 默认1小时
const suppressionKey = `${alertName}_${service}`;
this.suppressions.set(suppressionKey, {
expiresAt: Date.now() + duration
});
}
recordAlertHistory(alert) {
// 记录到数据库或日志系统
console.log('Alert history:', {
name: alert.name,
severity: alert.severity,
state: alert.state,
timestamp: alert.timestamp,
labels: alert.labels
});
}
// 获取告警统计
getAlertStats() {
return {
suppressions: this.suppressions.size,
escalations: this.escalations.size,
activeAlerts: Array.from(this.escalations.entries()).map(([key, value]) => ({
alert: key,
level: value.level,
count: value.alertCount
}))
};
}
}
module.exports = AlertManager;
🚨 告警最佳实践
告警疲劳预防
javascript
// alert-fatigue-prevention.js
class AlertFatiguePrevention {
constructor() {
this.alertCounts = new Map();
this.alertGroups = new Map();
}
// 告警聚合
aggregateAlerts(alerts) {
const groups = new Map();
alerts.forEach(alert => {
const groupKey = this.getGroupKey(alert);
if (!groups.has(groupKey)) {
groups.set(groupKey, {
alerts: [],
summary: this.generateGroupSummary(alert),
severity: alert.severity
});
}
const group = groups.get(groupKey);
group.alerts.push(alert);
// 使用最高严重级别
if (this.getSeverityLevel(alert.severity) > this.getSeverityLevel(group.severity)) {
group.severity = alert.severity;
}
});
return Array.from(groups.values());
}
getGroupKey(alert) {
// 根据服务和告警类型分组
return `${alert.labels.service}_${alert.name.split('_')[0]}`;
}
generateGroupSummary(alert) {
return `${alert.labels.service} service issues`;
}
getSeverityLevel(severity) {
const levels = { info: 1, warning: 2, critical: 3, fatal: 4 };
return levels[severity] || 0;
}
// 告警频率限制
shouldThrottleAlert(alert) {
const key = `${alert.name}_${alert.labels.service}`;
const now = Date.now();
const throttleWindow = 15 * 60 * 1000; // 15分钟
const maxAlertsPerWindow = 3;
if (!this.alertCounts.has(key)) {
this.alertCounts.set(key, []);
}
const counts = this.alertCounts.get(key);
// 清理过期记录
const validCounts = counts.filter(timestamp => now - timestamp < throttleWindow);
this.alertCounts.set(key, validCounts);
if (validCounts.length >= maxAlertsPerWindow) {
return true; // 需要限制
}
// 记录此次告警
validCounts.push(now);
return false;
}
// 智能告警路由
routeAlert(alert) {
const routes = [];
// 基于时间的路由
const hour = new Date().getHours();
const isBusinessHours = hour >= 9 && hour <= 18;
if (isBusinessHours) {
routes.push('slack', 'email');
} else {
// 非工作时间只发送高优先级告警
if (alert.severity === 'critical' || alert.severity === 'fatal') {
routes.push('dingtalk', 'email');
}
}
// 基于团队的路由
if (alert.labels.team === 'oncall') {
routes.push('phone'); // 电话通知
}
return routes;
}
}
module.exports = AlertFatiguePrevention;
告警质量评估
javascript
// alert-quality-metrics.js
class AlertQualityMetrics {
constructor() {
this.metrics = {
totalAlerts: 0,
falsePositives: 0,
resolvedAlerts: 0,
averageResolutionTime: 0,
alertsBySeverity: new Map(),
alertsByService: new Map()
};
}
recordAlert(alert) {
this.metrics.totalAlerts++;
// 按严重程度统计
const severityCount = this.metrics.alertsBySeverity.get(alert.severity) || 0;
this.metrics.alertsBySeverity.set(alert.severity, severityCount + 1);
// 按服务统计
const serviceCount = this.metrics.alertsByService.get(alert.labels.service) || 0;
this.metrics.alertsByService.set(alert.labels.service, serviceCount + 1);
}
recordResolution(alert, resolutionTime, wasFalsePositive = false) {
this.metrics.resolvedAlerts++;
if (wasFalsePositive) {
this.metrics.falsePositives++;
}
// 更新平均解决时间
const totalResolutionTime = this.metrics.averageResolutionTime * (this.metrics.resolvedAlerts - 1);
this.metrics.averageResolutionTime = (totalResolutionTime + resolutionTime) / this.metrics.resolvedAlerts;
}
calculateQualityScore() {
if (this.metrics.totalAlerts === 0) return 100;
const falsePositiveRate = this.metrics.falsePositives / this.metrics.totalAlerts;
const resolutionRate = this.metrics.resolvedAlerts / this.metrics.totalAlerts;
// 质量评分 = (1 - 误报率) * 解决率 * 100
return (1 - falsePositiveRate) * resolutionRate * 100;
}
generateReport() {
return {
summary: {
totalAlerts: this.metrics.totalAlerts,
resolvedAlerts: this.metrics.resolvedAlerts,
falsePositives: this.metrics.falsePositives,
averageResolutionTime: this.metrics.averageResolutionTime,
qualityScore: this.calculateQualityScore()
},
breakdown: {
bySeverity: Object.fromEntries(this.metrics.alertsBySeverity),
byService: Object.fromEntries(this.metrics.alertsByService)
}
};
}
}
module.exports = AlertQualityMetrics;
📝 总结
有效的告警配置应该包括:
- 合理的告警规则:基于业务影响的阈值设置
- 多样的通知渠道:确保告警能够及时送达
- 智能的告警管理:防止告警疲劳和噪音
- 完善的升级机制:确保重要问题得到及时处理
- 持续的优化改进:基于质量指标优化告警规则
良好的告警系统是高可用服务的重要保障,需要持续调优和改进。