运维自动化
📋 概述
运维自动化是通过自动化工具和脚本来减少手工操作,提高运维效率和可靠性的实践。它涵盖了系统监控、故障处理、部署发布、资源管理等各个方面。
🎯 学习目标
- 理解运维自动化的核心概念和价值
- 掌握常见运维任务的自动化实现
- 学会设计和实施自动化运维流程
- 了解运维自动化工具和最佳实践
📚 运维自动化基础
自动化层次
mermaid
graph TB
A[运维自动化] --> B[基础设施自动化]
A --> C[配置管理自动化]
A --> D[部署自动化]
A --> E[监控自动化]
A --> F[故障处理自动化]
B --> B1[资源创建]
B --> B2[环境配置]
C --> C1[配置分发]
C --> C2[配置变更]
D --> D1[应用部署]
D --> D2[版本管理]
E --> E1[指标收集]
E --> E2[告警处理]
F --> F1[自动恢复]
F --> F2[故障隔离]
自动化原则
javascript
const AutomationPrinciples = {
IDEMPOTENCY: '幂等性 - 重复执行产生相同结果',
RELIABILITY: '可靠性 - 自动化系统本身要稳定',
OBSERVABILITY: '可观测性 - 自动化过程要可监控',
ROLLBACK: '可回滚 - 支持快速回滚到之前状态',
TESTING: '可测试 - 自动化脚本要经过充分测试'
};
🛠 系统监控自动化
健康检查自动化
javascript
// health-check-automation.js
const axios = require('axios');
const nodemailer = require('nodemailer');
const { Webhook } = require('discord-webhook-node');
class HealthCheckAutomation {
constructor(config) {
this.config = config;
this.services = config.services || [];
this.notifications = this.initializeNotifications();
this.serviceStatus = new Map();
}
initializeNotifications() {
const notifications = {};
if (this.config.email) {
notifications.email = nodemailer.createTransporter(this.config.email);
}
if (this.config.slack) {
notifications.slack = new Webhook(this.config.slack.webhookUrl);
}
return notifications;
}
async startMonitoring() {
console.log('🔍 启动健康检查自动化监控');
// 初始化服务状态
for (const service of this.services) {
this.serviceStatus.set(service.name, {
status: 'unknown',
lastCheck: null,
consecutiveFailures: 0
});
}
// 定期检查
setInterval(() => {
this.performHealthChecks();
}, this.config.checkInterval || 60000); // 默认1分钟
// 立即执行一次检查
await this.performHealthChecks();
}
async performHealthChecks() {
const checkPromises = this.services.map(service =>
this.checkService(service)
);
const results = await Promise.allSettled(checkPromises);
// 处理检查结果
results.forEach((result, index) => {
const service = this.services[index];
if (result.status === 'fulfilled') {
this.handleServiceResult(service, result.value);
} else {
this.handleServiceResult(service, {
healthy: false,
error: result.reason.message
});
}
});
}
async checkService(service) {
const startTime = Date.now();
try {
const response = await axios.get(service.url, {
timeout: service.timeout || 10000,
headers: service.headers || {}
});
const responseTime = Date.now() - startTime;
const isHealthy = this.evaluateHealth(service, response, responseTime);
return {
healthy: isHealthy,
responseTime,
statusCode: response.status,
data: response.data
};
} catch (error) {
return {
healthy: false,
error: error.message,
responseTime: Date.now() - startTime
};
}
}
evaluateHealth(service, response, responseTime) {
// 检查状态码
if (response.status < 200 || response.status >= 300) {
return false;
}
// 检查响应时间
if (service.maxResponseTime && responseTime > service.maxResponseTime) {
return false;
}
// 检查响应内容
if (service.expectedContent) {
const content = typeof response.data === 'string'
? response.data
: JSON.stringify(response.data);
if (!content.includes(service.expectedContent)) {
return false;
}
}
return true;
}
handleServiceResult(service, result) {
const currentStatus = this.serviceStatus.get(service.name);
const now = new Date();
if (result.healthy) {
// 服务健康
if (currentStatus.status === 'down') {
// 服务恢复
this.handleServiceRecovery(service, result);
}
this.serviceStatus.set(service.name, {
status: 'up',
lastCheck: now,
consecutiveFailures: 0,
lastResult: result
});
} else {
// 服务异常
const consecutiveFailures = currentStatus.consecutiveFailures + 1;
this.serviceStatus.set(service.name, {
status: 'down',
lastCheck: now,
consecutiveFailures,
lastResult: result
});
// 检查是否需要发送告警
if (this.shouldSendAlert(service, consecutiveFailures)) {
this.handleServiceFailure(service, result, consecutiveFailures);
}
// 尝试自动恢复
if (service.autoRecover) {
this.attemptAutoRecovery(service);
}
}
}
shouldSendAlert(service, consecutiveFailures) {
const alertThreshold = service.alertThreshold || 3;
return consecutiveFailures === alertThreshold ||
(consecutiveFailures > alertThreshold && consecutiveFailures % 10 === 0);
}
async handleServiceFailure(service, result, consecutiveFailures) {
const alertData = {
service: service.name,
url: service.url,
status: 'DOWN',
error: result.error || 'Service unhealthy',
consecutiveFailures,
timestamp: new Date().toISOString()
};
console.error(`🚨 服务异常: ${service.name}`, alertData);
// 发送通知
await this.sendAlert(alertData);
}
async handleServiceRecovery(service, result) {
const recoveryData = {
service: service.name,
url: service.url,
status: 'RECOVERED',
responseTime: result.responseTime,
timestamp: new Date().toISOString()
};
console.log(`✅ 服务恢复: ${service.name}`, recoveryData);
// 发送恢复通知
await this.sendRecoveryNotification(recoveryData);
}
async sendAlert(alertData) {
const message = `🚨 服务告警\n服务: ${alertData.service}\nURL: ${alertData.url}\n状态: ${alertData.status}\n错误: ${alertData.error}\n连续失败次数: ${alertData.consecutiveFailures}`;
// 发送邮件
if (this.notifications.email) {
try {
await this.notifications.email.sendMail({
from: this.config.email.from,
to: this.config.email.alerts,
subject: `🚨 服务告警: ${alertData.service}`,
text: message
});
} catch (error) {
console.error('邮件发送失败:', error);
}
}
// 发送Slack通知
if (this.notifications.slack) {
try {
await this.notifications.slack.send({
content: message,
embeds: [{
title: '🚨 服务告警',
color: 0xff0000,
fields: [
{ name: '服务', value: alertData.service, inline: true },
{ name: '状态', value: alertData.status, inline: true },
{ name: '错误', value: alertData.error, inline: false }
],
timestamp: alertData.timestamp
}]
});
} catch (error) {
console.error('Slack通知发送失败:', error);
}
}
}
async attemptAutoRecovery(service) {
if (!service.recoveryActions) return;
console.log(`🔄 尝试自动恢复服务: ${service.name}`);
for (const action of service.recoveryActions) {
try {
await this.executeRecoveryAction(action);
console.log(`✅ 恢复动作执行成功: ${action.type}`);
} catch (error) {
console.error(`❌ 恢复动作执行失败: ${action.type}`, error);
}
}
}
async executeRecoveryAction(action) {
switch (action.type) {
case 'restart_service':
await this.restartService(action.service);
break;
case 'clear_cache':
await this.clearCache(action.cache);
break;
case 'scale_up':
await this.scaleService(action.service, action.instances);
break;
case 'webhook':
await this.callWebhook(action.url, action.payload);
break;
default:
throw new Error(`未知的恢复动作类型: ${action.type}`);
}
}
async restartService(serviceName) {
// 实现服务重启逻辑
console.log(`重启服务: ${serviceName}`);
// 这里可以调用Docker、systemctl、PM2等
}
async clearCache(cacheName) {
// 实现缓存清理逻辑
console.log(`清理缓存: ${cacheName}`);
}
async scaleService(serviceName, instances) {
// 实现服务扩容逻辑
console.log(`扩容服务: ${serviceName} 到 ${instances} 实例`);
}
async callWebhook(url, payload) {
await axios.post(url, payload);
}
getServiceStatus() {
const status = {};
for (const [serviceName, serviceStatus] of this.serviceStatus) {
status[serviceName] = serviceStatus;
}
return status;
}
}
module.exports = HealthCheckAutomation;
配置示例
javascript
// health-check-config.js
const config = {
checkInterval: 30000, // 30秒检查一次
services: [
{
name: 'nodejs-api',
url: 'http://localhost:3000/health',
timeout: 5000,
maxResponseTime: 2000,
expectedContent: 'OK',
alertThreshold: 3,
autoRecover: true,
recoveryActions: [
{
type: 'restart_service',
service: 'nodejs-app'
},
{
type: 'clear_cache',
cache: 'redis'
}
]
},
{
name: 'database',
url: 'http://localhost:3000/db-health',
timeout: 10000,
alertThreshold: 2,
recoveryActions: [
{
type: 'webhook',
url: 'http://localhost:3000/api/db/reconnect',
payload: { action: 'reconnect' }
}
]
}
],
email: {
host: 'smtp.gmail.com',
port: 587,
secure: false,
auth: {
user: 'alerts@company.com',
pass: 'password'
},
from: 'alerts@company.com',
alerts: ['admin@company.com', 'ops@company.com']
},
slack: {
webhookUrl: 'https://hooks.slack.com/services/...'
}
};
module.exports = config;
🔄 部署自动化
零停机部署自动化
javascript
// zero-downtime-deployment.js
const { exec } = require('child_process');
const axios = require('axios');
const fs = require('fs').promises;
class ZeroDowntimeDeployment {
constructor(config) {
this.config = config;
this.deploymentId = `deploy-${Date.now()}`;
this.rollbackData = null;
}
async deploy(version) {
console.log(`🚀 开始零停机部署: ${version}`);
console.log(`部署ID: ${this.deploymentId}`);
try {
// 预检查
await this.preDeploymentChecks();
// 创建回滚点
await this.createRollbackPoint();
// 执行部署
await this.executeDeployment(version);
// 健康检查
await this.performHealthCheck();
// 流量切换
await this.switchTraffic();
// 后部署验证
await this.postDeploymentVerification();
// 清理旧版本
await this.cleanup();
console.log('✅ 部署成功完成');
} catch (error) {
console.error('❌ 部署失败:', error.message);
await this.rollback();
throw error;
}
}
async preDeploymentChecks() {
console.log('🔍 执行部署前检查...');
// 检查系统资源
await this.checkSystemResources();
// 检查依赖服务
await this.checkDependencies();
// 检查当前服务状态
await this.checkCurrentServiceHealth();
console.log('✅ 部署前检查通过');
}
async checkSystemResources() {
const diskUsage = await this.executeCommand("df -h / | tail -1 | awk '{print $5}' | sed 's/%//'");
const memUsage = await this.executeCommand("free | grep Mem | awk '{printf \"%.0f\", $3/$2 * 100.0}'");
if (parseInt(diskUsage) > 90) {
throw new Error(`磁盘使用率过高: ${diskUsage}%`);
}
if (parseInt(memUsage) > 90) {
throw new Error(`内存使用率过高: ${memUsage}%`);
}
console.log(`系统资源检查通过 - 磁盘: ${diskUsage}%, 内存: ${memUsage}%`);
}
async checkDependencies() {
const dependencies = this.config.dependencies || [];
for (const dep of dependencies) {
try {
const response = await axios.get(dep.url, { timeout: 5000 });
if (response.status !== 200) {
throw new Error(`依赖服务异常: ${dep.name}`);
}
} catch (error) {
throw new Error(`依赖服务不可用: ${dep.name} - ${error.message}`);
}
}
console.log('✅ 依赖服务检查通过');
}
async checkCurrentServiceHealth() {
try {
const response = await axios.get(this.config.healthCheckUrl, { timeout: 5000 });
if (response.status !== 200) {
throw new Error('当前服务健康检查失败');
}
} catch (error) {
throw new Error(`当前服务不健康: ${error.message}`);
}
}
async createRollbackPoint() {
console.log('💾 创建回滚点...');
const timestamp = new Date().toISOString();
const backupDir = `/tmp/rollback-${this.deploymentId}`;
// 创建备份目录
await this.executeCommand(`mkdir -p ${backupDir}`);
// 备份当前版本
await this.executeCommand(`cp -r ${this.config.appDir} ${backupDir}/app`);
// 备份配置文件
if (this.config.configFiles) {
for (const configFile of this.config.configFiles) {
await this.executeCommand(`cp ${configFile} ${backupDir}/`);
}
}
// 记录当前状态
const rollbackData = {
deploymentId: this.deploymentId,
timestamp,
backupDir,
previousVersion: await this.getCurrentVersion(),
configFiles: this.config.configFiles || []
};
await fs.writeFile(`${backupDir}/rollback-info.json`, JSON.stringify(rollbackData, null, 2));
this.rollbackData = rollbackData;
console.log(`✅ 回滚点创建完成: ${backupDir}`);
}
async executeDeployment(version) {
console.log(`📦 执行部署: ${version}`);
// 下载新版本
await this.downloadVersion(version);
// 停止应用
await this.stopApplication();
// 更新应用文件
await this.updateApplication(version);
// 更新配置
await this.updateConfiguration();
// 启动应用
await this.startApplication();
console.log('✅ 应用部署完成');
}
async downloadVersion(version) {
console.log(`⬇️ 下载版本: ${version}`);
const downloadUrl = `${this.config.artifactRepository}/${version}.tar.gz`;
const downloadPath = `/tmp/${version}.tar.gz`;
await this.executeCommand(`wget -O ${downloadPath} ${downloadUrl}`);
await this.executeCommand(`tar -xzf ${downloadPath} -C /tmp/`);
console.log(`✅ 版本下载完成: ${downloadPath}`);
}
async stopApplication() {
console.log('⏹️ 停止应用...');
if (this.config.stopCommand) {
await this.executeCommand(this.config.stopCommand);
} else {
// 默认使用PM2
await this.executeCommand('pm2 stop all');
}
// 等待进程完全停止
await this.sleep(5000);
console.log('✅ 应用已停止');
}
async updateApplication(version) {
console.log('📁 更新应用文件...');
const sourceDir = `/tmp/${version}`;
const targetDir = this.config.appDir;
// 备份当前版本
await this.executeCommand(`mv ${targetDir} ${targetDir}.backup`);
// 部署新版本
await this.executeCommand(`mv ${sourceDir} ${targetDir}`);
// 安装依赖
if (this.config.installDependencies !== false) {
await this.executeCommand(`cd ${targetDir} && npm ci --only=production`);
}
console.log('✅ 应用文件更新完成');
}
async updateConfiguration() {
console.log('⚙️ 更新配置...');
if (this.config.configUpdates) {
for (const update of this.config.configUpdates) {
await this.executeCommand(update);
}
}
console.log('✅ 配置更新完成');
}
async startApplication() {
console.log('▶️ 启动应用...');
if (this.config.startCommand) {
await this.executeCommand(this.config.startCommand);
} else {
// 默认使用PM2
await this.executeCommand(`cd ${this.config.appDir} && pm2 start ecosystem.config.js`);
}
// 等待应用启动
await this.sleep(10000);
console.log('✅ 应用已启动');
}
async performHealthCheck() {
console.log('🔍 执行健康检查...');
const maxRetries = 30;
const retryInterval = 2000;
for (let i = 0; i < maxRetries; i++) {
try {
const response = await axios.get(this.config.healthCheckUrl, { timeout: 5000 });
if (response.status === 200) {
console.log('✅ 健康检查通过');
return;
}
} catch (error) {
console.log(`健康检查失败 (${i + 1}/${maxRetries}): ${error.message}`);
}
await this.sleep(retryInterval);
}
throw new Error('健康检查超时失败');
}
async switchTraffic() {
console.log('🔄 切换流量...');
if (this.config.loadBalancer) {
// 更新负载均衡器配置
await this.updateLoadBalancer();
}
if (this.config.dnsUpdate) {
// 更新DNS记录
await this.updateDNS();
}
console.log('✅ 流量切换完成');
}
async postDeploymentVerification() {
console.log('🔍 执行部署后验证...');
// 功能测试
if (this.config.functionalTests) {
await this.runFunctionalTests();
}
// 性能测试
if (this.config.performanceTests) {
await this.runPerformanceTests();
}
console.log('✅ 部署后验证通过');
}
async rollback() {
if (!this.rollbackData) {
throw new Error('没有可用的回滚数据');
}
console.log('🔄 开始回滚...');
try {
// 停止当前应用
await this.stopApplication();
// 恢复应用文件
await this.executeCommand(`rm -rf ${this.config.appDir}`);
await this.executeCommand(`cp -r ${this.rollbackData.backupDir}/app ${this.config.appDir}`);
// 恢复配置文件
for (const configFile of this.rollbackData.configFiles) {
const fileName = configFile.split('/').pop();
await this.executeCommand(`cp ${this.rollbackData.backupDir}/${fileName} ${configFile}`);
}
// 启动应用
await this.startApplication();
// 验证回滚
await this.performHealthCheck();
console.log('✅ 回滚完成');
} catch (error) {
console.error('❌ 回滚失败:', error.message);
throw error;
}
}
async cleanup() {
console.log('🧹 清理临时文件...');
// 清理下载的文件
await this.executeCommand('rm -rf /tmp/deploy-*');
// 清理旧的备份(保留最近3个)
await this.executeCommand(`ls -dt ${this.config.appDir}.backup* | tail -n +4 | xargs rm -rf`);
console.log('✅ 清理完成');
}
async executeCommand(command) {
return new Promise((resolve, reject) => {
exec(command, (error, stdout, stderr) => {
if (error) {
reject(new Error(`命令执行失败: ${command}\n${error.message}`));
} else {
resolve(stdout.trim());
}
});
});
}
async getCurrentVersion() {
try {
const packageJson = await fs.readFile(`${this.config.appDir}/package.json`, 'utf8');
const pkg = JSON.parse(packageJson);
return pkg.version;
} catch (error) {
return 'unknown';
}
}
sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
module.exports = ZeroDowntimeDeployment;
🔧 系统维护自动化
系统清理自动化
bash
#!/bin/bash
# system-cleanup-automation.sh
set -e
# 配置变量
LOG_DIR="/var/log"
TEMP_DIR="/tmp"
LOG_RETENTION_DAYS=30
TEMP_FILE_AGE_HOURS=24
DISK_THRESHOLD=85
MEMORY_THRESHOLD=80
# 日志函数
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1" | tee -a /var/log/system-cleanup.log
}
# 检查磁盘使用率
check_disk_usage() {
local usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//')
if [ "$usage" -gt "$DISK_THRESHOLD" ]; then
log "⚠️ 磁盘使用率过高: ${usage}%,开始清理"
return 0
else
log "✅ 磁盘使用率正常: ${usage}%"
return 1
fi
}
# 清理日志文件
cleanup_logs() {
log "🧹 开始清理日志文件..."
# 清理系统日志
find /var/log -name "*.log" -mtime +$LOG_RETENTION_DAYS -type f -delete
find /var/log -name "*.gz" -mtime +$LOG_RETENTION_DAYS -type f -delete
# 清理应用日志
find /opt/*/logs -name "*.log" -mtime +$LOG_RETENTION_DAYS -type f -delete 2>/dev/null || true
# 清理journal日志
journalctl --vacuum-time=${LOG_RETENTION_DAYS}d
log "✅ 日志清理完成"
}
# 清理临时文件
cleanup_temp_files() {
log "🧹 开始清理临时文件..."
# 清理/tmp目录
find $TEMP_DIR -type f -atime +1 -delete 2>/dev/null || true
find $TEMP_DIR -type d -empty -delete 2>/dev/null || true
# 清理用户临时文件
find /home/*/tmp -type f -mtime +1 -delete 2>/dev/null || true
log "✅ 临时文件清理完成"
}
# 清理包管理器缓存
cleanup_package_cache() {
log "🧹 开始清理包管理器缓存..."
# 清理APT缓存
if command -v apt-get >/dev/null 2>&1; then
apt-get clean
apt-get autoremove -y
fi
# 清理YUM缓存
if command -v yum >/dev/null 2>&1; then
yum clean all
fi
# 清理NPM缓存
if command -v npm >/dev/null 2>&1; then
npm cache clean --force
fi
log "✅ 包管理器缓存清理完成"
}
# 清理Docker资源
cleanup_docker() {
if command -v docker >/dev/null 2>&1; then
log "🧹 开始清理Docker资源..."
# 清理停止的容器
docker container prune -f
# 清理未使用的镜像
docker image prune -f
# 清理未使用的网络
docker network prune -f
# 清理未使用的卷
docker volume prune -f
log "✅ Docker资源清理完成"
fi
}
# 内存清理
cleanup_memory() {
local mem_usage=$(free | grep Mem | awk '{printf "%.0f", $3/$2 * 100.0}')
if [ "$mem_usage" -gt "$MEMORY_THRESHOLD" ]; then
log "⚠️ 内存使用率过高: ${mem_usage}%,开始内存清理"
# 清理页面缓存
sync && echo 1 > /proc/sys/vm/drop_caches
# 清理交换空间
swapoff -a && swapon -a
log "✅ 内存清理完成"
else
log "✅ 内存使用率正常: ${mem_usage}%"
fi
}
# 服务状态检查和重启
check_and_restart_services() {
log "🔍 检查服务状态..."
local services=("nginx" "nodejs-app" "redis" "postgresql")
for service in "${services[@]}"; do
if systemctl is-active --quiet "$service"; then
log "✅ 服务正常: $service"
else
log "⚠️ 服务异常: $service,尝试重启"
systemctl restart "$service"
if systemctl is-active --quiet "$service"; then
log "✅ 服务重启成功: $service"
else
log "❌ 服务重启失败: $service"
fi
fi
done
}
# 生成清理报告
generate_report() {
local report_file="/tmp/cleanup-report-$(date +%Y%m%d-%H%M%S).txt"
cat << EOF > "$report_file"
系统清理报告
=============
执行时间: $(date)
主机名: $(hostname)
磁盘使用情况:
$(df -h)
内存使用情况:
$(free -h)
系统负载:
$(uptime)
服务状态:
$(systemctl status nginx nodejs-app redis postgresql --no-pager -l)
Docker状态:
$(docker system df 2>/dev/null || echo "Docker未安装")
清理日志:
$(tail -20 /var/log/system-cleanup.log)
EOF
log "📊 清理报告已生成: $report_file"
# 发送报告邮件(如果配置了)
if [ -n "$REPORT_EMAIL" ]; then
mail -s "系统清理报告 - $(hostname)" "$REPORT_EMAIL" < "$report_file"
log "📧 报告已发送到: $REPORT_EMAIL"
fi
}
# 主函数
main() {
log "🚀 开始系统维护自动化"
# 检查是否需要清理
if check_disk_usage; then
cleanup_logs
cleanup_temp_files
cleanup_package_cache
cleanup_docker
fi
# 内存清理
cleanup_memory
# 服务检查
check_and_restart_services
# 生成报告
generate_report
log "✅ 系统维护自动化完成"
}
# 执行主函数
main "$@"
定时任务配置
bash
# 系统维护定时任务
# /etc/cron.d/system-maintenance
# 每天凌晨2点执行系统清理
0 2 * * * root /opt/scripts/system-cleanup-automation.sh
# 每小时检查服务状态
0 * * * * root /opt/scripts/service-health-check.sh
# 每周日凌晨3点执行完整系统维护
0 3 * * 0 root /opt/scripts/weekly-maintenance.sh
# 每天检查磁盘空间
*/30 * * * * root /opt/scripts/disk-space-monitor.sh
📊 性能监控自动化
性能指标收集
javascript
// performance-monitor.js
const os = require('os');
const fs = require('fs').promises;
const { performance } = require('perf_hooks');
class PerformanceMonitor {
constructor(config) {
this.config = config;
this.metrics = [];
this.isRunning = false;
}
start() {
if (this.isRunning) return;
this.isRunning = true;
console.log('📊 启动性能监控');
// 定期收集指标
this.metricsInterval = setInterval(() => {
this.collectMetrics();
}, this.config.interval || 30000);
// 定期分析和报告
this.analysisInterval = setInterval(() => {
this.analyzePerformance();
}, this.config.analysisInterval || 300000);
}
stop() {
if (!this.isRunning) return;
this.isRunning = false;
clearInterval(this.metricsInterval);
clearInterval(this.analysisInterval);
console.log('⏹️ 停止性能监控');
}
async collectMetrics() {
const timestamp = Date.now();
const metrics = {
timestamp,
cpu: this.getCPUMetrics(),
memory: this.getMemoryMetrics(),
disk: await this.getDiskMetrics(),
network: await this.getNetworkMetrics(),
process: this.getProcessMetrics(),
application: await this.getApplicationMetrics()
};
this.metrics.push(metrics);
// 保持最近1小时的数据
const oneHourAgo = timestamp - 60 * 60 * 1000;
this.metrics = this.metrics.filter(m => m.timestamp > oneHourAgo);
// 检查异常
this.checkAnomalies(metrics);
}
getCPUMetrics() {
const cpus = os.cpus();
const loadAvg = os.loadavg();
return {
count: cpus.length,
model: cpus[0].model,
speed: cpus[0].speed,
loadAverage: {
'1m': loadAvg[0],
'5m': loadAvg[1],
'15m': loadAvg[2]
},
usage: this.calculateCPUUsage()
};
}
calculateCPUUsage() {
const cpus = os.cpus();
let totalIdle = 0;
let totalTick = 0;
cpus.forEach(cpu => {
for (const type in cpu.times) {
totalTick += cpu.times[type];
}
totalIdle += cpu.times.idle;
});
return {
idle: totalIdle / totalTick * 100,
usage: (1 - totalIdle / totalTick) * 100
};
}
getMemoryMetrics() {
const totalMem = os.totalmem();
const freeMem = os.freemem();
const usedMem = totalMem - freeMem;
const processMemory = process.memoryUsage();
return {
total: totalMem,
free: freeMem,
used: usedMem,
usagePercent: (usedMem / totalMem) * 100,
process: {
rss: processMemory.rss,
heapTotal: processMemory.heapTotal,
heapUsed: processMemory.heapUsed,
external: processMemory.external,
arrayBuffers: processMemory.arrayBuffers
}
};
}
async getDiskMetrics() {
try {
const stats = await fs.stat('/');
// 这里简化处理,实际应该读取/proc/diskstats
return {
available: 'N/A', // 需要实现具体的磁盘空间检查
usage: 'N/A'
};
} catch (error) {
return { error: error.message };
}
}
async getNetworkMetrics() {
const interfaces = os.networkInterfaces();
const metrics = {};
for (const [name, addresses] of Object.entries(interfaces)) {
if (addresses) {
metrics[name] = addresses.filter(addr => !addr.internal);
}
}
return metrics;
}
getProcessMetrics() {
return {
pid: process.pid,
uptime: process.uptime(),
version: process.version,
platform: process.platform,
arch: process.arch,
title: process.title,
argv: process.argv,
execPath: process.execPath,
cwd: process.cwd()
};
}
async getApplicationMetrics() {
// 应用特定的指标
return {
activeConnections: await this.getActiveConnections(),
responseTime: await this.measureResponseTime(),
errorRate: this.calculateErrorRate(),
throughput: this.calculateThroughput()
};
}
async getActiveConnections() {
// 实现获取活跃连接数的逻辑
return 0;
}
async measureResponseTime() {
const start = performance.now();
try {
// 模拟一个内部健康检查请求
await new Promise(resolve => setTimeout(resolve, 1));
return performance.now() - start;
} catch (error) {
return -1;
}
}
calculateErrorRate() {
// 从最近的指标计算错误率
return 0;
}
calculateThroughput() {
// 计算吞吐量
return 0;
}
checkAnomalies(metrics) {
const anomalies = [];
// CPU使用率异常
if (metrics.cpu.usage > 90) {
anomalies.push({
type: 'high_cpu',
value: metrics.cpu.usage,
threshold: 90,
severity: 'critical'
});
}
// 内存使用率异常
if (metrics.memory.usagePercent > 85) {
anomalies.push({
type: 'high_memory',
value: metrics.memory.usagePercent,
threshold: 85,
severity: 'warning'
});
}
// 负载异常
if (metrics.cpu.loadAverage['1m'] > metrics.cpu.count * 2) {
anomalies.push({
type: 'high_load',
value: metrics.cpu.loadAverage['1m'],
threshold: metrics.cpu.count * 2,
severity: 'critical'
});
}
if (anomalies.length > 0) {
this.handleAnomalies(anomalies, metrics);
}
}
handleAnomalies(anomalies, metrics) {
console.warn('⚠️ 检测到性能异常:', anomalies);
// 触发自动化处理
anomalies.forEach(anomaly => {
this.executeAnomalyResponse(anomaly, metrics);
});
}
executeAnomalyResponse(anomaly, metrics) {
switch (anomaly.type) {
case 'high_cpu':
this.handleHighCPU(anomaly, metrics);
break;
case 'high_memory':
this.handleHighMemory(anomaly, metrics);
break;
case 'high_load':
this.handleHighLoad(anomaly, metrics);
break;
}
}
handleHighCPU(anomaly, metrics) {
console.log('🔄 处理高CPU使用率');
// 实现CPU优化策略
}
handleHighMemory(anomaly, metrics) {
console.log('🔄 处理高内存使用率');
// 触发垃圾回收
if (global.gc) {
global.gc();
}
}
handleHighLoad(anomaly, metrics) {
console.log('🔄 处理高系统负载');
// 实现负载优化策略
}
async analyzePerformance() {
if (this.metrics.length < 10) return;
console.log('📈 分析性能趋势');
const analysis = {
timestamp: Date.now(),
period: '5min',
summary: this.generateSummary(),
trends: this.analyzeTrends(),
recommendations: this.generateRecommendations()
};
await this.saveAnalysis(analysis);
if (this.config.reportCallback) {
this.config.reportCallback(analysis);
}
}
generateSummary() {
const recent = this.metrics.slice(-10);
const avgCPU = recent.reduce((sum, m) => sum + m.cpu.usage, 0) / recent.length;
const avgMemory = recent.reduce((sum, m) => sum + m.memory.usagePercent, 0) / recent.length;
const avgLoad = recent.reduce((sum, m) => sum + m.cpu.loadAverage['1m'], 0) / recent.length;
return {
averageCPU: avgCPU.toFixed(2),
averageMemory: avgMemory.toFixed(2),
averageLoad: avgLoad.toFixed(2),
sampleCount: recent.length
};
}
analyzeTrends() {
// 分析性能趋势
return {
cpu: 'stable',
memory: 'increasing',
load: 'stable'
};
}
generateRecommendations() {
// 生成优化建议
return [
'考虑增加内存容量',
'优化数据库查询',
'启用缓存机制'
];
}
async saveAnalysis(analysis) {
try {
const filename = `/tmp/performance-analysis-${analysis.timestamp}.json`;
await fs.writeFile(filename, JSON.stringify(analysis, null, 2));
console.log(`📊 性能分析已保存: ${filename}`);
} catch (error) {
console.error('保存性能分析失败:', error);
}
}
}
module.exports = PerformanceMonitor;
📝 总结
运维自动化为Node.js应用提供了:
- 提高效率:减少重复性手工操作
- 降低错误:自动化流程减少人为失误
- 快速响应:自动检测和处理问题
- 一致性:标准化的操作流程
- 可扩展性:支持大规模环境管理
通过合理的自动化策略和工具选择,可以构建出高效、可靠的运维体系。