262 lines
8.7 KiB
Python
262 lines
8.7 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
TSP助手监控服务
|
|||
|
|
实时监控系统状态,执行预警检查
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import logging
|
|||
|
|
import threading
|
|||
|
|
import time
|
|||
|
|
from typing import Dict, Any, List
|
|||
|
|
from datetime import datetime, timedelta
|
|||
|
|
|
|||
|
|
from .alert_system import AlertSystem, AlertRule, AlertLevel, AlertType
|
|||
|
|
|
|||
|
|
logger = logging.getLogger(__name__)
|
|||
|
|
|
|||
|
|
class MonitorService:
|
|||
|
|
"""监控服务"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
self.alert_system = AlertSystem()
|
|||
|
|
self.is_running = False
|
|||
|
|
self.monitor_thread = None
|
|||
|
|
self.check_interval = 60 # 检查间隔(秒)
|
|||
|
|
|
|||
|
|
def start(self):
|
|||
|
|
"""启动监控服务"""
|
|||
|
|
if self.is_running:
|
|||
|
|
logger.warning("监控服务已在运行")
|
|||
|
|
return
|
|||
|
|
|
|||
|
|
self.is_running = True
|
|||
|
|
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
|||
|
|
self.monitor_thread.start()
|
|||
|
|
logger.info("监控服务已启动")
|
|||
|
|
|
|||
|
|
def stop(self):
|
|||
|
|
"""停止监控服务"""
|
|||
|
|
self.is_running = False
|
|||
|
|
if self.monitor_thread:
|
|||
|
|
self.monitor_thread.join(timeout=5)
|
|||
|
|
logger.info("监控服务已停止")
|
|||
|
|
|
|||
|
|
def _monitor_loop(self):
|
|||
|
|
"""监控循环"""
|
|||
|
|
while self.is_running:
|
|||
|
|
try:
|
|||
|
|
# 执行预警检查
|
|||
|
|
triggered_alerts = self.alert_system.check_all_rules()
|
|||
|
|
|
|||
|
|
if triggered_alerts:
|
|||
|
|
logger.info(f"触发 {len(triggered_alerts)} 个预警")
|
|||
|
|
for alert in triggered_alerts:
|
|||
|
|
self._handle_alert(alert)
|
|||
|
|
|
|||
|
|
# 等待下次检查
|
|||
|
|
time.sleep(self.check_interval)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"监控循环异常: {e}")
|
|||
|
|
time.sleep(10) # 异常时等待10秒再继续
|
|||
|
|
|
|||
|
|
def _handle_alert(self, alert: Dict[str, Any]):
|
|||
|
|
"""处理预警"""
|
|||
|
|
try:
|
|||
|
|
# 记录预警
|
|||
|
|
logger.warning(f"预警触发: {alert['message']}")
|
|||
|
|
|
|||
|
|
# 根据预警级别采取不同措施
|
|||
|
|
if alert['level'] == 'critical':
|
|||
|
|
self._handle_critical_alert(alert)
|
|||
|
|
elif alert['level'] == 'error':
|
|||
|
|
self._handle_error_alert(alert)
|
|||
|
|
elif alert['level'] == 'warning':
|
|||
|
|
self._handle_warning_alert(alert)
|
|||
|
|
else:
|
|||
|
|
self._handle_info_alert(alert)
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"处理预警失败: {e}")
|
|||
|
|
|
|||
|
|
def _handle_critical_alert(self, alert: Dict[str, Any]):
|
|||
|
|
"""处理严重预警"""
|
|||
|
|
# 发送紧急通知
|
|||
|
|
self._send_notification(alert, "紧急")
|
|||
|
|
|
|||
|
|
# 记录到日志
|
|||
|
|
logger.critical(f"严重预警: {alert['message']}")
|
|||
|
|
|
|||
|
|
# 可以添加自动恢复措施
|
|||
|
|
self._attempt_auto_recovery(alert)
|
|||
|
|
|
|||
|
|
def _handle_error_alert(self, alert: Dict[str, Any]):
|
|||
|
|
"""处理错误预警"""
|
|||
|
|
# 发送错误通知
|
|||
|
|
self._send_notification(alert, "错误")
|
|||
|
|
|
|||
|
|
# 记录到日志
|
|||
|
|
logger.error(f"错误预警: {alert['message']}")
|
|||
|
|
|
|||
|
|
def _handle_warning_alert(self, alert: Dict[str, Any]):
|
|||
|
|
"""处理警告预警"""
|
|||
|
|
# 发送警告通知
|
|||
|
|
self._send_notification(alert, "警告")
|
|||
|
|
|
|||
|
|
# 记录到日志
|
|||
|
|
logger.warning(f"警告预警: {alert['message']}")
|
|||
|
|
|
|||
|
|
def _handle_info_alert(self, alert: Dict[str, Any]):
|
|||
|
|
"""处理信息预警"""
|
|||
|
|
# 记录到日志
|
|||
|
|
logger.info(f"信息预警: {alert['message']}")
|
|||
|
|
|
|||
|
|
def _send_notification(self, alert: Dict[str, Any], level: str):
|
|||
|
|
"""发送通知"""
|
|||
|
|
# 这里可以集成邮件、短信、钉钉等通知方式
|
|||
|
|
notification = {
|
|||
|
|
"level": level,
|
|||
|
|
"message": alert['message'],
|
|||
|
|
"timestamp": alert['timestamp'],
|
|||
|
|
"rule_name": alert['rule_name']
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 记录通知
|
|||
|
|
logger.info(f"发送通知: {notification}")
|
|||
|
|
|
|||
|
|
# TODO: 实现具体的通知发送逻辑
|
|||
|
|
# 例如:发送邮件、短信、钉钉消息等
|
|||
|
|
|
|||
|
|
def _attempt_auto_recovery(self, alert: Dict[str, Any]):
|
|||
|
|
"""尝试自动恢复"""
|
|||
|
|
try:
|
|||
|
|
rule_name = alert['rule_name']
|
|||
|
|
|
|||
|
|
if rule_name == "内存使用预警":
|
|||
|
|
# 尝试清理内存
|
|||
|
|
self._cleanup_memory()
|
|||
|
|
elif rule_name == "错误率预警":
|
|||
|
|
# 尝试重启相关服务
|
|||
|
|
self._restart_services()
|
|||
|
|
elif rule_name == "响应时间预警":
|
|||
|
|
# 尝试优化性能
|
|||
|
|
self._optimize_performance()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"自动恢复失败: {e}")
|
|||
|
|
|
|||
|
|
def _cleanup_memory(self):
|
|||
|
|
"""清理内存"""
|
|||
|
|
try:
|
|||
|
|
import gc
|
|||
|
|
gc.collect()
|
|||
|
|
logger.info("执行内存清理")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"内存清理失败: {e}")
|
|||
|
|
|
|||
|
|
def _restart_services(self):
|
|||
|
|
"""重启服务"""
|
|||
|
|
try:
|
|||
|
|
# 这里可以实现重启相关服务的逻辑
|
|||
|
|
logger.info("尝试重启服务")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"重启服务失败: {e}")
|
|||
|
|
|
|||
|
|
def _optimize_performance(self):
|
|||
|
|
"""优化性能"""
|
|||
|
|
try:
|
|||
|
|
# 这里可以实现性能优化的逻辑
|
|||
|
|
logger.info("尝试优化性能")
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"性能优化失败: {e}")
|
|||
|
|
|
|||
|
|
def get_system_health(self) -> Dict[str, Any]:
|
|||
|
|
"""获取系统健康状态"""
|
|||
|
|
try:
|
|||
|
|
# 获取活跃预警
|
|||
|
|
active_alerts = self.alert_system.get_active_alerts()
|
|||
|
|
|
|||
|
|
# 获取预警统计
|
|||
|
|
alert_stats = self.alert_system.get_alert_statistics()
|
|||
|
|
|
|||
|
|
# 计算健康分数
|
|||
|
|
health_score = self._calculate_health_score(active_alerts, alert_stats)
|
|||
|
|
|
|||
|
|
return {
|
|||
|
|
"health_score": health_score,
|
|||
|
|
"status": self._get_health_status(health_score),
|
|||
|
|
"active_alerts": len(active_alerts),
|
|||
|
|
"alert_statistics": alert_stats,
|
|||
|
|
"monitor_status": "running" if self.is_running else "stopped",
|
|||
|
|
"last_check": datetime.now().isoformat()
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"获取系统健康状态失败: {e}")
|
|||
|
|
return {"error": str(e)}
|
|||
|
|
|
|||
|
|
def _calculate_health_score(self, active_alerts: List[Dict[str, Any]], alert_stats: Dict[str, Any]) -> float:
|
|||
|
|
"""计算健康分数"""
|
|||
|
|
try:
|
|||
|
|
base_score = 100.0
|
|||
|
|
|
|||
|
|
# 根据活跃预警扣分
|
|||
|
|
for alert in active_alerts:
|
|||
|
|
if alert['level'] == 'critical':
|
|||
|
|
base_score -= 20
|
|||
|
|
elif alert['level'] == 'error':
|
|||
|
|
base_score -= 10
|
|||
|
|
elif alert['level'] == 'warning':
|
|||
|
|
base_score -= 5
|
|||
|
|
else:
|
|||
|
|
base_score -= 1
|
|||
|
|
|
|||
|
|
# 确保分数在0-100之间
|
|||
|
|
return max(0, min(100, base_score))
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"计算健康分数失败: {e}")
|
|||
|
|
return 50.0
|
|||
|
|
|
|||
|
|
def _get_health_status(self, health_score: float) -> str:
|
|||
|
|
"""获取健康状态"""
|
|||
|
|
if health_score >= 90:
|
|||
|
|
return "excellent"
|
|||
|
|
elif health_score >= 70:
|
|||
|
|
return "good"
|
|||
|
|
elif health_score >= 50:
|
|||
|
|
return "fair"
|
|||
|
|
elif health_score >= 30:
|
|||
|
|
return "poor"
|
|||
|
|
else:
|
|||
|
|
return "critical"
|
|||
|
|
|
|||
|
|
def add_custom_rule(self, rule: AlertRule) -> bool:
|
|||
|
|
"""添加自定义规则"""
|
|||
|
|
return self.alert_system.add_custom_rule(rule)
|
|||
|
|
|
|||
|
|
def update_rule(self, rule_name: str, **kwargs) -> bool:
|
|||
|
|
"""更新规则"""
|
|||
|
|
return self.alert_system.update_rule(rule_name, **kwargs)
|
|||
|
|
|
|||
|
|
def delete_rule(self, rule_name: str) -> bool:
|
|||
|
|
"""删除规则"""
|
|||
|
|
return self.alert_system.delete_rule(rule_name)
|
|||
|
|
|
|||
|
|
def get_rules(self) -> Dict[str, Any]:
|
|||
|
|
"""获取所有规则"""
|
|||
|
|
return {
|
|||
|
|
name: {
|
|||
|
|
"name": rule.name,
|
|||
|
|
"description": rule.description,
|
|||
|
|
"alert_type": rule.alert_type.value,
|
|||
|
|
"level": rule.level.value,
|
|||
|
|
"threshold": rule.threshold,
|
|||
|
|
"enabled": rule.enabled,
|
|||
|
|
"check_interval": rule.check_interval,
|
|||
|
|
"cooldown": rule.cooldown
|
|||
|
|
}
|
|||
|
|
for name, rule in self.alert_system.rules.items()
|
|||
|
|
}
|