TCPDashboard/data/manager_components/manager_health_monitor.py
Vasily.onl 2890ba2efa Implement Service Configuration Manager for data collection service
- Introduced `service_config.py` to manage configuration loading, validation, and schema management, enhancing modularity and security.
- Created a `ServiceConfig` class for handling configuration with robust error handling and default values.
- Refactored `DataCollectionService` to utilize the new `ServiceConfig`, streamlining configuration management and improving readability.
- Added a `CollectorFactory` to encapsulate collector creation logic, promoting separation of concerns.
- Updated `CollectorManager` and related components to align with the new architecture, ensuring better maintainability.
- Enhanced logging practices across the service for improved monitoring and debugging.

These changes significantly improve the architecture and maintainability of the data collection service, aligning with project standards for modularity and performance.
2025-06-10 12:55:27 +08:00

185 lines
7.3 KiB
Python

"""
Manager Health Monitor for monitoring collector health and auto-recovery.
This module handles health monitoring of data collectors including periodic health checks,
auto-restart functionality, and health status tracking.
"""
import asyncio
from datetime import datetime, timezone
from typing import Set, Dict, Optional
from ..base_collector import BaseDataCollector, CollectorStatus
class ManagerHealthMonitor:
"""Monitors the health of data collectors and provides auto-recovery."""
def __init__(self,
global_health_check_interval: float = 60.0,
logger_manager=None,
lifecycle_manager=None):
"""
Initialize the health monitor.
Args:
global_health_check_interval: Seconds between global health checks
logger_manager: Logger manager instance for logging operations
lifecycle_manager: Lifecycle manager for restart operations
"""
self.global_health_check_interval = global_health_check_interval
self.logger_manager = logger_manager
self.lifecycle_manager = lifecycle_manager
# Health monitoring state
self._running = False
self._last_global_check = datetime.now(timezone.utc)
self._global_health_task: Optional[asyncio.Task] = None
# Health statistics
self._health_stats = {
'last_global_check': None,
'running_collectors': 0,
'failed_collectors': 0
}
def set_running_state(self, running: bool) -> None:
"""Set the running state of the monitor."""
self._running = running
def get_health_stats(self) -> Dict:
"""Get health monitoring statistics."""
return self._health_stats.copy()
def get_last_global_check(self) -> datetime:
"""Get the timestamp of the last global health check."""
return self._last_global_check
async def start_monitoring(self) -> None:
"""Start the global health monitoring task."""
if self._global_health_task and not self._global_health_task.done():
if self.logger_manager:
self.logger_manager.log_warning("Health monitoring is already running")
return
if self.logger_manager:
self.logger_manager.log_debug("Starting health monitoring")
self._global_health_task = asyncio.create_task(self._global_health_monitor())
async def stop_monitoring(self) -> None:
"""Stop the global health monitoring task."""
if self._global_health_task and not self._global_health_task.done():
self._global_health_task.cancel()
try:
await self._global_health_task
except asyncio.CancelledError:
pass
if self.logger_manager:
self.logger_manager.log_debug("Health monitoring stopped")
async def _global_health_monitor(self) -> None:
"""Global health monitoring for all collectors."""
if self.logger_manager:
self.logger_manager.log_debug("Starting global health monitor")
while self._running:
try:
await asyncio.sleep(self.global_health_check_interval)
self._last_global_check = datetime.now(timezone.utc)
self._health_stats['last_global_check'] = self._last_global_check
# Perform health check if lifecycle manager is available
if self.lifecycle_manager:
await self._perform_health_check()
except asyncio.CancelledError:
if self.logger_manager:
self.logger_manager.log_debug("Global health monitor cancelled")
break
except Exception as e:
if self.logger_manager:
self.logger_manager.log_error(f"Error in global health monitor: {e}", exc_info=True)
await asyncio.sleep(self.global_health_check_interval)
async def _perform_health_check(self) -> None:
"""Perform health check on all enabled collectors."""
if not self.lifecycle_manager:
return
enabled_collectors = self.lifecycle_manager.get_enabled_collectors()
collectors = self.lifecycle_manager.get_collectors()
running_count = 0
failed_count = 0
for collector_name in enabled_collectors:
if collector_name not in collectors:
continue
collector = collectors[collector_name]
health_status = collector.get_health_status()
if health_status['is_healthy'] and collector.status == CollectorStatus.RUNNING:
running_count += 1
elif not health_status['is_healthy']:
failed_count += 1
if self.logger_manager:
self.logger_manager.log_warning(
f"Collector {collector_name} is unhealthy: {health_status['issues']}"
)
# Auto-restart if needed and not already restarting
config = self.lifecycle_manager.get_collector_config(collector_name)
if (config and config.auto_restart and
collector.status not in [CollectorStatus.STARTING, CollectorStatus.STOPPING]):
if self.logger_manager:
self.logger_manager.log_info(f"Auto-restarting unhealthy collector: {collector_name}")
# Create restart task without awaiting to avoid blocking
asyncio.create_task(self.lifecycle_manager.restart_collector(collector_name))
# Update health statistics
self._health_stats['running_collectors'] = running_count
self._health_stats['failed_collectors'] = failed_count
if self.logger_manager:
self.logger_manager.log_debug(
f"Health check complete - Running: {running_count}, Failed: {failed_count}"
)
async def perform_immediate_health_check(self) -> Dict[str, Dict]:
"""
Perform an immediate health check on all collectors.
Returns:
Dictionary mapping collector names to their health status
"""
if not self.lifecycle_manager:
return {}
enabled_collectors = self.lifecycle_manager.get_enabled_collectors()
collectors = self.lifecycle_manager.get_collectors()
health_results = {}
for collector_name in enabled_collectors:
if collector_name not in collectors:
continue
collector = collectors[collector_name]
health_status = collector.get_health_status()
health_results[collector_name] = {
'is_healthy': health_status['is_healthy'],
'status': collector.status.value,
'issues': health_status.get('issues', [])
}
return health_results
def get_health_task(self) -> Optional[asyncio.Task]:
"""Get the current health monitoring task."""
return self._global_health_task