- Introduced a comprehensive data collection framework, including `CollectorServiceConfig`, `BaseDataCollector`, and `CollectorManager`, enhancing modularity and maintainability. - Developed `CollectorFactory` for streamlined collector creation, promoting separation of concerns and improved configuration handling. - Enhanced `DataCollectionService` to utilize the new architecture, ensuring robust error handling and logging practices. - Added `TaskManager` for efficient management of asynchronous tasks, improving performance and resource management. - Implemented health monitoring and auto-recovery features in `CollectorManager`, ensuring reliable operation of data collectors. - Updated imports across the codebase to reflect the new structure, ensuring consistent access to components. These changes significantly improve the architecture and maintainability of the data collection service, aligning with project standards for modularity, performance, and error handling.
185 lines
7.3 KiB
Python
185 lines
7.3 KiB
Python
"""
|
|
Manager Health Monitor for monitoring collector health and auto-recovery.
|
|
|
|
This module handles health monitoring of data collectors including periodic health checks,
|
|
auto-restart functionality, and health status tracking.
|
|
"""
|
|
|
|
import asyncio
|
|
from datetime import datetime, timezone
|
|
from typing import Set, Dict, Optional
|
|
from ..collector.base_collector import BaseDataCollector, CollectorStatus
|
|
|
|
|
|
class ManagerHealthMonitor:
|
|
"""Monitors the health of data collectors and provides auto-recovery."""
|
|
|
|
def __init__(self,
|
|
global_health_check_interval: float = 60.0,
|
|
logger_manager=None,
|
|
lifecycle_manager=None):
|
|
"""
|
|
Initialize the health monitor.
|
|
|
|
Args:
|
|
global_health_check_interval: Seconds between global health checks
|
|
logger_manager: Logger manager instance for logging operations
|
|
lifecycle_manager: Lifecycle manager for restart operations
|
|
"""
|
|
self.global_health_check_interval = global_health_check_interval
|
|
self.logger_manager = logger_manager
|
|
self.lifecycle_manager = lifecycle_manager
|
|
|
|
# Health monitoring state
|
|
self._running = False
|
|
self._last_global_check = datetime.now(timezone.utc)
|
|
self._global_health_task: Optional[asyncio.Task] = None
|
|
|
|
# Health statistics
|
|
self._health_stats = {
|
|
'last_global_check': None,
|
|
'running_collectors': 0,
|
|
'failed_collectors': 0
|
|
}
|
|
|
|
def set_running_state(self, running: bool) -> None:
|
|
"""Set the running state of the monitor."""
|
|
self._running = running
|
|
|
|
def get_health_stats(self) -> Dict:
|
|
"""Get health monitoring statistics."""
|
|
return self._health_stats.copy()
|
|
|
|
def get_last_global_check(self) -> datetime:
|
|
"""Get the timestamp of the last global health check."""
|
|
return self._last_global_check
|
|
|
|
async def start_monitoring(self) -> None:
|
|
"""Start the global health monitoring task."""
|
|
if self._global_health_task and not self._global_health_task.done():
|
|
if self.logger_manager:
|
|
self.logger_manager.log_warning("Health monitoring is already running")
|
|
return
|
|
|
|
if self.logger_manager:
|
|
self.logger_manager.log_debug("Starting health monitoring")
|
|
|
|
self._global_health_task = asyncio.create_task(self._global_health_monitor())
|
|
|
|
async def stop_monitoring(self) -> None:
|
|
"""Stop the global health monitoring task."""
|
|
if self._global_health_task and not self._global_health_task.done():
|
|
self._global_health_task.cancel()
|
|
try:
|
|
await self._global_health_task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
|
|
if self.logger_manager:
|
|
self.logger_manager.log_debug("Health monitoring stopped")
|
|
|
|
async def _global_health_monitor(self) -> None:
|
|
"""Global health monitoring for all collectors."""
|
|
if self.logger_manager:
|
|
self.logger_manager.log_debug("Starting global health monitor")
|
|
|
|
while self._running:
|
|
try:
|
|
await asyncio.sleep(self.global_health_check_interval)
|
|
|
|
self._last_global_check = datetime.now(timezone.utc)
|
|
self._health_stats['last_global_check'] = self._last_global_check
|
|
|
|
# Perform health check if lifecycle manager is available
|
|
if self.lifecycle_manager:
|
|
await self._perform_health_check()
|
|
|
|
except asyncio.CancelledError:
|
|
if self.logger_manager:
|
|
self.logger_manager.log_debug("Global health monitor cancelled")
|
|
break
|
|
except Exception as e:
|
|
if self.logger_manager:
|
|
self.logger_manager.log_error(f"Error in global health monitor: {e}", exc_info=True)
|
|
await asyncio.sleep(self.global_health_check_interval)
|
|
|
|
async def _perform_health_check(self) -> None:
|
|
"""Perform health check on all enabled collectors."""
|
|
if not self.lifecycle_manager:
|
|
return
|
|
|
|
enabled_collectors = self.lifecycle_manager.get_enabled_collectors()
|
|
collectors = self.lifecycle_manager.get_collectors()
|
|
|
|
running_count = 0
|
|
failed_count = 0
|
|
|
|
for collector_name in enabled_collectors:
|
|
if collector_name not in collectors:
|
|
continue
|
|
|
|
collector = collectors[collector_name]
|
|
health_status = collector.get_health_status()
|
|
|
|
if health_status['is_healthy'] and collector.status == CollectorStatus.RUNNING:
|
|
running_count += 1
|
|
elif not health_status['is_healthy']:
|
|
failed_count += 1
|
|
if self.logger_manager:
|
|
self.logger_manager.log_warning(
|
|
f"Collector {collector_name} is unhealthy: {health_status['issues']}"
|
|
)
|
|
|
|
# Auto-restart if needed and not already restarting
|
|
config = self.lifecycle_manager.get_collector_config(collector_name)
|
|
if (config and config.auto_restart and
|
|
collector.status not in [CollectorStatus.STARTING, CollectorStatus.STOPPING]):
|
|
|
|
if self.logger_manager:
|
|
self.logger_manager.log_info(f"Auto-restarting unhealthy collector: {collector_name}")
|
|
|
|
# Create restart task without awaiting to avoid blocking
|
|
asyncio.create_task(self.lifecycle_manager.restart_collector(collector_name))
|
|
|
|
# Update health statistics
|
|
self._health_stats['running_collectors'] = running_count
|
|
self._health_stats['failed_collectors'] = failed_count
|
|
|
|
if self.logger_manager:
|
|
self.logger_manager.log_debug(
|
|
f"Health check complete - Running: {running_count}, Failed: {failed_count}"
|
|
)
|
|
|
|
async def perform_immediate_health_check(self) -> Dict[str, Dict]:
|
|
"""
|
|
Perform an immediate health check on all collectors.
|
|
|
|
Returns:
|
|
Dictionary mapping collector names to their health status
|
|
"""
|
|
if not self.lifecycle_manager:
|
|
return {}
|
|
|
|
enabled_collectors = self.lifecycle_manager.get_enabled_collectors()
|
|
collectors = self.lifecycle_manager.get_collectors()
|
|
|
|
health_results = {}
|
|
|
|
for collector_name in enabled_collectors:
|
|
if collector_name not in collectors:
|
|
continue
|
|
|
|
collector = collectors[collector_name]
|
|
health_status = collector.get_health_status()
|
|
|
|
health_results[collector_name] = {
|
|
'is_healthy': health_status['is_healthy'],
|
|
'status': collector.status.value,
|
|
'issues': health_status.get('issues', [])
|
|
}
|
|
|
|
return health_results
|
|
|
|
def get_health_task(self) -> Optional[asyncio.Task]:
|
|
"""Get the current health monitoring task."""
|
|
return self._global_health_task |