2025-05-30 20:33:56 +08:00
|
|
|
"""
|
|
|
|
|
Data Collector Manager for supervising and managing multiple data collectors.
|
|
|
|
|
|
|
|
|
|
This module provides centralized management of data collectors with health monitoring,
|
|
|
|
|
auto-recovery, and coordinated lifecycle management.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import asyncio
|
|
|
|
|
from typing import Dict, List, Optional, Any, Set
|
|
|
|
|
|
|
|
|
|
from utils.logger import get_logger
|
|
|
|
|
from .base_collector import BaseDataCollector, CollectorStatus
|
2025-06-10 12:55:27 +08:00
|
|
|
from .collector_types import ManagerStatus, CollectorConfig
|
|
|
|
|
from .manager_components import (
|
|
|
|
|
CollectorLifecycleManager,
|
|
|
|
|
ManagerHealthMonitor,
|
|
|
|
|
ManagerStatsTracker,
|
|
|
|
|
ManagerLogger
|
|
|
|
|
)
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
class CollectorManager:
|
|
|
|
|
"""
|
|
|
|
|
Manages multiple data collectors with health monitoring and auto-recovery.
|
|
|
|
|
|
|
|
|
|
The manager is responsible for:
|
|
|
|
|
- Starting and stopping collectors
|
|
|
|
|
- Health monitoring and auto-restart
|
|
|
|
|
- Coordinated lifecycle management
|
|
|
|
|
- Status reporting and metrics
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self,
|
|
|
|
|
manager_name: str = "collector_manager",
|
|
|
|
|
global_health_check_interval: float = 60.0,
|
2025-06-01 14:42:29 +08:00
|
|
|
restart_delay: float = 5.0,
|
|
|
|
|
logger = None,
|
|
|
|
|
log_errors_only: bool = False):
|
2025-06-10 12:55:27 +08:00
|
|
|
"""Initialize the collector manager with component-based architecture."""
|
2025-05-30 20:33:56 +08:00
|
|
|
self.manager_name = manager_name
|
|
|
|
|
self.restart_delay = restart_delay
|
|
|
|
|
|
2025-06-10 12:55:27 +08:00
|
|
|
# Initialize components
|
|
|
|
|
self.logger_manager = ManagerLogger(logger, log_errors_only)
|
|
|
|
|
self.lifecycle_manager = CollectorLifecycleManager(self.logger_manager)
|
|
|
|
|
self.health_monitor = ManagerHealthMonitor(
|
|
|
|
|
global_health_check_interval, self.logger_manager, self.lifecycle_manager)
|
|
|
|
|
self.stats_tracker = ManagerStatsTracker(
|
|
|
|
|
30.0, self.logger_manager, self.lifecycle_manager, self.health_monitor)
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
# Manager state
|
|
|
|
|
self.status = ManagerStatus.STOPPED
|
|
|
|
|
self._running = False
|
|
|
|
|
self._tasks: Set[asyncio.Task] = set()
|
|
|
|
|
|
2025-06-10 12:55:27 +08:00
|
|
|
if self.logger_manager.is_debug_enabled():
|
|
|
|
|
self.logger_manager.log_info(f"Initialized collector manager: {manager_name}")
|
2025-06-01 14:42:29 +08:00
|
|
|
|
2025-06-10 12:55:27 +08:00
|
|
|
def add_collector(self, collector: BaseDataCollector, config: Optional[CollectorConfig] = None) -> None:
|
|
|
|
|
"""Add a collector to be managed."""
|
|
|
|
|
self.lifecycle_manager.add_collector(collector, config)
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
def remove_collector(self, collector_name: str) -> bool:
|
2025-06-10 12:55:27 +08:00
|
|
|
"""Remove a collector from management."""
|
|
|
|
|
return self.lifecycle_manager.remove_collector(collector_name)
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
def enable_collector(self, collector_name: str) -> bool:
|
2025-06-10 12:55:27 +08:00
|
|
|
"""Enable a collector (will be started if manager is running)."""
|
|
|
|
|
return self.lifecycle_manager.enable_collector(collector_name)
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
def disable_collector(self, collector_name: str) -> bool:
|
2025-06-10 12:55:27 +08:00
|
|
|
"""Disable a collector (will be stopped if running)."""
|
|
|
|
|
return self.lifecycle_manager.disable_collector(collector_name)
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
async def start(self) -> bool:
|
2025-06-10 12:55:27 +08:00
|
|
|
"""Start the collector manager and all enabled collectors."""
|
2025-05-30 20:33:56 +08:00
|
|
|
if self.status in [ManagerStatus.RUNNING, ManagerStatus.STARTING]:
|
2025-06-10 12:55:27 +08:00
|
|
|
self.logger_manager.log_warning("Collector manager is already running or starting")
|
2025-05-30 20:33:56 +08:00
|
|
|
return True
|
|
|
|
|
|
2025-06-10 12:55:27 +08:00
|
|
|
self.logger_manager.log_info("Starting collector manager")
|
2025-05-30 20:33:56 +08:00
|
|
|
self.status = ManagerStatus.STARTING
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
self._running = True
|
|
|
|
|
|
2025-06-10 12:55:27 +08:00
|
|
|
# Set running state for all components
|
|
|
|
|
self.lifecycle_manager.set_running_state(True)
|
|
|
|
|
self.health_monitor.set_running_state(True)
|
|
|
|
|
self.stats_tracker.set_running_state(True)
|
|
|
|
|
|
|
|
|
|
# Start collectors and monitoring
|
|
|
|
|
await self.lifecycle_manager.start_all_enabled_collectors()
|
|
|
|
|
await self.health_monitor.start_monitoring()
|
2025-05-30 20:33:56 +08:00
|
|
|
|
2025-06-10 12:55:27 +08:00
|
|
|
# Track health monitoring task
|
|
|
|
|
health_task = self.health_monitor.get_health_task()
|
|
|
|
|
if health_task:
|
|
|
|
|
self._tasks.add(health_task)
|
|
|
|
|
health_task.add_done_callback(self._tasks.discard)
|
2025-05-30 20:33:56 +08:00
|
|
|
|
2025-06-10 12:55:27 +08:00
|
|
|
# Start statistics cache updates
|
|
|
|
|
await self.stats_tracker.start_cache_updates()
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
self.status = ManagerStatus.RUNNING
|
2025-06-10 12:55:27 +08:00
|
|
|
enabled_count = len(self.lifecycle_manager.get_enabled_collectors())
|
|
|
|
|
self.logger_manager.log_info(f"Collector manager started - Managing {enabled_count} collectors")
|
2025-05-30 20:33:56 +08:00
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
self.status = ManagerStatus.ERROR
|
2025-06-10 12:55:27 +08:00
|
|
|
self.logger_manager.log_error(f"Failed to start collector manager: {e}", exc_info=True)
|
2025-05-30 20:33:56 +08:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
async def stop(self) -> None:
|
|
|
|
|
"""Stop the collector manager and all collectors."""
|
|
|
|
|
if self.status == ManagerStatus.STOPPED:
|
2025-06-10 12:55:27 +08:00
|
|
|
self.logger_manager.log_warning("Collector manager is already stopped")
|
2025-05-30 20:33:56 +08:00
|
|
|
return
|
|
|
|
|
|
2025-06-10 12:55:27 +08:00
|
|
|
self.logger_manager.log_info("Stopping collector manager")
|
2025-05-30 20:33:56 +08:00
|
|
|
self.status = ManagerStatus.STOPPING
|
|
|
|
|
self._running = False
|
|
|
|
|
|
|
|
|
|
try:
|
2025-06-10 12:55:27 +08:00
|
|
|
# Set running state for all components
|
|
|
|
|
self.lifecycle_manager.set_running_state(False)
|
|
|
|
|
self.health_monitor.set_running_state(False)
|
|
|
|
|
self.stats_tracker.set_running_state(False)
|
|
|
|
|
|
|
|
|
|
# Stop monitoring and statistics
|
|
|
|
|
await self.health_monitor.stop_monitoring()
|
|
|
|
|
await self.stats_tracker.stop_cache_updates()
|
|
|
|
|
|
2025-05-30 20:33:56 +08:00
|
|
|
# Cancel manager tasks
|
|
|
|
|
for task in list(self._tasks):
|
|
|
|
|
task.cancel()
|
|
|
|
|
if self._tasks:
|
|
|
|
|
await asyncio.gather(*self._tasks, return_exceptions=True)
|
|
|
|
|
|
|
|
|
|
# Stop all collectors
|
2025-06-10 12:55:27 +08:00
|
|
|
await self.lifecycle_manager.stop_all_collectors()
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
self.status = ManagerStatus.STOPPED
|
2025-06-10 12:55:27 +08:00
|
|
|
self.logger_manager.log_info("Collector manager stopped")
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
self.status = ManagerStatus.ERROR
|
2025-06-10 12:55:27 +08:00
|
|
|
self.logger_manager.log_error(f"Error stopping collector manager: {e}", exc_info=True)
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
async def restart_collector(self, collector_name: str) -> bool:
|
2025-06-10 12:55:27 +08:00
|
|
|
"""Restart a specific collector."""
|
|
|
|
|
return await self.lifecycle_manager.restart_collector(collector_name)
|
2025-05-30 20:33:56 +08:00
|
|
|
|
2025-06-10 12:55:27 +08:00
|
|
|
async def restart_all_collectors(self) -> Dict[str, bool]:
|
|
|
|
|
"""Restart all enabled collectors."""
|
|
|
|
|
return await self.lifecycle_manager.restart_all_collectors()
|
2025-05-30 20:33:56 +08:00
|
|
|
|
2025-06-10 12:55:27 +08:00
|
|
|
def get_status(self, force_refresh: bool = False) -> Dict[str, Any]:
|
|
|
|
|
"""Get manager status and statistics."""
|
|
|
|
|
status_dict = self.stats_tracker.get_status(force_refresh)
|
|
|
|
|
status_dict['manager_status'] = self.status.value
|
|
|
|
|
return status_dict
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
def get_collector_status(self, collector_name: str) -> Optional[Dict[str, Any]]:
|
2025-06-10 12:55:27 +08:00
|
|
|
"""Get status for a specific collector."""
|
|
|
|
|
return self.stats_tracker.get_collector_status(collector_name)
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
def list_collectors(self) -> List[str]:
|
2025-06-10 12:55:27 +08:00
|
|
|
"""List all managed collector names."""
|
|
|
|
|
return self.stats_tracker.list_collectors()
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
def get_running_collectors(self) -> List[str]:
|
2025-06-10 12:55:27 +08:00
|
|
|
"""Get names of currently running collectors."""
|
|
|
|
|
return self.stats_tracker.get_running_collectors()
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
def get_failed_collectors(self) -> List[str]:
|
2025-06-10 12:55:27 +08:00
|
|
|
"""Get names of failed or unhealthy collectors."""
|
|
|
|
|
return self.stats_tracker.get_failed_collectors()
|
2025-05-30 20:33:56 +08:00
|
|
|
|
|
|
|
|
def __repr__(self) -> str:
|
|
|
|
|
"""String representation of the manager."""
|
2025-06-10 12:55:27 +08:00
|
|
|
return f"CollectorManager(name={self.manager_name}, status={self.status.value})"
|