- Introduced `service_config.py` to manage configuration loading, validation, and schema management, enhancing modularity and security. - Created a `ServiceConfig` class for handling configuration with robust error handling and default values. - Refactored `DataCollectionService` to utilize the new `ServiceConfig`, streamlining configuration management and improving readability. - Added a `CollectorFactory` to encapsulate collector creation logic, promoting separation of concerns. - Updated `CollectorManager` and related components to align with the new architecture, ensuring better maintainability. - Enhanced logging practices across the service for improved monitoring and debugging. These changes significantly improve the architecture and maintainability of the data collection service, aligning with project standards for modularity and performance.
183 lines
7.5 KiB
Python
183 lines
7.5 KiB
Python
"""
|
|
Data Collector Manager for supervising and managing multiple data collectors.
|
|
|
|
This module provides centralized management of data collectors with health monitoring,
|
|
auto-recovery, and coordinated lifecycle management.
|
|
"""
|
|
|
|
import asyncio
|
|
from typing import Dict, List, Optional, Any, Set
|
|
|
|
from utils.logger import get_logger
|
|
from .base_collector import BaseDataCollector, CollectorStatus
|
|
from .collector_types import ManagerStatus, CollectorConfig
|
|
from .manager_components import (
|
|
CollectorLifecycleManager,
|
|
ManagerHealthMonitor,
|
|
ManagerStatsTracker,
|
|
ManagerLogger
|
|
)
|
|
|
|
|
|
class CollectorManager:
|
|
"""
|
|
Manages multiple data collectors with health monitoring and auto-recovery.
|
|
|
|
The manager is responsible for:
|
|
- Starting and stopping collectors
|
|
- Health monitoring and auto-restart
|
|
- Coordinated lifecycle management
|
|
- Status reporting and metrics
|
|
"""
|
|
|
|
def __init__(self,
|
|
manager_name: str = "collector_manager",
|
|
global_health_check_interval: float = 60.0,
|
|
restart_delay: float = 5.0,
|
|
logger = None,
|
|
log_errors_only: bool = False):
|
|
"""Initialize the collector manager with component-based architecture."""
|
|
self.manager_name = manager_name
|
|
self.restart_delay = restart_delay
|
|
|
|
# Initialize components
|
|
self.logger_manager = ManagerLogger(logger, log_errors_only)
|
|
self.lifecycle_manager = CollectorLifecycleManager(self.logger_manager)
|
|
self.health_monitor = ManagerHealthMonitor(
|
|
global_health_check_interval, self.logger_manager, self.lifecycle_manager)
|
|
self.stats_tracker = ManagerStatsTracker(
|
|
30.0, self.logger_manager, self.lifecycle_manager, self.health_monitor)
|
|
|
|
# Manager state
|
|
self.status = ManagerStatus.STOPPED
|
|
self._running = False
|
|
self._tasks: Set[asyncio.Task] = set()
|
|
|
|
if self.logger_manager.is_debug_enabled():
|
|
self.logger_manager.log_info(f"Initialized collector manager: {manager_name}")
|
|
|
|
def add_collector(self, collector: BaseDataCollector, config: Optional[CollectorConfig] = None) -> None:
|
|
"""Add a collector to be managed."""
|
|
self.lifecycle_manager.add_collector(collector, config)
|
|
|
|
def remove_collector(self, collector_name: str) -> bool:
|
|
"""Remove a collector from management."""
|
|
return self.lifecycle_manager.remove_collector(collector_name)
|
|
|
|
def enable_collector(self, collector_name: str) -> bool:
|
|
"""Enable a collector (will be started if manager is running)."""
|
|
return self.lifecycle_manager.enable_collector(collector_name)
|
|
|
|
def disable_collector(self, collector_name: str) -> bool:
|
|
"""Disable a collector (will be stopped if running)."""
|
|
return self.lifecycle_manager.disable_collector(collector_name)
|
|
|
|
async def start(self) -> bool:
|
|
"""Start the collector manager and all enabled collectors."""
|
|
if self.status in [ManagerStatus.RUNNING, ManagerStatus.STARTING]:
|
|
self.logger_manager.log_warning("Collector manager is already running or starting")
|
|
return True
|
|
|
|
self.logger_manager.log_info("Starting collector manager")
|
|
self.status = ManagerStatus.STARTING
|
|
|
|
try:
|
|
self._running = True
|
|
|
|
# Set running state for all components
|
|
self.lifecycle_manager.set_running_state(True)
|
|
self.health_monitor.set_running_state(True)
|
|
self.stats_tracker.set_running_state(True)
|
|
|
|
# Start collectors and monitoring
|
|
await self.lifecycle_manager.start_all_enabled_collectors()
|
|
await self.health_monitor.start_monitoring()
|
|
|
|
# Track health monitoring task
|
|
health_task = self.health_monitor.get_health_task()
|
|
if health_task:
|
|
self._tasks.add(health_task)
|
|
health_task.add_done_callback(self._tasks.discard)
|
|
|
|
# Start statistics cache updates
|
|
await self.stats_tracker.start_cache_updates()
|
|
|
|
self.status = ManagerStatus.RUNNING
|
|
enabled_count = len(self.lifecycle_manager.get_enabled_collectors())
|
|
self.logger_manager.log_info(f"Collector manager started - Managing {enabled_count} collectors")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.status = ManagerStatus.ERROR
|
|
self.logger_manager.log_error(f"Failed to start collector manager: {e}", exc_info=True)
|
|
return False
|
|
|
|
async def stop(self) -> None:
|
|
"""Stop the collector manager and all collectors."""
|
|
if self.status == ManagerStatus.STOPPED:
|
|
self.logger_manager.log_warning("Collector manager is already stopped")
|
|
return
|
|
|
|
self.logger_manager.log_info("Stopping collector manager")
|
|
self.status = ManagerStatus.STOPPING
|
|
self._running = False
|
|
|
|
try:
|
|
# Set running state for all components
|
|
self.lifecycle_manager.set_running_state(False)
|
|
self.health_monitor.set_running_state(False)
|
|
self.stats_tracker.set_running_state(False)
|
|
|
|
# Stop monitoring and statistics
|
|
await self.health_monitor.stop_monitoring()
|
|
await self.stats_tracker.stop_cache_updates()
|
|
|
|
# Cancel manager tasks
|
|
for task in list(self._tasks):
|
|
task.cancel()
|
|
if self._tasks:
|
|
await asyncio.gather(*self._tasks, return_exceptions=True)
|
|
|
|
# Stop all collectors
|
|
await self.lifecycle_manager.stop_all_collectors()
|
|
|
|
self.status = ManagerStatus.STOPPED
|
|
self.logger_manager.log_info("Collector manager stopped")
|
|
|
|
except Exception as e:
|
|
self.status = ManagerStatus.ERROR
|
|
self.logger_manager.log_error(f"Error stopping collector manager: {e}", exc_info=True)
|
|
|
|
async def restart_collector(self, collector_name: str) -> bool:
|
|
"""Restart a specific collector."""
|
|
return await self.lifecycle_manager.restart_collector(collector_name)
|
|
|
|
async def restart_all_collectors(self) -> Dict[str, bool]:
|
|
"""Restart all enabled collectors."""
|
|
return await self.lifecycle_manager.restart_all_collectors()
|
|
|
|
def get_status(self, force_refresh: bool = False) -> Dict[str, Any]:
|
|
"""Get manager status and statistics."""
|
|
status_dict = self.stats_tracker.get_status(force_refresh)
|
|
status_dict['manager_status'] = self.status.value
|
|
return status_dict
|
|
|
|
def get_collector_status(self, collector_name: str) -> Optional[Dict[str, Any]]:
|
|
"""Get status for a specific collector."""
|
|
return self.stats_tracker.get_collector_status(collector_name)
|
|
|
|
def list_collectors(self) -> List[str]:
|
|
"""List all managed collector names."""
|
|
return self.stats_tracker.list_collectors()
|
|
|
|
def get_running_collectors(self) -> List[str]:
|
|
"""Get names of currently running collectors."""
|
|
return self.stats_tracker.get_running_collectors()
|
|
|
|
def get_failed_collectors(self) -> List[str]:
|
|
"""Get names of failed or unhealthy collectors."""
|
|
return self.stats_tracker.get_failed_collectors()
|
|
|
|
def __repr__(self) -> str:
|
|
"""String representation of the manager."""
|
|
return f"CollectorManager(name={self.manager_name}, status={self.status.value})" |