- Introduced `BaseDataCollector` and `CollectorManager` classes for standardized data collection and centralized management. - Added health monitoring features, including auto-restart capabilities and detailed status reporting for collectors. - Updated `env.template` to include new logging and health check configurations. - Enhanced documentation in `docs/data_collectors.md` to provide comprehensive guidance on the new data collection system. - Added unit tests for `BaseDataCollector` and `CollectorManager` to ensure reliability and functionality.
529 lines
20 KiB
Python
529 lines
20 KiB
Python
"""
|
|
Data Collector Manager for supervising and managing multiple data collectors.
|
|
|
|
This module provides centralized management of data collectors with health monitoring,
|
|
auto-recovery, and coordinated lifecycle management.
|
|
"""
|
|
|
|
import asyncio
|
|
import time
|
|
from datetime import datetime, timezone, timedelta
|
|
from typing import Dict, List, Optional, Any, Set
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
|
|
from utils.logger import get_logger
|
|
from .base_collector import BaseDataCollector, CollectorStatus
|
|
|
|
|
|
class ManagerStatus(Enum):
|
|
"""Status of the collector manager."""
|
|
STOPPED = "stopped"
|
|
STARTING = "starting"
|
|
RUNNING = "running"
|
|
STOPPING = "stopping"
|
|
ERROR = "error"
|
|
|
|
|
|
@dataclass
|
|
class CollectorConfig:
|
|
"""Configuration for a data collector."""
|
|
name: str
|
|
exchange: str
|
|
symbols: List[str]
|
|
data_types: List[str]
|
|
auto_restart: bool = True
|
|
health_check_interval: float = 30.0
|
|
enabled: bool = True
|
|
|
|
|
|
class CollectorManager:
|
|
"""
|
|
Manages multiple data collectors with health monitoring and auto-recovery.
|
|
|
|
The manager is responsible for:
|
|
- Starting and stopping collectors
|
|
- Health monitoring and auto-restart
|
|
- Coordinated lifecycle management
|
|
- Status reporting and metrics
|
|
"""
|
|
|
|
def __init__(self,
|
|
manager_name: str = "collector_manager",
|
|
global_health_check_interval: float = 60.0,
|
|
restart_delay: float = 5.0):
|
|
"""
|
|
Initialize the collector manager.
|
|
|
|
Args:
|
|
manager_name: Name for logging
|
|
global_health_check_interval: Seconds between global health checks
|
|
restart_delay: Delay between restart attempts
|
|
"""
|
|
self.manager_name = manager_name
|
|
self.global_health_check_interval = global_health_check_interval
|
|
self.restart_delay = restart_delay
|
|
|
|
# Initialize logger
|
|
self.logger = get_logger(f"data_collector_manager", verbose=True)
|
|
|
|
# Manager state
|
|
self.status = ManagerStatus.STOPPED
|
|
self._running = False
|
|
self._tasks: Set[asyncio.Task] = set()
|
|
|
|
# Collector management
|
|
self._collectors: Dict[str, BaseDataCollector] = {}
|
|
self._collector_configs: Dict[str, CollectorConfig] = {}
|
|
self._enabled_collectors: Set[str] = set()
|
|
|
|
# Health monitoring
|
|
self._last_global_check = datetime.now(timezone.utc)
|
|
self._global_health_task = None
|
|
|
|
# Statistics
|
|
self._stats = {
|
|
'total_collectors': 0,
|
|
'running_collectors': 0,
|
|
'failed_collectors': 0,
|
|
'restarts_performed': 0,
|
|
'last_global_check': None,
|
|
'uptime_start': None
|
|
}
|
|
|
|
self.logger.info(f"Initialized collector manager: {manager_name}")
|
|
|
|
def add_collector(self,
|
|
collector: BaseDataCollector,
|
|
config: Optional[CollectorConfig] = None) -> None:
|
|
"""
|
|
Add a collector to be managed.
|
|
|
|
Args:
|
|
collector: Data collector instance
|
|
config: Optional configuration (will create default if not provided)
|
|
"""
|
|
# Use a more unique name to avoid duplicates
|
|
collector_name = f"{collector.exchange_name}_{int(time.time() * 1000000) % 1000000}"
|
|
|
|
# Ensure unique name
|
|
counter = 1
|
|
base_name = collector_name
|
|
while collector_name in self._collectors:
|
|
collector_name = f"{base_name}_{counter}"
|
|
counter += 1
|
|
|
|
if config is None:
|
|
config = CollectorConfig(
|
|
name=collector_name,
|
|
exchange=collector.exchange_name,
|
|
symbols=list(collector.symbols),
|
|
data_types=[dt.value for dt in collector.data_types],
|
|
auto_restart=collector.auto_restart,
|
|
health_check_interval=collector.health_check_interval
|
|
)
|
|
|
|
self._collectors[collector_name] = collector
|
|
self._collector_configs[collector_name] = config
|
|
|
|
if config.enabled:
|
|
self._enabled_collectors.add(collector_name)
|
|
|
|
self._stats['total_collectors'] = len(self._collectors)
|
|
|
|
self.logger.info(f"Added collector: {collector_name} ({collector.exchange_name}) - "
|
|
f"Symbols: {', '.join(collector.symbols)} - Enabled: {config.enabled}")
|
|
|
|
def remove_collector(self, collector_name: str) -> bool:
|
|
"""
|
|
Remove a collector from management.
|
|
|
|
Args:
|
|
collector_name: Name of the collector to remove
|
|
|
|
Returns:
|
|
True if removed successfully, False if not found
|
|
"""
|
|
if collector_name not in self._collectors:
|
|
self.logger.warning(f"Collector not found: {collector_name}")
|
|
return False
|
|
|
|
# Stop the collector first (only if event loop is running)
|
|
collector = self._collectors[collector_name]
|
|
if collector.status != CollectorStatus.STOPPED:
|
|
try:
|
|
# Try to create task only if event loop is running
|
|
asyncio.create_task(collector.stop(force=True))
|
|
except RuntimeError:
|
|
# No event loop running, just log
|
|
self.logger.info(f"Collector {collector_name} will be removed without stopping (no event loop)")
|
|
|
|
# Remove from management
|
|
del self._collectors[collector_name]
|
|
del self._collector_configs[collector_name]
|
|
self._enabled_collectors.discard(collector_name)
|
|
|
|
self._stats['total_collectors'] = len(self._collectors)
|
|
|
|
self.logger.info(f"Removed collector: {collector_name}")
|
|
return True
|
|
|
|
def enable_collector(self, collector_name: str) -> bool:
|
|
"""
|
|
Enable a collector (will be started if manager is running).
|
|
|
|
Args:
|
|
collector_name: Name of the collector to enable
|
|
|
|
Returns:
|
|
True if enabled successfully, False if not found
|
|
"""
|
|
if collector_name not in self._collectors:
|
|
self.logger.warning(f"Collector not found: {collector_name}")
|
|
return False
|
|
|
|
self._enabled_collectors.add(collector_name)
|
|
self._collector_configs[collector_name].enabled = True
|
|
|
|
# Start the collector if manager is running (only if event loop is running)
|
|
if self._running:
|
|
try:
|
|
asyncio.create_task(self._start_collector(collector_name))
|
|
except RuntimeError:
|
|
# No event loop running, will be started when manager starts
|
|
self.logger.debug(f"Collector {collector_name} enabled but will start when manager starts")
|
|
|
|
self.logger.info(f"Enabled collector: {collector_name}")
|
|
return True
|
|
|
|
def disable_collector(self, collector_name: str) -> bool:
|
|
"""
|
|
Disable a collector (will be stopped if running).
|
|
|
|
Args:
|
|
collector_name: Name of the collector to disable
|
|
|
|
Returns:
|
|
True if disabled successfully, False if not found
|
|
"""
|
|
if collector_name not in self._collectors:
|
|
self.logger.warning(f"Collector not found: {collector_name}")
|
|
return False
|
|
|
|
self._enabled_collectors.discard(collector_name)
|
|
self._collector_configs[collector_name].enabled = False
|
|
|
|
# Stop the collector (only if event loop is running)
|
|
collector = self._collectors[collector_name]
|
|
try:
|
|
asyncio.create_task(collector.stop(force=True))
|
|
except RuntimeError:
|
|
# No event loop running, just log
|
|
self.logger.debug(f"Collector {collector_name} disabled but cannot stop (no event loop)")
|
|
|
|
self.logger.info(f"Disabled collector: {collector_name}")
|
|
return True
|
|
|
|
async def start(self) -> bool:
|
|
"""
|
|
Start the collector manager and all enabled collectors.
|
|
|
|
Returns:
|
|
True if started successfully, False otherwise
|
|
"""
|
|
if self.status in [ManagerStatus.RUNNING, ManagerStatus.STARTING]:
|
|
self.logger.warning("Collector manager is already running or starting")
|
|
return True
|
|
|
|
self.logger.info("Starting collector manager")
|
|
self.status = ManagerStatus.STARTING
|
|
|
|
try:
|
|
self._running = True
|
|
self._stats['uptime_start'] = datetime.now(timezone.utc)
|
|
|
|
# Start all enabled collectors
|
|
start_tasks = []
|
|
for collector_name in self._enabled_collectors:
|
|
task = asyncio.create_task(self._start_collector(collector_name))
|
|
start_tasks.append(task)
|
|
|
|
# Wait for all collectors to start (with timeout)
|
|
if start_tasks:
|
|
try:
|
|
await asyncio.wait_for(asyncio.gather(*start_tasks, return_exceptions=True), timeout=30.0)
|
|
except asyncio.TimeoutError:
|
|
self.logger.warning("Some collectors took too long to start")
|
|
|
|
# Start global health monitoring
|
|
health_task = asyncio.create_task(self._global_health_monitor())
|
|
self._tasks.add(health_task)
|
|
health_task.add_done_callback(self._tasks.discard)
|
|
|
|
self.status = ManagerStatus.RUNNING
|
|
self.logger.info(f"Collector manager started - Managing {len(self._enabled_collectors)} collectors")
|
|
return True
|
|
|
|
except Exception as e:
|
|
self.status = ManagerStatus.ERROR
|
|
self.logger.error(f"Failed to start collector manager: {e}")
|
|
return False
|
|
|
|
async def stop(self) -> None:
|
|
"""Stop the collector manager and all collectors."""
|
|
if self.status == ManagerStatus.STOPPED:
|
|
self.logger.warning("Collector manager is already stopped")
|
|
return
|
|
|
|
self.logger.info("Stopping collector manager")
|
|
self.status = ManagerStatus.STOPPING
|
|
self._running = False
|
|
|
|
try:
|
|
# Cancel manager tasks
|
|
for task in list(self._tasks):
|
|
task.cancel()
|
|
|
|
if self._tasks:
|
|
await asyncio.gather(*self._tasks, return_exceptions=True)
|
|
|
|
# Stop all collectors
|
|
stop_tasks = []
|
|
for collector in self._collectors.values():
|
|
task = asyncio.create_task(collector.stop(force=True))
|
|
stop_tasks.append(task)
|
|
|
|
# Wait for all collectors to stop (with timeout)
|
|
if stop_tasks:
|
|
try:
|
|
await asyncio.wait_for(asyncio.gather(*stop_tasks, return_exceptions=True), timeout=30.0)
|
|
except asyncio.TimeoutError:
|
|
self.logger.warning("Some collectors took too long to stop")
|
|
|
|
self.status = ManagerStatus.STOPPED
|
|
self.logger.info("Collector manager stopped")
|
|
|
|
except Exception as e:
|
|
self.status = ManagerStatus.ERROR
|
|
self.logger.error(f"Error stopping collector manager: {e}")
|
|
|
|
async def restart_collector(self, collector_name: str) -> bool:
|
|
"""
|
|
Restart a specific collector.
|
|
|
|
Args:
|
|
collector_name: Name of the collector to restart
|
|
|
|
Returns:
|
|
True if restarted successfully, False otherwise
|
|
"""
|
|
if collector_name not in self._collectors:
|
|
self.logger.warning(f"Collector not found: {collector_name}")
|
|
return False
|
|
|
|
collector = self._collectors[collector_name]
|
|
self.logger.info(f"Restarting collector: {collector_name}")
|
|
|
|
try:
|
|
success = await collector.restart()
|
|
if success:
|
|
self._stats['restarts_performed'] += 1
|
|
self.logger.info(f"Successfully restarted collector: {collector_name}")
|
|
else:
|
|
self.logger.error(f"Failed to restart collector: {collector_name}")
|
|
return success
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error restarting collector {collector_name}: {e}")
|
|
return False
|
|
|
|
async def _start_collector(self, collector_name: str) -> bool:
|
|
"""
|
|
Start a specific collector.
|
|
|
|
Args:
|
|
collector_name: Name of the collector to start
|
|
|
|
Returns:
|
|
True if started successfully, False otherwise
|
|
"""
|
|
if collector_name not in self._collectors:
|
|
self.logger.warning(f"Collector not found: {collector_name}")
|
|
return False
|
|
|
|
collector = self._collectors[collector_name]
|
|
|
|
try:
|
|
success = await collector.start()
|
|
if success:
|
|
self.logger.info(f"Started collector: {collector_name}")
|
|
else:
|
|
self.logger.error(f"Failed to start collector: {collector_name}")
|
|
return success
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error starting collector {collector_name}: {e}")
|
|
return False
|
|
|
|
async def _global_health_monitor(self) -> None:
|
|
"""Global health monitoring for all collectors."""
|
|
self.logger.debug("Starting global health monitor")
|
|
|
|
while self._running:
|
|
try:
|
|
await asyncio.sleep(self.global_health_check_interval)
|
|
|
|
self._last_global_check = datetime.now(timezone.utc)
|
|
self._stats['last_global_check'] = self._last_global_check
|
|
|
|
# Check each enabled collector
|
|
running_count = 0
|
|
failed_count = 0
|
|
|
|
for collector_name in self._enabled_collectors:
|
|
collector = self._collectors[collector_name]
|
|
health_status = collector.get_health_status()
|
|
|
|
if health_status['is_healthy'] and collector.status == CollectorStatus.RUNNING:
|
|
running_count += 1
|
|
elif not health_status['is_healthy']:
|
|
failed_count += 1
|
|
self.logger.warning(f"Collector {collector_name} is unhealthy: {health_status['issues']}")
|
|
|
|
# Auto-restart if needed and not already restarting
|
|
if (collector.auto_restart and
|
|
collector.status not in [CollectorStatus.STARTING, CollectorStatus.STOPPING]):
|
|
self.logger.info(f"Auto-restarting unhealthy collector: {collector_name}")
|
|
asyncio.create_task(self.restart_collector(collector_name))
|
|
|
|
# Update global statistics
|
|
self._stats['running_collectors'] = running_count
|
|
self._stats['failed_collectors'] = failed_count
|
|
|
|
self.logger.debug(f"Health check complete - Running: {running_count}, Failed: {failed_count}")
|
|
|
|
except asyncio.CancelledError:
|
|
self.logger.debug("Global health monitor cancelled")
|
|
break
|
|
except Exception as e:
|
|
self.logger.error(f"Error in global health monitor: {e}")
|
|
await asyncio.sleep(self.global_health_check_interval)
|
|
|
|
def get_status(self) -> Dict[str, Any]:
|
|
"""
|
|
Get manager status and statistics.
|
|
|
|
Returns:
|
|
Dictionary containing status information
|
|
"""
|
|
uptime_seconds = None
|
|
if self._stats['uptime_start']:
|
|
uptime_seconds = (datetime.now(timezone.utc) - self._stats['uptime_start']).total_seconds()
|
|
|
|
# Get individual collector statuses
|
|
collector_statuses = {}
|
|
for name, collector in self._collectors.items():
|
|
collector_statuses[name] = {
|
|
'status': collector.status.value,
|
|
'enabled': name in self._enabled_collectors,
|
|
'health': collector.get_health_status()
|
|
}
|
|
|
|
return {
|
|
'manager_status': self.status.value,
|
|
'uptime_seconds': uptime_seconds,
|
|
'statistics': self._stats,
|
|
'collectors': collector_statuses,
|
|
'enabled_collectors': list(self._enabled_collectors),
|
|
'total_collectors': len(self._collectors)
|
|
}
|
|
|
|
def get_collector_status(self, collector_name: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Get status for a specific collector.
|
|
|
|
Args:
|
|
collector_name: Name of the collector
|
|
|
|
Returns:
|
|
Collector status dict or None if not found
|
|
"""
|
|
if collector_name not in self._collectors:
|
|
return None
|
|
|
|
collector = self._collectors[collector_name]
|
|
return {
|
|
'name': collector_name,
|
|
'config': self._collector_configs[collector_name].__dict__,
|
|
'status': collector.get_status(),
|
|
'health': collector.get_health_status()
|
|
}
|
|
|
|
def list_collectors(self) -> List[str]:
|
|
"""
|
|
List all managed collector names.
|
|
|
|
Returns:
|
|
List of collector names
|
|
"""
|
|
return list(self._collectors.keys())
|
|
|
|
def get_running_collectors(self) -> List[str]:
|
|
"""
|
|
Get names of currently running collectors.
|
|
|
|
Returns:
|
|
List of running collector names
|
|
"""
|
|
running = []
|
|
for name, collector in self._collectors.items():
|
|
if collector.status == CollectorStatus.RUNNING:
|
|
running.append(name)
|
|
return running
|
|
|
|
def get_failed_collectors(self) -> List[str]:
|
|
"""
|
|
Get names of failed or unhealthy collectors.
|
|
|
|
Returns:
|
|
List of failed collector names
|
|
"""
|
|
failed = []
|
|
for name, collector in self._collectors.items():
|
|
health_status = collector.get_health_status()
|
|
if not health_status['is_healthy']:
|
|
failed.append(name)
|
|
return failed
|
|
|
|
async def restart_all_collectors(self) -> Dict[str, bool]:
|
|
"""
|
|
Restart all enabled collectors.
|
|
|
|
Returns:
|
|
Dictionary mapping collector names to restart success status
|
|
"""
|
|
self.logger.info("Restarting all enabled collectors")
|
|
|
|
results = {}
|
|
restart_tasks = []
|
|
|
|
for collector_name in self._enabled_collectors:
|
|
task = asyncio.create_task(self.restart_collector(collector_name))
|
|
restart_tasks.append((collector_name, task))
|
|
|
|
# Wait for all restarts to complete
|
|
for collector_name, task in restart_tasks:
|
|
try:
|
|
results[collector_name] = await task
|
|
except Exception as e:
|
|
self.logger.error(f"Error restarting {collector_name}: {e}")
|
|
results[collector_name] = False
|
|
|
|
successful_restarts = sum(1 for success in results.values() if success)
|
|
self.logger.info(f"Restart complete - {successful_restarts}/{len(results)} collectors restarted successfully")
|
|
|
|
return results
|
|
|
|
def __repr__(self) -> str:
|
|
"""String representation of the manager."""
|
|
return f"<CollectorManager({self.manager_name}, {len(self._collectors)} collectors, {self.status.value})>" |