Implement enhanced data collection system with health monitoring and management
- Introduced `BaseDataCollector` and `CollectorManager` classes for standardized data collection and centralized management. - Added health monitoring features, including auto-restart capabilities and detailed status reporting for collectors. - Updated `env.template` to include new logging and health check configurations. - Enhanced documentation in `docs/data_collectors.md` to provide comprehensive guidance on the new data collection system. - Added unit tests for `BaseDataCollector` and `CollectorManager` to ensure reliability and functionality.
This commit is contained in:
25
data/__init__.py
Normal file
25
data/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""
|
||||
Data collection and processing package for the Crypto Trading Bot Platform.
|
||||
|
||||
This package contains modules for collecting market data from various exchanges,
|
||||
processing and validating the data, and storing it in the database.
|
||||
"""
|
||||
|
||||
from .base_collector import (
|
||||
BaseDataCollector, DataCollectorError, DataValidationError,
|
||||
DataType, CollectorStatus, MarketDataPoint, OHLCVData
|
||||
)
|
||||
from .collector_manager import CollectorManager, ManagerStatus, CollectorConfig
|
||||
|
||||
__all__ = [
|
||||
'BaseDataCollector',
|
||||
'DataCollectorError',
|
||||
'DataValidationError',
|
||||
'DataType',
|
||||
'CollectorStatus',
|
||||
'MarketDataPoint',
|
||||
'OHLCVData',
|
||||
'CollectorManager',
|
||||
'ManagerStatus',
|
||||
'CollectorConfig'
|
||||
]
|
||||
667
data/base_collector.py
Normal file
667
data/base_collector.py
Normal file
@@ -0,0 +1,667 @@
|
||||
"""
|
||||
Abstract base class for data collectors.
|
||||
|
||||
This module provides a common interface for all data collection implementations,
|
||||
ensuring consistency across different exchange connectors and data sources.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from abc import ABC, abstractmethod
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from decimal import Decimal
|
||||
from typing import Dict, List, Optional, Any, Callable, Set
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
from utils.logger import get_logger
|
||||
|
||||
|
||||
class DataType(Enum):
|
||||
"""Types of data that can be collected."""
|
||||
TICKER = "ticker"
|
||||
TRADE = "trade"
|
||||
ORDERBOOK = "orderbook"
|
||||
CANDLE = "candle"
|
||||
BALANCE = "balance"
|
||||
|
||||
|
||||
class CollectorStatus(Enum):
|
||||
"""Status of the data collector."""
|
||||
STOPPED = "stopped"
|
||||
STARTING = "starting"
|
||||
RUNNING = "running"
|
||||
STOPPING = "stopping"
|
||||
ERROR = "error"
|
||||
RECONNECTING = "reconnecting"
|
||||
UNHEALTHY = "unhealthy" # Added for health monitoring
|
||||
|
||||
|
||||
@dataclass
|
||||
class MarketDataPoint:
|
||||
"""Standardized market data structure."""
|
||||
exchange: str
|
||||
symbol: str
|
||||
timestamp: datetime
|
||||
data_type: DataType
|
||||
data: Dict[str, Any]
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate data after initialization."""
|
||||
if not self.timestamp.tzinfo:
|
||||
self.timestamp = self.timestamp.replace(tzinfo=timezone.utc)
|
||||
|
||||
|
||||
@dataclass
|
||||
class OHLCVData:
|
||||
"""OHLCV (Open, High, Low, Close, Volume) data structure."""
|
||||
symbol: str
|
||||
timeframe: str
|
||||
timestamp: datetime
|
||||
open: Decimal
|
||||
high: Decimal
|
||||
low: Decimal
|
||||
close: Decimal
|
||||
volume: Decimal
|
||||
trades_count: Optional[int] = None
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate OHLCV data after initialization."""
|
||||
if not self.timestamp.tzinfo:
|
||||
self.timestamp = self.timestamp.replace(tzinfo=timezone.utc)
|
||||
|
||||
# Validate price data
|
||||
if not all(isinstance(price, (Decimal, float, int)) for price in [self.open, self.high, self.low, self.close]):
|
||||
raise DataValidationError("All OHLCV prices must be numeric")
|
||||
|
||||
if not isinstance(self.volume, (Decimal, float, int)):
|
||||
raise DataValidationError("Volume must be numeric")
|
||||
|
||||
# Convert to Decimal for precision
|
||||
self.open = Decimal(str(self.open))
|
||||
self.high = Decimal(str(self.high))
|
||||
self.low = Decimal(str(self.low))
|
||||
self.close = Decimal(str(self.close))
|
||||
self.volume = Decimal(str(self.volume))
|
||||
|
||||
# Validate price relationships
|
||||
if not (self.low <= self.open <= self.high and self.low <= self.close <= self.high):
|
||||
raise DataValidationError(f"Invalid OHLCV data: prices don't match expected relationships for {self.symbol}")
|
||||
|
||||
|
||||
class DataCollectorError(Exception):
|
||||
"""Base exception for data collector errors."""
|
||||
pass
|
||||
|
||||
|
||||
class DataValidationError(DataCollectorError):
|
||||
"""Exception raised when data validation fails."""
|
||||
pass
|
||||
|
||||
|
||||
class ConnectionError(DataCollectorError):
|
||||
"""Exception raised when connection to data source fails."""
|
||||
pass
|
||||
|
||||
|
||||
class BaseDataCollector(ABC):
|
||||
"""
|
||||
Abstract base class for all data collectors.
|
||||
|
||||
This class defines the interface that all data collection implementations
|
||||
must follow, providing consistency across different exchanges and data sources.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
exchange_name: str,
|
||||
symbols: List[str],
|
||||
data_types: Optional[List[DataType]] = None,
|
||||
component_name: Optional[str] = None,
|
||||
auto_restart: bool = True,
|
||||
health_check_interval: float = 30.0):
|
||||
"""
|
||||
Initialize the base data collector.
|
||||
|
||||
Args:
|
||||
exchange_name: Name of the exchange (e.g., 'okx', 'binance')
|
||||
symbols: List of trading symbols to collect data for
|
||||
data_types: Types of data to collect (default: [DataType.CANDLE])
|
||||
component_name: Name for logging (default: based on exchange_name)
|
||||
auto_restart: Enable automatic restart on failures (default: True)
|
||||
health_check_interval: Seconds between health checks (default: 30.0)
|
||||
"""
|
||||
self.exchange_name = exchange_name.lower()
|
||||
self.symbols = set(symbols)
|
||||
self.data_types = data_types or [DataType.CANDLE]
|
||||
self.auto_restart = auto_restart
|
||||
self.health_check_interval = health_check_interval
|
||||
|
||||
# Initialize logger
|
||||
component = component_name or f"{self.exchange_name}_collector"
|
||||
self.logger = get_logger(component, verbose=True)
|
||||
|
||||
# Collector state
|
||||
self.status = CollectorStatus.STOPPED
|
||||
self._running = False
|
||||
self._should_be_running = False # Track desired state
|
||||
self._tasks: Set[asyncio.Task] = set()
|
||||
|
||||
# Data callbacks
|
||||
self._data_callbacks: Dict[DataType, List[Callable]] = {
|
||||
data_type: [] for data_type in DataType
|
||||
}
|
||||
|
||||
# Connection management
|
||||
self._connection = None
|
||||
self._reconnect_attempts = 0
|
||||
self._max_reconnect_attempts = 5
|
||||
self._reconnect_delay = 5.0 # seconds
|
||||
|
||||
# Health monitoring
|
||||
self._last_heartbeat = datetime.now(timezone.utc)
|
||||
self._last_data_received = None
|
||||
self._health_check_task = None
|
||||
self._max_silence_duration = timedelta(minutes=5) # Max time without data before unhealthy
|
||||
|
||||
# Statistics
|
||||
self._stats = {
|
||||
'messages_received': 0,
|
||||
'messages_processed': 0,
|
||||
'errors': 0,
|
||||
'restarts': 0,
|
||||
'last_message_time': None,
|
||||
'connection_uptime': None,
|
||||
'last_error': None,
|
||||
'last_restart_time': None
|
||||
}
|
||||
|
||||
self.logger.info(f"Initialized {self.exchange_name} data collector for symbols: {', '.join(symbols)}")
|
||||
|
||||
@abstractmethod
|
||||
async def connect(self) -> bool:
|
||||
"""
|
||||
Establish connection to the data source.
|
||||
|
||||
Returns:
|
||||
True if connection successful, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def disconnect(self) -> None:
|
||||
"""Disconnect from the data source."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def subscribe_to_data(self, symbols: List[str], data_types: List[DataType]) -> bool:
|
||||
"""
|
||||
Subscribe to data streams for specified symbols and data types.
|
||||
|
||||
Args:
|
||||
symbols: Trading symbols to subscribe to
|
||||
data_types: Types of data to subscribe to
|
||||
|
||||
Returns:
|
||||
True if subscription successful, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def unsubscribe_from_data(self, symbols: List[str], data_types: List[DataType]) -> bool:
|
||||
"""
|
||||
Unsubscribe from data streams.
|
||||
|
||||
Args:
|
||||
symbols: Trading symbols to unsubscribe from
|
||||
data_types: Types of data to unsubscribe from
|
||||
|
||||
Returns:
|
||||
True if unsubscription successful, False otherwise
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
async def _process_message(self, message: Any) -> Optional[MarketDataPoint]:
|
||||
"""
|
||||
Process incoming message from the data source.
|
||||
|
||||
Args:
|
||||
message: Raw message from the data source
|
||||
|
||||
Returns:
|
||||
Processed MarketDataPoint or None if message should be ignored
|
||||
"""
|
||||
pass
|
||||
|
||||
async def start(self) -> bool:
|
||||
"""
|
||||
Start the data collector.
|
||||
|
||||
Returns:
|
||||
True if started successfully, False otherwise
|
||||
"""
|
||||
if self.status in [CollectorStatus.RUNNING, CollectorStatus.STARTING]:
|
||||
self.logger.warning("Data collector is already running or starting")
|
||||
return True
|
||||
|
||||
self.logger.info(f"Starting {self.exchange_name} data collector")
|
||||
self.status = CollectorStatus.STARTING
|
||||
self._should_be_running = True
|
||||
|
||||
try:
|
||||
# Connect to data source
|
||||
if not await self.connect():
|
||||
self.status = CollectorStatus.ERROR
|
||||
self.logger.error("Failed to connect to data source")
|
||||
return False
|
||||
|
||||
# Subscribe to data streams
|
||||
if not await self.subscribe_to_data(list(self.symbols), self.data_types):
|
||||
self.status = CollectorStatus.ERROR
|
||||
self.logger.error("Failed to subscribe to data streams")
|
||||
await self.disconnect()
|
||||
return False
|
||||
|
||||
# Start message processing
|
||||
self._running = True
|
||||
self.status = CollectorStatus.RUNNING
|
||||
self._stats['connection_uptime'] = datetime.now(timezone.utc)
|
||||
self._last_heartbeat = datetime.now(timezone.utc)
|
||||
|
||||
# Create background task for message processing
|
||||
message_task = asyncio.create_task(self._message_loop())
|
||||
self._tasks.add(message_task)
|
||||
message_task.add_done_callback(self._tasks.discard)
|
||||
|
||||
# Start health monitoring
|
||||
if self.auto_restart:
|
||||
health_task = asyncio.create_task(self._health_monitor())
|
||||
self._tasks.add(health_task)
|
||||
health_task.add_done_callback(self._tasks.discard)
|
||||
|
||||
self.logger.info(f"{self.exchange_name} data collector started successfully")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.status = CollectorStatus.ERROR
|
||||
self._stats['last_error'] = str(e)
|
||||
self.logger.error(f"Failed to start data collector: {e}")
|
||||
await self.disconnect()
|
||||
return False
|
||||
|
||||
async def stop(self, force: bool = False) -> None:
|
||||
"""
|
||||
Stop the data collector.
|
||||
|
||||
Args:
|
||||
force: If True, don't restart automatically even if auto_restart is enabled
|
||||
"""
|
||||
if self.status == CollectorStatus.STOPPED:
|
||||
self.logger.warning("Data collector is already stopped")
|
||||
return
|
||||
|
||||
self.logger.info(f"Stopping {self.exchange_name} data collector")
|
||||
self.status = CollectorStatus.STOPPING
|
||||
self._running = False
|
||||
|
||||
if force:
|
||||
self._should_be_running = False
|
||||
|
||||
try:
|
||||
# Cancel all tasks
|
||||
for task in list(self._tasks):
|
||||
task.cancel()
|
||||
|
||||
# Wait for tasks to complete
|
||||
if self._tasks:
|
||||
await asyncio.gather(*self._tasks, return_exceptions=True)
|
||||
|
||||
# Unsubscribe and disconnect
|
||||
await self.unsubscribe_from_data(list(self.symbols), self.data_types)
|
||||
await self.disconnect()
|
||||
|
||||
self.status = CollectorStatus.STOPPED
|
||||
self.logger.info(f"{self.exchange_name} data collector stopped")
|
||||
|
||||
except Exception as e:
|
||||
self.status = CollectorStatus.ERROR
|
||||
self._stats['last_error'] = str(e)
|
||||
self.logger.error(f"Error stopping data collector: {e}")
|
||||
|
||||
async def restart(self) -> bool:
|
||||
"""
|
||||
Restart the data collector.
|
||||
|
||||
Returns:
|
||||
True if restart successful, False otherwise
|
||||
"""
|
||||
self.logger.info(f"Restarting {self.exchange_name} data collector")
|
||||
self._stats['restarts'] += 1
|
||||
self._stats['last_restart_time'] = datetime.now(timezone.utc)
|
||||
|
||||
# Stop without disabling auto-restart
|
||||
await self.stop(force=False)
|
||||
|
||||
# Wait a bit before restart
|
||||
await asyncio.sleep(2.0)
|
||||
|
||||
# Reset reconnection attempts
|
||||
self._reconnect_attempts = 0
|
||||
|
||||
# Start again
|
||||
return await self.start()
|
||||
|
||||
async def _message_loop(self) -> None:
|
||||
"""Main message processing loop."""
|
||||
self.logger.debug("Starting message processing loop")
|
||||
|
||||
while self._running:
|
||||
try:
|
||||
# This should be implemented by subclasses to handle their specific message loop
|
||||
await self._handle_messages()
|
||||
|
||||
# Update heartbeat
|
||||
self._last_heartbeat = datetime.now(timezone.utc)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
self.logger.debug("Message loop cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
self._stats['errors'] += 1
|
||||
self._stats['last_error'] = str(e)
|
||||
self.logger.error(f"Error in message loop: {e}")
|
||||
|
||||
# Attempt reconnection if connection lost
|
||||
if not await self._handle_connection_error():
|
||||
break
|
||||
|
||||
await asyncio.sleep(1) # Brief pause before retrying
|
||||
|
||||
async def _health_monitor(self) -> None:
|
||||
"""Monitor collector health and restart if needed."""
|
||||
self.logger.debug("Starting health monitor")
|
||||
|
||||
while self._running and self.auto_restart:
|
||||
try:
|
||||
await asyncio.sleep(self.health_check_interval)
|
||||
|
||||
# Check if we should be running but aren't
|
||||
if self._should_be_running and not self._running:
|
||||
self.logger.warning("Collector should be running but isn't - restarting")
|
||||
await self.restart()
|
||||
continue
|
||||
|
||||
# Check heartbeat freshness
|
||||
time_since_heartbeat = datetime.now(timezone.utc) - self._last_heartbeat
|
||||
if time_since_heartbeat > timedelta(seconds=self.health_check_interval * 2):
|
||||
self.logger.warning(f"No heartbeat for {time_since_heartbeat.total_seconds():.1f}s - restarting")
|
||||
self.status = CollectorStatus.UNHEALTHY
|
||||
await self.restart()
|
||||
continue
|
||||
|
||||
# Check data freshness (if we've received data before)
|
||||
if self._last_data_received:
|
||||
time_since_data = datetime.now(timezone.utc) - self._last_data_received
|
||||
if time_since_data > self._max_silence_duration:
|
||||
self.logger.warning(f"No data received for {time_since_data.total_seconds():.1f}s - restarting")
|
||||
self.status = CollectorStatus.UNHEALTHY
|
||||
await self.restart()
|
||||
continue
|
||||
|
||||
# Check if status indicates failure
|
||||
if self.status in [CollectorStatus.ERROR, CollectorStatus.UNHEALTHY]:
|
||||
self.logger.warning(f"Collector in {self.status.value} status - restarting")
|
||||
await self.restart()
|
||||
continue
|
||||
|
||||
except asyncio.CancelledError:
|
||||
self.logger.debug("Health monitor cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in health monitor: {e}")
|
||||
await asyncio.sleep(self.health_check_interval)
|
||||
|
||||
@abstractmethod
|
||||
async def _handle_messages(self) -> None:
|
||||
"""
|
||||
Handle incoming messages from the data source.
|
||||
This method should be implemented by subclasses to handle their specific message format.
|
||||
"""
|
||||
pass
|
||||
|
||||
async def _handle_connection_error(self) -> bool:
|
||||
"""
|
||||
Handle connection errors and attempt reconnection.
|
||||
|
||||
Returns:
|
||||
True if reconnection successful, False if max attempts exceeded
|
||||
"""
|
||||
if self._reconnect_attempts >= self._max_reconnect_attempts:
|
||||
self.logger.error(f"Max reconnection attempts ({self._max_reconnect_attempts}) exceeded")
|
||||
self.status = CollectorStatus.ERROR
|
||||
return False
|
||||
|
||||
self._reconnect_attempts += 1
|
||||
self.status = CollectorStatus.RECONNECTING
|
||||
|
||||
self.logger.warning(f"Connection lost. Attempting reconnection {self._reconnect_attempts}/{self._max_reconnect_attempts}")
|
||||
|
||||
await asyncio.sleep(self._reconnect_delay)
|
||||
|
||||
try:
|
||||
if await self.connect():
|
||||
if await self.subscribe_to_data(list(self.symbols), self.data_types):
|
||||
self.status = CollectorStatus.RUNNING
|
||||
self._reconnect_attempts = 0
|
||||
self._stats['connection_uptime'] = datetime.now(timezone.utc)
|
||||
self.logger.info("Reconnection successful")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self._stats['last_error'] = str(e)
|
||||
self.logger.error(f"Reconnection attempt failed: {e}")
|
||||
return False
|
||||
|
||||
def add_data_callback(self, data_type: DataType, callback: Callable[[MarketDataPoint], None]) -> None:
|
||||
"""
|
||||
Add a callback function to be called when data of specified type is received.
|
||||
|
||||
Args:
|
||||
data_type: Type of data to register callback for
|
||||
callback: Function to call with MarketDataPoint data
|
||||
"""
|
||||
self._data_callbacks[data_type].append(callback)
|
||||
self.logger.debug(f"Added callback for {data_type.value} data")
|
||||
|
||||
def remove_data_callback(self, data_type: DataType, callback: Callable[[MarketDataPoint], None]) -> None:
|
||||
"""
|
||||
Remove a data callback.
|
||||
|
||||
Args:
|
||||
data_type: Type of data to remove callback for
|
||||
callback: Callback function to remove
|
||||
"""
|
||||
if callback in self._data_callbacks[data_type]:
|
||||
self._data_callbacks[data_type].remove(callback)
|
||||
self.logger.debug(f"Removed callback for {data_type.value} data")
|
||||
|
||||
async def _notify_callbacks(self, data_point: MarketDataPoint) -> None:
|
||||
"""
|
||||
Notify all registered callbacks for the data type.
|
||||
|
||||
Args:
|
||||
data_point: Market data to send to callbacks
|
||||
"""
|
||||
# Update data received timestamp
|
||||
self._last_data_received = datetime.now(timezone.utc)
|
||||
self._stats['last_message_time'] = self._last_data_received
|
||||
|
||||
callbacks = self._data_callbacks.get(data_point.data_type, [])
|
||||
|
||||
for callback in callbacks:
|
||||
try:
|
||||
if asyncio.iscoroutinefunction(callback):
|
||||
await callback(data_point)
|
||||
else:
|
||||
callback(data_point)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in data callback: {e}")
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get current collector status and statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary containing status information
|
||||
"""
|
||||
uptime_seconds = None
|
||||
if self._stats['connection_uptime']:
|
||||
uptime_seconds = (datetime.now(timezone.utc) - self._stats['connection_uptime']).total_seconds()
|
||||
|
||||
time_since_heartbeat = None
|
||||
if self._last_heartbeat:
|
||||
time_since_heartbeat = (datetime.now(timezone.utc) - self._last_heartbeat).total_seconds()
|
||||
|
||||
time_since_data = None
|
||||
if self._last_data_received:
|
||||
time_since_data = (datetime.now(timezone.utc) - self._last_data_received).total_seconds()
|
||||
|
||||
return {
|
||||
'exchange': self.exchange_name,
|
||||
'status': self.status.value,
|
||||
'should_be_running': self._should_be_running,
|
||||
'symbols': list(self.symbols),
|
||||
'data_types': [dt.value for dt in self.data_types],
|
||||
'auto_restart': self.auto_restart,
|
||||
'health': {
|
||||
'time_since_heartbeat': time_since_heartbeat,
|
||||
'time_since_data': time_since_data,
|
||||
'max_silence_duration': self._max_silence_duration.total_seconds()
|
||||
},
|
||||
'statistics': {
|
||||
**self._stats,
|
||||
'uptime_seconds': uptime_seconds,
|
||||
'reconnect_attempts': self._reconnect_attempts
|
||||
}
|
||||
}
|
||||
|
||||
def get_health_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get detailed health status for monitoring.
|
||||
|
||||
Returns:
|
||||
Dictionary containing health information
|
||||
"""
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
is_healthy = True
|
||||
health_issues = []
|
||||
|
||||
# Check if should be running but isn't
|
||||
if self._should_be_running and not self._running:
|
||||
is_healthy = False
|
||||
health_issues.append("Should be running but is stopped")
|
||||
|
||||
# Check heartbeat
|
||||
if self._last_heartbeat:
|
||||
time_since_heartbeat = now - self._last_heartbeat
|
||||
if time_since_heartbeat > timedelta(seconds=self.health_check_interval * 2):
|
||||
is_healthy = False
|
||||
health_issues.append(f"No heartbeat for {time_since_heartbeat.total_seconds():.1f}s")
|
||||
|
||||
# Check data freshness
|
||||
if self._last_data_received:
|
||||
time_since_data = now - self._last_data_received
|
||||
if time_since_data > self._max_silence_duration:
|
||||
is_healthy = False
|
||||
health_issues.append(f"No data for {time_since_data.total_seconds():.1f}s")
|
||||
|
||||
# Check status
|
||||
if self.status in [CollectorStatus.ERROR, CollectorStatus.UNHEALTHY]:
|
||||
is_healthy = False
|
||||
health_issues.append(f"Status: {self.status.value}")
|
||||
|
||||
return {
|
||||
'is_healthy': is_healthy,
|
||||
'issues': health_issues,
|
||||
'status': self.status.value,
|
||||
'last_heartbeat': self._last_heartbeat.isoformat() if self._last_heartbeat else None,
|
||||
'last_data_received': self._last_data_received.isoformat() if self._last_data_received else None,
|
||||
'should_be_running': self._should_be_running,
|
||||
'is_running': self._running
|
||||
}
|
||||
|
||||
def add_symbol(self, symbol: str) -> None:
|
||||
"""
|
||||
Add a new symbol to collect data for.
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol to add
|
||||
"""
|
||||
if symbol not in self.symbols:
|
||||
self.symbols.add(symbol)
|
||||
self.logger.info(f"Added symbol: {symbol}")
|
||||
|
||||
def remove_symbol(self, symbol: str) -> None:
|
||||
"""
|
||||
Remove a symbol from data collection.
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol to remove
|
||||
"""
|
||||
if symbol in self.symbols:
|
||||
self.symbols.remove(symbol)
|
||||
self.logger.info(f"Removed symbol: {symbol}")
|
||||
|
||||
def validate_ohlcv_data(self, data: Dict[str, Any], symbol: str, timeframe: str) -> OHLCVData:
|
||||
"""
|
||||
Validate and convert raw OHLCV data to standardized format.
|
||||
|
||||
Args:
|
||||
data: Raw OHLCV data dictionary
|
||||
symbol: Trading symbol
|
||||
timeframe: Timeframe (e.g., '1m', '5m', '1h')
|
||||
|
||||
Returns:
|
||||
Validated OHLCVData object
|
||||
|
||||
Raises:
|
||||
DataValidationError: If data validation fails
|
||||
"""
|
||||
required_fields = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
|
||||
|
||||
# Check required fields
|
||||
for field in required_fields:
|
||||
if field not in data:
|
||||
raise DataValidationError(f"Missing required field: {field}")
|
||||
|
||||
try:
|
||||
# Parse timestamp
|
||||
timestamp = data['timestamp']
|
||||
if isinstance(timestamp, (int, float)):
|
||||
# Assume Unix timestamp in milliseconds
|
||||
timestamp = datetime.fromtimestamp(timestamp / 1000, tz=timezone.utc)
|
||||
elif isinstance(timestamp, str):
|
||||
timestamp = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
||||
elif not isinstance(timestamp, datetime):
|
||||
raise DataValidationError(f"Invalid timestamp format: {type(timestamp)}")
|
||||
|
||||
return OHLCVData(
|
||||
symbol=symbol,
|
||||
timeframe=timeframe,
|
||||
timestamp=timestamp,
|
||||
open=Decimal(str(data['open'])),
|
||||
high=Decimal(str(data['high'])),
|
||||
low=Decimal(str(data['low'])),
|
||||
close=Decimal(str(data['close'])),
|
||||
volume=Decimal(str(data['volume'])),
|
||||
trades_count=data.get('trades_count')
|
||||
)
|
||||
|
||||
except (ValueError, TypeError, KeyError) as e:
|
||||
raise DataValidationError(f"Invalid OHLCV data for {symbol}: {e}")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""String representation of the collector."""
|
||||
return f"<{self.__class__.__name__}({self.exchange_name}, {len(self.symbols)} symbols, {self.status.value})>"
|
||||
529
data/collector_manager.py
Normal file
529
data/collector_manager.py
Normal file
@@ -0,0 +1,529 @@
|
||||
"""
|
||||
Data Collector Manager for supervising and managing multiple data collectors.
|
||||
|
||||
This module provides centralized management of data collectors with health monitoring,
|
||||
auto-recovery, and coordinated lifecycle management.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Dict, List, Optional, Any, Set
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
from utils.logger import get_logger
|
||||
from .base_collector import BaseDataCollector, CollectorStatus
|
||||
|
||||
|
||||
class ManagerStatus(Enum):
|
||||
"""Status of the collector manager."""
|
||||
STOPPED = "stopped"
|
||||
STARTING = "starting"
|
||||
RUNNING = "running"
|
||||
STOPPING = "stopping"
|
||||
ERROR = "error"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CollectorConfig:
|
||||
"""Configuration for a data collector."""
|
||||
name: str
|
||||
exchange: str
|
||||
symbols: List[str]
|
||||
data_types: List[str]
|
||||
auto_restart: bool = True
|
||||
health_check_interval: float = 30.0
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
class CollectorManager:
|
||||
"""
|
||||
Manages multiple data collectors with health monitoring and auto-recovery.
|
||||
|
||||
The manager is responsible for:
|
||||
- Starting and stopping collectors
|
||||
- Health monitoring and auto-restart
|
||||
- Coordinated lifecycle management
|
||||
- Status reporting and metrics
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
manager_name: str = "collector_manager",
|
||||
global_health_check_interval: float = 60.0,
|
||||
restart_delay: float = 5.0):
|
||||
"""
|
||||
Initialize the collector manager.
|
||||
|
||||
Args:
|
||||
manager_name: Name for logging
|
||||
global_health_check_interval: Seconds between global health checks
|
||||
restart_delay: Delay between restart attempts
|
||||
"""
|
||||
self.manager_name = manager_name
|
||||
self.global_health_check_interval = global_health_check_interval
|
||||
self.restart_delay = restart_delay
|
||||
|
||||
# Initialize logger
|
||||
self.logger = get_logger(f"data_collector_manager", verbose=True)
|
||||
|
||||
# Manager state
|
||||
self.status = ManagerStatus.STOPPED
|
||||
self._running = False
|
||||
self._tasks: Set[asyncio.Task] = set()
|
||||
|
||||
# Collector management
|
||||
self._collectors: Dict[str, BaseDataCollector] = {}
|
||||
self._collector_configs: Dict[str, CollectorConfig] = {}
|
||||
self._enabled_collectors: Set[str] = set()
|
||||
|
||||
# Health monitoring
|
||||
self._last_global_check = datetime.now(timezone.utc)
|
||||
self._global_health_task = None
|
||||
|
||||
# Statistics
|
||||
self._stats = {
|
||||
'total_collectors': 0,
|
||||
'running_collectors': 0,
|
||||
'failed_collectors': 0,
|
||||
'restarts_performed': 0,
|
||||
'last_global_check': None,
|
||||
'uptime_start': None
|
||||
}
|
||||
|
||||
self.logger.info(f"Initialized collector manager: {manager_name}")
|
||||
|
||||
def add_collector(self,
|
||||
collector: BaseDataCollector,
|
||||
config: Optional[CollectorConfig] = None) -> None:
|
||||
"""
|
||||
Add a collector to be managed.
|
||||
|
||||
Args:
|
||||
collector: Data collector instance
|
||||
config: Optional configuration (will create default if not provided)
|
||||
"""
|
||||
# Use a more unique name to avoid duplicates
|
||||
collector_name = f"{collector.exchange_name}_{int(time.time() * 1000000) % 1000000}"
|
||||
|
||||
# Ensure unique name
|
||||
counter = 1
|
||||
base_name = collector_name
|
||||
while collector_name in self._collectors:
|
||||
collector_name = f"{base_name}_{counter}"
|
||||
counter += 1
|
||||
|
||||
if config is None:
|
||||
config = CollectorConfig(
|
||||
name=collector_name,
|
||||
exchange=collector.exchange_name,
|
||||
symbols=list(collector.symbols),
|
||||
data_types=[dt.value for dt in collector.data_types],
|
||||
auto_restart=collector.auto_restart,
|
||||
health_check_interval=collector.health_check_interval
|
||||
)
|
||||
|
||||
self._collectors[collector_name] = collector
|
||||
self._collector_configs[collector_name] = config
|
||||
|
||||
if config.enabled:
|
||||
self._enabled_collectors.add(collector_name)
|
||||
|
||||
self._stats['total_collectors'] = len(self._collectors)
|
||||
|
||||
self.logger.info(f"Added collector: {collector_name} ({collector.exchange_name}) - "
|
||||
f"Symbols: {', '.join(collector.symbols)} - Enabled: {config.enabled}")
|
||||
|
||||
def remove_collector(self, collector_name: str) -> bool:
|
||||
"""
|
||||
Remove a collector from management.
|
||||
|
||||
Args:
|
||||
collector_name: Name of the collector to remove
|
||||
|
||||
Returns:
|
||||
True if removed successfully, False if not found
|
||||
"""
|
||||
if collector_name not in self._collectors:
|
||||
self.logger.warning(f"Collector not found: {collector_name}")
|
||||
return False
|
||||
|
||||
# Stop the collector first (only if event loop is running)
|
||||
collector = self._collectors[collector_name]
|
||||
if collector.status != CollectorStatus.STOPPED:
|
||||
try:
|
||||
# Try to create task only if event loop is running
|
||||
asyncio.create_task(collector.stop(force=True))
|
||||
except RuntimeError:
|
||||
# No event loop running, just log
|
||||
self.logger.info(f"Collector {collector_name} will be removed without stopping (no event loop)")
|
||||
|
||||
# Remove from management
|
||||
del self._collectors[collector_name]
|
||||
del self._collector_configs[collector_name]
|
||||
self._enabled_collectors.discard(collector_name)
|
||||
|
||||
self._stats['total_collectors'] = len(self._collectors)
|
||||
|
||||
self.logger.info(f"Removed collector: {collector_name}")
|
||||
return True
|
||||
|
||||
def enable_collector(self, collector_name: str) -> bool:
|
||||
"""
|
||||
Enable a collector (will be started if manager is running).
|
||||
|
||||
Args:
|
||||
collector_name: Name of the collector to enable
|
||||
|
||||
Returns:
|
||||
True if enabled successfully, False if not found
|
||||
"""
|
||||
if collector_name not in self._collectors:
|
||||
self.logger.warning(f"Collector not found: {collector_name}")
|
||||
return False
|
||||
|
||||
self._enabled_collectors.add(collector_name)
|
||||
self._collector_configs[collector_name].enabled = True
|
||||
|
||||
# Start the collector if manager is running (only if event loop is running)
|
||||
if self._running:
|
||||
try:
|
||||
asyncio.create_task(self._start_collector(collector_name))
|
||||
except RuntimeError:
|
||||
# No event loop running, will be started when manager starts
|
||||
self.logger.debug(f"Collector {collector_name} enabled but will start when manager starts")
|
||||
|
||||
self.logger.info(f"Enabled collector: {collector_name}")
|
||||
return True
|
||||
|
||||
def disable_collector(self, collector_name: str) -> bool:
|
||||
"""
|
||||
Disable a collector (will be stopped if running).
|
||||
|
||||
Args:
|
||||
collector_name: Name of the collector to disable
|
||||
|
||||
Returns:
|
||||
True if disabled successfully, False if not found
|
||||
"""
|
||||
if collector_name not in self._collectors:
|
||||
self.logger.warning(f"Collector not found: {collector_name}")
|
||||
return False
|
||||
|
||||
self._enabled_collectors.discard(collector_name)
|
||||
self._collector_configs[collector_name].enabled = False
|
||||
|
||||
# Stop the collector (only if event loop is running)
|
||||
collector = self._collectors[collector_name]
|
||||
try:
|
||||
asyncio.create_task(collector.stop(force=True))
|
||||
except RuntimeError:
|
||||
# No event loop running, just log
|
||||
self.logger.debug(f"Collector {collector_name} disabled but cannot stop (no event loop)")
|
||||
|
||||
self.logger.info(f"Disabled collector: {collector_name}")
|
||||
return True
|
||||
|
||||
async def start(self) -> bool:
|
||||
"""
|
||||
Start the collector manager and all enabled collectors.
|
||||
|
||||
Returns:
|
||||
True if started successfully, False otherwise
|
||||
"""
|
||||
if self.status in [ManagerStatus.RUNNING, ManagerStatus.STARTING]:
|
||||
self.logger.warning("Collector manager is already running or starting")
|
||||
return True
|
||||
|
||||
self.logger.info("Starting collector manager")
|
||||
self.status = ManagerStatus.STARTING
|
||||
|
||||
try:
|
||||
self._running = True
|
||||
self._stats['uptime_start'] = datetime.now(timezone.utc)
|
||||
|
||||
# Start all enabled collectors
|
||||
start_tasks = []
|
||||
for collector_name in self._enabled_collectors:
|
||||
task = asyncio.create_task(self._start_collector(collector_name))
|
||||
start_tasks.append(task)
|
||||
|
||||
# Wait for all collectors to start (with timeout)
|
||||
if start_tasks:
|
||||
try:
|
||||
await asyncio.wait_for(asyncio.gather(*start_tasks, return_exceptions=True), timeout=30.0)
|
||||
except asyncio.TimeoutError:
|
||||
self.logger.warning("Some collectors took too long to start")
|
||||
|
||||
# Start global health monitoring
|
||||
health_task = asyncio.create_task(self._global_health_monitor())
|
||||
self._tasks.add(health_task)
|
||||
health_task.add_done_callback(self._tasks.discard)
|
||||
|
||||
self.status = ManagerStatus.RUNNING
|
||||
self.logger.info(f"Collector manager started - Managing {len(self._enabled_collectors)} collectors")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
self.status = ManagerStatus.ERROR
|
||||
self.logger.error(f"Failed to start collector manager: {e}")
|
||||
return False
|
||||
|
||||
async def stop(self) -> None:
|
||||
"""Stop the collector manager and all collectors."""
|
||||
if self.status == ManagerStatus.STOPPED:
|
||||
self.logger.warning("Collector manager is already stopped")
|
||||
return
|
||||
|
||||
self.logger.info("Stopping collector manager")
|
||||
self.status = ManagerStatus.STOPPING
|
||||
self._running = False
|
||||
|
||||
try:
|
||||
# Cancel manager tasks
|
||||
for task in list(self._tasks):
|
||||
task.cancel()
|
||||
|
||||
if self._tasks:
|
||||
await asyncio.gather(*self._tasks, return_exceptions=True)
|
||||
|
||||
# Stop all collectors
|
||||
stop_tasks = []
|
||||
for collector in self._collectors.values():
|
||||
task = asyncio.create_task(collector.stop(force=True))
|
||||
stop_tasks.append(task)
|
||||
|
||||
# Wait for all collectors to stop (with timeout)
|
||||
if stop_tasks:
|
||||
try:
|
||||
await asyncio.wait_for(asyncio.gather(*stop_tasks, return_exceptions=True), timeout=30.0)
|
||||
except asyncio.TimeoutError:
|
||||
self.logger.warning("Some collectors took too long to stop")
|
||||
|
||||
self.status = ManagerStatus.STOPPED
|
||||
self.logger.info("Collector manager stopped")
|
||||
|
||||
except Exception as e:
|
||||
self.status = ManagerStatus.ERROR
|
||||
self.logger.error(f"Error stopping collector manager: {e}")
|
||||
|
||||
async def restart_collector(self, collector_name: str) -> bool:
|
||||
"""
|
||||
Restart a specific collector.
|
||||
|
||||
Args:
|
||||
collector_name: Name of the collector to restart
|
||||
|
||||
Returns:
|
||||
True if restarted successfully, False otherwise
|
||||
"""
|
||||
if collector_name not in self._collectors:
|
||||
self.logger.warning(f"Collector not found: {collector_name}")
|
||||
return False
|
||||
|
||||
collector = self._collectors[collector_name]
|
||||
self.logger.info(f"Restarting collector: {collector_name}")
|
||||
|
||||
try:
|
||||
success = await collector.restart()
|
||||
if success:
|
||||
self._stats['restarts_performed'] += 1
|
||||
self.logger.info(f"Successfully restarted collector: {collector_name}")
|
||||
else:
|
||||
self.logger.error(f"Failed to restart collector: {collector_name}")
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error restarting collector {collector_name}: {e}")
|
||||
return False
|
||||
|
||||
async def _start_collector(self, collector_name: str) -> bool:
|
||||
"""
|
||||
Start a specific collector.
|
||||
|
||||
Args:
|
||||
collector_name: Name of the collector to start
|
||||
|
||||
Returns:
|
||||
True if started successfully, False otherwise
|
||||
"""
|
||||
if collector_name not in self._collectors:
|
||||
self.logger.warning(f"Collector not found: {collector_name}")
|
||||
return False
|
||||
|
||||
collector = self._collectors[collector_name]
|
||||
|
||||
try:
|
||||
success = await collector.start()
|
||||
if success:
|
||||
self.logger.info(f"Started collector: {collector_name}")
|
||||
else:
|
||||
self.logger.error(f"Failed to start collector: {collector_name}")
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error starting collector {collector_name}: {e}")
|
||||
return False
|
||||
|
||||
async def _global_health_monitor(self) -> None:
|
||||
"""Global health monitoring for all collectors."""
|
||||
self.logger.debug("Starting global health monitor")
|
||||
|
||||
while self._running:
|
||||
try:
|
||||
await asyncio.sleep(self.global_health_check_interval)
|
||||
|
||||
self._last_global_check = datetime.now(timezone.utc)
|
||||
self._stats['last_global_check'] = self._last_global_check
|
||||
|
||||
# Check each enabled collector
|
||||
running_count = 0
|
||||
failed_count = 0
|
||||
|
||||
for collector_name in self._enabled_collectors:
|
||||
collector = self._collectors[collector_name]
|
||||
health_status = collector.get_health_status()
|
||||
|
||||
if health_status['is_healthy'] and collector.status == CollectorStatus.RUNNING:
|
||||
running_count += 1
|
||||
elif not health_status['is_healthy']:
|
||||
failed_count += 1
|
||||
self.logger.warning(f"Collector {collector_name} is unhealthy: {health_status['issues']}")
|
||||
|
||||
# Auto-restart if needed and not already restarting
|
||||
if (collector.auto_restart and
|
||||
collector.status not in [CollectorStatus.STARTING, CollectorStatus.STOPPING]):
|
||||
self.logger.info(f"Auto-restarting unhealthy collector: {collector_name}")
|
||||
asyncio.create_task(self.restart_collector(collector_name))
|
||||
|
||||
# Update global statistics
|
||||
self._stats['running_collectors'] = running_count
|
||||
self._stats['failed_collectors'] = failed_count
|
||||
|
||||
self.logger.debug(f"Health check complete - Running: {running_count}, Failed: {failed_count}")
|
||||
|
||||
except asyncio.CancelledError:
|
||||
self.logger.debug("Global health monitor cancelled")
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in global health monitor: {e}")
|
||||
await asyncio.sleep(self.global_health_check_interval)
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get manager status and statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary containing status information
|
||||
"""
|
||||
uptime_seconds = None
|
||||
if self._stats['uptime_start']:
|
||||
uptime_seconds = (datetime.now(timezone.utc) - self._stats['uptime_start']).total_seconds()
|
||||
|
||||
# Get individual collector statuses
|
||||
collector_statuses = {}
|
||||
for name, collector in self._collectors.items():
|
||||
collector_statuses[name] = {
|
||||
'status': collector.status.value,
|
||||
'enabled': name in self._enabled_collectors,
|
||||
'health': collector.get_health_status()
|
||||
}
|
||||
|
||||
return {
|
||||
'manager_status': self.status.value,
|
||||
'uptime_seconds': uptime_seconds,
|
||||
'statistics': self._stats,
|
||||
'collectors': collector_statuses,
|
||||
'enabled_collectors': list(self._enabled_collectors),
|
||||
'total_collectors': len(self._collectors)
|
||||
}
|
||||
|
||||
def get_collector_status(self, collector_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get status for a specific collector.
|
||||
|
||||
Args:
|
||||
collector_name: Name of the collector
|
||||
|
||||
Returns:
|
||||
Collector status dict or None if not found
|
||||
"""
|
||||
if collector_name not in self._collectors:
|
||||
return None
|
||||
|
||||
collector = self._collectors[collector_name]
|
||||
return {
|
||||
'name': collector_name,
|
||||
'config': self._collector_configs[collector_name].__dict__,
|
||||
'status': collector.get_status(),
|
||||
'health': collector.get_health_status()
|
||||
}
|
||||
|
||||
def list_collectors(self) -> List[str]:
|
||||
"""
|
||||
List all managed collector names.
|
||||
|
||||
Returns:
|
||||
List of collector names
|
||||
"""
|
||||
return list(self._collectors.keys())
|
||||
|
||||
def get_running_collectors(self) -> List[str]:
|
||||
"""
|
||||
Get names of currently running collectors.
|
||||
|
||||
Returns:
|
||||
List of running collector names
|
||||
"""
|
||||
running = []
|
||||
for name, collector in self._collectors.items():
|
||||
if collector.status == CollectorStatus.RUNNING:
|
||||
running.append(name)
|
||||
return running
|
||||
|
||||
def get_failed_collectors(self) -> List[str]:
|
||||
"""
|
||||
Get names of failed or unhealthy collectors.
|
||||
|
||||
Returns:
|
||||
List of failed collector names
|
||||
"""
|
||||
failed = []
|
||||
for name, collector in self._collectors.items():
|
||||
health_status = collector.get_health_status()
|
||||
if not health_status['is_healthy']:
|
||||
failed.append(name)
|
||||
return failed
|
||||
|
||||
async def restart_all_collectors(self) -> Dict[str, bool]:
|
||||
"""
|
||||
Restart all enabled collectors.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping collector names to restart success status
|
||||
"""
|
||||
self.logger.info("Restarting all enabled collectors")
|
||||
|
||||
results = {}
|
||||
restart_tasks = []
|
||||
|
||||
for collector_name in self._enabled_collectors:
|
||||
task = asyncio.create_task(self.restart_collector(collector_name))
|
||||
restart_tasks.append((collector_name, task))
|
||||
|
||||
# Wait for all restarts to complete
|
||||
for collector_name, task in restart_tasks:
|
||||
try:
|
||||
results[collector_name] = await task
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error restarting {collector_name}: {e}")
|
||||
results[collector_name] = False
|
||||
|
||||
successful_restarts = sum(1 for success in results.values() if success)
|
||||
self.logger.info(f"Restart complete - {successful_restarts}/{len(results)} collectors restarted successfully")
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""String representation of the manager."""
|
||||
return f"<CollectorManager({self.manager_name}, {len(self._collectors)} collectors, {self.status.value})>"
|
||||
Reference in New Issue
Block a user