Implement enhanced data collection system with health monitoring and management

- Introduced `BaseDataCollector` and `CollectorManager` classes for standardized data collection and centralized management. - Added health monitoring features, including auto-restart capabilities and detailed status reporting for collectors. - Updated `env.template` to include new logging and health check configurations. - Enhanced documentation in `docs/data_collectors.md` to provide comprehensive guidance on the new data collection system. - Added unit tests for `BaseDataCollector` and `CollectorManager` to ensure reliability and functionality.
2025-05-30 20:33:56 +08:00
parent b7263b023f
commit 4936e5cd73
13 changed files with 4036 additions and 1 deletions
--- a/data/init.py
+++ b/data/init.py
@@ -0,0 +1,25 @@
+"""
+Data collection and processing package for the Crypto Trading Bot Platform.
+
+This package contains modules for collecting market data from various exchanges,
+processing and validating the data, and storing it in the database.
+"""
+
+from .base_collector import (
+    BaseDataCollector, DataCollectorError, DataValidationError, 
+    DataType, CollectorStatus, MarketDataPoint, OHLCVData
+)
+from .collector_manager import CollectorManager, ManagerStatus, CollectorConfig
+
+__all__ = [
+    'BaseDataCollector',
+    'DataCollectorError', 
+    'DataValidationError',
+    'DataType',
+    'CollectorStatus',
+    'MarketDataPoint',
+    'OHLCVData',
+    'CollectorManager',
+    'ManagerStatus',
+    'CollectorConfig'
+] 
--- a/data/base_collector.py
+++ b/data/base_collector.py
@@ -0,0 +1,667 @@
+"""
+Abstract base class for data collectors.
+
+This module provides a common interface for all data collection implementations,
+ensuring consistency across different exchange connectors and data sources.
+"""
+
+import asyncio
+from abc import ABC, abstractmethod
+from datetime import datetime, timezone, timedelta
+from decimal import Decimal
+from typing import Dict, List, Optional, Any, Callable, Set
+from dataclasses import dataclass
+from enum import Enum
+
+from utils.logger import get_logger
+
+
+class DataType(Enum):
+    """Types of data that can be collected."""
+    TICKER = "ticker"
+    TRADE = "trade"
+    ORDERBOOK = "orderbook"
+    CANDLE = "candle"
+    BALANCE = "balance"
+
+
+class CollectorStatus(Enum):
+    """Status of the data collector."""
+    STOPPED = "stopped"
+    STARTING = "starting"
+    RUNNING = "running"
+    STOPPING = "stopping"
+    ERROR = "error"
+    RECONNECTING = "reconnecting"
+    UNHEALTHY = "unhealthy"  # Added for health monitoring
+
+
+@dataclass
+class MarketDataPoint:
+    """Standardized market data structure."""
+    exchange: str
+    symbol: str
+    timestamp: datetime
+    data_type: DataType
+    data: Dict[str, Any]
+    
+    def __post_init__(self):
+        """Validate data after initialization."""
+        if not self.timestamp.tzinfo:
+            self.timestamp = self.timestamp.replace(tzinfo=timezone.utc)
+
+
+@dataclass
+class OHLCVData:
+    """OHLCV (Open, High, Low, Close, Volume) data structure."""
+    symbol: str
+    timeframe: str
+    timestamp: datetime
+    open: Decimal
+    high: Decimal
+    low: Decimal
+    close: Decimal
+    volume: Decimal
+    trades_count: Optional[int] = None
+    
+    def __post_init__(self):
+        """Validate OHLCV data after initialization."""
+        if not self.timestamp.tzinfo:
+            self.timestamp = self.timestamp.replace(tzinfo=timezone.utc)
+        
+        # Validate price data
+        if not all(isinstance(price, (Decimal, float, int)) for price in [self.open, self.high, self.low, self.close]):
+            raise DataValidationError("All OHLCV prices must be numeric")
+        
+        if not isinstance(self.volume, (Decimal, float, int)):
+            raise DataValidationError("Volume must be numeric")
+        
+        # Convert to Decimal for precision
+        self.open = Decimal(str(self.open))
+        self.high = Decimal(str(self.high))
+        self.low = Decimal(str(self.low))
+        self.close = Decimal(str(self.close))
+        self.volume = Decimal(str(self.volume))
+        
+        # Validate price relationships
+        if not (self.low <= self.open <= self.high and self.low <= self.close <= self.high):
+            raise DataValidationError(f"Invalid OHLCV data: prices don't match expected relationships for {self.symbol}")
+
+
+class DataCollectorError(Exception):
+    """Base exception for data collector errors."""
+    pass
+
+
+class DataValidationError(DataCollectorError):
+    """Exception raised when data validation fails."""
+    pass
+
+
+class ConnectionError(DataCollectorError):
+    """Exception raised when connection to data source fails."""
+    pass
+
+
+class BaseDataCollector(ABC):
+    """
+    Abstract base class for all data collectors.
+    
+    This class defines the interface that all data collection implementations
+    must follow, providing consistency across different exchanges and data sources.
+    """
+    
+    def __init__(self, 
+                 exchange_name: str,
+                 symbols: List[str],
+                 data_types: Optional[List[DataType]] = None,
+                 component_name: Optional[str] = None,
+                 auto_restart: bool = True,
+                 health_check_interval: float = 30.0):
+        """
+        Initialize the base data collector.
+        
+        Args:
+            exchange_name: Name of the exchange (e.g., 'okx', 'binance')
+            symbols: List of trading symbols to collect data for
+            data_types: Types of data to collect (default: [DataType.CANDLE])
+            component_name: Name for logging (default: based on exchange_name)
+            auto_restart: Enable automatic restart on failures (default: True)
+            health_check_interval: Seconds between health checks (default: 30.0)
+        """
+        self.exchange_name = exchange_name.lower()
+        self.symbols = set(symbols)
+        self.data_types = data_types or [DataType.CANDLE]
+        self.auto_restart = auto_restart
+        self.health_check_interval = health_check_interval
+        
+        # Initialize logger
+        component = component_name or f"{self.exchange_name}_collector"
+        self.logger = get_logger(component, verbose=True)
+        
+        # Collector state
+        self.status = CollectorStatus.STOPPED
+        self._running = False
+        self._should_be_running = False  # Track desired state
+        self._tasks: Set[asyncio.Task] = set()
+        
+        # Data callbacks
+        self._data_callbacks: Dict[DataType, List[Callable]] = {
+            data_type: [] for data_type in DataType
+        }
+        
+        # Connection management
+        self._connection = None
+        self._reconnect_attempts = 0
+        self._max_reconnect_attempts = 5
+        self._reconnect_delay = 5.0  # seconds
+        
+        # Health monitoring
+        self._last_heartbeat = datetime.now(timezone.utc)
+        self._last_data_received = None
+        self._health_check_task = None
+        self._max_silence_duration = timedelta(minutes=5)  # Max time without data before unhealthy
+        
+        # Statistics
+        self._stats = {
+            'messages_received': 0,
+            'messages_processed': 0,
+            'errors': 0,
+            'restarts': 0,
+            'last_message_time': None,
+            'connection_uptime': None,
+            'last_error': None,
+            'last_restart_time': None
+        }
+        
+        self.logger.info(f"Initialized {self.exchange_name} data collector for symbols: {', '.join(symbols)}")
+    
+    @abstractmethod
+    async def connect(self) -> bool:
+        """
+        Establish connection to the data source.
+        
+        Returns:
+            True if connection successful, False otherwise
+        """
+        pass
+    
+    @abstractmethod
+    async def disconnect(self) -> None:
+        """Disconnect from the data source."""
+        pass
+    
+    @abstractmethod
+    async def subscribe_to_data(self, symbols: List[str], data_types: List[DataType]) -> bool:
+        """
+        Subscribe to data streams for specified symbols and data types.
+        
+        Args:
+            symbols: Trading symbols to subscribe to
+            data_types: Types of data to subscribe to
+            
+        Returns:
+            True if subscription successful, False otherwise
+        """
+        pass
+    
+    @abstractmethod
+    async def unsubscribe_from_data(self, symbols: List[str], data_types: List[DataType]) -> bool:
+        """
+        Unsubscribe from data streams.
+        
+        Args:
+            symbols: Trading symbols to unsubscribe from
+            data_types: Types of data to unsubscribe from
+            
+        Returns:
+            True if unsubscription successful, False otherwise
+        """
+        pass
+    
+    @abstractmethod
+    async def _process_message(self, message: Any) -> Optional[MarketDataPoint]:
+        """
+        Process incoming message from the data source.
+        
+        Args:
+            message: Raw message from the data source
+            
+        Returns:
+            Processed MarketDataPoint or None if message should be ignored
+        """
+        pass
+    
+    async def start(self) -> bool:
+        """
+        Start the data collector.
+        
+        Returns:
+            True if started successfully, False otherwise
+        """
+        if self.status in [CollectorStatus.RUNNING, CollectorStatus.STARTING]:
+            self.logger.warning("Data collector is already running or starting")
+            return True
+        
+        self.logger.info(f"Starting {self.exchange_name} data collector")
+        self.status = CollectorStatus.STARTING
+        self._should_be_running = True
+        
+        try:
+            # Connect to data source
+            if not await self.connect():
+                self.status = CollectorStatus.ERROR
+                self.logger.error("Failed to connect to data source")
+                return False
+            
+            # Subscribe to data streams
+            if not await self.subscribe_to_data(list(self.symbols), self.data_types):
+                self.status = CollectorStatus.ERROR
+                self.logger.error("Failed to subscribe to data streams")
+                await self.disconnect()
+                return False
+            
+            # Start message processing
+            self._running = True
+            self.status = CollectorStatus.RUNNING
+            self._stats['connection_uptime'] = datetime.now(timezone.utc)
+            self._last_heartbeat = datetime.now(timezone.utc)
+            
+            # Create background task for message processing
+            message_task = asyncio.create_task(self._message_loop())
+            self._tasks.add(message_task)
+            message_task.add_done_callback(self._tasks.discard)
+            
+            # Start health monitoring
+            if self.auto_restart:
+                health_task = asyncio.create_task(self._health_monitor())
+                self._tasks.add(health_task)
+                health_task.add_done_callback(self._tasks.discard)
+            
+            self.logger.info(f"{self.exchange_name} data collector started successfully")
+            return True
+            
+        except Exception as e:
+            self.status = CollectorStatus.ERROR
+            self._stats['last_error'] = str(e)
+            self.logger.error(f"Failed to start data collector: {e}")
+            await self.disconnect()
+            return False
+    
+    async def stop(self, force: bool = False) -> None:
+        """
+        Stop the data collector.
+        
+        Args:
+            force: If True, don't restart automatically even if auto_restart is enabled
+        """
+        if self.status == CollectorStatus.STOPPED:
+            self.logger.warning("Data collector is already stopped")
+            return
+        
+        self.logger.info(f"Stopping {self.exchange_name} data collector")
+        self.status = CollectorStatus.STOPPING
+        self._running = False
+        
+        if force:
+            self._should_be_running = False
+        
+        try:
+            # Cancel all tasks
+            for task in list(self._tasks):
+                task.cancel()
+            
+            # Wait for tasks to complete
+            if self._tasks:
+                await asyncio.gather(*self._tasks, return_exceptions=True)
+            
+            # Unsubscribe and disconnect
+            await self.unsubscribe_from_data(list(self.symbols), self.data_types)
+            await self.disconnect()
+            
+            self.status = CollectorStatus.STOPPED
+            self.logger.info(f"{self.exchange_name} data collector stopped")
+            
+        except Exception as e:
+            self.status = CollectorStatus.ERROR
+            self._stats['last_error'] = str(e)
+            self.logger.error(f"Error stopping data collector: {e}")
+    
+    async def restart(self) -> bool:
+        """
+        Restart the data collector.
+        
+        Returns:
+            True if restart successful, False otherwise
+        """
+        self.logger.info(f"Restarting {self.exchange_name} data collector")
+        self._stats['restarts'] += 1
+        self._stats['last_restart_time'] = datetime.now(timezone.utc)
+        
+        # Stop without disabling auto-restart
+        await self.stop(force=False)
+        
+        # Wait a bit before restart
+        await asyncio.sleep(2.0)
+        
+        # Reset reconnection attempts
+        self._reconnect_attempts = 0
+        
+        # Start again
+        return await self.start()
+    
+    async def _message_loop(self) -> None:
+        """Main message processing loop."""
+        self.logger.debug("Starting message processing loop")
+        
+        while self._running:
+            try:
+                # This should be implemented by subclasses to handle their specific message loop
+                await self._handle_messages()
+                
+                # Update heartbeat
+                self._last_heartbeat = datetime.now(timezone.utc)
+                
+            except asyncio.CancelledError:
+                self.logger.debug("Message loop cancelled")
+                break
+            except Exception as e:
+                self._stats['errors'] += 1
+                self._stats['last_error'] = str(e)
+                self.logger.error(f"Error in message loop: {e}")
+                
+                # Attempt reconnection if connection lost
+                if not await self._handle_connection_error():
+                    break
+                
+                await asyncio.sleep(1)  # Brief pause before retrying
+    
+    async def _health_monitor(self) -> None:
+        """Monitor collector health and restart if needed."""
+        self.logger.debug("Starting health monitor")
+        
+        while self._running and self.auto_restart:
+            try:
+                await asyncio.sleep(self.health_check_interval)
+                
+                # Check if we should be running but aren't
+                if self._should_be_running and not self._running:
+                    self.logger.warning("Collector should be running but isn't - restarting")
+                    await self.restart()
+                    continue
+                
+                # Check heartbeat freshness
+                time_since_heartbeat = datetime.now(timezone.utc) - self._last_heartbeat
+                if time_since_heartbeat > timedelta(seconds=self.health_check_interval * 2):
+                    self.logger.warning(f"No heartbeat for {time_since_heartbeat.total_seconds():.1f}s - restarting")
+                    self.status = CollectorStatus.UNHEALTHY
+                    await self.restart()
+                    continue
+                
+                # Check data freshness (if we've received data before)
+                if self._last_data_received:
+                    time_since_data = datetime.now(timezone.utc) - self._last_data_received
+                    if time_since_data > self._max_silence_duration:
+                        self.logger.warning(f"No data received for {time_since_data.total_seconds():.1f}s - restarting")
+                        self.status = CollectorStatus.UNHEALTHY
+                        await self.restart()
+                        continue
+                
+                # Check if status indicates failure
+                if self.status in [CollectorStatus.ERROR, CollectorStatus.UNHEALTHY]:
+                    self.logger.warning(f"Collector in {self.status.value} status - restarting")
+                    await self.restart()
+                    continue
+                
+            except asyncio.CancelledError:
+                self.logger.debug("Health monitor cancelled")
+                break
+            except Exception as e:
+                self.logger.error(f"Error in health monitor: {e}")
+                await asyncio.sleep(self.health_check_interval)
+    
+    @abstractmethod
+    async def _handle_messages(self) -> None:
+        """
+        Handle incoming messages from the data source.
+        This method should be implemented by subclasses to handle their specific message format.
+        """
+        pass
+    
+    async def _handle_connection_error(self) -> bool:
+        """
+        Handle connection errors and attempt reconnection.
+        
+        Returns:
+            True if reconnection successful, False if max attempts exceeded
+        """
+        if self._reconnect_attempts >= self._max_reconnect_attempts:
+            self.logger.error(f"Max reconnection attempts ({self._max_reconnect_attempts}) exceeded")
+            self.status = CollectorStatus.ERROR
+            return False
+        
+        self._reconnect_attempts += 1
+        self.status = CollectorStatus.RECONNECTING
+        
+        self.logger.warning(f"Connection lost. Attempting reconnection {self._reconnect_attempts}/{self._max_reconnect_attempts}")
+        
+        await asyncio.sleep(self._reconnect_delay)
+        
+        try:
+            if await self.connect():
+                if await self.subscribe_to_data(list(self.symbols), self.data_types):
+                    self.status = CollectorStatus.RUNNING
+                    self._reconnect_attempts = 0
+                    self._stats['connection_uptime'] = datetime.now(timezone.utc)
+                    self.logger.info("Reconnection successful")
+                    return True
+            
+            return False
+            
+        except Exception as e:
+            self._stats['last_error'] = str(e)
+            self.logger.error(f"Reconnection attempt failed: {e}")
+            return False
+    
+    def add_data_callback(self, data_type: DataType, callback: Callable[[MarketDataPoint], None]) -> None:
+        """
+        Add a callback function to be called when data of specified type is received.
+        
+        Args:
+            data_type: Type of data to register callback for
+            callback: Function to call with MarketDataPoint data
+        """
+        self._data_callbacks[data_type].append(callback)
+        self.logger.debug(f"Added callback for {data_type.value} data")
+    
+    def remove_data_callback(self, data_type: DataType, callback: Callable[[MarketDataPoint], None]) -> None:
+        """
+        Remove a data callback.
+        
+        Args:
+            data_type: Type of data to remove callback for
+            callback: Callback function to remove
+        """
+        if callback in self._data_callbacks[data_type]:
+            self._data_callbacks[data_type].remove(callback)
+            self.logger.debug(f"Removed callback for {data_type.value} data")
+    
+    async def _notify_callbacks(self, data_point: MarketDataPoint) -> None:
+        """
+        Notify all registered callbacks for the data type.
+        
+        Args:
+            data_point: Market data to send to callbacks
+        """
+        # Update data received timestamp
+        self._last_data_received = datetime.now(timezone.utc)
+        self._stats['last_message_time'] = self._last_data_received
+        
+        callbacks = self._data_callbacks.get(data_point.data_type, [])
+        
+        for callback in callbacks:
+            try:
+                if asyncio.iscoroutinefunction(callback):
+                    await callback(data_point)
+                else:
+                    callback(data_point)
+            except Exception as e:
+                self.logger.error(f"Error in data callback: {e}")
+    
+    def get_status(self) -> Dict[str, Any]:
+        """
+        Get current collector status and statistics.
+        
+        Returns:
+            Dictionary containing status information
+        """
+        uptime_seconds = None
+        if self._stats['connection_uptime']:
+            uptime_seconds = (datetime.now(timezone.utc) - self._stats['connection_uptime']).total_seconds()
+        
+        time_since_heartbeat = None
+        if self._last_heartbeat:
+            time_since_heartbeat = (datetime.now(timezone.utc) - self._last_heartbeat).total_seconds()
+        
+        time_since_data = None
+        if self._last_data_received:
+            time_since_data = (datetime.now(timezone.utc) - self._last_data_received).total_seconds()
+        
+        return {
+            'exchange': self.exchange_name,
+            'status': self.status.value,
+            'should_be_running': self._should_be_running,
+            'symbols': list(self.symbols),
+            'data_types': [dt.value for dt in self.data_types],
+            'auto_restart': self.auto_restart,
+            'health': {
+                'time_since_heartbeat': time_since_heartbeat,
+                'time_since_data': time_since_data,
+                'max_silence_duration': self._max_silence_duration.total_seconds()
+            },
+            'statistics': {
+                **self._stats,
+                'uptime_seconds': uptime_seconds,
+                'reconnect_attempts': self._reconnect_attempts
+            }
+        }
+    
+    def get_health_status(self) -> Dict[str, Any]:
+        """
+        Get detailed health status for monitoring.
+        
+        Returns:
+            Dictionary containing health information
+        """
+        now = datetime.now(timezone.utc)
+        
+        is_healthy = True
+        health_issues = []
+        
+        # Check if should be running but isn't
+        if self._should_be_running and not self._running:
+            is_healthy = False
+            health_issues.append("Should be running but is stopped")
+        
+        # Check heartbeat
+        if self._last_heartbeat:
+            time_since_heartbeat = now - self._last_heartbeat
+            if time_since_heartbeat > timedelta(seconds=self.health_check_interval * 2):
+                is_healthy = False
+                health_issues.append(f"No heartbeat for {time_since_heartbeat.total_seconds():.1f}s")
+        
+        # Check data freshness
+        if self._last_data_received:
+            time_since_data = now - self._last_data_received
+            if time_since_data > self._max_silence_duration:
+                is_healthy = False
+                health_issues.append(f"No data for {time_since_data.total_seconds():.1f}s")
+        
+        # Check status
+        if self.status in [CollectorStatus.ERROR, CollectorStatus.UNHEALTHY]:
+            is_healthy = False
+            health_issues.append(f"Status: {self.status.value}")
+        
+        return {
+            'is_healthy': is_healthy,
+            'issues': health_issues,
+            'status': self.status.value,
+            'last_heartbeat': self._last_heartbeat.isoformat() if self._last_heartbeat else None,
+            'last_data_received': self._last_data_received.isoformat() if self._last_data_received else None,
+            'should_be_running': self._should_be_running,
+            'is_running': self._running
+        }
+    
+    def add_symbol(self, symbol: str) -> None:
+        """
+        Add a new symbol to collect data for.
+        
+        Args:
+            symbol: Trading symbol to add
+        """
+        if symbol not in self.symbols:
+            self.symbols.add(symbol)
+            self.logger.info(f"Added symbol: {symbol}")
+    
+    def remove_symbol(self, symbol: str) -> None:
+        """
+        Remove a symbol from data collection.
+        
+        Args:
+            symbol: Trading symbol to remove
+        """
+        if symbol in self.symbols:
+            self.symbols.remove(symbol)
+            self.logger.info(f"Removed symbol: {symbol}")
+    
+    def validate_ohlcv_data(self, data: Dict[str, Any], symbol: str, timeframe: str) -> OHLCVData:
+        """
+        Validate and convert raw OHLCV data to standardized format.
+        
+        Args:
+            data: Raw OHLCV data dictionary
+            symbol: Trading symbol
+            timeframe: Timeframe (e.g., '1m', '5m', '1h')
+            
+        Returns:
+            Validated OHLCVData object
+            
+        Raises:
+            DataValidationError: If data validation fails
+        """
+        required_fields = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
+        
+        # Check required fields
+        for field in required_fields:
+            if field not in data:
+                raise DataValidationError(f"Missing required field: {field}")
+        
+        try:
+            # Parse timestamp
+            timestamp = data['timestamp']
+            if isinstance(timestamp, (int, float)):
+                # Assume Unix timestamp in milliseconds
+                timestamp = datetime.fromtimestamp(timestamp / 1000, tz=timezone.utc)
+            elif isinstance(timestamp, str):
+                timestamp = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
+            elif not isinstance(timestamp, datetime):
+                raise DataValidationError(f"Invalid timestamp format: {type(timestamp)}")
+            
+            return OHLCVData(
+                symbol=symbol,
+                timeframe=timeframe,
+                timestamp=timestamp,
+                open=Decimal(str(data['open'])),
+                high=Decimal(str(data['high'])),
+                low=Decimal(str(data['low'])),
+                close=Decimal(str(data['close'])),
+                volume=Decimal(str(data['volume'])),
+                trades_count=data.get('trades_count')
+            )
+            
+        except (ValueError, TypeError, KeyError) as e:
+            raise DataValidationError(f"Invalid OHLCV data for {symbol}: {e}")
+    
+    def __repr__(self) -> str:
+        """String representation of the collector."""
+        return f"<{self.__class__.__name__}({self.exchange_name}, {len(self.symbols)} symbols, {self.status.value})>" 
--- a/data/collector_manager.py
+++ b/data/collector_manager.py
@@ -0,0 +1,529 @@
+"""
+Data Collector Manager for supervising and managing multiple data collectors.
+
+This module provides centralized management of data collectors with health monitoring,
+auto-recovery, and coordinated lifecycle management.
+"""
+
+import asyncio
+import time
+from datetime import datetime, timezone, timedelta
+from typing import Dict, List, Optional, Any, Set
+from dataclasses import dataclass
+from enum import Enum
+
+from utils.logger import get_logger
+from .base_collector import BaseDataCollector, CollectorStatus
+
+
+class ManagerStatus(Enum):
+    """Status of the collector manager."""
+    STOPPED = "stopped"
+    STARTING = "starting"
+    RUNNING = "running"
+    STOPPING = "stopping"
+    ERROR = "error"
+
+
+@dataclass
+class CollectorConfig:
+    """Configuration for a data collector."""
+    name: str
+    exchange: str
+    symbols: List[str]
+    data_types: List[str]
+    auto_restart: bool = True
+    health_check_interval: float = 30.0
+    enabled: bool = True
+
+
+class CollectorManager:
+    """
+    Manages multiple data collectors with health monitoring and auto-recovery.
+    
+    The manager is responsible for:
+    - Starting and stopping collectors
+    - Health monitoring and auto-restart
+    - Coordinated lifecycle management
+    - Status reporting and metrics
+    """
+    
+    def __init__(self,
+                 manager_name: str = "collector_manager",
+                 global_health_check_interval: float = 60.0,
+                 restart_delay: float = 5.0):
+        """
+        Initialize the collector manager.
+        
+        Args:
+            manager_name: Name for logging
+            global_health_check_interval: Seconds between global health checks
+            restart_delay: Delay between restart attempts
+        """
+        self.manager_name = manager_name
+        self.global_health_check_interval = global_health_check_interval
+        self.restart_delay = restart_delay
+        
+        # Initialize logger
+        self.logger = get_logger(f"data_collector_manager", verbose=True)
+        
+        # Manager state
+        self.status = ManagerStatus.STOPPED
+        self._running = False
+        self._tasks: Set[asyncio.Task] = set()
+        
+        # Collector management
+        self._collectors: Dict[str, BaseDataCollector] = {}
+        self._collector_configs: Dict[str, CollectorConfig] = {}
+        self._enabled_collectors: Set[str] = set()
+        
+        # Health monitoring
+        self._last_global_check = datetime.now(timezone.utc)
+        self._global_health_task = None
+        
+        # Statistics
+        self._stats = {
+            'total_collectors': 0,
+            'running_collectors': 0,
+            'failed_collectors': 0,
+            'restarts_performed': 0,
+            'last_global_check': None,
+            'uptime_start': None
+        }
+        
+        self.logger.info(f"Initialized collector manager: {manager_name}")
+    
+    def add_collector(self, 
+                     collector: BaseDataCollector, 
+                     config: Optional[CollectorConfig] = None) -> None:
+        """
+        Add a collector to be managed.
+        
+        Args:
+            collector: Data collector instance
+            config: Optional configuration (will create default if not provided)
+        """
+        # Use a more unique name to avoid duplicates
+        collector_name = f"{collector.exchange_name}_{int(time.time() * 1000000) % 1000000}"
+        
+        # Ensure unique name
+        counter = 1
+        base_name = collector_name
+        while collector_name in self._collectors:
+            collector_name = f"{base_name}_{counter}"
+            counter += 1
+        
+        if config is None:
+            config = CollectorConfig(
+                name=collector_name,
+                exchange=collector.exchange_name,
+                symbols=list(collector.symbols),
+                data_types=[dt.value for dt in collector.data_types],
+                auto_restart=collector.auto_restart,
+                health_check_interval=collector.health_check_interval
+            )
+        
+        self._collectors[collector_name] = collector
+        self._collector_configs[collector_name] = config
+        
+        if config.enabled:
+            self._enabled_collectors.add(collector_name)
+        
+        self._stats['total_collectors'] = len(self._collectors)
+        
+        self.logger.info(f"Added collector: {collector_name} ({collector.exchange_name}) - "
+                        f"Symbols: {', '.join(collector.symbols)} - Enabled: {config.enabled}")
+    
+    def remove_collector(self, collector_name: str) -> bool:
+        """
+        Remove a collector from management.
+        
+        Args:
+            collector_name: Name of the collector to remove
+            
+        Returns:
+            True if removed successfully, False if not found
+        """
+        if collector_name not in self._collectors:
+            self.logger.warning(f"Collector not found: {collector_name}")
+            return False
+        
+        # Stop the collector first (only if event loop is running)
+        collector = self._collectors[collector_name]
+        if collector.status != CollectorStatus.STOPPED:
+            try:
+                # Try to create task only if event loop is running
+                asyncio.create_task(collector.stop(force=True))
+            except RuntimeError:
+                # No event loop running, just log
+                self.logger.info(f"Collector {collector_name} will be removed without stopping (no event loop)")
+        
+        # Remove from management
+        del self._collectors[collector_name]
+        del self._collector_configs[collector_name]
+        self._enabled_collectors.discard(collector_name)
+        
+        self._stats['total_collectors'] = len(self._collectors)
+        
+        self.logger.info(f"Removed collector: {collector_name}")
+        return True
+    
+    def enable_collector(self, collector_name: str) -> bool:
+        """
+        Enable a collector (will be started if manager is running).
+        
+        Args:
+            collector_name: Name of the collector to enable
+            
+        Returns:
+            True if enabled successfully, False if not found
+        """
+        if collector_name not in self._collectors:
+            self.logger.warning(f"Collector not found: {collector_name}")
+            return False
+        
+        self._enabled_collectors.add(collector_name)
+        self._collector_configs[collector_name].enabled = True
+        
+        # Start the collector if manager is running (only if event loop is running)
+        if self._running:
+            try:
+                asyncio.create_task(self._start_collector(collector_name))
+            except RuntimeError:
+                # No event loop running, will be started when manager starts
+                self.logger.debug(f"Collector {collector_name} enabled but will start when manager starts")
+        
+        self.logger.info(f"Enabled collector: {collector_name}")
+        return True
+    
+    def disable_collector(self, collector_name: str) -> bool:
+        """
+        Disable a collector (will be stopped if running).
+        
+        Args:
+            collector_name: Name of the collector to disable
+            
+        Returns:
+            True if disabled successfully, False if not found
+        """
+        if collector_name not in self._collectors:
+            self.logger.warning(f"Collector not found: {collector_name}")
+            return False
+        
+        self._enabled_collectors.discard(collector_name)
+        self._collector_configs[collector_name].enabled = False
+        
+        # Stop the collector (only if event loop is running)
+        collector = self._collectors[collector_name]
+        try:
+            asyncio.create_task(collector.stop(force=True))
+        except RuntimeError:
+            # No event loop running, just log
+            self.logger.debug(f"Collector {collector_name} disabled but cannot stop (no event loop)")
+        
+        self.logger.info(f"Disabled collector: {collector_name}")
+        return True
+    
+    async def start(self) -> bool:
+        """
+        Start the collector manager and all enabled collectors.
+        
+        Returns:
+            True if started successfully, False otherwise
+        """
+        if self.status in [ManagerStatus.RUNNING, ManagerStatus.STARTING]:
+            self.logger.warning("Collector manager is already running or starting")
+            return True
+        
+        self.logger.info("Starting collector manager")
+        self.status = ManagerStatus.STARTING
+        
+        try:
+            self._running = True
+            self._stats['uptime_start'] = datetime.now(timezone.utc)
+            
+            # Start all enabled collectors
+            start_tasks = []
+            for collector_name in self._enabled_collectors:
+                task = asyncio.create_task(self._start_collector(collector_name))
+                start_tasks.append(task)
+            
+            # Wait for all collectors to start (with timeout)
+            if start_tasks:
+                try:
+                    await asyncio.wait_for(asyncio.gather(*start_tasks, return_exceptions=True), timeout=30.0)
+                except asyncio.TimeoutError:
+                    self.logger.warning("Some collectors took too long to start")
+            
+            # Start global health monitoring
+            health_task = asyncio.create_task(self._global_health_monitor())
+            self._tasks.add(health_task)
+            health_task.add_done_callback(self._tasks.discard)
+            
+            self.status = ManagerStatus.RUNNING
+            self.logger.info(f"Collector manager started - Managing {len(self._enabled_collectors)} collectors")
+            return True
+            
+        except Exception as e:
+            self.status = ManagerStatus.ERROR
+            self.logger.error(f"Failed to start collector manager: {e}")
+            return False
+    
+    async def stop(self) -> None:
+        """Stop the collector manager and all collectors."""
+        if self.status == ManagerStatus.STOPPED:
+            self.logger.warning("Collector manager is already stopped")
+            return
+        
+        self.logger.info("Stopping collector manager")
+        self.status = ManagerStatus.STOPPING
+        self._running = False
+        
+        try:
+            # Cancel manager tasks
+            for task in list(self._tasks):
+                task.cancel()
+            
+            if self._tasks:
+                await asyncio.gather(*self._tasks, return_exceptions=True)
+            
+            # Stop all collectors
+            stop_tasks = []
+            for collector in self._collectors.values():
+                task = asyncio.create_task(collector.stop(force=True))
+                stop_tasks.append(task)
+            
+            # Wait for all collectors to stop (with timeout)
+            if stop_tasks:
+                try:
+                    await asyncio.wait_for(asyncio.gather(*stop_tasks, return_exceptions=True), timeout=30.0)
+                except asyncio.TimeoutError:
+                    self.logger.warning("Some collectors took too long to stop")
+            
+            self.status = ManagerStatus.STOPPED
+            self.logger.info("Collector manager stopped")
+            
+        except Exception as e:
+            self.status = ManagerStatus.ERROR
+            self.logger.error(f"Error stopping collector manager: {e}")
+    
+    async def restart_collector(self, collector_name: str) -> bool:
+        """
+        Restart a specific collector.
+        
+        Args:
+            collector_name: Name of the collector to restart
+            
+        Returns:
+            True if restarted successfully, False otherwise
+        """
+        if collector_name not in self._collectors:
+            self.logger.warning(f"Collector not found: {collector_name}")
+            return False
+        
+        collector = self._collectors[collector_name]
+        self.logger.info(f"Restarting collector: {collector_name}")
+        
+        try:
+            success = await collector.restart()
+            if success:
+                self._stats['restarts_performed'] += 1
+                self.logger.info(f"Successfully restarted collector: {collector_name}")
+            else:
+                self.logger.error(f"Failed to restart collector: {collector_name}")
+            return success
+            
+        except Exception as e:
+            self.logger.error(f"Error restarting collector {collector_name}: {e}")
+            return False
+    
+    async def _start_collector(self, collector_name: str) -> bool:
+        """
+        Start a specific collector.
+        
+        Args:
+            collector_name: Name of the collector to start
+            
+        Returns:
+            True if started successfully, False otherwise
+        """
+        if collector_name not in self._collectors:
+            self.logger.warning(f"Collector not found: {collector_name}")
+            return False
+        
+        collector = self._collectors[collector_name]
+        
+        try:
+            success = await collector.start()
+            if success:
+                self.logger.info(f"Started collector: {collector_name}")
+            else:
+                self.logger.error(f"Failed to start collector: {collector_name}")
+            return success
+            
+        except Exception as e:
+            self.logger.error(f"Error starting collector {collector_name}: {e}")
+            return False
+    
+    async def _global_health_monitor(self) -> None:
+        """Global health monitoring for all collectors."""
+        self.logger.debug("Starting global health monitor")
+        
+        while self._running:
+            try:
+                await asyncio.sleep(self.global_health_check_interval)
+                
+                self._last_global_check = datetime.now(timezone.utc)
+                self._stats['last_global_check'] = self._last_global_check
+                
+                # Check each enabled collector
+                running_count = 0
+                failed_count = 0
+                
+                for collector_name in self._enabled_collectors:
+                    collector = self._collectors[collector_name]
+                    health_status = collector.get_health_status()
+                    
+                    if health_status['is_healthy'] and collector.status == CollectorStatus.RUNNING:
+                        running_count += 1
+                    elif not health_status['is_healthy']:
+                        failed_count += 1
+                        self.logger.warning(f"Collector {collector_name} is unhealthy: {health_status['issues']}")
+                        
+                        # Auto-restart if needed and not already restarting
+                        if (collector.auto_restart and 
+                            collector.status not in [CollectorStatus.STARTING, CollectorStatus.STOPPING]):
+                            self.logger.info(f"Auto-restarting unhealthy collector: {collector_name}")
+                            asyncio.create_task(self.restart_collector(collector_name))
+                
+                # Update global statistics
+                self._stats['running_collectors'] = running_count
+                self._stats['failed_collectors'] = failed_count
+                
+                self.logger.debug(f"Health check complete - Running: {running_count}, Failed: {failed_count}")
+                
+            except asyncio.CancelledError:
+                self.logger.debug("Global health monitor cancelled")
+                break
+            except Exception as e:
+                self.logger.error(f"Error in global health monitor: {e}")
+                await asyncio.sleep(self.global_health_check_interval)
+    
+    def get_status(self) -> Dict[str, Any]:
+        """
+        Get manager status and statistics.
+        
+        Returns:
+            Dictionary containing status information
+        """
+        uptime_seconds = None
+        if self._stats['uptime_start']:
+            uptime_seconds = (datetime.now(timezone.utc) - self._stats['uptime_start']).total_seconds()
+        
+        # Get individual collector statuses
+        collector_statuses = {}
+        for name, collector in self._collectors.items():
+            collector_statuses[name] = {
+                'status': collector.status.value,
+                'enabled': name in self._enabled_collectors,
+                'health': collector.get_health_status()
+            }
+        
+        return {
+            'manager_status': self.status.value,
+            'uptime_seconds': uptime_seconds,
+            'statistics': self._stats,
+            'collectors': collector_statuses,
+            'enabled_collectors': list(self._enabled_collectors),
+            'total_collectors': len(self._collectors)
+        }
+    
+    def get_collector_status(self, collector_name: str) -> Optional[Dict[str, Any]]:
+        """
+        Get status for a specific collector.
+        
+        Args:
+            collector_name: Name of the collector
+            
+        Returns:
+            Collector status dict or None if not found
+        """
+        if collector_name not in self._collectors:
+            return None
+        
+        collector = self._collectors[collector_name]
+        return {
+            'name': collector_name,
+            'config': self._collector_configs[collector_name].__dict__,
+            'status': collector.get_status(),
+            'health': collector.get_health_status()
+        }
+    
+    def list_collectors(self) -> List[str]:
+        """
+        List all managed collector names.
+        
+        Returns:
+            List of collector names
+        """
+        return list(self._collectors.keys())
+    
+    def get_running_collectors(self) -> List[str]:
+        """
+        Get names of currently running collectors.
+        
+        Returns:
+            List of running collector names
+        """
+        running = []
+        for name, collector in self._collectors.items():
+            if collector.status == CollectorStatus.RUNNING:
+                running.append(name)
+        return running
+    
+    def get_failed_collectors(self) -> List[str]:
+        """
+        Get names of failed or unhealthy collectors.
+        
+        Returns:
+            List of failed collector names
+        """
+        failed = []
+        for name, collector in self._collectors.items():
+            health_status = collector.get_health_status()
+            if not health_status['is_healthy']:
+                failed.append(name)
+        return failed
+    
+    async def restart_all_collectors(self) -> Dict[str, bool]:
+        """
+        Restart all enabled collectors.
+        
+        Returns:
+            Dictionary mapping collector names to restart success status
+        """
+        self.logger.info("Restarting all enabled collectors")
+        
+        results = {}
+        restart_tasks = []
+        
+        for collector_name in self._enabled_collectors:
+            task = asyncio.create_task(self.restart_collector(collector_name))
+            restart_tasks.append((collector_name, task))
+        
+        # Wait for all restarts to complete
+        for collector_name, task in restart_tasks:
+            try:
+                results[collector_name] = await task
+            except Exception as e:
+                self.logger.error(f"Error restarting {collector_name}: {e}")
+                results[collector_name] = False
+        
+        successful_restarts = sum(1 for success in results.values() if success)
+        self.logger.info(f"Restart complete - {successful_restarts}/{len(results)} collectors restarted successfully")
+        
+        return results
+    
+    def __repr__(self) -> str:
+        """String representation of the manager."""
+        return f"<CollectorManager({self.manager_name}, {len(self._collectors)} collectors, {self.status.value})>" 
--- a/docs/README.md
+++ b/docs/README.md
@@ -0,0 +1,228 @@
+# TCP Dashboard Documentation
+
+Welcome to the **TCP Dashboard** (Trading Crypto Platform) documentation. This platform provides a comprehensive solution for cryptocurrency trading bot development, backtesting, and portfolio management.
+
+## 📚 Documentation Index
+
+### 🏗️ **Architecture & Design**
+
+- **[Architecture Overview](architecture.md)** - High-level system architecture and component design
+- **[Project Specification](specification.md)** - Technical specifications and requirements
+- **[Crypto Bot PRD](crypto-bot-prd.md)** - Product Requirements Document for the crypto trading bot platform
+
+### 🚀 **Setup & Installation**
+
+- **[Setup Guide](setup.md)** - Comprehensive setup instructions for new machines and environments
+  - Environment configuration
+  - Database setup with Docker
+  - Development workflow
+  - Production deployment
+
+### 🔧 **Core Systems**
+
+#### Data Collection System
+
+- **[Data Collectors Documentation](data_collectors.md)** - *Comprehensive guide to the enhanced data collector system*
+  - **BaseDataCollector** abstract class with health monitoring
+  - **CollectorManager** for centralized management
+  - Auto-restart and failure recovery
+  - Health monitoring and alerting
+  - Performance optimization
+  - Integration examples
+  - Troubleshooting guide
+
+#### Logging System
+
+- **[Enhanced Logging System](logging.md)** - Unified logging framework
+  - Multi-level logging with automatic cleanup
+  - Console and file output with formatting
+  - Performance monitoring
+  - Integration across all components
+
+## 🎯 **Quick Start**
+
+1. **New to the platform?** Start with the [Setup Guide](setup.md)
+2. **Implementing data collectors?** See [Data Collectors Documentation](data_collectors.md)
+3. **Understanding the architecture?** Read [Architecture Overview](architecture.md)
+4. **Troubleshooting?** Check component-specific documentation
+
+## 🏛️ **System Components**
+
+### Core Infrastructure
+- **Database Layer**: PostgreSQL with SQLAlchemy models
+- **Real-time Messaging**: Redis pub/sub for data distribution
+- **Configuration Management**: Pydantic-based settings
+- **Containerization**: Docker and docker-compose setup
+
+### Data Collection & Processing
+- **Abstract Base Collectors**: Standardized interface for all exchange connectors
+- **Health Monitoring**: Automatic failure detection and recovery
+- **Data Validation**: Comprehensive validation for market data
+- **Multi-Exchange Support**: OKX, Binance, and extensible framework
+
+### Trading & Strategy Engine
+- **Strategy Framework**: Base strategy classes and implementations
+- **Bot Management**: Lifecycle management with JSON configuration
+- **Backtesting Engine**: Historical strategy testing with performance metrics
+- **Portfolio Management**: Virtual trading with P&L tracking
+
+### User Interface
+- **Dashboard**: Dash-based web interface with Mantine UI
+- **Real-time Charts**: Interactive price charts with technical indicators
+- **Bot Controls**: Start/stop/configure trading bots
+- **Performance Analytics**: Portfolio visualization and trade analytics
+
+## 📋 **Task Progress**
+
+The platform follows a structured development approach with clearly defined tasks:
+
+- ✅ **Database Foundation** - Complete
+- ✅ **Enhanced Data Collectors** - Complete with health monitoring
+- ⏳ **Market Data Collection** - In progress (OKX connector next)
+- ⏳ **Basic Dashboard** - Planned
+- ⏳ **Strategy Engine** - Planned
+- ⏳ **Advanced Features** - Planned
+
+For detailed task tracking, see [tasks/tasks-crypto-bot-prd.md](../tasks/tasks-crypto-bot-prd.md).
+
+## 🛠️ **Development Workflow**
+
+### Setting Up Development Environment
+
+```bash
+# Clone and setup
+git clone <repository>
+cd TCPDashboard
+
+# Install dependencies with UV
+uv sync
+
+# Setup environment
+cp .env.example .env
+# Edit .env with your configuration
+
+# Start services
+docker-compose up -d
+
+# Initialize database
+uv run python scripts/init_database.py
+
+# Run tests
+uv run pytest
+```
+
+### Key Development Tools
+
+- **UV**: Modern Python package management
+- **pytest**: Testing framework with async support
+- **SQLAlchemy**: Database ORM with migration support
+- **Dash + Mantine**: Modern web UI framework
+- **Docker**: Containerized development environment
+
+## 🔍 **Testing**
+
+The platform includes comprehensive test coverage:
+
+- **Unit Tests**: Individual component testing
+- **Integration Tests**: Cross-component functionality
+- **Performance Tests**: Load and stress testing
+- **End-to-End Tests**: Full system workflows
+
+```bash
+# Run all tests
+uv run pytest
+
+# Run specific test files
+uv run pytest tests/test_base_collector.py
+uv run pytest tests/test_collector_manager.py
+
+# Run with coverage
+uv run pytest --cov=data --cov-report=html
+```
+
+## 📊 **Monitoring & Observability**
+
+### Logging
+- **Structured Logging**: JSON-formatted logs with automatic cleanup
+- **Multiple Levels**: Debug, Info, Warning, Error with configurable output
+- **Component Isolation**: Separate loggers for different system components
+
+### Health Monitoring
+- **Collector Health**: Real-time status and performance metrics
+- **Auto-Recovery**: Automatic restart on failures
+- **Performance Tracking**: Message rates, uptime, error rates
+
+### Metrics Integration
+- **Prometheus Support**: Built-in metrics collection
+- **Custom Dashboards**: System performance visualization
+- **Alerting**: Configurable alerts for system health
+
+## 🔐 **Security & Best Practices**
+
+### Configuration Management
+- **Environment Variables**: All sensitive data via `.env` files
+- **No Hardcoded Secrets**: Clean separation of configuration and code
+- **Validation**: Pydantic-based configuration validation
+
+### Data Handling
+- **Input Validation**: Comprehensive validation for all external data
+- **Error Handling**: Robust error handling with proper logging
+- **Resource Management**: Proper cleanup and resource management
+
+### Code Quality
+- **Type Hints**: Full type annotation coverage
+- **Documentation**: Comprehensive docstrings and comments
+- **Testing**: High test coverage with multiple test types
+- **Code Standards**: Consistent formatting and patterns
+
+## 🤝 **Contributing**
+
+### Development Guidelines
+1. Follow existing code patterns and architecture
+2. Add comprehensive tests for new functionality
+3. Update documentation for API changes
+4. Use type hints and proper error handling
+5. Follow the existing logging patterns
+
+### Code Review Process
+1. Create feature branches from main
+2. Write tests before implementing features
+3. Ensure all tests pass and maintain coverage
+4. Update relevant documentation
+5. Submit pull requests with clear descriptions
+
+## 📞 **Support**
+
+### Getting Help
+1. **Documentation**: Check relevant component documentation
+2. **Logs**: Review system logs in `./logs/` directory  
+3. **Status**: Use built-in status and health check methods
+4. **Tests**: Run test suite to verify system integrity
+
+### Common Issues
+- **Database Connection**: Check Docker services and environment variables
+- **Collector Failures**: Review collector health status and logs
+- **Performance Issues**: Monitor system resources and optimize accordingly
+
+---
+
+## 📁 **File Structure**
+
+```
+TCPDashboard/
+├── docs/                    # Documentation (you are here)
+├── data/                    # Data collection system
+├── database/                # Database models and utilities  
+├── utils/                   # Shared utilities (logging, etc.)
+├── tests/                   # Test suite
+├── examples/                # Usage examples
+├── config/                  # Configuration files
+├── logs/                    # Application logs
+└── scripts/                 # Utility scripts
+```
+
+---
+
+*Last updated: $(date)*
+
+For the most current information, refer to the individual component documentation linked above. 
--- a/docs/data_collectors.md
+++ b/docs/data_collectors.md
--- a/env.template
+++ b/env.template
@@ -35,4 +35,15 @@ DEFAULT_VIRTUAL_BALANCE=10000
 # Data Configuration
 MARKET_DATA_SYMBOLS=BTC-USDT,ETH-USDT,LTC-USDT
 HISTORICAL_DATA_DAYS=30
-CHART_UPDATE_INTERVAL=2000  # milliseconds 
+CHART_UPDATE_INTERVAL=2000  # milliseconds 
+
+# Logging
+VERBOSE_LOGGING = true
+LOG_CLEANUP=true                  # Enable automatic log cleanup
+LOG_MAX_FILES=30                  # Maximum log files to retain
+
+# Health monitoring
+DEFAULT_HEALTH_CHECK_INTERVAL=30  # Default health check interval (seconds)
+MAX_SILENCE_DURATION=300          # Max time without data (seconds)
+MAX_RECONNECT_ATTEMPTS=5          # Maximum reconnection attempts
+RECONNECT_DELAY=5                 # Delay between reconnect attempts (seconds)
--- a/examples/collector_demo.py
+++ b/examples/collector_demo.py
@@ -0,0 +1,309 @@
+"""
+Demonstration of the enhanced data collector system with health monitoring and auto-restart.
+
+This example shows how to:
+1. Create data collectors with health monitoring
+2. Use the collector manager for coordinated management
+3. Monitor collector health and handle failures
+4. Enable/disable collectors dynamically
+"""
+
+import asyncio
+from datetime import datetime, timezone
+from typing import Any, Optional
+
+from data import (
+    BaseDataCollector, DataType, CollectorStatus, MarketDataPoint,
+    CollectorManager, CollectorConfig
+)
+
+
+class DemoDataCollector(BaseDataCollector):
+    """
+    Demo implementation of a data collector for demonstration purposes.
+    
+    This collector simulates receiving market data and can be configured
+    to fail periodically to demonstrate auto-restart functionality.
+    """
+    
+    def __init__(self, 
+                 exchange_name: str, 
+                 symbols: list,
+                 fail_every_n_messages: int = 0,
+                 connection_delay: float = 0.1):
+        """
+        Initialize demo collector.
+        
+        Args:
+            exchange_name: Name of the exchange
+            symbols: Trading symbols to collect
+            fail_every_n_messages: Simulate failure every N messages (0 = no failures)
+            connection_delay: Simulated connection delay
+        """
+        super().__init__(exchange_name, symbols, [DataType.TICKER])
+        self.fail_every_n_messages = fail_every_n_messages
+        self.connection_delay = connection_delay
+        self.message_count = 0
+        self.connected = False
+        self.subscribed = False
+    
+    async def connect(self) -> bool:
+        """Simulate connection to exchange."""
+        print(f"[{self.exchange_name}] Connecting...")
+        await asyncio.sleep(self.connection_delay)
+        self.connected = True
+        print(f"[{self.exchange_name}] Connected successfully")
+        return True
+    
+    async def disconnect(self) -> None:
+        """Simulate disconnection from exchange."""
+        print(f"[{self.exchange_name}] Disconnecting...")
+        await asyncio.sleep(self.connection_delay / 2)
+        self.connected = False
+        self.subscribed = False
+        print(f"[{self.exchange_name}] Disconnected")
+    
+    async def subscribe_to_data(self, symbols: list, data_types: list) -> bool:
+        """Simulate subscription to data streams."""
+        if not self.connected:
+            return False
+        
+        print(f"[{self.exchange_name}] Subscribing to {len(symbols)} symbols: {', '.join(symbols)}")
+        await asyncio.sleep(0.05)
+        self.subscribed = True
+        return True
+    
+    async def unsubscribe_from_data(self, symbols: list, data_types: list) -> bool:
+        """Simulate unsubscription from data streams."""
+        print(f"[{self.exchange_name}] Unsubscribing from data streams")
+        self.subscribed = False
+        return True
+    
+    async def _process_message(self, message: Any) -> Optional[MarketDataPoint]:
+        """Process simulated market data message."""
+        self.message_count += 1
+        
+        # Simulate periodic failures if configured
+        if (self.fail_every_n_messages > 0 and 
+            self.message_count % self.fail_every_n_messages == 0):
+            raise Exception(f"Simulated failure after {self.message_count} messages")
+        
+        # Create mock market data
+        data_point = MarketDataPoint(
+            exchange=self.exchange_name,
+            symbol=message['symbol'],
+            timestamp=datetime.now(timezone.utc),
+            data_type=DataType.TICKER,
+            data={
+                'price': message['price'],
+                'volume': message.get('volume', 100),
+                'timestamp': datetime.now(timezone.utc).isoformat()
+            }
+        )
+        
+        return data_point
+    
+    async def _handle_messages(self) -> None:
+        """Simulate receiving and processing messages."""
+        if not self.connected or not self.subscribed:
+            await asyncio.sleep(0.1)
+            return
+        
+        # Simulate receiving data for each symbol
+        for symbol in self.symbols:
+            try:
+                # Create simulated message
+                simulated_message = {
+                    'symbol': symbol,
+                    'price': 50000 + (self.message_count % 1000),  # Fake price that changes
+                    'volume': 1.5
+                }
+                
+                # Process the message
+                data_point = await self._process_message(simulated_message)
+                if data_point:
+                    self._stats['messages_processed'] += 1
+                    await self._notify_callbacks(data_point)
+                
+            except Exception as e:
+                # This will trigger reconnection logic
+                raise e
+        
+        # Simulate processing delay
+        await asyncio.sleep(1.0)
+
+
+async def data_callback(data_point: MarketDataPoint):
+    """Callback function to handle received data."""
+    print(f"📊 Data received: {data_point.exchange} - {data_point.symbol} - "
+          f"Price: {data_point.data.get('price')} at {data_point.timestamp.strftime('%H:%M:%S')}")
+
+
+async def monitor_collectors(manager: CollectorManager, duration: int = 30):
+    """Monitor collector status and print updates."""
+    print(f"\n🔍 Starting monitoring for {duration} seconds...")
+    
+    for i in range(duration):
+        await asyncio.sleep(1)
+        
+        status = manager.get_status()
+        running = len(manager.get_running_collectors())
+        failed = len(manager.get_failed_collectors())
+        
+        if i % 5 == 0:  # Print status every 5 seconds
+            print(f"⏰ Status at {i+1}s: {running} running, {failed} failed, "
+                  f"{status['statistics']['restarts_performed']} restarts")
+    
+    print("🏁 Monitoring complete")
+
+
+async def demo_basic_usage():
+    """Demonstrate basic collector usage."""
+    print("=" * 60)
+    print("🚀 Demo 1: Basic Data Collector Usage")
+    print("=" * 60)
+    
+    # Create a stable collector
+    collector = DemoDataCollector("demo_exchange", ["BTC-USDT", "ETH-USDT"])
+    
+    # Add data callback
+    collector.add_data_callback(DataType.TICKER, data_callback)
+    
+    # Start the collector
+    print("Starting collector...")
+    success = await collector.start()
+    if success:
+        print("✅ Collector started successfully")
+        
+        # Let it run for a few seconds
+        await asyncio.sleep(5)
+        
+        # Show status
+        status = collector.get_status()
+        print(f"📈 Messages processed: {status['statistics']['messages_processed']}")
+        print(f"⏱️  Uptime: {status['statistics']['uptime_seconds']:.1f}s")
+        
+        # Stop the collector
+        await collector.stop()
+        print("✅ Collector stopped")
+    else:
+        print("❌ Failed to start collector")
+
+
+async def demo_manager_usage():
+    """Demonstrate collector manager usage."""
+    print("\n" + "=" * 60)
+    print("🎛️  Demo 2: Collector Manager Usage")
+    print("=" * 60)
+    
+    # Create manager
+    manager = CollectorManager("demo_manager", global_health_check_interval=3.0)
+    
+    # Create multiple collectors
+    stable_collector = DemoDataCollector("stable_exchange", ["BTC-USDT"])
+    failing_collector = DemoDataCollector("failing_exchange", ["ETH-USDT"], 
+                                        fail_every_n_messages=5)  # Fails every 5 messages
+    
+    # Add data callbacks
+    stable_collector.add_data_callback(DataType.TICKER, data_callback)
+    failing_collector.add_data_callback(DataType.TICKER, data_callback)
+    
+    # Add collectors to manager
+    manager.add_collector(stable_collector)
+    manager.add_collector(failing_collector)
+    
+    print(f"📝 Added {len(manager.list_collectors())} collectors to manager")
+    
+    # Start manager
+    success = await manager.start()
+    if success:
+        print("✅ Manager started successfully")
+        
+        # Monitor for a while
+        await monitor_collectors(manager, duration=15)
+        
+        # Show final status
+        status = manager.get_status()
+        print(f"\n📊 Final Statistics:")
+        print(f"   - Total restarts: {status['statistics']['restarts_performed']}")
+        print(f"   - Running collectors: {len(manager.get_running_collectors())}")
+        print(f"   - Failed collectors: {len(manager.get_failed_collectors())}")
+        
+        # Stop manager
+        await manager.stop()
+        print("✅ Manager stopped")
+    else:
+        print("❌ Failed to start manager")
+
+
+async def demo_dynamic_management():
+    """Demonstrate dynamic collector management."""
+    print("\n" + "=" * 60)
+    print("🔄 Demo 3: Dynamic Collector Management")
+    print("=" * 60)
+    
+    # Create manager
+    manager = CollectorManager("dynamic_manager", global_health_check_interval=2.0)
+    
+    # Start with one collector
+    collector1 = DemoDataCollector("exchange_1", ["BTC-USDT"])
+    collector1.add_data_callback(DataType.TICKER, data_callback)
+    
+    manager.add_collector(collector1)
+    await manager.start()
+    
+    print("✅ Started with 1 collector")
+    await asyncio.sleep(3)
+    
+    # Add second collector
+    collector2 = DemoDataCollector("exchange_2", ["ETH-USDT"])
+    collector2.add_data_callback(DataType.TICKER, data_callback)
+    manager.add_collector(collector2)
+    
+    print("➕ Added second collector")
+    await asyncio.sleep(3)
+    
+    # Disable first collector
+    collector_names = manager.list_collectors()
+    manager.disable_collector(collector_names[0])
+    
+    print("⏸️  Disabled first collector")
+    await asyncio.sleep(3)
+    
+    # Re-enable first collector
+    manager.enable_collector(collector_names[0])
+    
+    print("▶️  Re-enabled first collector")
+    await asyncio.sleep(3)
+    
+    # Show final status
+    status = manager.get_status()
+    print(f"📊 Final state: {len(manager.get_running_collectors())} running collectors")
+    
+    await manager.stop()
+    print("✅ Dynamic demo complete")
+
+
+async def main():
+    """Run all demonstrations."""
+    print("🎯 Data Collector System Demonstration")
+    print("This demo shows health monitoring and auto-restart capabilities\n")
+    
+    try:
+        # Run demonstrations
+        await demo_basic_usage()
+        await demo_manager_usage()
+        await demo_dynamic_management()
+        
+        print("\n" + "=" * 60)
+        print("🎉 All demonstrations completed successfully!")
+        print("=" * 60)
+        
+    except Exception as e:
+        print(f"❌ Demo failed with error: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    asyncio.run(main()) 
--- a/examples/parallel_collectors_demo.py
+++ b/examples/parallel_collectors_demo.py
@@ -0,0 +1,412 @@
+"""
+Demonstration of running multiple data collectors in parallel.
+
+This example shows how to set up and manage multiple collectors simultaneously,
+each collecting data from different exchanges or different symbols.
+"""
+
+import asyncio
+from datetime import datetime, timezone
+from typing import Dict, Any
+
+from data import (
+    BaseDataCollector, DataType, CollectorStatus, MarketDataPoint,
+    CollectorManager, CollectorConfig
+)
+
+
+class DemoExchangeCollector(BaseDataCollector):
+    """Demo collector simulating different exchanges."""
+    
+    def __init__(self, 
+                 exchange_name: str, 
+                 symbols: list,
+                 message_interval: float = 1.0,
+                 base_price: float = 50000):
+        """
+        Initialize demo collector.
+        
+        Args:
+            exchange_name: Name of the exchange (okx, binance, coinbase, etc.)
+            symbols: Trading symbols to collect
+            message_interval: Seconds between simulated messages
+            base_price: Base price for simulation
+        """
+        super().__init__(exchange_name, symbols, [DataType.TICKER])
+        self.message_interval = message_interval
+        self.base_price = base_price
+        self.connected = False
+        self.subscribed = False
+        self.message_count = 0
+    
+    async def connect(self) -> bool:
+        """Simulate connection to exchange."""
+        print(f"🔌 [{self.exchange_name.upper()}] Connecting...")
+        await asyncio.sleep(0.2)  # Simulate connection delay
+        self.connected = True
+        print(f"✅ [{self.exchange_name.upper()}] Connected successfully")
+        return True
+    
+    async def disconnect(self) -> None:
+        """Simulate disconnection from exchange."""
+        print(f"🔌 [{self.exchange_name.upper()}] Disconnecting...")
+        await asyncio.sleep(0.1)
+        self.connected = False
+        self.subscribed = False
+        print(f"❌ [{self.exchange_name.upper()}] Disconnected")
+    
+    async def subscribe_to_data(self, symbols: list, data_types: list) -> bool:
+        """Simulate subscription to data streams."""
+        if not self.connected:
+            return False
+        
+        print(f"📡 [{self.exchange_name.upper()}] Subscribing to {len(symbols)} symbols")
+        await asyncio.sleep(0.1)
+        self.subscribed = True
+        return True
+    
+    async def unsubscribe_from_data(self, symbols: list, data_types: list) -> bool:
+        """Simulate unsubscription from data streams."""
+        print(f"📡 [{self.exchange_name.upper()}] Unsubscribing from data streams")
+        self.subscribed = False
+        return True
+    
+    async def _process_message(self, message: Any) -> MarketDataPoint:
+        """Process simulated market data message."""
+        self.message_count += 1
+        
+        # Create realistic price variation
+        price_variation = (self.message_count % 100 - 50) * 10
+        current_price = self.base_price + price_variation
+        
+        data_point = MarketDataPoint(
+            exchange=self.exchange_name,
+            symbol=message['symbol'],
+            timestamp=datetime.now(timezone.utc),
+            data_type=DataType.TICKER,
+            data={
+                'price': current_price,
+                'volume': message.get('volume', 1.0 + (self.message_count % 10) * 0.1),
+                'bid': current_price - 0.5,
+                'ask': current_price + 0.5,
+                'timestamp': datetime.now(timezone.utc).isoformat()
+            }
+        )
+        
+        return data_point
+    
+    async def _handle_messages(self) -> None:
+        """Simulate receiving and processing messages."""
+        if not self.connected or not self.subscribed:
+            await asyncio.sleep(0.1)
+            return
+        
+        # Process each symbol
+        for symbol in self.symbols:
+            try:
+                # Create simulated message
+                simulated_message = {
+                    'symbol': symbol,
+                    'volume': 1.5 + (self.message_count % 5) * 0.2
+                }
+                
+                # Process the message
+                data_point = await self._process_message(simulated_message)
+                if data_point:
+                    self._stats['messages_processed'] += 1
+                    await self._notify_callbacks(data_point)
+                
+            except Exception as e:
+                self.logger.error(f"Error processing message for {symbol}: {e}")
+                raise e
+        
+        # Wait before next batch of messages
+        await asyncio.sleep(self.message_interval)
+
+
+def create_data_callback(exchange_name: str):
+    """Create a data callback function for a specific exchange."""
+    
+    def data_callback(data_point: MarketDataPoint):
+        print(f"📊 {exchange_name.upper():8} | {data_point.symbol:10} | "
+              f"${data_point.data.get('price', 0):8.2f} | "
+              f"Vol: {data_point.data.get('volume', 0):.2f} | "
+              f"{data_point.timestamp.strftime('%H:%M:%S')}")
+    
+    return data_callback
+
+
+async def demo_parallel_collectors():
+    """Demonstrate running multiple collectors in parallel."""
+    print("=" * 80)
+    print("🚀 PARALLEL COLLECTORS DEMONSTRATION")
+    print("=" * 80)
+    print("Running multiple exchange collectors simultaneously...")
+    print()
+    
+    # Create manager
+    manager = CollectorManager(
+        "parallel_demo_manager",
+        global_health_check_interval=10.0  # Check every 10 seconds
+    )
+    
+    # Define exchange configurations
+    exchange_configs = [
+        {
+            'name': 'okx',
+            'symbols': ['BTC-USDT', 'ETH-USDT'],
+            'interval': 1.0,
+            'base_price': 45000
+        },
+        {
+            'name': 'binance', 
+            'symbols': ['BTC-USDT', 'ETH-USDT', 'SOL-USDT'],
+            'interval': 1.5,
+            'base_price': 45100
+        },
+        {
+            'name': 'coinbase',
+            'symbols': ['BTC-USD', 'ETH-USD'],
+            'interval': 2.0,
+            'base_price': 44900
+        },
+        {
+            'name': 'kraken',
+            'symbols': ['XBTUSD', 'ETHUSD'],
+            'interval': 1.2,
+            'base_price': 45050
+        }
+    ]
+    
+    # Create and configure collectors
+    for config in exchange_configs:
+        # Create collector
+        collector = DemoExchangeCollector(
+            exchange_name=config['name'],
+            symbols=config['symbols'],
+            message_interval=config['interval'],
+            base_price=config['base_price']
+        )
+        
+        # Add data callback
+        callback = create_data_callback(config['name'])
+        collector.add_data_callback(DataType.TICKER, callback)
+        
+        # Add to manager with configuration
+        collector_config = CollectorConfig(
+            name=f"{config['name']}_collector",
+            exchange=config['name'],
+            symbols=config['symbols'],
+            data_types=['ticker'],
+            auto_restart=True,
+            health_check_interval=15.0,
+            enabled=True
+        )
+        
+        manager.add_collector(collector, collector_config)
+        print(f"➕ Added {config['name'].upper()} collector with {len(config['symbols'])} symbols")
+    
+    print(f"\n📝 Total collectors added: {len(manager.list_collectors())}")
+    print()
+    
+    # Start all collectors in parallel
+    print("🏁 Starting all collectors...")
+    start_time = asyncio.get_event_loop().time()
+    
+    success = await manager.start()
+    if not success:
+        print("❌ Failed to start collector manager")
+        return
+    
+    startup_time = asyncio.get_event_loop().time() - start_time
+    print(f"✅ All collectors started in {startup_time:.2f} seconds")
+    print()
+    
+    print("📊 DATA STREAM (All exchanges running in parallel):")
+    print("-" * 80)
+    
+    # Monitor for a period
+    monitoring_duration = 30  # seconds
+    for i in range(monitoring_duration):
+        await asyncio.sleep(1)
+        
+        # Print status every 10 seconds
+        if i % 10 == 0 and i > 0:
+            status = manager.get_status()
+            print()
+            print(f"⏰ STATUS UPDATE ({i}s):")
+            print(f"   Running collectors: {len(manager.get_running_collectors())}")
+            print(f"   Failed collectors: {len(manager.get_failed_collectors())}")
+            print(f"   Total restarts: {status['statistics']['restarts_performed']}")
+            print("-" * 80)
+    
+    # Final status report
+    print()
+    print("📈 FINAL STATUS REPORT:")
+    print("=" * 80)
+    
+    status = manager.get_status()
+    print(f"Manager Status: {status['manager_status']}")
+    print(f"Total Collectors: {status['total_collectors']}")
+    print(f"Running Collectors: {len(manager.get_running_collectors())}")
+    print(f"Failed Collectors: {len(manager.get_failed_collectors())}")
+    print(f"Total Restarts: {status['statistics']['restarts_performed']}")
+    
+    # Individual collector statistics
+    print("\n📊 INDIVIDUAL COLLECTOR STATS:")
+    for collector_name in manager.list_collectors():
+        collector_status = manager.get_collector_status(collector_name)
+        if collector_status:
+            stats = collector_status['status']['statistics']
+            health = collector_status['health']
+            
+            print(f"\n{collector_name.upper()}:")
+            print(f"  Status: {collector_status['status']['status']}")
+            print(f"  Messages Processed: {stats['messages_processed']}")
+            print(f"  Uptime: {stats.get('uptime_seconds', 0):.1f}s")
+            print(f"  Errors: {stats['errors']}")
+            print(f"  Healthy: {health['is_healthy']}")
+    
+    # Stop all collectors
+    print("\n🛑 Stopping all collectors...")
+    await manager.stop()
+    print("✅ All collectors stopped successfully")
+
+
+async def demo_dynamic_management():
+    """Demonstrate dynamic addition/removal of collectors."""
+    print("\n" + "=" * 80)
+    print("🔄 DYNAMIC COLLECTOR MANAGEMENT")
+    print("=" * 80)
+    
+    manager = CollectorManager("dynamic_manager")
+    
+    # Start with one collector
+    collector1 = DemoExchangeCollector("exchange_a", ["BTC-USDT"], 1.0)
+    collector1.add_data_callback(DataType.TICKER, create_data_callback("exchange_a"))
+    manager.add_collector(collector1)
+    
+    await manager.start()
+    print("✅ Started with 1 collector")
+    await asyncio.sleep(3)
+    
+    # Add second collector while system is running
+    collector2 = DemoExchangeCollector("exchange_b", ["ETH-USDT"], 1.5)
+    collector2.add_data_callback(DataType.TICKER, create_data_callback("exchange_b"))
+    manager.add_collector(collector2)
+    
+    print("➕ Added second collector while running")
+    await asyncio.sleep(3)
+    
+    # Add third collector
+    collector3 = DemoExchangeCollector("exchange_c", ["SOL-USDT"], 2.0)
+    collector3.add_data_callback(DataType.TICKER, create_data_callback("exchange_c"))
+    manager.add_collector(collector3)
+    
+    print("➕ Added third collector")
+    await asyncio.sleep(5)
+    
+    # Show current status
+    print(f"\n📊 Current Status: {len(manager.get_running_collectors())} collectors running")
+    
+    # Disable one collector
+    collectors = manager.list_collectors()
+    if len(collectors) > 1:
+        manager.disable_collector(collectors[1])
+        print(f"⏸️  Disabled collector: {collectors[1]}")
+        await asyncio.sleep(3)
+    
+    # Re-enable
+    if len(collectors) > 1:
+        manager.enable_collector(collectors[1])
+        print(f"▶️  Re-enabled collector: {collectors[1]}")
+        await asyncio.sleep(3)
+    
+    print(f"\n📊 Final Status: {len(manager.get_running_collectors())} collectors running")
+    
+    await manager.stop()
+    print("✅ Dynamic management demo complete")
+
+
+async def demo_performance_monitoring():
+    """Demonstrate performance monitoring across multiple collectors."""
+    print("\n" + "=" * 80)
+    print("📈 PERFORMANCE MONITORING")
+    print("=" * 80)
+    
+    manager = CollectorManager("performance_monitor", global_health_check_interval=5.0)
+    
+    # Create collectors with different performance characteristics
+    configs = [
+        ("fast_exchange", ["BTC-USDT"], 0.5),      # Fast updates
+        ("medium_exchange", ["ETH-USDT"], 1.0),    # Medium updates  
+        ("slow_exchange", ["SOL-USDT"], 2.0),      # Slow updates
+    ]
+    
+    for exchange, symbols, interval in configs:
+        collector = DemoExchangeCollector(exchange, symbols, interval)
+        collector.add_data_callback(DataType.TICKER, create_data_callback(exchange))
+        manager.add_collector(collector)
+    
+    await manager.start()
+    print("✅ Started performance monitoring demo")
+    
+    # Monitor performance for 20 seconds
+    for i in range(4):
+        await asyncio.sleep(5)
+        
+        print(f"\n📊 PERFORMANCE SNAPSHOT ({(i+1)*5}s):")
+        print("-" * 60)
+        
+        for collector_name in manager.list_collectors():
+            status = manager.get_collector_status(collector_name)
+            if status:
+                stats = status['status']['statistics']
+                health = status['health']
+                
+                msg_rate = stats['messages_processed'] / max(stats.get('uptime_seconds', 1), 1)
+                
+                print(f"{collector_name:15} | "
+                      f"Rate: {msg_rate:5.1f}/s | "
+                      f"Total: {stats['messages_processed']:4d} | "
+                      f"Errors: {stats['errors']:2d} | "
+                      f"Health: {'✅' if health['is_healthy'] else '❌'}")
+    
+    await manager.stop()
+    print("\n✅ Performance monitoring demo complete")
+
+
+async def main():
+    """Run all parallel collector demonstrations."""
+    print("🎯 MULTIPLE COLLECTORS PARALLEL EXECUTION DEMO")
+    print("This demonstration shows the CollectorManager running multiple collectors simultaneously\n")
+    
+    try:
+        # Main parallel demo
+        await demo_parallel_collectors()
+        
+        # Dynamic management demo
+        await demo_dynamic_management()
+        
+        # Performance monitoring demo
+        await demo_performance_monitoring()
+        
+        print("\n" + "=" * 80)
+        print("🎉 ALL PARALLEL EXECUTION DEMOS COMPLETED!")
+        print("=" * 80)
+        print("\nKey takeaways:")
+        print("✅ Multiple collectors run truly in parallel")
+        print("✅ Each collector operates independently")
+        print("✅ Collectors can be added/removed while system is running")
+        print("✅ Centralized health monitoring across all collectors")
+        print("✅ Individual performance tracking per collector")
+        print("✅ Coordinated lifecycle management")
+        
+    except Exception as e:
+        print(f"❌ Demo failed with error: {e}")
+        import traceback
+        traceback.print_exc()
+
+
+if __name__ == "__main__":
+    asyncio.run(main()) 
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -69,3 +69,8 @@ python_version = "3.10"
 warn_return_any = true
 warn_unused_configs = true
 disallow_untyped_defs = true
+
+[dependency-groups]
+dev = [
+    "pytest-asyncio>=1.0.0",
+]
--- a/tasks/tasks-crypto-bot-prd.md
+++ b/tasks/tasks-crypto-bot-prd.md
@@ -10,6 +10,9 @@
 - `database/migrations/` - Alembic migration system for database schema versioning and updates
 - `database/init/init.sql` - Docker initialization script for automatic database setup
 - `database/init/schema_clean.sql` - Copy of clean schema for Docker initialization
+- `data/base_collector.py` - Abstract base class for all data collectors with standardized interface, error handling, data validation, health monitoring, and auto-restart capabilities
+- `data/collector_manager.py` - Centralized collector management with health monitoring, auto-recovery, and coordinated lifecycle management
+- `data/__init__.py` - Data collection package initialization
 - `data/okx_collector.py` - OKX API integration for real-time market data collection
 - `data/aggregator.py` - OHLCV candle aggregation and processing
 - `strategies/base_strategy.py` - Base strategy class and interface
@@ -31,6 +34,8 @@
 - `tests/test_strategies.py` - Unit tests for strategy implementations
 - `tests/test_bot_manager.py` - Unit tests for bot management functionality
 - `tests/test_data_collection.py` - Unit tests for data collection and aggregation
+- `tests/test_base_collector.py` - Comprehensive unit tests for the BaseDataCollector abstract class (13 tests)
+- `tests/test_collector_manager.py` - Comprehensive unit tests for the CollectorManager with health monitoring (14 tests)
 - `tests/test_logging_enhanced.py` - Comprehensive unit tests for enhanced logging features (16 tests)
 - `docs/setup.md` - Comprehensive setup guide for new machines and environments
 - `docs/logging.md` - Complete documentation for the enhanced unified logging system
@@ -49,6 +54,9 @@
  - [x] 1.9 Add unified logging system we can use for all components

 - [ ] 2.0 Market Data Collection and Processing System
+  - [x] 2.0.1 Create abstract base class for data collectors with standardized interface, error handling, and data validation
+  - [x] 2.0.2 Enhance data collectors with health monitoring, heartbeat system, and auto-restart capabilities
+  - [x] 2.0.3 Create collector manager for supervising multiple data collectors with coordinated lifecycle management
  - [ ] 2.1 Implement OKX WebSocket API connector for real-time data
  - [ ] 2.2 Create OHLCV candle aggregation logic with multiple timeframes (1m, 5m, 15m, 1h, 4h, 1d)
  - [ ] 2.3 Build data validation and error handling for market data
--- a/tests/test_base_collector.py
+++ b/tests/test_base_collector.py
@@ -0,0 +1,333 @@
+"""
+Unit tests for the BaseDataCollector abstract class.
+"""
+
+import asyncio
+import pytest
+from datetime import datetime, timezone
+from decimal import Decimal
+from unittest.mock import AsyncMock, MagicMock
+
+from data.base_collector import (
+    BaseDataCollector, DataType, CollectorStatus, MarketDataPoint, 
+    OHLCVData, DataValidationError, DataCollectorError
+)
+
+
+class TestDataCollector(BaseDataCollector):
+    """Test implementation of BaseDataCollector for testing."""
+    
+    def __init__(self, exchange_name: str, symbols: list, data_types=None):
+        super().__init__(exchange_name, symbols, data_types)
+        self.connected = False
+        self.subscribed = False
+        self.messages = []
+    
+    async def connect(self) -> bool:
+        await asyncio.sleep(0.01)  # Simulate connection delay
+        self.connected = True
+        return True
+    
+    async def disconnect(self) -> None:
+        await asyncio.sleep(0.01)  # Simulate disconnection delay
+        self.connected = False
+        self.subscribed = False
+    
+    async def subscribe_to_data(self, symbols: list, data_types: list) -> bool:
+        if not self.connected:
+            return False
+        self.subscribed = True
+        return True
+    
+    async def unsubscribe_from_data(self, symbols: list, data_types: list) -> bool:
+        self.subscribed = False
+        return True
+    
+    async def _process_message(self, message) -> MarketDataPoint:
+        self._stats['messages_received'] += 1
+        return MarketDataPoint(
+            exchange=self.exchange_name,
+            symbol=message.get('symbol', 'BTC-USDT'),
+            timestamp=datetime.now(timezone.utc),
+            data_type=DataType.TICKER,
+            data=message
+        )
+    
+    async def _handle_messages(self) -> None:
+        # Simulate receiving messages
+        if self.messages:
+            message = self.messages.pop(0)
+            data_point = await self._process_message(message)
+            self._stats['messages_processed'] += 1
+            self._stats['last_message_time'] = datetime.now(timezone.utc)
+            await self._notify_callbacks(data_point)
+        else:
+            await asyncio.sleep(0.1)  # Wait for messages
+    
+    def add_test_message(self, message: dict):
+        """Add a test message to be processed."""
+        self.messages.append(message)
+
+
+class TestBaseDataCollector:
+    """Test cases for BaseDataCollector."""
+    
+    @pytest.fixture
+    def collector(self):
+        """Create a test collector instance."""
+        return TestDataCollector("okx", ["BTC-USDT", "ETH-USDT"], [DataType.TICKER])
+    
+    def test_initialization(self, collector):
+        """Test collector initialization."""
+        assert collector.exchange_name == "okx"
+        assert collector.symbols == {"BTC-USDT", "ETH-USDT"}
+        assert collector.data_types == [DataType.TICKER]
+        assert collector.status == CollectorStatus.STOPPED
+        assert not collector._running
+    
+    @pytest.mark.asyncio
+    async def test_start_stop_cycle(self, collector):
+        """Test starting and stopping the collector."""
+        # Test start
+        success = await collector.start()
+        assert success
+        assert collector.status == CollectorStatus.RUNNING
+        assert collector.connected
+        assert collector.subscribed
+        assert collector._running
+        
+        # Wait a bit for the message loop to start
+        await asyncio.sleep(0.1)
+        
+        # Test stop
+        await collector.stop()
+        assert collector.status == CollectorStatus.STOPPED
+        assert not collector._running
+        assert not collector.connected
+        assert not collector.subscribed
+    
+    @pytest.mark.asyncio
+    async def test_message_processing(self, collector):
+        """Test message processing and callbacks."""
+        received_data = []
+        
+        def callback(data_point: MarketDataPoint):
+            received_data.append(data_point)
+        
+        collector.add_data_callback(DataType.TICKER, callback)
+        
+        await collector.start()
+        
+        # Add test message
+        test_message = {"symbol": "BTC-USDT", "price": "50000"}
+        collector.add_test_message(test_message)
+        
+        # Wait for message processing
+        await asyncio.sleep(0.2)
+        
+        await collector.stop()
+        
+        # Verify message was processed
+        assert len(received_data) == 1
+        assert received_data[0].symbol == "BTC-USDT"
+        assert received_data[0].data_type == DataType.TICKER
+        assert collector._stats['messages_received'] == 1
+        assert collector._stats['messages_processed'] == 1
+    
+    def test_symbol_management(self, collector):
+        """Test adding and removing symbols."""
+        initial_count = len(collector.symbols)
+        
+        # Add new symbol
+        collector.add_symbol("LTC-USDT")
+        assert "LTC-USDT" in collector.symbols
+        assert len(collector.symbols) == initial_count + 1
+        
+        # Remove symbol
+        collector.remove_symbol("BTC-USDT")
+        assert "BTC-USDT" not in collector.symbols
+        assert len(collector.symbols) == initial_count
+        
+        # Try to add existing symbol (should not duplicate)
+        collector.add_symbol("ETH-USDT")
+        assert len(collector.symbols) == initial_count
+    
+    def test_callback_management(self, collector):
+        """Test adding and removing callbacks."""
+        def callback1(data): pass
+        def callback2(data): pass
+        
+        # Add callbacks
+        collector.add_data_callback(DataType.TICKER, callback1)
+        collector.add_data_callback(DataType.TICKER, callback2)
+        assert len(collector._data_callbacks[DataType.TICKER]) == 2
+        
+        # Remove callback
+        collector.remove_data_callback(DataType.TICKER, callback1)
+        assert len(collector._data_callbacks[DataType.TICKER]) == 1
+        assert callback2 in collector._data_callbacks[DataType.TICKER]
+    
+    def test_get_status(self, collector):
+        """Test status reporting."""
+        status = collector.get_status()
+        
+        assert status['exchange'] == 'okx'
+        assert status['status'] == 'stopped'
+        assert set(status['symbols']) == {"BTC-USDT", "ETH-USDT"}
+        assert status['data_types'] == ['ticker']
+        assert 'statistics' in status
+        assert status['statistics']['messages_received'] == 0
+
+
+class TestOHLCVData:
+    """Test cases for OHLCVData validation."""
+    
+    def test_valid_ohlcv_data(self):
+        """Test creating valid OHLCV data."""
+        ohlcv = OHLCVData(
+            symbol="BTC-USDT",
+            timeframe="1m",
+            timestamp=datetime.now(timezone.utc),
+            open=Decimal("50000"),
+            high=Decimal("50100"),
+            low=Decimal("49900"),
+            close=Decimal("50050"),
+            volume=Decimal("1.5"),
+            trades_count=100
+        )
+        
+        assert ohlcv.symbol == "BTC-USDT"
+        assert ohlcv.timeframe == "1m"
+        assert isinstance(ohlcv.open, Decimal)
+        assert ohlcv.trades_count == 100
+    
+    def test_invalid_ohlcv_relationships(self):
+        """Test OHLCV validation for invalid price relationships."""
+        with pytest.raises(DataValidationError):
+            OHLCVData(
+                symbol="BTC-USDT",
+                timeframe="1m",
+                timestamp=datetime.now(timezone.utc),
+                open=Decimal("50000"),
+                high=Decimal("49000"),  # High is less than open
+                low=Decimal("49900"),
+                close=Decimal("50050"),
+                volume=Decimal("1.5")
+            )
+    
+    def test_ohlcv_decimal_conversion(self):
+        """Test automatic conversion to Decimal."""
+        ohlcv = OHLCVData(
+            symbol="BTC-USDT",
+            timeframe="1m",
+            timestamp=datetime.now(timezone.utc),
+            open=50000.0,  # float
+            high=50100,    # int  
+            low=49900,     # int (changed from string to test proper conversion)
+            close=50050.0, # float
+            volume=1.5     # float
+        )
+        
+        assert isinstance(ohlcv.open, Decimal)
+        assert isinstance(ohlcv.high, Decimal)
+        assert isinstance(ohlcv.low, Decimal)
+        assert isinstance(ohlcv.close, Decimal)
+        assert isinstance(ohlcv.volume, Decimal)
+
+
+class TestDataValidation:
+    """Test cases for data validation methods."""
+    
+    def test_validate_ohlcv_data_success(self):
+        """Test successful OHLCV data validation."""
+        collector = TestDataCollector("test", ["BTC-USDT"])
+        
+        raw_data = {
+            "timestamp": 1609459200000,  # Unix timestamp in ms
+            "open": "50000",
+            "high": "50100",
+            "low": "49900",
+            "close": "50050",
+            "volume": "1.5",
+            "trades_count": 100
+        }
+        
+        ohlcv = collector.validate_ohlcv_data(raw_data, "BTC-USDT", "1m")
+        
+        assert ohlcv.symbol == "BTC-USDT"
+        assert ohlcv.timeframe == "1m"
+        assert ohlcv.trades_count == 100
+        assert isinstance(ohlcv.open, Decimal)
+    
+    def test_validate_ohlcv_data_missing_field(self):
+        """Test OHLCV validation with missing required field."""
+        collector = TestDataCollector("test", ["BTC-USDT"])
+        
+        raw_data = {
+            "timestamp": 1609459200000,
+            "open": "50000",
+            "high": "50100",
+            # Missing 'low' field
+            "close": "50050",
+            "volume": "1.5"
+        }
+        
+        with pytest.raises(DataValidationError, match="Missing required field: low"):
+            collector.validate_ohlcv_data(raw_data, "BTC-USDT", "1m")
+    
+    def test_validate_ohlcv_data_invalid_timestamp(self):
+        """Test OHLCV validation with invalid timestamp."""
+        collector = TestDataCollector("test", ["BTC-USDT"])
+        
+        raw_data = {
+            "timestamp": "invalid_timestamp",
+            "open": "50000",
+            "high": "50100",
+            "low": "49900",
+            "close": "50050",
+            "volume": "1.5"
+        }
+        
+        with pytest.raises(DataValidationError):
+            collector.validate_ohlcv_data(raw_data, "BTC-USDT", "1m")
+
+
+@pytest.mark.asyncio
+async def test_connection_error_handling():
+    """Test connection error handling and reconnection."""
+    
+    class FailingCollector(TestDataCollector):
+        def __init__(self):
+            super().__init__("test", ["BTC-USDT"])
+            self.connect_attempts = 0
+            self.should_fail = True
+        
+        async def connect(self) -> bool:
+            self.connect_attempts += 1
+            if self.should_fail and self.connect_attempts < 3:
+                return False  # Fail first 2 attempts
+            return await super().connect()
+    
+    collector = FailingCollector()
+    
+    # First start should fail
+    success = await collector.start()
+    assert not success
+    assert collector.status == CollectorStatus.ERROR
+    
+    # Reset for retry and allow success
+    collector._reconnect_attempts = 0
+    collector.status = CollectorStatus.STOPPED
+    collector.connect_attempts = 0  # Reset connection attempts
+    collector.should_fail = False   # Allow connection to succeed
+    
+    # This attempt should succeed
+    success = await collector.start()
+    assert success
+    assert collector.status == CollectorStatus.RUNNING
+    
+    await collector.stop()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"]) 
--- a/tests/test_collector_manager.py
+++ b/tests/test_collector_manager.py
@@ -0,0 +1,341 @@
+"""
+Unit tests for the CollectorManager class.
+"""
+
+import asyncio
+import pytest
+from datetime import datetime, timezone
+from unittest.mock import AsyncMock, MagicMock
+
+from data.collector_manager import CollectorManager, ManagerStatus, CollectorConfig
+from data.base_collector import BaseDataCollector, DataType, CollectorStatus
+
+
+class MockDataCollector(BaseDataCollector):
+    """Mock implementation of BaseDataCollector for testing."""
+    
+    def __init__(self, exchange_name: str, symbols: list, auto_restart: bool = True):
+        super().__init__(exchange_name, symbols, [DataType.TICKER], auto_restart=auto_restart)
+        self.connected = False
+        self.subscribed = False
+        self.should_fail_connect = False
+        self.should_fail_subscribe = False
+        self.fail_count = 0
+        
+    async def connect(self) -> bool:
+        if self.should_fail_connect and self.fail_count < 2:
+            self.fail_count += 1
+            return False
+        await asyncio.sleep(0.01)
+        self.connected = True
+        return True
+    
+    async def disconnect(self) -> None:
+        await asyncio.sleep(0.01)
+        self.connected = False
+        self.subscribed = False
+    
+    async def subscribe_to_data(self, symbols: list, data_types: list) -> bool:
+        if self.should_fail_subscribe:
+            return False
+        if not self.connected:
+            return False
+        self.subscribed = True
+        return True
+    
+    async def unsubscribe_from_data(self, symbols: list, data_types: list) -> bool:
+        self.subscribed = False
+        return True
+    
+    async def _process_message(self, message) -> None:
+        # No message processing in mock
+        pass
+    
+    async def _handle_messages(self) -> None:
+        # Simulate light processing
+        await asyncio.sleep(0.1)
+
+
+class TestCollectorManager:
+    """Test cases for CollectorManager."""
+    
+    @pytest.fixture
+    def manager(self):
+        """Create a test manager instance."""
+        return CollectorManager("test_manager", global_health_check_interval=1.0)
+    
+    @pytest.fixture  
+    def mock_collector(self):
+        """Create a mock collector."""
+        return MockDataCollector("okx", ["BTC-USDT", "ETH-USDT"])
+    
+    def test_initialization(self, manager):
+        """Test manager initialization."""
+        assert manager.manager_name == "test_manager"
+        assert manager.status == ManagerStatus.STOPPED
+        assert len(manager._collectors) == 0
+        assert len(manager._enabled_collectors) == 0
+    
+    def test_add_collector(self, manager, mock_collector):
+        """Test adding a collector to the manager."""
+        # Add collector
+        manager.add_collector(mock_collector)
+        
+        assert len(manager._collectors) == 1
+        assert len(manager._enabled_collectors) == 1
+        
+        # Verify collector is in the collections
+        collector_names = manager.list_collectors()
+        assert len(collector_names) == 1
+        assert collector_names[0].startswith("okx_")
+        
+        # Test with custom config using a different collector instance
+        mock_collector2 = MockDataCollector("binance", ["ETH-USDT"])
+        config = CollectorConfig(
+            name="custom_collector",
+            exchange="binance",
+            symbols=["ETH-USDT"],
+            data_types=["ticker"],
+            enabled=False
+        )
+        manager.add_collector(mock_collector2, config)
+        assert len(manager._collectors) == 2
+        assert len(manager._enabled_collectors) == 1  # Still 1 since second is disabled
+    
+    def test_remove_collector(self, manager, mock_collector):
+        """Test removing a collector from the manager."""
+        # Add then remove
+        manager.add_collector(mock_collector)
+        collector_names = manager.list_collectors()
+        collector_name = collector_names[0]
+        
+        success = manager.remove_collector(collector_name)
+        assert success
+        assert len(manager._collectors) == 0
+        assert len(manager._enabled_collectors) == 0
+        
+        # Test removing non-existent collector
+        success = manager.remove_collector("non_existent")
+        assert not success
+    
+    def test_enable_disable_collector(self, manager, mock_collector):
+        """Test enabling and disabling collectors."""
+        manager.add_collector(mock_collector)
+        collector_name = manager.list_collectors()[0]
+        
+        # Initially enabled
+        assert collector_name in manager._enabled_collectors
+        
+        # Disable
+        success = manager.disable_collector(collector_name)
+        assert success
+        assert collector_name not in manager._enabled_collectors
+        
+        # Enable again
+        success = manager.enable_collector(collector_name)
+        assert success
+        assert collector_name in manager._enabled_collectors
+        
+        # Test with non-existent collector
+        success = manager.enable_collector("non_existent")
+        assert not success
+    
+    @pytest.mark.asyncio
+    async def test_start_stop_manager(self, manager, mock_collector):
+        """Test starting and stopping the manager."""
+        # Add a collector
+        manager.add_collector(mock_collector)
+        
+        # Start manager
+        success = await manager.start()
+        assert success
+        assert manager.status == ManagerStatus.RUNNING
+        
+        # Wait a bit for collectors to start
+        await asyncio.sleep(0.2)
+        
+        # Check collector is running
+        running_collectors = manager.get_running_collectors()
+        assert len(running_collectors) == 1
+        
+        # Stop manager
+        await manager.stop()
+        assert manager.status == ManagerStatus.STOPPED
+        
+        # Check collector is stopped
+        running_collectors = manager.get_running_collectors()
+        assert len(running_collectors) == 0
+    
+    @pytest.mark.asyncio
+    async def test_restart_collector(self, manager, mock_collector):
+        """Test restarting a specific collector."""
+        manager.add_collector(mock_collector)
+        await manager.start()
+        
+        collector_name = manager.list_collectors()[0]
+        
+        # Wait for collector to start
+        await asyncio.sleep(0.2)
+        
+        # Restart the collector
+        success = await manager.restart_collector(collector_name)
+        assert success
+        
+        # Check statistics
+        status = manager.get_status()
+        assert status['statistics']['restarts_performed'] >= 1
+        
+        await manager.stop()
+    
+    @pytest.mark.asyncio
+    async def test_health_monitoring(self, manager):
+        """Test health monitoring and auto-restart functionality."""
+        # Create a collector that will fail initially
+        failing_collector = MockDataCollector("test", ["BTC-USDT"], auto_restart=True)
+        failing_collector.should_fail_connect = True
+        
+        manager.add_collector(failing_collector)
+        await manager.start()
+        
+        # Wait for health checks
+        await asyncio.sleep(2.5)  # More than health check interval
+        
+        # Check that restarts were attempted
+        status = manager.get_status()
+        failed_collectors = manager.get_failed_collectors()
+        
+        # The collector should have been marked as failed and restart attempts made
+        assert len(failed_collectors) >= 0  # May have recovered
+        
+        await manager.stop()
+    
+    def test_get_status(self, manager, mock_collector):
+        """Test status reporting."""
+        manager.add_collector(mock_collector)
+        
+        status = manager.get_status()
+        
+        assert status['manager_status'] == 'stopped'
+        assert status['total_collectors'] == 1
+        assert len(status['enabled_collectors']) == 1
+        assert 'statistics' in status
+        assert 'collectors' in status
+    
+    def test_get_collector_status(self, manager, mock_collector):
+        """Test getting individual collector status."""
+        manager.add_collector(mock_collector)
+        collector_name = manager.list_collectors()[0]
+        
+        collector_status = manager.get_collector_status(collector_name)
+        
+        assert collector_status is not None
+        assert collector_status['name'] == collector_name
+        assert 'config' in collector_status
+        assert 'status' in collector_status
+        assert 'health' in collector_status
+        
+        # Test non-existent collector
+        non_existent_status = manager.get_collector_status("non_existent")
+        assert non_existent_status is None
+    
+    @pytest.mark.asyncio
+    async def test_restart_all_collectors(self, manager):
+        """Test restarting all collectors."""
+        # Add multiple collectors
+        collector1 = MockDataCollector("okx", ["BTC-USDT"])
+        collector2 = MockDataCollector("binance", ["ETH-USDT"])
+        
+        manager.add_collector(collector1)
+        manager.add_collector(collector2)
+        
+        await manager.start()
+        await asyncio.sleep(0.2)  # Let them start
+        
+        # Restart all
+        results = await manager.restart_all_collectors()
+        
+        assert len(results) == 2
+        assert all(success for success in results.values())
+        
+        await manager.stop()
+    
+    def test_get_running_and_failed_collectors(self, manager, mock_collector):
+        """Test getting running and failed collector lists."""
+        manager.add_collector(mock_collector)
+        
+        # Initially no running collectors
+        running = manager.get_running_collectors()
+        failed = manager.get_failed_collectors()
+        
+        assert len(running) == 0
+        # Note: failed might be empty since collector hasn't started yet
+    
+    def test_collector_config(self):
+        """Test CollectorConfig dataclass."""
+        config = CollectorConfig(
+            name="test_collector",
+            exchange="okx",
+            symbols=["BTC-USDT", "ETH-USDT"],
+            data_types=["ticker", "trade"],
+            auto_restart=True,
+            health_check_interval=30.0,
+            enabled=True
+        )
+        
+        assert config.name == "test_collector"
+        assert config.exchange == "okx"
+        assert len(config.symbols) == 2
+        assert len(config.data_types) == 2
+        assert config.auto_restart is True
+        assert config.enabled is True
+
+
+@pytest.mark.asyncio
+async def test_manager_with_connection_failures():
+    """Test manager handling collectors with connection failures."""
+    manager = CollectorManager("test_manager", global_health_check_interval=0.5)
+    
+    # Create a collector that fails connection initially
+    failing_collector = MockDataCollector("failing_exchange", ["BTC-USDT"])
+    failing_collector.should_fail_connect = True
+    
+    manager.add_collector(failing_collector)
+    
+    # Start manager
+    success = await manager.start()
+    assert success  # Manager should start even if collectors fail
+    
+    # Wait for some health checks
+    await asyncio.sleep(1.5)
+    
+    # Check that the failing collector is detected
+    failed_collectors = manager.get_failed_collectors()
+    status = manager.get_status()
+    
+    # The collector should be in failed state or have restart attempts
+    assert status['statistics']['restarts_performed'] >= 0
+    
+    await manager.stop()
+
+
+@pytest.mark.asyncio
+async def test_manager_graceful_shutdown():
+    """Test that manager shuts down gracefully even with problematic collectors."""
+    manager = CollectorManager("test_manager")
+    
+    # Add multiple collectors
+    for i in range(3):
+        collector = MockDataCollector(f"exchange_{i}", ["BTC-USDT"])
+        manager.add_collector(collector)
+    
+    await manager.start()
+    await asyncio.sleep(0.2)
+    
+    # Stop should complete even if collectors take time
+    await manager.stop()
+    
+    assert manager.status == ManagerStatus.STOPPED
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"]) 
--- a/uv.lock
+++ b/uv.lock
@@ -428,6 +428,11 @@ dev = [
    { name = "pytest-mock" },
 ]

+[package.dev-dependencies]
+dev = [
+    { name = "pytest-asyncio" },
+]
+
 [package.metadata]
 requires-dist = [
    { name = "aiohttp", specifier = ">=3.8.0" },
@@ -462,6 +467,9 @@ requires-dist = [
 ]
 provides-extras = ["dev"]

+[package.metadata.requires-dev]
+dev = [{ name = "pytest-asyncio", specifier = ">=1.0.0" }]
+
 [[package]]
 name = "distlib"
 version = "0.3.9"