Refactor BaseDataCollector to utilize CollectorStateAndTelemetry for improved state management
- Introduced a new `CollectorStateAndTelemetry` class to encapsulate the status, health checks, and statistics of the data collector, promoting modularity and separation of concerns. - Updated `BaseDataCollector` to replace direct status management with calls to the new telemetry class, enhancing maintainability and readability. - Refactored logging methods to utilize the telemetry class, ensuring consistent logging practices. - Modified the `OKXCollector` to integrate with the new telemetry system for improved status reporting and error handling. - Added comprehensive tests for the `CollectorStateAndTelemetry` class to ensure functionality and reliability. These changes streamline the data collector's architecture, aligning with project standards for maintainability and performance.
This commit is contained in:
223
data/collector/collector_state_telemetry.py
Normal file
223
data/collector/collector_state_telemetry.py
Normal file
@@ -0,0 +1,223 @@
|
||||
"""
|
||||
Module for managing collector status, health, and telemetry.
|
||||
|
||||
This module provides a dedicated class to encapsulate the status,
|
||||
health checks, and statistical data for a data collector, promoting
|
||||
modularity and separation of concerns.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
class CollectorStatus(Enum):
|
||||
"""Status of the data collector."""
|
||||
STOPPED = "stopped"
|
||||
STARTING = "starting"
|
||||
RUNNING = "running"
|
||||
STOPPING = "stopping"
|
||||
ERROR = "error"
|
||||
RECONNECTING = "reconnecting"
|
||||
UNHEALTHY = "unhealthy" # Added for health monitoring
|
||||
|
||||
|
||||
class CollectorStateAndTelemetry:
|
||||
"""
|
||||
Manages the operational state, health, and performance statistics
|
||||
of a data collector.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
exchange_name: str,
|
||||
component_name: str,
|
||||
health_check_interval: float = 30.0,
|
||||
max_silence_duration: timedelta = timedelta(minutes=5),
|
||||
logger=None,
|
||||
log_errors_only: bool = False):
|
||||
self.exchange_name = exchange_name
|
||||
self.component_name = component_name
|
||||
self.health_check_interval = health_check_interval
|
||||
self._max_silence_duration = max_silence_duration
|
||||
self.logger = logger
|
||||
self.log_errors_only = log_errors_only
|
||||
|
||||
# Collector state
|
||||
self.status = CollectorStatus.STOPPED
|
||||
self._running = False
|
||||
self._should_be_running = False # Track desired state
|
||||
|
||||
# Health monitoring
|
||||
self._last_heartbeat = datetime.now(timezone.utc)
|
||||
self._last_data_received = None
|
||||
|
||||
# Statistics
|
||||
self._stats = {
|
||||
'messages_received': 0,
|
||||
'messages_processed': 0,
|
||||
'errors': 0,
|
||||
'restarts': 0,
|
||||
'last_message_time': None,
|
||||
'connection_uptime': None,
|
||||
'last_error': None,
|
||||
'last_restart_time': None
|
||||
}
|
||||
|
||||
def _log_debug(self, message: str) -> None:
|
||||
"""Log debug message if logger is available and not in errors-only mode."""
|
||||
if self.logger and not self.log_errors_only:
|
||||
self.logger.debug(message)
|
||||
|
||||
def _log_info(self, message: str) -> None:
|
||||
"""Log info message if logger is available and not in errors-only mode."""
|
||||
if self.logger and not self.log_errors_only:
|
||||
self.logger.info(message)
|
||||
|
||||
def _log_warning(self, message: str) -> None:
|
||||
"""Log warning message if logger is available and not in errors-only mode."""
|
||||
if self.logger and not self.log_errors_only:
|
||||
self.logger.warning(message)
|
||||
|
||||
def _log_error(self, message: str, exc_info: bool = False) -> None:
|
||||
"""Log error message if logger is available (always logs errors regardless of log_errors_only)."""
|
||||
if self.logger:
|
||||
self.logger.error(message, exc_info=exc_info)
|
||||
|
||||
def _log_critical(self, message: str, exc_info: bool = False) -> None:
|
||||
"""Log critical message if logger is available (always logs critical regardless of log_errors_only)."""
|
||||
if self.logger:
|
||||
self.logger.critical(message, exc_info=exc_info)
|
||||
|
||||
def update_status(self, new_status: CollectorStatus) -> None:
|
||||
"""Update the collector's operational status."""
|
||||
self.status = new_status
|
||||
self._log_debug(f"Collector status updated to: {new_status.value}")
|
||||
|
||||
def set_running_state(self, is_running: bool) -> None:
|
||||
"""Set the internal running state."""
|
||||
self._running = is_running
|
||||
self._log_debug(f"Collector internal running state set to: {is_running}")
|
||||
|
||||
def set_should_be_running(self, should_run: bool) -> None:
|
||||
"""Set the desired running state."""
|
||||
self._should_be_running = should_run
|
||||
self._log_debug(f"Collector desired running state set to: {should_run}")
|
||||
|
||||
def update_heartbeat(self) -> None:
|
||||
"""Update the last heartbeat timestamp."""
|
||||
self._last_heartbeat = datetime.now(timezone.utc)
|
||||
self._log_debug("Heartbeat updated")
|
||||
|
||||
def update_data_received_timestamp(self) -> None:
|
||||
"""Update the last data received timestamp."""
|
||||
self._last_data_received = datetime.now(timezone.utc)
|
||||
self._log_debug("Last data received timestamp updated")
|
||||
|
||||
def increment_messages_received(self) -> None:
|
||||
"""Increment the count of messages received."""
|
||||
self._stats['messages_received'] += 1
|
||||
self._log_debug(f"Messages received: {self._stats['messages_received']}")
|
||||
|
||||
def increment_messages_processed(self) -> None:
|
||||
"""Increment the count of messages processed."""
|
||||
self._stats['messages_processed'] += 1
|
||||
self._log_debug(f"Messages processed: {self._stats['messages_processed']}")
|
||||
|
||||
def increment_errors(self, error_message: str) -> None:
|
||||
"""Increment error count and store the last error message."""
|
||||
self._stats['errors'] += 1
|
||||
self._stats['last_error'] = error_message
|
||||
self._log_error(f"Error count: {self._stats['errors']}, Last error: {error_message}")
|
||||
|
||||
def increment_restarts(self) -> None:
|
||||
"""Increment restart count and update last restart time."""
|
||||
self._stats['restarts'] += 1
|
||||
self._stats['last_restart_time'] = datetime.now(timezone.utc)
|
||||
self._log_info(f"Collector restarts: {self._stats['restarts']}")
|
||||
|
||||
def set_connection_uptime_start(self) -> None:
|
||||
"""Set the connection uptime start time."""
|
||||
self._stats['connection_uptime'] = datetime.now(timezone.utc)
|
||||
self._log_debug("Connection uptime start set")
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get current collector status and statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary containing status information
|
||||
"""
|
||||
uptime_seconds = None
|
||||
if self._stats['connection_uptime']:
|
||||
uptime_seconds = (datetime.now(timezone.utc) - self._stats['connection_uptime']).total_seconds()
|
||||
|
||||
time_since_heartbeat = None
|
||||
if self._last_heartbeat:
|
||||
time_since_heartbeat = (datetime.now(timezone.utc) - self._last_heartbeat).total_seconds()
|
||||
|
||||
time_since_data = None
|
||||
if self._last_data_received:
|
||||
time_since_data = (datetime.now(timezone.utc) - self._last_data_received).total_seconds()
|
||||
|
||||
return {
|
||||
'exchange': self.exchange_name,
|
||||
'status': self.status.value,
|
||||
'should_be_running': self._should_be_running,
|
||||
'auto_restart': True, # This will be managed by the base collector, not state
|
||||
'health': {
|
||||
'time_since_heartbeat': time_since_heartbeat,
|
||||
'time_since_data': time_since_data,
|
||||
'max_silence_duration': self._max_silence_duration.total_seconds()
|
||||
},
|
||||
'statistics': {
|
||||
**self._stats,
|
||||
'uptime_seconds': uptime_seconds,
|
||||
'reconnect_attempts': 0 # This will be managed by connection manager
|
||||
}
|
||||
}
|
||||
|
||||
def get_health_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get detailed health status for monitoring.
|
||||
|
||||
Returns:
|
||||
Dictionary containing health information
|
||||
"""
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
is_healthy = True
|
||||
health_issues = []
|
||||
|
||||
# Check if should be running but isn't
|
||||
if self._should_be_running and not self._running:
|
||||
is_healthy = False
|
||||
health_issues.append("Should be running but is stopped")
|
||||
|
||||
# Check heartbeat
|
||||
if self._last_heartbeat:
|
||||
time_since_heartbeat = now - self._last_heartbeat
|
||||
if time_since_heartbeat > timedelta(seconds=self.health_check_interval * 2):
|
||||
is_healthy = False
|
||||
health_issues.append(f"No heartbeat for {time_since_heartbeat.total_seconds():.1f}s")
|
||||
|
||||
# Check data freshness
|
||||
if self._last_data_received:
|
||||
time_since_data = now - self._last_data_received
|
||||
if time_since_data > self._max_silence_duration:
|
||||
is_healthy = False
|
||||
health_issues.append(f"No data for {time_since_data.total_seconds():.1f}s")
|
||||
|
||||
# Check for error status
|
||||
if self.status in [CollectorStatus.ERROR, CollectorStatus.UNHEALTHY]:
|
||||
is_healthy = False
|
||||
health_issues.append(f"Status: {self.status.value}")
|
||||
|
||||
return {
|
||||
'is_healthy': is_healthy,
|
||||
'issues': health_issues,
|
||||
'status': self.status.value,
|
||||
'last_heartbeat': self._last_heartbeat.isoformat() if self._last_heartbeat else None,
|
||||
'last_data_received': self._last_data_received.isoformat() if self._last_data_received else None,
|
||||
'should_be_running': self._should_be_running,
|
||||
'is_running': self._running
|
||||
}
|
||||
Reference in New Issue
Block a user