TCPDashboard/data/collector/collector_state_telemetry.py
Vasily.onl 60434afd5d Refactor BaseDataCollector to utilize CollectorStateAndTelemetry for improved state management
- Introduced a new `CollectorStateAndTelemetry` class to encapsulate the status, health checks, and statistics of the data collector, promoting modularity and separation of concerns.
- Updated `BaseDataCollector` to replace direct status management with calls to the new telemetry class, enhancing maintainability and readability.
- Refactored logging methods to utilize the telemetry class, ensuring consistent logging practices.
- Modified the `OKXCollector` to integrate with the new telemetry system for improved status reporting and error handling.
- Added comprehensive tests for the `CollectorStateAndTelemetry` class to ensure functionality and reliability.

These changes streamline the data collector's architecture, aligning with project standards for maintainability and performance.
2025-06-09 17:27:29 +08:00

223 lines
8.8 KiB
Python

"""
Module for managing collector status, health, and telemetry.
This module provides a dedicated class to encapsulate the status,
health checks, and statistical data for a data collector, promoting
modularity and separation of concerns.
"""
from datetime import datetime, timezone, timedelta
from enum import Enum
from typing import Any, Dict, Optional
class CollectorStatus(Enum):
"""Status of the data collector."""
STOPPED = "stopped"
STARTING = "starting"
RUNNING = "running"
STOPPING = "stopping"
ERROR = "error"
RECONNECTING = "reconnecting"
UNHEALTHY = "unhealthy" # Added for health monitoring
class CollectorStateAndTelemetry:
"""
Manages the operational state, health, and performance statistics
of a data collector.
"""
def __init__(self,
exchange_name: str,
component_name: str,
health_check_interval: float = 30.0,
max_silence_duration: timedelta = timedelta(minutes=5),
logger=None,
log_errors_only: bool = False):
self.exchange_name = exchange_name
self.component_name = component_name
self.health_check_interval = health_check_interval
self._max_silence_duration = max_silence_duration
self.logger = logger
self.log_errors_only = log_errors_only
# Collector state
self.status = CollectorStatus.STOPPED
self._running = False
self._should_be_running = False # Track desired state
# Health monitoring
self._last_heartbeat = datetime.now(timezone.utc)
self._last_data_received = None
# Statistics
self._stats = {
'messages_received': 0,
'messages_processed': 0,
'errors': 0,
'restarts': 0,
'last_message_time': None,
'connection_uptime': None,
'last_error': None,
'last_restart_time': None
}
def _log_debug(self, message: str) -> None:
"""Log debug message if logger is available and not in errors-only mode."""
if self.logger and not self.log_errors_only:
self.logger.debug(message)
def _log_info(self, message: str) -> None:
"""Log info message if logger is available and not in errors-only mode."""
if self.logger and not self.log_errors_only:
self.logger.info(message)
def _log_warning(self, message: str) -> None:
"""Log warning message if logger is available and not in errors-only mode."""
if self.logger and not self.log_errors_only:
self.logger.warning(message)
def _log_error(self, message: str, exc_info: bool = False) -> None:
"""Log error message if logger is available (always logs errors regardless of log_errors_only)."""
if self.logger:
self.logger.error(message, exc_info=exc_info)
def _log_critical(self, message: str, exc_info: bool = False) -> None:
"""Log critical message if logger is available (always logs critical regardless of log_errors_only)."""
if self.logger:
self.logger.critical(message, exc_info=exc_info)
def update_status(self, new_status: CollectorStatus) -> None:
"""Update the collector's operational status."""
self.status = new_status
self._log_debug(f"Collector status updated to: {new_status.value}")
def set_running_state(self, is_running: bool) -> None:
"""Set the internal running state."""
self._running = is_running
self._log_debug(f"Collector internal running state set to: {is_running}")
def set_should_be_running(self, should_run: bool) -> None:
"""Set the desired running state."""
self._should_be_running = should_run
self._log_debug(f"Collector desired running state set to: {should_run}")
def update_heartbeat(self) -> None:
"""Update the last heartbeat timestamp."""
self._last_heartbeat = datetime.now(timezone.utc)
self._log_debug("Heartbeat updated")
def update_data_received_timestamp(self) -> None:
"""Update the last data received timestamp."""
self._last_data_received = datetime.now(timezone.utc)
self._log_debug("Last data received timestamp updated")
def increment_messages_received(self) -> None:
"""Increment the count of messages received."""
self._stats['messages_received'] += 1
self._log_debug(f"Messages received: {self._stats['messages_received']}")
def increment_messages_processed(self) -> None:
"""Increment the count of messages processed."""
self._stats['messages_processed'] += 1
self._log_debug(f"Messages processed: {self._stats['messages_processed']}")
def increment_errors(self, error_message: str) -> None:
"""Increment error count and store the last error message."""
self._stats['errors'] += 1
self._stats['last_error'] = error_message
self._log_error(f"Error count: {self._stats['errors']}, Last error: {error_message}")
def increment_restarts(self) -> None:
"""Increment restart count and update last restart time."""
self._stats['restarts'] += 1
self._stats['last_restart_time'] = datetime.now(timezone.utc)
self._log_info(f"Collector restarts: {self._stats['restarts']}")
def set_connection_uptime_start(self) -> None:
"""Set the connection uptime start time."""
self._stats['connection_uptime'] = datetime.now(timezone.utc)
self._log_debug("Connection uptime start set")
def get_status(self) -> Dict[str, Any]:
"""
Get current collector status and statistics.
Returns:
Dictionary containing status information
"""
uptime_seconds = None
if self._stats['connection_uptime']:
uptime_seconds = (datetime.now(timezone.utc) - self._stats['connection_uptime']).total_seconds()
time_since_heartbeat = None
if self._last_heartbeat:
time_since_heartbeat = (datetime.now(timezone.utc) - self._last_heartbeat).total_seconds()
time_since_data = None
if self._last_data_received:
time_since_data = (datetime.now(timezone.utc) - self._last_data_received).total_seconds()
return {
'exchange': self.exchange_name,
'status': self.status.value,
'should_be_running': self._should_be_running,
'auto_restart': True, # This will be managed by the base collector, not state
'health': {
'time_since_heartbeat': time_since_heartbeat,
'time_since_data': time_since_data,
'max_silence_duration': self._max_silence_duration.total_seconds()
},
'statistics': {
**self._stats,
'uptime_seconds': uptime_seconds,
'reconnect_attempts': 0 # This will be managed by connection manager
}
}
def get_health_status(self) -> Dict[str, Any]:
"""
Get detailed health status for monitoring.
Returns:
Dictionary containing health information
"""
now = datetime.now(timezone.utc)
is_healthy = True
health_issues = []
# Check if should be running but isn't
if self._should_be_running and not self._running:
is_healthy = False
health_issues.append("Should be running but is stopped")
# Check heartbeat
if self._last_heartbeat:
time_since_heartbeat = now - self._last_heartbeat
if time_since_heartbeat > timedelta(seconds=self.health_check_interval * 2):
is_healthy = False
health_issues.append(f"No heartbeat for {time_since_heartbeat.total_seconds():.1f}s")
# Check data freshness
if self._last_data_received:
time_since_data = now - self._last_data_received
if time_since_data > self._max_silence_duration:
is_healthy = False
health_issues.append(f"No data for {time_since_data.total_seconds():.1f}s")
# Check for error status
if self.status in [CollectorStatus.ERROR, CollectorStatus.UNHEALTHY]:
is_healthy = False
health_issues.append(f"Status: {self.status.value}")
return {
'is_healthy': is_healthy,
'issues': health_issues,
'status': self.status.value,
'last_heartbeat': self._last_heartbeat.isoformat() if self._last_heartbeat else None,
'last_data_received': self._last_data_received.isoformat() if self._last_data_received else None,
'should_be_running': self._should_be_running,
'is_running': self._running
}