223 lines
8.8 KiB
Python
223 lines
8.8 KiB
Python
|
|
"""
|
||
|
|
Module for managing collector status, health, and telemetry.
|
||
|
|
|
||
|
|
This module provides a dedicated class to encapsulate the status,
|
||
|
|
health checks, and statistical data for a data collector, promoting
|
||
|
|
modularity and separation of concerns.
|
||
|
|
"""
|
||
|
|
|
||
|
|
from datetime import datetime, timezone, timedelta
|
||
|
|
from enum import Enum
|
||
|
|
from typing import Any, Dict, Optional
|
||
|
|
|
||
|
|
|
||
|
|
class CollectorStatus(Enum):
|
||
|
|
"""Status of the data collector."""
|
||
|
|
STOPPED = "stopped"
|
||
|
|
STARTING = "starting"
|
||
|
|
RUNNING = "running"
|
||
|
|
STOPPING = "stopping"
|
||
|
|
ERROR = "error"
|
||
|
|
RECONNECTING = "reconnecting"
|
||
|
|
UNHEALTHY = "unhealthy" # Added for health monitoring
|
||
|
|
|
||
|
|
|
||
|
|
class CollectorStateAndTelemetry:
|
||
|
|
"""
|
||
|
|
Manages the operational state, health, and performance statistics
|
||
|
|
of a data collector.
|
||
|
|
"""
|
||
|
|
|
||
|
|
def __init__(self,
|
||
|
|
exchange_name: str,
|
||
|
|
component_name: str,
|
||
|
|
health_check_interval: float = 30.0,
|
||
|
|
max_silence_duration: timedelta = timedelta(minutes=5),
|
||
|
|
logger=None,
|
||
|
|
log_errors_only: bool = False):
|
||
|
|
self.exchange_name = exchange_name
|
||
|
|
self.component_name = component_name
|
||
|
|
self.health_check_interval = health_check_interval
|
||
|
|
self._max_silence_duration = max_silence_duration
|
||
|
|
self.logger = logger
|
||
|
|
self.log_errors_only = log_errors_only
|
||
|
|
|
||
|
|
# Collector state
|
||
|
|
self.status = CollectorStatus.STOPPED
|
||
|
|
self._running = False
|
||
|
|
self._should_be_running = False # Track desired state
|
||
|
|
|
||
|
|
# Health monitoring
|
||
|
|
self._last_heartbeat = datetime.now(timezone.utc)
|
||
|
|
self._last_data_received = None
|
||
|
|
|
||
|
|
# Statistics
|
||
|
|
self._stats = {
|
||
|
|
'messages_received': 0,
|
||
|
|
'messages_processed': 0,
|
||
|
|
'errors': 0,
|
||
|
|
'restarts': 0,
|
||
|
|
'last_message_time': None,
|
||
|
|
'connection_uptime': None,
|
||
|
|
'last_error': None,
|
||
|
|
'last_restart_time': None
|
||
|
|
}
|
||
|
|
|
||
|
|
def _log_debug(self, message: str) -> None:
|
||
|
|
"""Log debug message if logger is available and not in errors-only mode."""
|
||
|
|
if self.logger and not self.log_errors_only:
|
||
|
|
self.logger.debug(message)
|
||
|
|
|
||
|
|
def _log_info(self, message: str) -> None:
|
||
|
|
"""Log info message if logger is available and not in errors-only mode."""
|
||
|
|
if self.logger and not self.log_errors_only:
|
||
|
|
self.logger.info(message)
|
||
|
|
|
||
|
|
def _log_warning(self, message: str) -> None:
|
||
|
|
"""Log warning message if logger is available and not in errors-only mode."""
|
||
|
|
if self.logger and not self.log_errors_only:
|
||
|
|
self.logger.warning(message)
|
||
|
|
|
||
|
|
def _log_error(self, message: str, exc_info: bool = False) -> None:
|
||
|
|
"""Log error message if logger is available (always logs errors regardless of log_errors_only)."""
|
||
|
|
if self.logger:
|
||
|
|
self.logger.error(message, exc_info=exc_info)
|
||
|
|
|
||
|
|
def _log_critical(self, message: str, exc_info: bool = False) -> None:
|
||
|
|
"""Log critical message if logger is available (always logs critical regardless of log_errors_only)."""
|
||
|
|
if self.logger:
|
||
|
|
self.logger.critical(message, exc_info=exc_info)
|
||
|
|
|
||
|
|
def update_status(self, new_status: CollectorStatus) -> None:
|
||
|
|
"""Update the collector's operational status."""
|
||
|
|
self.status = new_status
|
||
|
|
self._log_debug(f"Collector status updated to: {new_status.value}")
|
||
|
|
|
||
|
|
def set_running_state(self, is_running: bool) -> None:
|
||
|
|
"""Set the internal running state."""
|
||
|
|
self._running = is_running
|
||
|
|
self._log_debug(f"Collector internal running state set to: {is_running}")
|
||
|
|
|
||
|
|
def set_should_be_running(self, should_run: bool) -> None:
|
||
|
|
"""Set the desired running state."""
|
||
|
|
self._should_be_running = should_run
|
||
|
|
self._log_debug(f"Collector desired running state set to: {should_run}")
|
||
|
|
|
||
|
|
def update_heartbeat(self) -> None:
|
||
|
|
"""Update the last heartbeat timestamp."""
|
||
|
|
self._last_heartbeat = datetime.now(timezone.utc)
|
||
|
|
self._log_debug("Heartbeat updated")
|
||
|
|
|
||
|
|
def update_data_received_timestamp(self) -> None:
|
||
|
|
"""Update the last data received timestamp."""
|
||
|
|
self._last_data_received = datetime.now(timezone.utc)
|
||
|
|
self._log_debug("Last data received timestamp updated")
|
||
|
|
|
||
|
|
def increment_messages_received(self) -> None:
|
||
|
|
"""Increment the count of messages received."""
|
||
|
|
self._stats['messages_received'] += 1
|
||
|
|
self._log_debug(f"Messages received: {self._stats['messages_received']}")
|
||
|
|
|
||
|
|
def increment_messages_processed(self) -> None:
|
||
|
|
"""Increment the count of messages processed."""
|
||
|
|
self._stats['messages_processed'] += 1
|
||
|
|
self._log_debug(f"Messages processed: {self._stats['messages_processed']}")
|
||
|
|
|
||
|
|
def increment_errors(self, error_message: str) -> None:
|
||
|
|
"""Increment error count and store the last error message."""
|
||
|
|
self._stats['errors'] += 1
|
||
|
|
self._stats['last_error'] = error_message
|
||
|
|
self._log_error(f"Error count: {self._stats['errors']}, Last error: {error_message}")
|
||
|
|
|
||
|
|
def increment_restarts(self) -> None:
|
||
|
|
"""Increment restart count and update last restart time."""
|
||
|
|
self._stats['restarts'] += 1
|
||
|
|
self._stats['last_restart_time'] = datetime.now(timezone.utc)
|
||
|
|
self._log_info(f"Collector restarts: {self._stats['restarts']}")
|
||
|
|
|
||
|
|
def set_connection_uptime_start(self) -> None:
|
||
|
|
"""Set the connection uptime start time."""
|
||
|
|
self._stats['connection_uptime'] = datetime.now(timezone.utc)
|
||
|
|
self._log_debug("Connection uptime start set")
|
||
|
|
|
||
|
|
def get_status(self) -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Get current collector status and statistics.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dictionary containing status information
|
||
|
|
"""
|
||
|
|
uptime_seconds = None
|
||
|
|
if self._stats['connection_uptime']:
|
||
|
|
uptime_seconds = (datetime.now(timezone.utc) - self._stats['connection_uptime']).total_seconds()
|
||
|
|
|
||
|
|
time_since_heartbeat = None
|
||
|
|
if self._last_heartbeat:
|
||
|
|
time_since_heartbeat = (datetime.now(timezone.utc) - self._last_heartbeat).total_seconds()
|
||
|
|
|
||
|
|
time_since_data = None
|
||
|
|
if self._last_data_received:
|
||
|
|
time_since_data = (datetime.now(timezone.utc) - self._last_data_received).total_seconds()
|
||
|
|
|
||
|
|
return {
|
||
|
|
'exchange': self.exchange_name,
|
||
|
|
'status': self.status.value,
|
||
|
|
'should_be_running': self._should_be_running,
|
||
|
|
'auto_restart': True, # This will be managed by the base collector, not state
|
||
|
|
'health': {
|
||
|
|
'time_since_heartbeat': time_since_heartbeat,
|
||
|
|
'time_since_data': time_since_data,
|
||
|
|
'max_silence_duration': self._max_silence_duration.total_seconds()
|
||
|
|
},
|
||
|
|
'statistics': {
|
||
|
|
**self._stats,
|
||
|
|
'uptime_seconds': uptime_seconds,
|
||
|
|
'reconnect_attempts': 0 # This will be managed by connection manager
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
def get_health_status(self) -> Dict[str, Any]:
|
||
|
|
"""
|
||
|
|
Get detailed health status for monitoring.
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dictionary containing health information
|
||
|
|
"""
|
||
|
|
now = datetime.now(timezone.utc)
|
||
|
|
|
||
|
|
is_healthy = True
|
||
|
|
health_issues = []
|
||
|
|
|
||
|
|
# Check if should be running but isn't
|
||
|
|
if self._should_be_running and not self._running:
|
||
|
|
is_healthy = False
|
||
|
|
health_issues.append("Should be running but is stopped")
|
||
|
|
|
||
|
|
# Check heartbeat
|
||
|
|
if self._last_heartbeat:
|
||
|
|
time_since_heartbeat = now - self._last_heartbeat
|
||
|
|
if time_since_heartbeat > timedelta(seconds=self.health_check_interval * 2):
|
||
|
|
is_healthy = False
|
||
|
|
health_issues.append(f"No heartbeat for {time_since_heartbeat.total_seconds():.1f}s")
|
||
|
|
|
||
|
|
# Check data freshness
|
||
|
|
if self._last_data_received:
|
||
|
|
time_since_data = now - self._last_data_received
|
||
|
|
if time_since_data > self._max_silence_duration:
|
||
|
|
is_healthy = False
|
||
|
|
health_issues.append(f"No data for {time_since_data.total_seconds():.1f}s")
|
||
|
|
|
||
|
|
# Check for error status
|
||
|
|
if self.status in [CollectorStatus.ERROR, CollectorStatus.UNHEALTHY]:
|
||
|
|
is_healthy = False
|
||
|
|
health_issues.append(f"Status: {self.status.value}")
|
||
|
|
|
||
|
|
return {
|
||
|
|
'is_healthy': is_healthy,
|
||
|
|
'issues': health_issues,
|
||
|
|
'status': self.status.value,
|
||
|
|
'last_heartbeat': self._last_heartbeat.isoformat() if self._last_heartbeat else None,
|
||
|
|
'last_data_received': self._last_data_received.isoformat() if self._last_data_received else None,
|
||
|
|
'should_be_running': self._should_be_running,
|
||
|
|
'is_running': self._running
|
||
|
|
}
|