Add system health monitoring features with modular callbacks

- Introduced a new `system_health_constants.py` file to define thresholds and constants for system health metrics.
- Refactored existing system health callbacks into modular components, enhancing maintainability and clarity.
- Implemented dynamic loading of time range options in `charts.py`, improving flexibility in time range selection.
- Added detailed documentation for new callback functions, ensuring clarity on their purpose and usage.
- Enhanced error handling and logging practices across the new modules to ensure robust monitoring and debugging capabilities.

These changes significantly improve the architecture and maintainability of the system health monitoring features, aligning with project standards for modularity and performance.
This commit is contained in:
Vasily.onl
2025-06-11 19:33:08 +08:00
parent 3e0e89b826
commit d5db9402e8
11 changed files with 800 additions and 632 deletions

View File

@@ -0,0 +1,26 @@
from utils.logger import get_logger
from database.connection import DatabaseManager
from database.redis_manager import get_sync_redis_manager
import psutil
from datetime import datetime, timedelta
import dash_bootstrap_components as dbc
from dash import html
logger = get_logger("default_logger")
def _check_data_collection_service_running() -> bool:
"""Check if data collection service is running."""
try:
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
try:
if proc.info['cmdline']:
cmdline = ' '.join(proc.info['cmdline'])
if 'start_data_collection.py' in cmdline or 'collection_service' in cmdline:
return True
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
logger.warning(f"Access or process error checking service: {e}")
continue
return False
except Exception as e:
logger.error(f"Error checking data collection service running status: {e}")
return False

View File

@@ -0,0 +1,234 @@
from dash import Output, Input, State, html, callback_context, no_update
import dash_bootstrap_components as dbc
from utils.logger import get_logger
from database.connection import DatabaseManager
from datetime import datetime, timedelta
from dashboard.callbacks.system_health_modules.common_health_utils import _check_data_collection_service_running
from config.constants.system_health_constants import (
DATA_FRESHNESS_RECENT_MINUTES,
DATA_FRESHNESS_STALE_HOURS
)
logger = get_logger("default_logger")
def register_data_collection_callbacks(app):
"""Register data collection status and metrics callbacks."""
# Detailed Data Collection Service Status
@app.callback(
[Output('data-collection-service-status', 'children'),
Output('data-collection-metrics', 'children')],
[Input('interval-component', 'n_intervals'),
Input('refresh-data-status-btn', 'n_clicks')]
)
def update_data_collection_status(n_intervals, refresh_clicks):
"""Update detailed data collection service status and metrics."""
try:
service_status = _get_data_collection_service_status()
metrics = _get_data_collection_metrics()
return service_status, metrics
except Exception as e:
logger.error(f"Error updating data collection status: {e}")
error_div = dbc.Alert(
f"Error: {str(e)}",
color="danger",
dismissable=True
)
return error_div, error_div
# Individual Collectors Status
@app.callback(
Output('individual-collectors-status', 'children'),
[Input('interval-component', 'n_intervals'),
Input('refresh-data-status-btn', 'n_clicks')]
)
def update_individual_collectors_status(n_intervals, refresh_clicks):
"""Update individual data collector health status."""
try:
return _get_individual_collectors_status()
except Exception as e:
logger.error(f"Error updating individual collectors status: {e}")
return dbc.Alert(
f"Error: {str(e)}",
color="danger",
dismissable=True
)
# Data Collection Details Modal
@app.callback(
[Output("collection-details-modal", "is_open"),
Output("collection-details-content", "children")],
[Input("view-collection-details-btn", "n_clicks")],
[State("collection-details-modal", "is_open")]
)
def toggle_collection_details_modal(n_clicks, is_open):
"""Toggle and populate the collection details modal."""
if n_clicks:
details_content = _get_collection_details_content()
return not is_open, details_content
return is_open, no_update
# Collection Logs Modal
@app.callback(
[Output("collection-logs-modal", "is_open"),
Output("collection-logs-content", "children")],
[Input("view-collection-logs-btn", "n_clicks"),
Input("refresh-logs-btn", "n_clicks")],
[State("collection-logs-modal", "is_open")],
prevent_initial_call=True
)
def toggle_collection_logs_modal(logs_clicks, refresh_clicks, is_open):
"""Toggle and populate the collection logs modal."""
ctx = callback_context
if not ctx.triggered:
return is_open, no_update
triggered_id = ctx.triggered_id
if triggered_id in ["view-collection-logs-btn", "refresh-logs-btn"]:
logs_content = _get_collection_logs_content()
return True, logs_content
return is_open, no_update
@app.callback(
Output("collection-logs-modal", "is_open", allow_duplicate=True),
Input("close-logs-modal", "n_clicks"),
State("collection-logs-modal", "is_open"),
prevent_initial_call=True
)
def close_logs_modal(n_clicks, is_open):
if n_clicks:
return not is_open
return is_open
def _get_data_collection_service_status() -> html.Div:
"""Get detailed data collection service status."""
try:
is_running = _check_data_collection_service_running()
current_time = datetime.now().strftime('%H:%M:%S')
if is_running:
status_badge = dbc.Badge("Service Running", color="success", className="me-2")
status_text = html.P("Data collection service is actively collecting market data.", className="mb-0")
details = html.Div()
else:
status_badge = dbc.Badge("Service Stopped", color="danger", className="me-2")
status_text = html.P("Data collection service is not running.", className="text-danger")
details = html.Div([
html.P("To start the service, run:", className="mt-2 mb-1"),
html.Code("python scripts/start_data_collection.py")
])
return html.Div([
dbc.Row([
dbc.Col(status_badge, width="auto"),
dbc.Col(html.P(f"Checked: {current_time}", className="text-muted mb-0"), width="auto")
], align="center", className="mb-2"),
status_text,
details
])
except Exception as e:
return dbc.Alert(f"Error checking status: {e}", color="danger")
def _get_data_collection_metrics() -> html.Div:
"""Get data collection metrics."""
try:
db_manager = DatabaseManager()
db_manager.initialize()
with db_manager.get_session() as session:
from sqlalchemy import text
candles_count = session.execute(text("SELECT COUNT(*) FROM market_data")).scalar() or 0
tickers_count = session.execute(text("SELECT COUNT(*) FROM raw_trades WHERE data_type = 'ticker'")).scalar() or 0
latest_market_data = session.execute(text("SELECT MAX(timestamp) FROM market_data")).scalar()
latest_raw_data = session.execute(text("SELECT MAX(timestamp) FROM raw_trades")).scalar()
latest_data = max(d for d in [latest_market_data, latest_raw_data] if d) if any([latest_market_data, latest_raw_data]) else None
if latest_data:
time_diff = datetime.utcnow() - (latest_data.replace(tzinfo=None) if latest_data.tzinfo else latest_data)
if time_diff < timedelta(minutes=DATA_FRESHNESS_RECENT_MINUTES):
freshness_badge = dbc.Badge(f"Fresh ({time_diff.seconds // 60}m ago)", color="success")
elif time_diff < timedelta(hours=DATA_FRESHNESS_STALE_HOURS):
freshness_badge = dbc.Badge(f"Recent ({time_diff.seconds // 60}m ago)", color="warning")
else:
freshness_badge = dbc.Badge(f"Stale ({time_diff.total_seconds() // 3600:.1f}h ago)", color="danger")
else:
freshness_badge = dbc.Badge("No data", color="secondary")
return html.Div([
dbc.Row([
dbc.Col(html.Strong("Candles:")),
dbc.Col(f"{candles_count:,}", className="text-end")
]),
dbc.Row([
dbc.Col(html.Strong("Tickers:")),
dbc.Col(f"{tickers_count:,}", className="text-end")
]),
dbc.Row([
dbc.Col(html.Strong("Data Freshness:")),
dbc.Col(freshness_badge, className="text-end")
])
])
except Exception as e:
return dbc.Alert(f"Error loading metrics: {e}", color="danger")
def _get_individual_collectors_status() -> html.Div:
"""Get individual data collector status."""
try:
return dbc.Alert([
html.P("Individual collector health data will be displayed here when the data collection service is running.", className="mb-2"),
html.Hr(),
html.P("To start monitoring, run the following command:", className="mb-1"),
html.Code("python scripts/start_data_collection.py")
], color="info")
except Exception as e:
return dbc.Alert(f"Error checking collector status: {e}", color="danger")
def _get_collection_details_content() -> html.Div:
"""Get detailed collection information for modal."""
try:
return html.Div([
html.H5("Data Collection Service Details"),
html.P("Comprehensive data collection service information would be displayed here."),
html.Hr(),
html.H6("Configuration"),
html.P("Service configuration details..."),
html.H6("Performance Metrics"),
html.P("Detailed performance analytics..."),
html.H6("Health Status"),
html.P("Individual collector health information...")
])
except Exception as e:
return dbc.Alert(f"Error loading details: {e}", color="danger")
def _get_collection_logs_content() -> str:
"""Get recent collection service logs."""
try:
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return f"""[{current_time}] INFO - Data Collection Service Logs
Recent log entries would be displayed here from the data collection service.
This would include:
- Service startup/shutdown events
- Collector connection status changes
- Data collection statistics
- Error messages and warnings
- Performance metrics
To view real logs, check the logs/ directory or configure log file monitoring.
"""
except Exception as e:
return f"Error loading logs: {str(e)}"

View File

@@ -0,0 +1,102 @@
from dash import Output, Input, html
import dash_bootstrap_components as dbc
from utils.logger import get_logger
from database.connection import DatabaseManager
from datetime import datetime, timedelta
from sqlalchemy import text
from config.constants.system_health_constants import (
DATABASE_RECENT_ACTIVITY_HOURS,
DATABASE_LARGEST_TABLES_LIMIT
)
from database.operations import get_database_operations
logger = get_logger("default_logger")
def register_database_callbacks(app):
"""Register database status and statistics callbacks."""
# Database Status and Statistics
@app.callback(
[Output('database-status', 'children'),
Output('database-stats', 'children')],
Input('interval-component', 'n_intervals')
)
def update_database_status(n_intervals):
"""Update database connection status and statistics."""
try:
db_status = _get_database_status()
db_stats = _get_database_statistics()
return db_status, db_stats
except Exception as e:
logger.error(f"Error updating database status: {e}")
error_alert = dbc.Alert(
f"Error: {str(e)}",
color="danger",
dismissable=True
)
return error_alert, error_alert
def _get_database_status() -> html.Div:
"""Get detailed database status."""
db_operations = get_database_operations(logger)
try:
is_connected = db_operations.health_check()
current_time = datetime.now().strftime('%H:%M:%S')
if is_connected:
status_badge = dbc.Badge("Database Connected", color="success")
# Placeholder for version and connections, as get_stats will provide more detailed info
details_text = html.P("Details available in Database Statistics section.", className="mb-0")
else:
status_badge = dbc.Badge("Database Disconnected", color="danger")
details_text = html.P("Could not connect to the database.", className="mb-0")
return html.Div([
dbc.Row([
dbc.Col(status_badge, width="auto"),
dbc.Col(html.P(f"Checked: {current_time}", className="text-muted"), width="auto")
], align="center", className="mb-2"),
details_text
])
except Exception as e:
logger.error(f"Error connecting to database: {e}")
return dbc.Alert(f"Error connecting to database: {e}", color="danger")
def _get_database_statistics() -> html.Div:
"""Get database statistics."""
db_operations = get_database_operations(logger)
try:
stats = db_operations.get_stats()
if not stats.get('healthy'):
return dbc.Alert(f"Database statistics unavailable: {stats.get('error', 'Connection failed')}", color="warning")
components = [
dbc.Row([
dbc.Col(html.Strong(f"Bots:")),
dbc.Col(f"{stats.get('bot_count', 'N/A')}", className="text-end")
]),
dbc.Row([
dbc.Col(html.Strong(f"Candles:")),
dbc.Col(f"{stats.get('candle_count', 'N/A')}", className="text-end")
]),
dbc.Row([
dbc.Col(html.Strong(f"Raw Trades:")),
dbc.Col(f"{stats.get('raw_trade_count', 'N/A')}", className="text-end")
]),
# TODO: Integrate detailed table stats, recent activity from `database.operations` if available
# Currently, `get_stats` does not provide this granular data directly.
]
return html.Div(components)
except Exception as e:
logger.error(f"Error loading database stats: {e}")
return dbc.Alert(f"Error loading database stats: {e}", color="danger")

View File

@@ -0,0 +1,120 @@
import asyncio
import json
import subprocess
import psutil
from datetime import datetime, timedelta
from typing import Dict, Any, Optional, List
from dash import Output, Input, State, html, callback_context, no_update
import dash_bootstrap_components as dbc
from utils.logger import get_logger
from database.connection import DatabaseManager
from database.redis_manager import get_sync_redis_manager
from config.constants.system_health_constants import (
CPU_GOOD_THRESHOLD, CPU_WARNING_THRESHOLD,
MEMORY_GOOD_THRESHOLD, MEMORY_WARNING_THRESHOLD
)
logger = get_logger("default_logger")
def register_quick_status_callbacks(app):
"""Register quick status callbacks (top cards)."""
@app.callback(
[Output('data-collection-quick-status', 'children'),
Output('database-quick-status', 'children'),
Output('redis-quick-status', 'children'),
Output('performance-quick-status', 'children')],
Input('interval-component', 'n_intervals')
)
def update_quick_status(n_intervals):
"""Update quick status indicators."""
try:
dc_status = _get_data_collection_quick_status()
db_status = _get_database_quick_status()
redis_status = _get_redis_quick_status()
perf_status = _get_performance_quick_status()
return dc_status, db_status, redis_status, perf_status
except Exception as e:
logger.error(f"Error updating quick status: {e}")
error_status = dbc.Badge("🔴 Error", color="danger", className="me-1")
return error_status, error_status, error_status, error_status
def _get_data_collection_quick_status() -> dbc.Badge:
"""Get quick data collection status."""
try:
is_running = _check_data_collection_service_running()
if is_running:
return dbc.Badge("Active", color="success", className="me-1")
else:
return dbc.Badge("Stopped", color="danger", className="me-1")
except Exception as e:
logger.error(f"Error checking data collection quick status: {e}")
return dbc.Badge("Unknown", color="warning", className="me-1")
def _get_database_quick_status() -> dbc.Badge:
"""Get quick database status."""
try:
db_manager = DatabaseManager()
db_manager.initialize()
if db_manager.test_connection():
return dbc.Badge("Connected", color="success", className="me-1")
else:
return dbc.Badge("Error", color="danger", className="me-1")
except Exception as e:
logger.error(f"Error checking database quick status: {e}")
return dbc.Badge("Error", color="danger", className="me-1")
def _get_redis_quick_status() -> dbc.Badge:
"""Get quick Redis status."""
try:
redis_manager = get_sync_redis_manager()
redis_manager.initialize()
if redis_manager.client.ping():
return dbc.Badge("Connected", color="success", className="me-1")
else:
return dbc.Badge("Error", color="danger", className="me-1")
except Exception as e:
logger.error(f"Redis quick status check failed: {e}")
return dbc.Badge("Error", color="danger", className="me-1")
def _get_performance_quick_status() -> dbc.Badge:
"""Get quick performance status."""
try:
cpu_percent = psutil.cpu_percent(interval=0.1)
memory = psutil.virtual_memory()
if cpu_percent < CPU_GOOD_THRESHOLD and memory.percent < MEMORY_GOOD_THRESHOLD:
return dbc.Badge("Good", color="success", className="me-1")
elif cpu_percent < CPU_WARNING_THRESHOLD and memory.percent < MEMORY_WARNING_THRESHOLD:
return dbc.Badge("Warning", color="warning", className="me-1")
else:
return dbc.Badge("High", color="danger", className="me-1")
except Exception as e:
logger.error(f"Error checking performance quick status: {e}")
return dbc.Badge("Unknown", color="secondary", className="me-1")
def _check_data_collection_service_running() -> bool:
"""Check if data collection service is running."""
try:
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
try:
if proc.info['cmdline']:
cmdline = ' '.join(proc.info['cmdline'])
if 'start_data_collection.py' in cmdline or 'collection_service' in cmdline:
return True
except (psutil.NoSuchProcess, psutil.AccessDenied) as e:
logger.warning(f"Access or process error checking service: {e}")
continue
return False
except Exception as e:
logger.error(f"Error checking data collection service running status: {e}")
return False

View File

@@ -0,0 +1,82 @@
from dash import Output, Input, html
import dash_bootstrap_components as dbc
from utils.logger import get_logger
from database.redis_manager import get_sync_redis_manager
logger = get_logger("default_logger")
def register_redis_callbacks(app):
"""Register Redis status and statistics callbacks."""
# Redis Status and Statistics
@app.callback(
[Output('redis-status', 'children'),
Output('redis-stats', 'children')],
Input('interval-component', 'n_intervals')
)
def update_redis_status(n_intervals):
"""Update Redis connection status and statistics."""
try:
redis_status = _get_redis_status()
redis_stats = _get_redis_statistics()
return redis_status, redis_stats
except Exception as e:
logger.error(f"Error updating Redis status: {e}")
error_alert = dbc.Alert(
f"Error: {str(e)}",
color="danger",
dismissable=True
)
return error_alert, error_alert
def _get_redis_status() -> html.Div:
"""Get detailed Redis server status."""
try:
redis_manager = get_sync_redis_manager()
redis_manager.initialize()
if not redis_manager.client.ping():
raise ConnectionError("Redis server is not responding.")
info = redis_manager.client.info()
status_badge = dbc.Badge("Connected", color="success", className="me-1")
return html.Div([
html.H5("Redis Status"),
status_badge,
html.P(f"Version: {info.get('redis_version', 'N/A')}"),
html.P(f"Mode: {info.get('redis_mode', 'N/A')}")
])
except Exception as e:
logger.error(f"Failed to get Redis status: {e}")
return html.Div([
html.H5("Redis Status"),
dbc.Badge("Error", color="danger", className="me-1"),
dbc.Alert(f"Error: {e}", color="danger", dismissable=True)
])
def _get_redis_statistics() -> html.Div:
"""Get detailed Redis statistics."""
try:
redis_manager = get_sync_redis_manager()
redis_manager.initialize()
if not redis_manager.client.ping():
raise ConnectionError("Redis server is not responding.")
info = redis_manager.client.info()
return html.Div([
html.H5("Redis Statistics"),
html.P(f"Connected Clients: {info.get('connected_clients', 'N/A')}"),
html.P(f"Memory Used: {info.get('used_memory_human', 'N/A')}"),
html.P(f"Total Commands Processed: {info.get('total_commands_processed', 'N/A')}")
])
except Exception as e:
logger.error(f"Failed to get Redis statistics: {e}")
return dbc.Alert(f"Error: {e}", color="danger", dismissable=True)

View File

@@ -0,0 +1,75 @@
from dash import Output, Input, html
import dash_bootstrap_components as dbc
from utils.logger import get_logger
import psutil
from config.constants.system_health_constants import (
CAPACITY_GOOD_THRESHOLD, CAPACITY_WARNING_THRESHOLD,
CPU_GOOD_THRESHOLD, CPU_WARNING_THRESHOLD,
MEMORY_GOOD_THRESHOLD, MEMORY_WARNING_THRESHOLD,
DISK_GOOD_THRESHOLD, DISK_WARNING_THRESHOLD,
BYTE_TO_GB
)
logger = get_logger("default_logger")
def register_system_performance_callbacks(app):
"""Register system performance metrics callbacks."""
# System Performance Metrics
@app.callback(
Output('system-performance-metrics', 'children'),
Input('interval-component', 'n_intervals')
)
def update_system_performance(n_intervals):
"""Update system performance metrics."""
try:
return _get_system_performance_metrics()
except Exception as e:
logger.error(f"Error updating system performance: {e}")
return dbc.Alert(
f"Error: {str(e)}",
color="danger",
dismissable=True
)
def _get_system_performance_metrics() -> html.Div:
"""Get system performance metrics."""
try:
cpu_percent = psutil.cpu_percent(interval=0.1)
cpu_count = psutil.cpu_count()
memory = psutil.virtual_memory()
disk = psutil.disk_usage('/')
def get_color(percent):
if percent < CAPACITY_GOOD_THRESHOLD: return "success"
if percent < CAPACITY_WARNING_THRESHOLD: return "warning"
return "danger"
return html.Div([
html.Div([
html.Strong("CPU Usage: "),
dbc.Badge(f"{cpu_percent:.1f}%", color=get_color(cpu_percent)),
html.Span(f" ({cpu_count} cores)", className="text-muted ms-1")
], className="mb-2"),
dbc.Progress(value=cpu_percent, color=get_color(cpu_percent), style={"height": "10px"}, className="mb-3"),
html.Div([
html.Strong("Memory Usage: "),
dbc.Badge(f"{memory.percent:.1f}%", color=get_color(memory.percent)),
html.Span(f" ({memory.used / BYTE_TO_GB:.1f} / {memory.total / BYTE_TO_GB:.1f} GB)", className="text-muted ms-1")
], className="mb-2"),
dbc.Progress(value=memory.percent, color=get_color(memory.percent), style={"height": "10px"}, className="mb-3"),
html.Div([
html.Strong("Disk Usage: "),
dbc.Badge(f"{disk.percent:.1f}%", color=get_color(disk.percent)),
html.Span(f" ({disk.used / BYTE_TO_GB:.1f} / {disk.total / BYTE_TO_GB:.1f} GB)", className="text-muted ms-1")
], className="mb-2"),
dbc.Progress(value=disk.percent, color=get_color(disk.percent), style={"height": "10px"})
])
except Exception as e:
return dbc.Alert(f"Error loading performance metrics: {e}", color="danger")