TCPDashboard/dashboard/callbacks/system_health.py

664 lines
25 KiB
Python
Raw Normal View History

"""
Enhanced system health callbacks for the dashboard.
"""
import asyncio
import json
import subprocess
import psutil
from datetime import datetime, timedelta
from typing import Dict, Any, Optional, List
from dash import Output, Input, State, html, callback_context, no_update
import dash_mantine_components as dmc
from utils.logger import get_logger
from database.connection import DatabaseManager
from database.redis_manager import RedisManager
logger = get_logger("system_health_callbacks")
def register_system_health_callbacks(app):
"""Register enhanced system health callbacks with comprehensive monitoring."""
# Quick Status Updates (Top Cards)
@app.callback(
[Output('data-collection-quick-status', 'children'),
Output('database-quick-status', 'children'),
Output('redis-quick-status', 'children'),
Output('performance-quick-status', 'children')],
Input('interval-component', 'n_intervals')
)
def update_quick_status(n_intervals):
"""Update quick status indicators."""
try:
# Data Collection Status
dc_status = _get_data_collection_quick_status()
# Database Status
db_status = _get_database_quick_status()
# Redis Status
redis_status = _get_redis_quick_status()
# Performance Status
perf_status = _get_performance_quick_status()
return dc_status, db_status, redis_status, perf_status
except Exception as e:
logger.error(f"Error updating quick status: {e}")
error_status = dmc.Badge("🔴 Error", color="red", variant="light")
return error_status, error_status, error_status, error_status
# Detailed Data Collection Service Status
@app.callback(
[Output('data-collection-service-status', 'children'),
Output('data-collection-metrics', 'children')],
[Input('interval-component', 'n_intervals'),
Input('refresh-data-status-btn', 'n_clicks')]
)
def update_data_collection_status(n_intervals, refresh_clicks):
"""Update detailed data collection service status and metrics."""
try:
service_status = _get_data_collection_service_status()
metrics = _get_data_collection_metrics()
return service_status, metrics
except Exception as e:
logger.error(f"Error updating data collection status: {e}")
error_div = dmc.Alert(
f"Error: {str(e)}",
title="🔴 Status Check Failed",
color="red",
variant="light"
)
return error_div, error_div
# Individual Collectors Status
@app.callback(
Output('individual-collectors-status', 'children'),
[Input('interval-component', 'n_intervals'),
Input('refresh-data-status-btn', 'n_clicks')]
)
def update_individual_collectors_status(n_intervals, refresh_clicks):
"""Update individual data collector health status."""
try:
return _get_individual_collectors_status()
except Exception as e:
logger.error(f"Error updating individual collectors status: {e}")
return dmc.Alert(
f"Error: {str(e)}",
title="🔴 Collectors Check Failed",
color="red",
variant="light"
)
# Database Status and Statistics
@app.callback(
[Output('database-status', 'children'),
Output('database-stats', 'children')],
Input('interval-component', 'n_intervals')
)
def update_database_status(n_intervals):
"""Update database connection status and statistics."""
try:
db_status = _get_database_status()
db_stats = _get_database_statistics()
return db_status, db_stats
except Exception as e:
logger.error(f"Error updating database status: {e}")
error_alert = dmc.Alert(
f"Error: {str(e)}",
title="🔴 Database Check Failed",
color="red",
variant="light"
)
return error_alert, error_alert
# Redis Status and Statistics
@app.callback(
[Output('redis-status', 'children'),
Output('redis-stats', 'children')],
Input('interval-component', 'n_intervals')
)
def update_redis_status(n_intervals):
"""Update Redis connection status and statistics."""
try:
redis_status = _get_redis_status()
redis_stats = _get_redis_statistics()
return redis_status, redis_stats
except Exception as e:
logger.error(f"Error updating Redis status: {e}")
error_alert = dmc.Alert(
f"Error: {str(e)}",
title="🔴 Redis Check Failed",
color="red",
variant="light"
)
return error_alert, error_alert
# System Performance Metrics
@app.callback(
Output('system-performance-metrics', 'children'),
Input('interval-component', 'n_intervals')
)
def update_system_performance(n_intervals):
"""Update system performance metrics."""
try:
return _get_system_performance_metrics()
except Exception as e:
logger.error(f"Error updating system performance: {e}")
return dmc.Alert(
f"Error: {str(e)}",
title="🔴 Performance Check Failed",
color="red",
variant="light"
)
# Data Collection Details Modal
@app.callback(
[Output("collection-details-modal", "opened"),
Output("collection-details-content", "children")],
[Input("view-collection-details-btn", "n_clicks")],
State("collection-details-modal", "opened")
)
def toggle_collection_details_modal(details_clicks, is_open):
"""Toggle and populate the collection details modal."""
if details_clicks:
# Load detailed collection information
details_content = _get_collection_details_content()
return True, details_content
return is_open, no_update
# Collection Logs Modal
@app.callback(
[Output("collection-logs-modal", "opened"),
Output("collection-logs-content", "children")],
[Input("view-collection-logs-btn", "n_clicks"),
Input("refresh-logs-btn", "n_clicks"),
Input("close-logs-modal", "n_clicks")],
State("collection-logs-modal", "opened")
)
def toggle_collection_logs_modal(logs_clicks, refresh_clicks, close_clicks, is_open):
"""Toggle and populate the collection logs modal."""
if logs_clicks or refresh_clicks:
# Load recent logs
logs_content = _get_collection_logs_content()
return True, logs_content
elif close_clicks:
return False, no_update
return is_open, no_update
logger.info("Enhanced system health callbacks registered successfully")
# Helper Functions
def _get_data_collection_quick_status() -> dmc.Badge:
"""Get quick data collection status."""
try:
# Check if data collection service is running (simplified check)
is_running = _check_data_collection_service_running()
if is_running:
return dmc.Badge("🟢 Active", color="green", variant="light")
else:
return dmc.Badge("🔴 Stopped", color="red", variant="light")
except:
return dmc.Badge("🟡 Unknown", color="yellow", variant="light")
def _get_database_quick_status() -> dmc.Badge:
"""Get quick database status."""
try:
db_manager = DatabaseManager()
db_manager.initialize() # Initialize the database manager
result = db_manager.test_connection()
if result:
return dmc.Badge("🟢 Connected", color="green", variant="light")
else:
return dmc.Badge("🔴 Error", color="red", variant="light")
except:
return dmc.Badge("🔴 Error", color="red", variant="light")
def _get_redis_quick_status() -> dmc.Badge:
"""Get quick Redis status."""
try:
redis_manager = RedisManager()
redis_manager.initialize() # Initialize the Redis manager
result = redis_manager.test_connection()
if result:
return dmc.Badge("🟢 Connected", color="green", variant="light")
else:
return dmc.Badge("🔴 Error", color="red", variant="light")
except:
return dmc.Badge("🔴 Error", color="red", variant="light")
def _get_performance_quick_status() -> dmc.Badge:
"""Get quick performance status."""
try:
cpu_percent = psutil.cpu_percent(interval=0.1)
memory = psutil.virtual_memory()
if cpu_percent < 80 and memory.percent < 80:
return dmc.Badge("🟢 Good", color="green", variant="light")
elif cpu_percent < 90 and memory.percent < 90:
return dmc.Badge("🟡 Warning", color="yellow", variant="light")
else:
return dmc.Badge("🔴 High", color="red", variant="light")
except:
return dmc.Badge("❓ Unknown", color="gray", variant="light")
def _get_data_collection_service_status() -> html.Div:
"""Get detailed data collection service status."""
try:
is_running = _check_data_collection_service_running()
current_time = datetime.now()
if is_running:
return dmc.Stack([
dmc.Group([
dmc.Badge("🟢 Service Running", color="green", variant="light"),
dmc.Text(f"Checked: {current_time.strftime('%H:%M:%S')}", size="xs", c="dimmed")
], justify="space-between"),
dmc.Text("Data collection service is actively collecting market data.",
size="sm", c="#2c3e50")
], gap="xs")
else:
return dmc.Stack([
dmc.Group([
dmc.Badge("🔴 Service Stopped", color="red", variant="light"),
dmc.Text(f"Checked: {current_time.strftime('%H:%M:%S')}", size="xs", c="dimmed")
], justify="space-between"),
dmc.Text("Data collection service is not running.", size="sm", c="#e74c3c"),
dmc.Code("python scripts/start_data_collection.py", style={'margin-top': '5px'})
], gap="xs")
except Exception as e:
return dmc.Alert(
f"Error: {str(e)}",
title="🔴 Status Check Failed",
color="red",
variant="light"
)
def _get_data_collection_metrics() -> html.Div:
"""Get data collection metrics."""
try:
# Get database statistics for collected data
db_manager = DatabaseManager()
db_manager.initialize() # Initialize the database manager
with db_manager.get_session() as session:
from sqlalchemy import text
# Count OHLCV candles from market_data table
candles_count = session.execute(
text("SELECT COUNT(*) FROM market_data")
).scalar() or 0
# Count raw tickers from raw_trades table
tickers_count = session.execute(
text("SELECT COUNT(*) FROM raw_trades WHERE data_type = 'ticker'")
).scalar() or 0
# Get latest data timestamp from both tables
latest_market_data = session.execute(
text("SELECT MAX(timestamp) FROM market_data")
).scalar()
latest_raw_data = session.execute(
text("SELECT MAX(timestamp) FROM raw_trades")
).scalar()
# Use the most recent timestamp
latest_data = None
if latest_market_data and latest_raw_data:
latest_data = max(latest_market_data, latest_raw_data)
elif latest_market_data:
latest_data = latest_market_data
elif latest_raw_data:
latest_data = latest_raw_data
# Calculate data freshness
data_freshness_badge = dmc.Badge("No data", color="gray", variant="light")
if latest_data:
time_diff = datetime.utcnow() - latest_data.replace(tzinfo=None) if latest_data.tzinfo else datetime.utcnow() - latest_data
if time_diff < timedelta(minutes=5):
data_freshness_badge = dmc.Badge(f"🟢 Fresh ({time_diff.seconds // 60}m ago)", color="green", variant="light")
elif time_diff < timedelta(hours=1):
data_freshness_badge = dmc.Badge(f"🟡 Recent ({time_diff.seconds // 60}m ago)", color="yellow", variant="light")
else:
data_freshness_badge = dmc.Badge(f"🔴 Stale ({time_diff.total_seconds() // 3600:.1f}h ago)", color="red", variant="light")
return dmc.Stack([
dmc.Group([
dmc.Text(f"Candles: {candles_count:,}", fw=500),
dmc.Text(f"Tickers: {tickers_count:,}", fw=500)
], justify="space-between"),
dmc.Group([
dmc.Text("Data Freshness:", fw=500),
data_freshness_badge
], justify="space-between")
], gap="xs")
except Exception as e:
return dmc.Alert(
f"Error: {str(e)}",
title="🔴 Metrics Unavailable",
color="red",
variant="light"
)
def _get_individual_collectors_status() -> html.Div:
"""Get individual data collector status."""
try:
# This would connect to a running data collection service
# For now, show a placeholder indicating the status
return dmc.Alert([
dmc.Text("Individual collector health data would be displayed here when the data collection service is running.", size="sm"),
dmc.Space(h="sm"),
dmc.Group([
dmc.Text("To start monitoring:", size="sm"),
dmc.Code("python scripts/start_data_collection.py")
])
], title="📊 Collector Health Monitoring", color="blue", variant="light")
except Exception as e:
return dmc.Alert(
f"Error: {str(e)}",
title="🔴 Collector Status Check Failed",
color="red",
variant="light"
)
def _get_database_status() -> html.Div:
"""Get detailed database status."""
try:
db_manager = DatabaseManager()
db_manager.initialize() # Initialize the database manager
with db_manager.get_session() as session:
# Test connection and get basic info
from sqlalchemy import text
result = session.execute(text("SELECT version()")).fetchone()
version = result[0] if result else "Unknown"
# Get connection count
connections = session.execute(
text("SELECT count(*) FROM pg_stat_activity")
).scalar() or 0
return dmc.Stack([
dmc.Group([
dmc.Badge("🟢 Database Connected", color="green", variant="light"),
dmc.Text(f"Checked: {datetime.now().strftime('%H:%M:%S')}", size="xs", c="dimmed")
], justify="space-between"),
dmc.Text(f"Version: PostgreSQL {version.split()[1] if 'PostgreSQL' in version else 'Unknown'}",
size="xs", c="dimmed"),
dmc.Text(f"Active connections: {connections}", size="xs", c="dimmed")
], gap="xs")
except Exception as e:
return dmc.Alert(
f"Error: {str(e)}",
title="🔴 Database Connection Failed",
color="red",
variant="light"
)
def _get_database_statistics() -> html.Div:
"""Get database statistics."""
try:
db_manager = DatabaseManager()
db_manager.initialize() # Initialize the database manager
with db_manager.get_session() as session:
# Get table sizes
from sqlalchemy import text
table_stats = session.execute(text("""
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) as size
FROM pg_tables
WHERE schemaname NOT IN ('information_schema', 'pg_catalog')
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC
LIMIT 5
""")).fetchall()
# Get recent activity from both main data tables
market_data_activity = session.execute(
text("SELECT COUNT(*) FROM market_data WHERE timestamp > NOW() - INTERVAL '1 hour'")
).scalar() or 0
raw_data_activity = session.execute(
text("SELECT COUNT(*) FROM raw_trades WHERE timestamp > NOW() - INTERVAL '1 hour'")
).scalar() or 0
total_recent_activity = market_data_activity + raw_data_activity
stats_components = [
dmc.Group([
dmc.Text("Recent Activity (1h):", fw=500),
dmc.Text(f"{total_recent_activity:,} records", c="#2c3e50")
], justify="space-between"),
dmc.Group([
dmc.Text("• Market Data:", fw=400),
dmc.Text(f"{market_data_activity:,}", c="#7f8c8d")
], justify="space-between"),
dmc.Group([
dmc.Text("• Raw Data:", fw=400),
dmc.Text(f"{raw_data_activity:,}", c="#7f8c8d")
], justify="space-between")
]
if table_stats:
stats_components.append(dmc.Text("Largest Tables:", fw=500))
for schema, table, size in table_stats:
stats_components.append(
dmc.Text(f"{table}: {size}", size="xs", c="dimmed", style={'margin-left': '10px'})
)
return dmc.Stack(stats_components, gap="xs")
except Exception as e:
return dmc.Alert(
f"Error: {str(e)}",
title="🔴 Statistics Unavailable",
color="red",
variant="light"
)
def _get_redis_status() -> html.Div:
"""Get Redis status."""
try:
redis_manager = RedisManager()
redis_manager.initialize() # Initialize the Redis manager
info = redis_manager.get_info()
return dmc.Stack([
dmc.Group([
dmc.Badge("🟢 Redis Connected", color="green", variant="light"),
dmc.Text(f"Checked: {datetime.now().strftime('%H:%M:%S')}", size="xs", c="dimmed")
], justify="space-between"),
dmc.Text(f"Host: {redis_manager.config.host}:{redis_manager.config.port}",
size="xs", c="dimmed")
], gap="xs")
except Exception as e:
return dmc.Alert(
f"Error: {str(e)}",
title="🔴 Redis Connection Failed",
color="red",
variant="light"
)
def _get_redis_statistics() -> html.Div:
"""Get Redis statistics."""
try:
redis_manager = RedisManager()
redis_manager.initialize() # Initialize the Redis manager
# Get Redis info
info = redis_manager.get_info()
return dmc.Stack([
dmc.Group([
dmc.Text("Memory Used:", fw=500),
dmc.Text(f"{info.get('used_memory_human', 'Unknown')}", c="#2c3e50")
], justify="space-between"),
dmc.Group([
dmc.Text("Connected Clients:", fw=500),
dmc.Text(f"{info.get('connected_clients', 'Unknown')}", c="#2c3e50")
], justify="space-between"),
dmc.Group([
dmc.Text("Uptime:", fw=500),
dmc.Text(f"{info.get('uptime_in_seconds', 0) // 3600}h", c="#2c3e50")
], justify="space-between")
], gap="xs")
except Exception as e:
return dmc.Alert(
f"Error: {str(e)}",
title="🔴 Statistics Unavailable",
color="red",
variant="light"
)
def _get_system_performance_metrics() -> html.Div:
"""Get system performance metrics."""
try:
# CPU usage
cpu_percent = psutil.cpu_percent(interval=0.1)
cpu_count = psutil.cpu_count()
# Memory usage
memory = psutil.virtual_memory()
# Disk usage
disk = psutil.disk_usage('/')
# Network I/O (if available)
try:
network = psutil.net_io_counters()
network_sent = f"{network.bytes_sent / (1024**3):.2f} GB"
network_recv = f"{network.bytes_recv / (1024**3):.2f} GB"
except:
network_sent = "N/A"
network_recv = "N/A"
# Color coding for metrics
cpu_color = "green" if cpu_percent < 70 else "yellow" if cpu_percent < 85 else "red"
memory_color = "green" if memory.percent < 70 else "yellow" if memory.percent < 85 else "red"
disk_color = "green" if disk.percent < 70 else "yellow" if disk.percent < 85 else "red"
return dmc.Stack([
dmc.Group([
dmc.Text("CPU Usage:", fw=500),
dmc.Badge(f"{cpu_percent:.1f}%", color=cpu_color, variant="light"),
dmc.Text(f"({cpu_count} cores)", size="xs", c="dimmed")
], justify="space-between"),
dmc.Group([
dmc.Text("Memory:", fw=500),
dmc.Badge(f"{memory.percent:.1f}%", color=memory_color, variant="light"),
dmc.Text(f"{memory.used // (1024**3)} GB / {memory.total // (1024**3)} GB",
size="xs", c="dimmed")
], justify="space-between"),
dmc.Group([
dmc.Text("Disk Usage:", fw=500),
dmc.Badge(f"{disk.percent:.1f}%", color=disk_color, variant="light"),
dmc.Text(f"{disk.used // (1024**3)} GB / {disk.total // (1024**3)} GB",
size="xs", c="dimmed")
], justify="space-between"),
dmc.Group([
dmc.Text("Network I/O:", fw=500),
dmc.Text(f"{network_sent}{network_recv}", size="xs", c="dimmed")
], justify="space-between")
], gap="sm")
except Exception as e:
return dmc.Alert(
f"Error: {str(e)}",
title="🔴 Performance Metrics Unavailable",
color="red",
variant="light"
)
def _get_collection_details_content() -> html.Div:
"""Get detailed collection information for modal."""
try:
# Detailed service and collector information
return dmc.Stack([
dmc.Title("📊 Data Collection Service Details", order=5),
dmc.Text("Comprehensive data collection service information would be displayed here."),
dmc.Divider(),
dmc.Title("Configuration", order=6),
dmc.Text("Service configuration details..."),
dmc.Title("Performance Metrics", order=6),
dmc.Text("Detailed performance analytics..."),
dmc.Title("Health Status", order=6),
dmc.Text("Individual collector health information...")
], gap="md")
except Exception as e:
return dmc.Alert(
f"Error: {str(e)}",
title="🔴 Error Loading Details",
color="red",
variant="light"
)
def _get_collection_logs_content() -> str:
"""Get recent collection service logs."""
try:
# This would read from actual log files
# For now, return a placeholder
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return f"""[{current_time}] INFO - Data Collection Service Logs
Recent log entries would be displayed here from the data collection service.
This would include:
- Service startup/shutdown events
- Collector connection status changes
- Data collection statistics
- Error messages and warnings
- Performance metrics
To view real logs, check the logs/ directory or configure log file monitoring.
"""
except Exception as e:
return f"Error loading logs: {str(e)}"
def _check_data_collection_service_running() -> bool:
"""Check if data collection service is running."""
try:
# Check for running processes (simplified)
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
try:
if proc.info['cmdline']:
cmdline = ' '.join(proc.info['cmdline'])
if 'start_data_collection.py' in cmdline or 'collection_service' in cmdline:
return True
except (psutil.NoSuchProcess, psutil.AccessDenied):
continue
return False
except:
return False