reverted to sequential computing for features, added one distribution visualization graph

This commit is contained in:
Simon Moisy 2025-05-30 15:54:48 +08:00
parent 2f98463df8
commit ced64825bd
2 changed files with 65 additions and 39 deletions

View File

@ -6,13 +6,8 @@ import numpy as np
from sklearn.model_selection import train_test_split
from custom_xgboost import CustomXGBoostGPU
from sklearn.metrics import mean_squared_error
from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns, plot_predicted_vs_actual_prices
import ta
from plot_results import plot_predicted_vs_actual_prices, plot_prediction_error_distribution
from cycles.supertrend import Supertrends
from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillator, StochasticOscillator, WilliamsRIndicator
from ta.volatility import KeltnerChannel, DonchianChannel
from ta.others import DailyReturnIndicator
import time
from numba import njit
@ -516,8 +511,7 @@ if __name__ == '__main__':
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Prepare jobs for lags, rolling stats, log returns, and volatility
feature_jobs = []
# Prepare lags, rolling stats, log returns, and volatility features sequentially
# Lags
for col in ohlcv_cols:
for lag in range(1, lags + 1):
@ -527,8 +521,11 @@ if __name__ == '__main__':
print(f'C Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Adding lag feature job: {feature_name}')
feature_jobs.append((feature_name, compute_lag, col, lag))
print(f'Computing lag feature: {feature_name}')
result = compute_lag(df, col, lag)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
# Rolling statistics
for col in ohlcv_cols:
for window in window_sizes:
@ -547,8 +544,11 @@ if __name__ == '__main__':
print(f'D Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Adding rolling stat feature job: {feature_name}')
feature_jobs.append((feature_name, compute_rolling, col, stat, window))
print(f'Computing rolling stat feature: {feature_name}')
result = compute_rolling(df, col, stat, window)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
# Log returns for different horizons
for horizon in [5, 15, 30]:
feature_name = f'log_return_{horizon}'
@ -557,8 +557,11 @@ if __name__ == '__main__':
print(f'E Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Adding log return feature job: {feature_name}')
feature_jobs.append((feature_name, compute_log_return, horizon))
print(f'Computing log return feature: {feature_name}')
result = compute_log_return(df, horizon)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
# Volatility
for window in window_sizes:
feature_name = f'volatility_{window}'
@ -567,22 +570,11 @@ if __name__ == '__main__':
print(f'F Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Adding volatility feature job: {feature_name}')
feature_jobs.append((feature_name, compute_volatility, window))
# Sequential computation for all non-cached features
if feature_jobs:
print(f'Computing {len(feature_jobs)} features sequentially...')
for job in feature_jobs:
print(f'Computing feature job: {job[0]}')
feature_name, result = run_feature_job(job, df)
print(f'Computing volatility feature: {feature_name}')
result = compute_volatility(df, window)
features_dict[feature_name] = result
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
np.save(feature_file, result.values)
print(f'Saved computed feature: {feature_file}')
print('All features computed.')
else:
print('All features loaded from cache.')
print(f'Saved feature: {feature_file}')
# Concatenate all new features at once
print('Concatenating all new features to DataFrame...')
@ -633,17 +625,11 @@ if __name__ == '__main__':
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df['hour'] = df['Timestamp'].dt.hour
# Drop NaNs after all feature engineering
print('Dropping NaNs after feature engineering...')
df = df.dropna().reset_index(drop=True)
# Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
print('Selecting feature columns...')
exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
feature_cols = [col for col in df.columns if col not in exclude_cols]
# Print the features used for training
print("Features used for training:", feature_cols)
# Impute NaNs after all feature engineering
print('Imputing NaNs after feature engineering (using column means)...')
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
# If you want to impute non-numeric columns differently, add logic here
# Drop excluded columns to save memory
print('Dropping excluded columns to save memory...')
@ -721,6 +707,9 @@ if __name__ == '__main__':
print('Plotting predicted vs actual prices...')
plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps)
print('Plotting distribution of absolute prediction errors...')
plot_prediction_error_distribution(predicted_prices, actual_prices)
print("Final features used for training:", feature_cols)
print("Shape of X:", X.shape)

View File

@ -167,3 +167,40 @@ def plot_predicted_vs_actual_prices(actual_prices, predicted_prices, timestamps=
)
fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
pyo.plot(fig_scatter, filename='price_scatter_plot.html')
def plot_prediction_error_distribution(predicted_prices, actual_prices, nbins=100):
"""
Plots the distribution of signed prediction errors between predicted and actual prices,
coloring negative errors (under-prediction) and positive errors (over-prediction) differently.
"""
import plotly.offline as pyo
import plotly.graph_objs as go
errors = np.array(predicted_prices) - np.array(actual_prices)
# Separate negative and positive errors
neg_errors = errors[errors < 0]
pos_errors = errors[errors >= 0]
trace_neg = go.Histogram(
x=neg_errors,
nbinsx=nbins,
opacity=0.75,
marker=dict(color='blue'),
name='Negative Error (Under-prediction)'
)
trace_pos = go.Histogram(
x=pos_errors,
nbinsx=nbins,
opacity=0.75,
marker=dict(color='orange'),
name='Positive Error (Over-prediction)'
)
layout = go.Layout(
title='Distribution of Prediction Errors (Signed)',
xaxis=dict(title='Prediction Error (Predicted - Actual)'),
yaxis=dict(title='Frequency'),
barmode='overlay',
bargap=0.05
)
fig = go.Figure(data=[trace_neg, trace_pos], layout=layout)
pyo.plot(fig, filename='prediction_error_distribution.html')