reverted to sequential computing for features, added one distribution visualization graph

This commit is contained in:
Simon Moisy 2025-05-30 15:54:48 +08:00
parent 2f98463df8
commit ced64825bd
2 changed files with 65 additions and 39 deletions

View File

@ -6,13 +6,8 @@ import numpy as np
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from custom_xgboost import CustomXGBoostGPU from custom_xgboost import CustomXGBoostGPU
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns, plot_predicted_vs_actual_prices from plot_results import plot_predicted_vs_actual_prices, plot_prediction_error_distribution
import ta
from cycles.supertrend import Supertrends from cycles.supertrend import Supertrends
from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillator, StochasticOscillator, WilliamsRIndicator
from ta.volatility import KeltnerChannel, DonchianChannel
from ta.others import DailyReturnIndicator
import time import time
from numba import njit from numba import njit
@ -516,8 +511,7 @@ if __name__ == '__main__':
np.save(sub_feature_file, values.values) np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}') print(f'Saved feature: {sub_feature_file}')
# Prepare jobs for lags, rolling stats, log returns, and volatility # Prepare lags, rolling stats, log returns, and volatility features sequentially
feature_jobs = []
# Lags # Lags
for col in ohlcv_cols: for col in ohlcv_cols:
for lag in range(1, lags + 1): for lag in range(1, lags + 1):
@ -527,8 +521,11 @@ if __name__ == '__main__':
print(f'C Loading cached feature: {feature_file}') print(f'C Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file) features_dict[feature_name] = np.load(feature_file)
else: else:
print(f'Adding lag feature job: {feature_name}') print(f'Computing lag feature: {feature_name}')
feature_jobs.append((feature_name, compute_lag, col, lag)) result = compute_lag(df, col, lag)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
# Rolling statistics # Rolling statistics
for col in ohlcv_cols: for col in ohlcv_cols:
for window in window_sizes: for window in window_sizes:
@ -547,8 +544,11 @@ if __name__ == '__main__':
print(f'D Loading cached feature: {feature_file}') print(f'D Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file) features_dict[feature_name] = np.load(feature_file)
else: else:
print(f'Adding rolling stat feature job: {feature_name}') print(f'Computing rolling stat feature: {feature_name}')
feature_jobs.append((feature_name, compute_rolling, col, stat, window)) result = compute_rolling(df, col, stat, window)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
# Log returns for different horizons # Log returns for different horizons
for horizon in [5, 15, 30]: for horizon in [5, 15, 30]:
feature_name = f'log_return_{horizon}' feature_name = f'log_return_{horizon}'
@ -557,8 +557,11 @@ if __name__ == '__main__':
print(f'E Loading cached feature: {feature_file}') print(f'E Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file) features_dict[feature_name] = np.load(feature_file)
else: else:
print(f'Adding log return feature job: {feature_name}') print(f'Computing log return feature: {feature_name}')
feature_jobs.append((feature_name, compute_log_return, horizon)) result = compute_log_return(df, horizon)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
# Volatility # Volatility
for window in window_sizes: for window in window_sizes:
feature_name = f'volatility_{window}' feature_name = f'volatility_{window}'
@ -567,22 +570,11 @@ if __name__ == '__main__':
print(f'F Loading cached feature: {feature_file}') print(f'F Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file) features_dict[feature_name] = np.load(feature_file)
else: else:
print(f'Adding volatility feature job: {feature_name}') print(f'Computing volatility feature: {feature_name}')
feature_jobs.append((feature_name, compute_volatility, window)) result = compute_volatility(df, window)
# Sequential computation for all non-cached features
if feature_jobs:
print(f'Computing {len(feature_jobs)} features sequentially...')
for job in feature_jobs:
print(f'Computing feature job: {job[0]}')
feature_name, result = run_feature_job(job, df)
features_dict[feature_name] = result features_dict[feature_name] = result
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
np.save(feature_file, result.values) np.save(feature_file, result.values)
print(f'Saved computed feature: {feature_file}') print(f'Saved feature: {feature_file}')
print('All features computed.')
else:
print('All features loaded from cache.')
# Concatenate all new features at once # Concatenate all new features at once
print('Concatenating all new features to DataFrame...') print('Concatenating all new features to DataFrame...')
@ -633,17 +625,11 @@ if __name__ == '__main__':
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce') df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df['hour'] = df['Timestamp'].dt.hour df['hour'] = df['Timestamp'].dt.hour
# Drop NaNs after all feature engineering # Impute NaNs after all feature engineering
print('Dropping NaNs after feature engineering...') print('Imputing NaNs after feature engineering (using column means)...')
df = df.dropna().reset_index(drop=True) numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
# Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features # If you want to impute non-numeric columns differently, add logic here
print('Selecting feature columns...')
exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
feature_cols = [col for col in df.columns if col not in exclude_cols]
# Print the features used for training
print("Features used for training:", feature_cols)
# Drop excluded columns to save memory # Drop excluded columns to save memory
print('Dropping excluded columns to save memory...') print('Dropping excluded columns to save memory...')
@ -721,6 +707,9 @@ if __name__ == '__main__':
print('Plotting predicted vs actual prices...') print('Plotting predicted vs actual prices...')
plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps) plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps)
print('Plotting distribution of absolute prediction errors...')
plot_prediction_error_distribution(predicted_prices, actual_prices)
print("Final features used for training:", feature_cols) print("Final features used for training:", feature_cols)
print("Shape of X:", X.shape) print("Shape of X:", X.shape)

View File

@ -167,3 +167,40 @@ def plot_predicted_vs_actual_prices(actual_prices, predicted_prices, timestamps=
) )
fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter) fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
pyo.plot(fig_scatter, filename='price_scatter_plot.html') pyo.plot(fig_scatter, filename='price_scatter_plot.html')
def plot_prediction_error_distribution(predicted_prices, actual_prices, nbins=100):
"""
Plots the distribution of signed prediction errors between predicted and actual prices,
coloring negative errors (under-prediction) and positive errors (over-prediction) differently.
"""
import plotly.offline as pyo
import plotly.graph_objs as go
errors = np.array(predicted_prices) - np.array(actual_prices)
# Separate negative and positive errors
neg_errors = errors[errors < 0]
pos_errors = errors[errors >= 0]
trace_neg = go.Histogram(
x=neg_errors,
nbinsx=nbins,
opacity=0.75,
marker=dict(color='blue'),
name='Negative Error (Under-prediction)'
)
trace_pos = go.Histogram(
x=pos_errors,
nbinsx=nbins,
opacity=0.75,
marker=dict(color='orange'),
name='Positive Error (Over-prediction)'
)
layout = go.Layout(
title='Distribution of Prediction Errors (Signed)',
xaxis=dict(title='Prediction Error (Predicted - Actual)'),
yaxis=dict(title='Frequency'),
barmode='overlay',
bargap=0.05
)
fig = go.Figure(data=[trace_neg, trace_pos], layout=layout)
pyo.plot(fig, filename='prediction_error_distribution.html')