From ced64825bdabe54e5109d7e141763816be0b27bb Mon Sep 17 00:00:00 2001 From: Simon Moisy Date: Fri, 30 May 2025 15:54:48 +0800 Subject: [PATCH] reverted to sequential computing for features, added one distribution visualization graph --- xgboost/main.py | 67 +++++++++++++++++------------------------ xgboost/plot_results.py | 37 +++++++++++++++++++++++ 2 files changed, 65 insertions(+), 39 deletions(-) diff --git a/xgboost/main.py b/xgboost/main.py index cf1a462..0b59a32 100644 --- a/xgboost/main.py +++ b/xgboost/main.py @@ -6,13 +6,8 @@ import numpy as np from sklearn.model_selection import train_test_split from custom_xgboost import CustomXGBoostGPU from sklearn.metrics import mean_squared_error -from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns, plot_predicted_vs_actual_prices -import ta +from plot_results import plot_predicted_vs_actual_prices, plot_prediction_error_distribution from cycles.supertrend import Supertrends -from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator -from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillator, StochasticOscillator, WilliamsRIndicator -from ta.volatility import KeltnerChannel, DonchianChannel -from ta.others import DailyReturnIndicator import time from numba import njit @@ -516,8 +511,7 @@ if __name__ == '__main__': np.save(sub_feature_file, values.values) print(f'Saved feature: {sub_feature_file}') - # Prepare jobs for lags, rolling stats, log returns, and volatility - feature_jobs = [] + # Prepare lags, rolling stats, log returns, and volatility features sequentially # Lags for col in ohlcv_cols: for lag in range(1, lags + 1): @@ -527,8 +521,11 @@ if __name__ == '__main__': print(f'C Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: - print(f'Adding lag feature job: {feature_name}') - feature_jobs.append((feature_name, compute_lag, col, lag)) + print(f'Computing lag feature: {feature_name}') + result = compute_lag(df, col, lag) + features_dict[feature_name] = result + np.save(feature_file, result.values) + print(f'Saved feature: {feature_file}') # Rolling statistics for col in ohlcv_cols: for window in window_sizes: @@ -547,8 +544,11 @@ if __name__ == '__main__': print(f'D Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: - print(f'Adding rolling stat feature job: {feature_name}') - feature_jobs.append((feature_name, compute_rolling, col, stat, window)) + print(f'Computing rolling stat feature: {feature_name}') + result = compute_rolling(df, col, stat, window) + features_dict[feature_name] = result + np.save(feature_file, result.values) + print(f'Saved feature: {feature_file}') # Log returns for different horizons for horizon in [5, 15, 30]: feature_name = f'log_return_{horizon}' @@ -557,8 +557,11 @@ if __name__ == '__main__': print(f'E Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: - print(f'Adding log return feature job: {feature_name}') - feature_jobs.append((feature_name, compute_log_return, horizon)) + print(f'Computing log return feature: {feature_name}') + result = compute_log_return(df, horizon) + features_dict[feature_name] = result + np.save(feature_file, result.values) + print(f'Saved feature: {feature_file}') # Volatility for window in window_sizes: feature_name = f'volatility_{window}' @@ -567,22 +570,11 @@ if __name__ == '__main__': print(f'F Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: - print(f'Adding volatility feature job: {feature_name}') - feature_jobs.append((feature_name, compute_volatility, window)) - - # Sequential computation for all non-cached features - if feature_jobs: - print(f'Computing {len(feature_jobs)} features sequentially...') - for job in feature_jobs: - print(f'Computing feature job: {job[0]}') - feature_name, result = run_feature_job(job, df) + print(f'Computing volatility feature: {feature_name}') + result = compute_volatility(df, window) features_dict[feature_name] = result - feature_file = f'./data/{csv_prefix}_{feature_name}.npy' np.save(feature_file, result.values) - print(f'Saved computed feature: {feature_file}') - print('All features computed.') - else: - print('All features loaded from cache.') + print(f'Saved feature: {feature_file}') # Concatenate all new features at once print('Concatenating all new features to DataFrame...') @@ -633,17 +625,11 @@ if __name__ == '__main__': df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce') df['hour'] = df['Timestamp'].dt.hour - # Drop NaNs after all feature engineering - print('Dropping NaNs after feature engineering...') - df = df.dropna().reset_index(drop=True) - - # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features - print('Selecting feature columns...') - exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30'] - feature_cols = [col for col in df.columns if col not in exclude_cols] - - # Print the features used for training - print("Features used for training:", feature_cols) + # Impute NaNs after all feature engineering + print('Imputing NaNs after feature engineering (using column means)...') + numeric_cols = df.select_dtypes(include=[np.number]).columns + df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean()) + # If you want to impute non-numeric columns differently, add logic here # Drop excluded columns to save memory print('Dropping excluded columns to save memory...') @@ -721,6 +707,9 @@ if __name__ == '__main__': print('Plotting predicted vs actual prices...') plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps) + print('Plotting distribution of absolute prediction errors...') + plot_prediction_error_distribution(predicted_prices, actual_prices) + print("Final features used for training:", feature_cols) print("Shape of X:", X.shape) diff --git a/xgboost/plot_results.py b/xgboost/plot_results.py index 6acf429..3217c6b 100644 --- a/xgboost/plot_results.py +++ b/xgboost/plot_results.py @@ -167,3 +167,40 @@ def plot_predicted_vs_actual_prices(actual_prices, predicted_prices, timestamps= ) fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter) pyo.plot(fig_scatter, filename='price_scatter_plot.html') + +def plot_prediction_error_distribution(predicted_prices, actual_prices, nbins=100): + """ + Plots the distribution of signed prediction errors between predicted and actual prices, + coloring negative errors (under-prediction) and positive errors (over-prediction) differently. + """ + import plotly.offline as pyo + import plotly.graph_objs as go + errors = np.array(predicted_prices) - np.array(actual_prices) + + # Separate negative and positive errors + neg_errors = errors[errors < 0] + pos_errors = errors[errors >= 0] + + trace_neg = go.Histogram( + x=neg_errors, + nbinsx=nbins, + opacity=0.75, + marker=dict(color='blue'), + name='Negative Error (Under-prediction)' + ) + trace_pos = go.Histogram( + x=pos_errors, + nbinsx=nbins, + opacity=0.75, + marker=dict(color='orange'), + name='Positive Error (Over-prediction)' + ) + layout = go.Layout( + title='Distribution of Prediction Errors (Signed)', + xaxis=dict(title='Prediction Error (Predicted - Actual)'), + yaxis=dict(title='Frequency'), + barmode='overlay', + bargap=0.05 + ) + fig = go.Figure(data=[trace_neg, trace_pos], layout=layout) + pyo.plot(fig, filename='prediction_error_distribution.html')