From ced64825bdabe54e5109d7e141763816be0b27bb Mon Sep 17 00:00:00 2001
From: Simon Moisy <simon.moisy@tutanota.com>
Date: Fri, 30 May 2025 15:54:48 +0800
Subject: [PATCH] reverted to sequential computing for features, added one
 distribution visualization graph

---
 xgboost/main.py         | 67 +++++++++++++++++------------------------
 xgboost/plot_results.py | 37 +++++++++++++++++++++++
 2 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/xgboost/main.py b/xgboost/main.py
index cf1a462..0b59a32 100644
--- a/xgboost/main.py
+++ b/xgboost/main.py
@@ -6,13 +6,8 @@ import numpy as np
 from sklearn.model_selection import train_test_split
 from custom_xgboost import CustomXGBoostGPU
 from sklearn.metrics import mean_squared_error
-from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns, plot_predicted_vs_actual_prices
-import ta
+from plot_results import plot_predicted_vs_actual_prices, plot_prediction_error_distribution
 from cycles.supertrend import Supertrends
-from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
-from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillator, StochasticOscillator, WilliamsRIndicator
-from ta.volatility import KeltnerChannel, DonchianChannel
-from ta.others import DailyReturnIndicator
 import time
 from numba import njit
 
@@ -516,8 +511,7 @@ if __name__ == '__main__':
             np.save(sub_feature_file, values.values)
             print(f'Saved feature: {sub_feature_file}')
 
-    # Prepare jobs for lags, rolling stats, log returns, and volatility
-    feature_jobs = []
+    # Prepare lags, rolling stats, log returns, and volatility features sequentially
     # Lags
     for col in ohlcv_cols:
         for lag in range(1, lags + 1):
@@ -527,8 +521,11 @@ if __name__ == '__main__':
                 print(f'C Loading cached feature: {feature_file}')
                 features_dict[feature_name] = np.load(feature_file)
             else:
-                print(f'Adding lag feature job: {feature_name}')
-                feature_jobs.append((feature_name, compute_lag, col, lag))
+                print(f'Computing lag feature: {feature_name}')
+                result = compute_lag(df, col, lag)
+                features_dict[feature_name] = result
+                np.save(feature_file, result.values)
+                print(f'Saved feature: {feature_file}')
     # Rolling statistics
     for col in ohlcv_cols:
         for window in window_sizes:
@@ -547,8 +544,11 @@ if __name__ == '__main__':
                     print(f'D Loading cached feature: {feature_file}')
                     features_dict[feature_name] = np.load(feature_file)
                 else:
-                    print(f'Adding rolling stat feature job: {feature_name}')
-                    feature_jobs.append((feature_name, compute_rolling, col, stat, window))
+                    print(f'Computing rolling stat feature: {feature_name}')
+                    result = compute_rolling(df, col, stat, window)
+                    features_dict[feature_name] = result
+                    np.save(feature_file, result.values)
+                    print(f'Saved feature: {feature_file}')
     # Log returns for different horizons
     for horizon in [5, 15, 30]:
         feature_name = f'log_return_{horizon}'
@@ -557,8 +557,11 @@ if __name__ == '__main__':
             print(f'E Loading cached feature: {feature_file}')
             features_dict[feature_name] = np.load(feature_file)
         else:
-            print(f'Adding log return feature job: {feature_name}')
-            feature_jobs.append((feature_name, compute_log_return, horizon))
+            print(f'Computing log return feature: {feature_name}')
+            result = compute_log_return(df, horizon)
+            features_dict[feature_name] = result
+            np.save(feature_file, result.values)
+            print(f'Saved feature: {feature_file}')
     # Volatility
     for window in window_sizes:
         feature_name = f'volatility_{window}'
@@ -567,22 +570,11 @@ if __name__ == '__main__':
             print(f'F Loading cached feature: {feature_file}')
             features_dict[feature_name] = np.load(feature_file)
         else:
-            print(f'Adding volatility feature job: {feature_name}')
-            feature_jobs.append((feature_name, compute_volatility, window))
-
-    # Sequential computation for all non-cached features
-    if feature_jobs:
-        print(f'Computing {len(feature_jobs)} features sequentially...')
-        for job in feature_jobs:
-            print(f'Computing feature job: {job[0]}')
-            feature_name, result = run_feature_job(job, df)
+            print(f'Computing volatility feature: {feature_name}')
+            result = compute_volatility(df, window)
             features_dict[feature_name] = result
-            feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
             np.save(feature_file, result.values)
-            print(f'Saved computed feature: {feature_file}')
-        print('All features computed.')
-    else:
-        print('All features loaded from cache.')
+            print(f'Saved feature: {feature_file}')
 
     # Concatenate all new features at once
     print('Concatenating all new features to DataFrame...')
@@ -633,17 +625,11 @@ if __name__ == '__main__':
     df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
     df['hour'] = df['Timestamp'].dt.hour
 
-    # Drop NaNs after all feature engineering
-    print('Dropping NaNs after feature engineering...')
-    df = df.dropna().reset_index(drop=True)
-
-    # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
-    print('Selecting feature columns...')
-    exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
-    feature_cols = [col for col in df.columns if col not in exclude_cols]
-
-    # Print the features used for training
-    print("Features used for training:", feature_cols)
+    # Impute NaNs after all feature engineering
+    print('Imputing NaNs after feature engineering (using column means)...')
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
+    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
+    # If you want to impute non-numeric columns differently, add logic here
 
     # Drop excluded columns to save memory
     print('Dropping excluded columns to save memory...')
@@ -721,6 +707,9 @@ if __name__ == '__main__':
     print('Plotting predicted vs actual prices...')
     plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps)
 
+    print('Plotting distribution of absolute prediction errors...')
+    plot_prediction_error_distribution(predicted_prices, actual_prices)
+
     print("Final features used for training:", feature_cols)
 
     print("Shape of X:", X.shape)
diff --git a/xgboost/plot_results.py b/xgboost/plot_results.py
index 6acf429..3217c6b 100644
--- a/xgboost/plot_results.py
+++ b/xgboost/plot_results.py
@@ -167,3 +167,40 @@ def plot_predicted_vs_actual_prices(actual_prices, predicted_prices, timestamps=
     )
     fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
     pyo.plot(fig_scatter, filename='price_scatter_plot.html')
+
+def plot_prediction_error_distribution(predicted_prices, actual_prices, nbins=100):
+    """
+    Plots the distribution of signed prediction errors between predicted and actual prices,
+    coloring negative errors (under-prediction) and positive errors (over-prediction) differently.
+    """
+    import plotly.offline as pyo
+    import plotly.graph_objs as go
+    errors = np.array(predicted_prices) - np.array(actual_prices)
+    
+    # Separate negative and positive errors
+    neg_errors = errors[errors < 0]
+    pos_errors = errors[errors >= 0]
+
+    trace_neg = go.Histogram(
+        x=neg_errors,
+        nbinsx=nbins,
+        opacity=0.75,
+        marker=dict(color='blue'),
+        name='Negative Error (Under-prediction)'
+    )
+    trace_pos = go.Histogram(
+        x=pos_errors,
+        nbinsx=nbins,
+        opacity=0.75,
+        marker=dict(color='orange'),
+        name='Positive Error (Over-prediction)'
+    )
+    layout = go.Layout(
+        title='Distribution of Prediction Errors (Signed)',
+        xaxis=dict(title='Prediction Error (Predicted - Actual)'),
+        yaxis=dict(title='Frequency'),
+        barmode='overlay',
+        bargap=0.05
+    )
+    fig = go.Figure(data=[trace_neg, trace_pos], layout=layout)
+    pyo.plot(fig, filename='prediction_error_distribution.html')