reverted to sequential computing for features, added one distribution visualization graph

2025-05-30 15:54:48 +08:00
parent 2f98463df8
commit ced64825bd
2 changed files with 65 additions and 39 deletions
--- a/xgboost/main.py
+++ b/xgboost/main.py
@@ -6,13 +6,8 @@ import numpy as np
 from sklearn.model_selection import train_test_split
 from custom_xgboost import CustomXGBoostGPU
 from sklearn.metrics import mean_squared_error
-from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns, plot_predicted_vs_actual_prices
+from plot_results import plot_predicted_vs_actual_prices, plot_prediction_error_distribution
 import ta
 from cycles.supertrend import Supertrends
 from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
 from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillator, StochasticOscillator, WilliamsRIndicator
 from ta.volatility import KeltnerChannel, DonchianChannel
 from ta.others import DailyReturnIndicator
 import time
 from numba import njit
@@ -516,8 +511,7 @@ if __name__ == '__main__':
            np.save(sub_feature_file, values.values)
            print(f'Saved feature: {sub_feature_file}')
-    # Prepare jobs for lags, rolling stats, log returns, and volatility
+    # Prepare lags, rolling stats, log returns, and volatility features sequentially
    feature_jobs = []
    # Lags
    for col in ohlcv_cols:
        for lag in range(1, lags + 1):
@@ -527,8 +521,11 @@ if __name__ == '__main__':
                print(f'C Loading cached feature: {feature_file}')
                features_dict[feature_name] = np.load(feature_file)
            else:
-                print(f'Adding lag feature job: {feature_name}')
+                print(f'Computing lag feature: {feature_name}')
-                feature_jobs.append((feature_name, compute_lag, col, lag))
+                result = compute_lag(df, col, lag)
                features_dict[feature_name] = result
                np.save(feature_file, result.values)
                print(f'Saved feature: {feature_file}')
    # Rolling statistics
    for col in ohlcv_cols:
        for window in window_sizes:
@@ -547,8 +544,11 @@ if __name__ == '__main__':
                    print(f'D Loading cached feature: {feature_file}')
                    features_dict[feature_name] = np.load(feature_file)
                else:
-                    print(f'Adding rolling stat feature job: {feature_name}')
+                    print(f'Computing rolling stat feature: {feature_name}')
-                    feature_jobs.append((feature_name, compute_rolling, col, stat, window))
+                    result = compute_rolling(df, col, stat, window)
                    features_dict[feature_name] = result
                    np.save(feature_file, result.values)
                    print(f'Saved feature: {feature_file}')
    # Log returns for different horizons
    for horizon in [5, 15, 30]:
        feature_name = f'log_return_{horizon}'
@@ -557,8 +557,11 @@ if __name__ == '__main__':
            print(f'E Loading cached feature: {feature_file}')
            features_dict[feature_name] = np.load(feature_file)
        else:
-            print(f'Adding log return feature job: {feature_name}')
+            print(f'Computing log return feature: {feature_name}')
-            feature_jobs.append((feature_name, compute_log_return, horizon))
+            result = compute_log_return(df, horizon)
            features_dict[feature_name] = result
            np.save(feature_file, result.values)
            print(f'Saved feature: {feature_file}')
    # Volatility
    for window in window_sizes:
        feature_name = f'volatility_{window}'
@@ -567,22 +570,11 @@ if __name__ == '__main__':
            print(f'F Loading cached feature: {feature_file}')
            features_dict[feature_name] = np.load(feature_file)
        else:
-            print(f'Adding volatility feature job: {feature_name}')
+            print(f'Computing volatility feature: {feature_name}')
-            feature_jobs.append((feature_name, compute_volatility, window))
+            result = compute_volatility(df, window)
    # Sequential computation for all non-cached features
    if feature_jobs:
        print(f'Computing {len(feature_jobs)} features sequentially...')
        for job in feature_jobs:
            print(f'Computing feature job: {job[0]}')
            feature_name, result = run_feature_job(job, df)
            features_dict[feature_name] = result
            feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
            np.save(feature_file, result.values)
-            print(f'Saved computed feature: {feature_file}')
+            print(f'Saved feature: {feature_file}')
        print('All features computed.')
    else:
        print('All features loaded from cache.')
    # Concatenate all new features at once
    print('Concatenating all new features to DataFrame...')
@@ -633,17 +625,11 @@ if __name__ == '__main__':
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
    df['hour'] = df['Timestamp'].dt.hour
-    # Drop NaNs after all feature engineering
+    # Impute NaNs after all feature engineering
-    print('Dropping NaNs after feature engineering...')
+    print('Imputing NaNs after feature engineering (using column means)...')
-    df = df.dropna().reset_index(drop=True)
+    numeric_cols = df.select_dtypes(include=[np.number]).columns
-
+    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
-    # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
+    # If you want to impute non-numeric columns differently, add logic here
    print('Selecting feature columns...')
    exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    # Print the features used for training
    print("Features used for training:", feature_cols)
    # Drop excluded columns to save memory
    print('Dropping excluded columns to save memory...')
@@ -721,6 +707,9 @@ if __name__ == '__main__':
    print('Plotting predicted vs actual prices...')
    plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps)
    print('Plotting distribution of absolute prediction errors...')
    plot_prediction_error_distribution(predicted_prices, actual_prices)
    print("Final features used for training:", feature_cols)
    print("Shape of X:", X.shape)
--- a/xgboost/plot_results.py
+++ b/xgboost/plot_results.py
@@ -167,3 +167,40 @@ def plot_predicted_vs_actual_prices(actual_prices, predicted_prices, timestamps=
    )
    fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
    pyo.plot(fig_scatter, filename='price_scatter_plot.html')
 def plot_prediction_error_distribution(predicted_prices, actual_prices, nbins=100):
    """
    Plots the distribution of signed prediction errors between predicted and actual prices,
    coloring negative errors (under-prediction) and positive errors (over-prediction) differently.
    """
    import plotly.offline as pyo
    import plotly.graph_objs as go
    errors = np.array(predicted_prices) - np.array(actual_prices)
    # Separate negative and positive errors
    neg_errors = errors[errors < 0]
    pos_errors = errors[errors >= 0]
    trace_neg = go.Histogram(
        x=neg_errors,
        nbinsx=nbins,
        opacity=0.75,
        marker=dict(color='blue'),
        name='Negative Error (Under-prediction)'
    )
    trace_pos = go.Histogram(
        x=pos_errors,
        nbinsx=nbins,
        opacity=0.75,
        marker=dict(color='orange'),
        name='Positive Error (Over-prediction)'
    )
    layout = go.Layout(
        title='Distribution of Prediction Errors (Signed)',
        xaxis=dict(title='Prediction Error (Predicted - Actual)'),
        yaxis=dict(title='Frequency'),
        barmode='overlay',
        bargap=0.05
    )
    fig = go.Figure(data=[trace_neg, trace_pos], layout=layout)
    pyo.plot(fig, filename='prediction_error_distribution.html')