reverted to sequential computing for features, added one distribution visualization graph
This commit is contained in:
parent
2f98463df8
commit
ced64825bd
@ -6,13 +6,8 @@ import numpy as np
|
|||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from custom_xgboost import CustomXGBoostGPU
|
from custom_xgboost import CustomXGBoostGPU
|
||||||
from sklearn.metrics import mean_squared_error
|
from sklearn.metrics import mean_squared_error
|
||||||
from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns, plot_predicted_vs_actual_prices
|
from plot_results import plot_predicted_vs_actual_prices, plot_prediction_error_distribution
|
||||||
import ta
|
|
||||||
from cycles.supertrend import Supertrends
|
from cycles.supertrend import Supertrends
|
||||||
from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
|
|
||||||
from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillator, StochasticOscillator, WilliamsRIndicator
|
|
||||||
from ta.volatility import KeltnerChannel, DonchianChannel
|
|
||||||
from ta.others import DailyReturnIndicator
|
|
||||||
import time
|
import time
|
||||||
from numba import njit
|
from numba import njit
|
||||||
|
|
||||||
@ -516,8 +511,7 @@ if __name__ == '__main__':
|
|||||||
np.save(sub_feature_file, values.values)
|
np.save(sub_feature_file, values.values)
|
||||||
print(f'Saved feature: {sub_feature_file}')
|
print(f'Saved feature: {sub_feature_file}')
|
||||||
|
|
||||||
# Prepare jobs for lags, rolling stats, log returns, and volatility
|
# Prepare lags, rolling stats, log returns, and volatility features sequentially
|
||||||
feature_jobs = []
|
|
||||||
# Lags
|
# Lags
|
||||||
for col in ohlcv_cols:
|
for col in ohlcv_cols:
|
||||||
for lag in range(1, lags + 1):
|
for lag in range(1, lags + 1):
|
||||||
@ -527,8 +521,11 @@ if __name__ == '__main__':
|
|||||||
print(f'C Loading cached feature: {feature_file}')
|
print(f'C Loading cached feature: {feature_file}')
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
print(f'Adding lag feature job: {feature_name}')
|
print(f'Computing lag feature: {feature_name}')
|
||||||
feature_jobs.append((feature_name, compute_lag, col, lag))
|
result = compute_lag(df, col, lag)
|
||||||
|
features_dict[feature_name] = result
|
||||||
|
np.save(feature_file, result.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
# Rolling statistics
|
# Rolling statistics
|
||||||
for col in ohlcv_cols:
|
for col in ohlcv_cols:
|
||||||
for window in window_sizes:
|
for window in window_sizes:
|
||||||
@ -547,8 +544,11 @@ if __name__ == '__main__':
|
|||||||
print(f'D Loading cached feature: {feature_file}')
|
print(f'D Loading cached feature: {feature_file}')
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
print(f'Adding rolling stat feature job: {feature_name}')
|
print(f'Computing rolling stat feature: {feature_name}')
|
||||||
feature_jobs.append((feature_name, compute_rolling, col, stat, window))
|
result = compute_rolling(df, col, stat, window)
|
||||||
|
features_dict[feature_name] = result
|
||||||
|
np.save(feature_file, result.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
# Log returns for different horizons
|
# Log returns for different horizons
|
||||||
for horizon in [5, 15, 30]:
|
for horizon in [5, 15, 30]:
|
||||||
feature_name = f'log_return_{horizon}'
|
feature_name = f'log_return_{horizon}'
|
||||||
@ -557,8 +557,11 @@ if __name__ == '__main__':
|
|||||||
print(f'E Loading cached feature: {feature_file}')
|
print(f'E Loading cached feature: {feature_file}')
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
print(f'Adding log return feature job: {feature_name}')
|
print(f'Computing log return feature: {feature_name}')
|
||||||
feature_jobs.append((feature_name, compute_log_return, horizon))
|
result = compute_log_return(df, horizon)
|
||||||
|
features_dict[feature_name] = result
|
||||||
|
np.save(feature_file, result.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
# Volatility
|
# Volatility
|
||||||
for window in window_sizes:
|
for window in window_sizes:
|
||||||
feature_name = f'volatility_{window}'
|
feature_name = f'volatility_{window}'
|
||||||
@ -567,22 +570,11 @@ if __name__ == '__main__':
|
|||||||
print(f'F Loading cached feature: {feature_file}')
|
print(f'F Loading cached feature: {feature_file}')
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
print(f'Adding volatility feature job: {feature_name}')
|
print(f'Computing volatility feature: {feature_name}')
|
||||||
feature_jobs.append((feature_name, compute_volatility, window))
|
result = compute_volatility(df, window)
|
||||||
|
|
||||||
# Sequential computation for all non-cached features
|
|
||||||
if feature_jobs:
|
|
||||||
print(f'Computing {len(feature_jobs)} features sequentially...')
|
|
||||||
for job in feature_jobs:
|
|
||||||
print(f'Computing feature job: {job[0]}')
|
|
||||||
feature_name, result = run_feature_job(job, df)
|
|
||||||
features_dict[feature_name] = result
|
features_dict[feature_name] = result
|
||||||
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
|
||||||
np.save(feature_file, result.values)
|
np.save(feature_file, result.values)
|
||||||
print(f'Saved computed feature: {feature_file}')
|
print(f'Saved feature: {feature_file}')
|
||||||
print('All features computed.')
|
|
||||||
else:
|
|
||||||
print('All features loaded from cache.')
|
|
||||||
|
|
||||||
# Concatenate all new features at once
|
# Concatenate all new features at once
|
||||||
print('Concatenating all new features to DataFrame...')
|
print('Concatenating all new features to DataFrame...')
|
||||||
@ -633,17 +625,11 @@ if __name__ == '__main__':
|
|||||||
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
|
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
|
||||||
df['hour'] = df['Timestamp'].dt.hour
|
df['hour'] = df['Timestamp'].dt.hour
|
||||||
|
|
||||||
# Drop NaNs after all feature engineering
|
# Impute NaNs after all feature engineering
|
||||||
print('Dropping NaNs after feature engineering...')
|
print('Imputing NaNs after feature engineering (using column means)...')
|
||||||
df = df.dropna().reset_index(drop=True)
|
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
||||||
|
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
|
||||||
# Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
|
# If you want to impute non-numeric columns differently, add logic here
|
||||||
print('Selecting feature columns...')
|
|
||||||
exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
|
|
||||||
feature_cols = [col for col in df.columns if col not in exclude_cols]
|
|
||||||
|
|
||||||
# Print the features used for training
|
|
||||||
print("Features used for training:", feature_cols)
|
|
||||||
|
|
||||||
# Drop excluded columns to save memory
|
# Drop excluded columns to save memory
|
||||||
print('Dropping excluded columns to save memory...')
|
print('Dropping excluded columns to save memory...')
|
||||||
@ -721,6 +707,9 @@ if __name__ == '__main__':
|
|||||||
print('Plotting predicted vs actual prices...')
|
print('Plotting predicted vs actual prices...')
|
||||||
plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps)
|
plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps)
|
||||||
|
|
||||||
|
print('Plotting distribution of absolute prediction errors...')
|
||||||
|
plot_prediction_error_distribution(predicted_prices, actual_prices)
|
||||||
|
|
||||||
print("Final features used for training:", feature_cols)
|
print("Final features used for training:", feature_cols)
|
||||||
|
|
||||||
print("Shape of X:", X.shape)
|
print("Shape of X:", X.shape)
|
||||||
|
|||||||
@ -167,3 +167,40 @@ def plot_predicted_vs_actual_prices(actual_prices, predicted_prices, timestamps=
|
|||||||
)
|
)
|
||||||
fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
|
fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
|
||||||
pyo.plot(fig_scatter, filename='price_scatter_plot.html')
|
pyo.plot(fig_scatter, filename='price_scatter_plot.html')
|
||||||
|
|
||||||
|
def plot_prediction_error_distribution(predicted_prices, actual_prices, nbins=100):
|
||||||
|
"""
|
||||||
|
Plots the distribution of signed prediction errors between predicted and actual prices,
|
||||||
|
coloring negative errors (under-prediction) and positive errors (over-prediction) differently.
|
||||||
|
"""
|
||||||
|
import plotly.offline as pyo
|
||||||
|
import plotly.graph_objs as go
|
||||||
|
errors = np.array(predicted_prices) - np.array(actual_prices)
|
||||||
|
|
||||||
|
# Separate negative and positive errors
|
||||||
|
neg_errors = errors[errors < 0]
|
||||||
|
pos_errors = errors[errors >= 0]
|
||||||
|
|
||||||
|
trace_neg = go.Histogram(
|
||||||
|
x=neg_errors,
|
||||||
|
nbinsx=nbins,
|
||||||
|
opacity=0.75,
|
||||||
|
marker=dict(color='blue'),
|
||||||
|
name='Negative Error (Under-prediction)'
|
||||||
|
)
|
||||||
|
trace_pos = go.Histogram(
|
||||||
|
x=pos_errors,
|
||||||
|
nbinsx=nbins,
|
||||||
|
opacity=0.75,
|
||||||
|
marker=dict(color='orange'),
|
||||||
|
name='Positive Error (Over-prediction)'
|
||||||
|
)
|
||||||
|
layout = go.Layout(
|
||||||
|
title='Distribution of Prediction Errors (Signed)',
|
||||||
|
xaxis=dict(title='Prediction Error (Predicted - Actual)'),
|
||||||
|
yaxis=dict(title='Frequency'),
|
||||||
|
barmode='overlay',
|
||||||
|
bargap=0.05
|
||||||
|
)
|
||||||
|
fig = go.Figure(data=[trace_neg, trace_pos], layout=layout)
|
||||||
|
pyo.plot(fig, filename='prediction_error_distribution.html')
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user