Add evaluation module with walk-forward CV and metrics computation. Enhance feature engineering with additional OHLCV features and rolling statistics. Update main script to integrate walk-forward CV for feature importance and pruning.
This commit is contained in:
parent
a419764fff
commit
add3fbcf19
77
evaluation.py
Normal file
77
evaluation.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import numpy as np
|
||||||
|
from typing import Dict, List, Tuple
|
||||||
|
try:
|
||||||
|
from .custom_xgboost import CustomXGBoostGPU
|
||||||
|
except ImportError:
|
||||||
|
from custom_xgboost import CustomXGBoostGPU
|
||||||
|
from sklearn.metrics import mean_squared_error, r2_score
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[float, float, float, float]:
|
||||||
|
"""Compute RMSE, MAPE, R2, and directional accuracy.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(rmse, mape, r2, directional_accuracy)
|
||||||
|
"""
|
||||||
|
rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
|
||||||
|
with np.errstate(divide='ignore', invalid='ignore'):
|
||||||
|
mape_arr = np.abs((y_true - y_pred) / np.where(y_true == 0, np.nan, y_true))
|
||||||
|
mape = float(np.nanmean(mape_arr) * 100.0)
|
||||||
|
r2 = float(r2_score(y_true, y_pred))
|
||||||
|
direction_actual = np.sign(np.diff(y_true))
|
||||||
|
direction_pred = np.sign(np.diff(y_pred))
|
||||||
|
min_len = min(len(direction_actual), len(direction_pred))
|
||||||
|
if min_len == 0:
|
||||||
|
dir_acc = 0.0
|
||||||
|
else:
|
||||||
|
dir_acc = float((direction_actual[:min_len] == direction_pred[:min_len]).mean())
|
||||||
|
return rmse, mape, r2, dir_acc
|
||||||
|
|
||||||
|
|
||||||
|
def walk_forward_cv(
|
||||||
|
X: np.ndarray,
|
||||||
|
y: np.ndarray,
|
||||||
|
feature_names: List[str],
|
||||||
|
n_splits: int = 5,
|
||||||
|
) -> Tuple[Dict[str, float], Dict[str, float]]:
|
||||||
|
"""Run simple walk-forward CV and aggregate metrics and feature importances.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
metrics_avg: Average metrics across folds {rmse, mape, r2, dir_acc}
|
||||||
|
importance_avg: Average feature importance across folds {feature -> importance}
|
||||||
|
"""
|
||||||
|
num_samples = len(X)
|
||||||
|
fold_size = num_samples // (n_splits + 1)
|
||||||
|
if fold_size <= 0:
|
||||||
|
raise ValueError("Not enough samples for walk-forward CV")
|
||||||
|
|
||||||
|
metrics_accum = {"rmse": [], "mape": [], "r2": [], "dir_acc": []}
|
||||||
|
importance_sum = {name: 0.0 for name in feature_names}
|
||||||
|
|
||||||
|
for i in range(1, n_splits + 1):
|
||||||
|
train_end = i * fold_size
|
||||||
|
test_end = (i + 1) * fold_size if i < n_splits else num_samples
|
||||||
|
X_train, y_train = X[:train_end], y[:train_end]
|
||||||
|
X_test, y_test = X[train_end:test_end], y[train_end:test_end]
|
||||||
|
if len(X_test) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
|
||||||
|
model.train(eval_metric='rmse')
|
||||||
|
|
||||||
|
preds = model.predict(X_test)
|
||||||
|
rmse, mape, r2, dir_acc = _compute_metrics(y_test, preds)
|
||||||
|
metrics_accum["rmse"].append(rmse)
|
||||||
|
metrics_accum["mape"].append(mape)
|
||||||
|
metrics_accum["r2"].append(r2)
|
||||||
|
metrics_accum["dir_acc"].append(dir_acc)
|
||||||
|
|
||||||
|
fold_importance = model.get_feature_importance(feature_names)
|
||||||
|
for name, val in fold_importance.items():
|
||||||
|
importance_sum[name] += float(val)
|
||||||
|
|
||||||
|
metrics_avg = {k: float(np.mean(v)) if len(v) > 0 else float('nan') for k, v in metrics_accum.items()}
|
||||||
|
importance_avg = {k: (importance_sum[k] / n_splits) for k in feature_names}
|
||||||
|
return metrics_avg, importance_avg
|
||||||
|
|
||||||
|
|
||||||
@ -433,4 +433,157 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
np.save(st_file, features_dict[st_name].values)
|
np.save(st_file, features_dict[st_name].values)
|
||||||
np.save(st_trend_file, features_dict[st_trend_name].values)
|
np.save(st_trend_file, features_dict[st_trend_name].values)
|
||||||
|
|
||||||
|
# --- OHLCV-only additional features ---
|
||||||
|
# Helper for caching single-series features using the same pattern as above
|
||||||
|
def _save_or_load_feature(name, series):
|
||||||
|
if csv_prefix:
|
||||||
|
feature_file = f'../data/{csv_prefix}_{name}.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict[name] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
# Ensure pandas Series with correct index
|
||||||
|
series = pd.Series(series, index=df.index)
|
||||||
|
features_dict[name] = series
|
||||||
|
np.save(feature_file, series.values)
|
||||||
|
else:
|
||||||
|
series = pd.Series(series, index=df.index)
|
||||||
|
features_dict[name] = series
|
||||||
|
|
||||||
|
eps = 1e-9
|
||||||
|
|
||||||
|
# Candle shape/position
|
||||||
|
body = (df['Close'] - df['Open']).abs()
|
||||||
|
rng = (df['High'] - df['Low'])
|
||||||
|
upper_wick = df['High'] - df[['Open', 'Close']].max(axis=1)
|
||||||
|
lower_wick = df[['Open', 'Close']].min(axis=1) - df['Low']
|
||||||
|
|
||||||
|
_save_or_load_feature('candle_body', body)
|
||||||
|
_save_or_load_feature('candle_upper_wick', upper_wick)
|
||||||
|
_save_or_load_feature('candle_lower_wick', lower_wick)
|
||||||
|
_save_or_load_feature('candle_body_to_range', body / (rng + eps))
|
||||||
|
_save_or_load_feature('candle_upper_wick_to_range', upper_wick / (rng + eps))
|
||||||
|
_save_or_load_feature('candle_lower_wick_to_range', lower_wick / (rng + eps))
|
||||||
|
_save_or_load_feature('close_pos_in_bar', (df['Close'] - df['Low']) / (rng + eps))
|
||||||
|
|
||||||
|
for w in window_sizes:
|
||||||
|
roll_max = df['High'].rolling(w).max()
|
||||||
|
roll_min = df['Low'].rolling(w).min()
|
||||||
|
close_pos_roll = (df['Close'] - roll_min) / ((roll_max - roll_min) + eps)
|
||||||
|
_save_or_load_feature(f'close_pos_in_roll_{w}', close_pos_roll)
|
||||||
|
|
||||||
|
# Range-based volatility (Parkinson, Garman–Klass, Rogers–Satchell, Yang–Zhang)
|
||||||
|
log_hl = np.log((df['High'] / df['Low']).replace(0, np.nan))
|
||||||
|
log_co = np.log((df['Close'] / df['Open']).replace(0, np.nan))
|
||||||
|
log_close = np.log(df['Close'].replace(0, np.nan))
|
||||||
|
ret1 = log_close.diff()
|
||||||
|
|
||||||
|
for w in window_sizes:
|
||||||
|
# Parkinson
|
||||||
|
parkinson_var = (log_hl.pow(2)).rolling(w).mean() / (4.0 * np.log(2.0))
|
||||||
|
_save_or_load_feature(f'park_vol_{w}', np.sqrt(parkinson_var.clip(lower=0)))
|
||||||
|
|
||||||
|
# Garman–Klass
|
||||||
|
gk_var = 0.5 * (log_hl.pow(2)).rolling(w).mean() - (2.0 * np.log(2.0) - 1.0) * (log_co.pow(2)).rolling(w).mean()
|
||||||
|
_save_or_load_feature(f'gk_vol_{w}', np.sqrt(gk_var.clip(lower=0)))
|
||||||
|
|
||||||
|
# Rogers–Satchell
|
||||||
|
u = np.log((df['High'] / df['Close']).replace(0, np.nan))
|
||||||
|
d = np.log((df['Low'] / df['Close']).replace(0, np.nan))
|
||||||
|
uo = np.log((df['High'] / df['Open']).replace(0, np.nan))
|
||||||
|
do = np.log((df['Low'] / df['Open']).replace(0, np.nan))
|
||||||
|
rs_term = u * uo + d * do
|
||||||
|
rs_var = rs_term.rolling(w).mean()
|
||||||
|
_save_or_load_feature(f'rs_vol_{w}', np.sqrt(rs_var.clip(lower=0)))
|
||||||
|
|
||||||
|
# Yang–Zhang
|
||||||
|
g = np.log((df['Open'] / df['Close'].shift(1)).replace(0, np.nan))
|
||||||
|
u_yz = np.log((df['High'] / df['Open']).replace(0, np.nan))
|
||||||
|
d_yz = np.log((df['Low'] / df['Open']).replace(0, np.nan))
|
||||||
|
c_yz = np.log((df['Close'] / df['Open']).replace(0, np.nan))
|
||||||
|
sigma_g2 = g.rolling(w).var()
|
||||||
|
sigma_c2 = c_yz.rolling(w).var()
|
||||||
|
sigma_rs = (u_yz * (u_yz - c_yz) + d_yz * (d_yz - c_yz)).rolling(w).mean()
|
||||||
|
k = 0.34 / (1.34 + (w + 1.0) / max(w - 1.0, 1.0))
|
||||||
|
yz_var = sigma_g2 + k * sigma_c2 + (1.0 - k) * sigma_rs
|
||||||
|
_save_or_load_feature(f'yz_vol_{w}', np.sqrt(yz_var.clip(lower=0)))
|
||||||
|
|
||||||
|
# Trend strength: rolling linear-regression slope and R² of log price
|
||||||
|
def _linreg_slope(arr):
|
||||||
|
y = np.asarray(arr, dtype=float)
|
||||||
|
n = y.size
|
||||||
|
x = np.arange(n, dtype=float)
|
||||||
|
xmean = (n - 1.0) / 2.0
|
||||||
|
ymean = np.nanmean(y)
|
||||||
|
xm = x - xmean
|
||||||
|
ym = y - ymean
|
||||||
|
cov = np.nansum(xm * ym)
|
||||||
|
varx = np.nansum(xm * xm) + eps
|
||||||
|
return cov / varx
|
||||||
|
|
||||||
|
def _linreg_r2(arr):
|
||||||
|
y = np.asarray(arr, dtype=float)
|
||||||
|
n = y.size
|
||||||
|
x = np.arange(n, dtype=float)
|
||||||
|
xmean = (n - 1.0) / 2.0
|
||||||
|
ymean = np.nanmean(y)
|
||||||
|
slope = _linreg_slope(arr)
|
||||||
|
intercept = ymean - slope * xmean
|
||||||
|
yhat = slope * x + intercept
|
||||||
|
ss_tot = np.nansum((y - ymean) ** 2)
|
||||||
|
ss_res = np.nansum((y - yhat) ** 2)
|
||||||
|
return 1.0 - ss_res / (ss_tot + eps)
|
||||||
|
|
||||||
|
for w in window_sizes:
|
||||||
|
_save_or_load_feature(f'lr_slope_log_close_{w}', log_close.rolling(w).apply(_linreg_slope, raw=True))
|
||||||
|
_save_or_load_feature(f'lr_r2_log_close_{w}', log_close.rolling(w).apply(_linreg_r2, raw=True))
|
||||||
|
|
||||||
|
# EMA(7), EMA(21), their slopes and spread
|
||||||
|
ema_7 = df['Close'].ewm(span=7, adjust=False).mean()
|
||||||
|
ema_21 = df['Close'].ewm(span=21, adjust=False).mean()
|
||||||
|
_save_or_load_feature('ema_7', ema_7)
|
||||||
|
_save_or_load_feature('ema_21', ema_21)
|
||||||
|
_save_or_load_feature('ema_7_slope', ema_7.pct_change())
|
||||||
|
_save_or_load_feature('ema_21_slope', ema_21.pct_change())
|
||||||
|
_save_or_load_feature('ema_7_21_spread', ema_7 - ema_21)
|
||||||
|
|
||||||
|
# VWAP over windows and distance of Close from VWAP
|
||||||
|
tp = (df['High'] + df['Low'] + df['Close']) / 3.0
|
||||||
|
for w in window_sizes:
|
||||||
|
vwap_w = (tp * df['Volume']).rolling(w).sum() / (df['Volume'].rolling(w).sum() + eps)
|
||||||
|
_save_or_load_feature(f'vwap_{w}', vwap_w)
|
||||||
|
_save_or_load_feature(f'vwap_dist_{w}', (df['Close'] - vwap_w) / (vwap_w + eps))
|
||||||
|
|
||||||
|
# Autocorrelation of log returns at lags 1–5 (rolling window 30)
|
||||||
|
for lag in range(1, 6):
|
||||||
|
ac = ret1.rolling(30).corr(ret1.shift(lag))
|
||||||
|
_save_or_load_feature(f'ret_autocorr_lag{lag}_30', ac)
|
||||||
|
|
||||||
|
# Rolling skewness and kurtosis of returns (15, 30)
|
||||||
|
for w in [15, 30]:
|
||||||
|
_save_or_load_feature(f'ret_skew_{w}', ret1.rolling(w).skew())
|
||||||
|
_save_or_load_feature(f'ret_kurt_{w}', ret1.rolling(w).kurt())
|
||||||
|
|
||||||
|
# Volume z-score and return-volume rolling correlation (15, 30)
|
||||||
|
for w in [15, 30]:
|
||||||
|
vol_mean = df['Volume'].rolling(w).mean()
|
||||||
|
vol_std = df['Volume'].rolling(w).std()
|
||||||
|
_save_or_load_feature(f'volume_zscore_{w}', (df['Volume'] - vol_mean) / (vol_std + eps))
|
||||||
|
_save_or_load_feature(f'ret_vol_corr_{w}', ret1.rolling(w).corr(df['Volume']))
|
||||||
|
|
||||||
|
# Cyclical time features and relative volume vs hour-of-day average
|
||||||
|
try:
|
||||||
|
hours = pd.to_datetime(df['Timestamp']).dt.hour
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
hours = pd.to_datetime(df['Timestamp'], unit='s', errors='coerce').dt.hour
|
||||||
|
except Exception:
|
||||||
|
hours = pd.Series(np.nan, index=df.index)
|
||||||
|
|
||||||
|
_save_or_load_feature('sin_hour', np.sin(2.0 * np.pi * (hours.fillna(0)) / 24.0))
|
||||||
|
_save_or_load_feature('cos_hour', np.cos(2.0 * np.pi * (hours.fillna(0)) / 24.0))
|
||||||
|
|
||||||
|
hourly_mean_vol = df['Volume'].groupby(hours).transform('mean')
|
||||||
|
_save_or_load_feature('relative_volume_hour', df['Volume'] / (hourly_mean_vol + eps))
|
||||||
|
|
||||||
return features_dict
|
return features_dict
|
||||||
|
|||||||
67
main.py
67
main.py
@ -4,6 +4,7 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from custom_xgboost import CustomXGBoostGPU
|
from custom_xgboost import CustomXGBoostGPU
|
||||||
|
from evaluation import walk_forward_cv
|
||||||
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
||||||
from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap
|
from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap
|
||||||
import time
|
import time
|
||||||
@ -193,7 +194,8 @@ if __name__ == '__main__':
|
|||||||
if not os.path.exists(results_csv):
|
if not os.path.exists(results_csv):
|
||||||
with open(results_csv, 'w', newline='') as f:
|
with open(results_csv, 'w', newline='') as f:
|
||||||
writer = csv.writer(f)
|
writer = csv.writer(f)
|
||||||
writer.writerow(['num_features', 'added feature', 'rmse', 'mae', 'r2', 'mape', 'directional_accuracy', 'feature_importance'])
|
# Header aligned with what we actually write: feature, rmse, mape, r2, directional_accuracy, feature_importance
|
||||||
|
writer.writerow(['feature', 'rmse', 'mape', 'r2', 'directional_accuracy', 'feature_importance'])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
X = df[feature_cols].values.astype(np.float32)
|
X = df[feature_cols].values.astype(np.float32)
|
||||||
@ -203,7 +205,64 @@ if __name__ == '__main__':
|
|||||||
y_train, y_test = y[:split_idx], y[split_idx:]
|
y_train, y_test = y[:split_idx], y[split_idx:]
|
||||||
test_timestamps = df['Timestamp'].values[split_idx:]
|
test_timestamps = df['Timestamp'].values[split_idx:]
|
||||||
|
|
||||||
model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
|
# Optional: walk-forward CV to obtain averaged importance for pruning
|
||||||
|
DO_WALK_FORWARD_CV = True
|
||||||
|
AUTO_PRUNE = True
|
||||||
|
TOP_K = 150 # keep top-K features by averaged importance
|
||||||
|
|
||||||
|
feature_importance_avg = None
|
||||||
|
if DO_WALK_FORWARD_CV:
|
||||||
|
print('Running walk-forward CV for metrics and averaged feature importance...')
|
||||||
|
metrics_avg, importance_avg = walk_forward_cv(X, y, feature_cols, n_splits=5)
|
||||||
|
print(f"CV metrics: RMSE={metrics_avg['rmse']:.6f}, MAPE={metrics_avg['mape']:.4f}%, R2={metrics_avg['r2']:.6f}, DirAcc={metrics_avg['dir_acc']:.4f}")
|
||||||
|
feature_importance_avg = importance_avg
|
||||||
|
|
||||||
|
# Auto-prune low-importance and known low-value features
|
||||||
|
prune_set = set()
|
||||||
|
# Known low-value from analysis
|
||||||
|
KNOWN_LOW = [
|
||||||
|
'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0',
|
||||||
|
'supertrend_trend_12_3.0', 'supertrend_trend_10_1.0', 'supertrend_trend_11_2.0',
|
||||||
|
]
|
||||||
|
# Prefer cyclical hour; drop raw hour
|
||||||
|
KNOWN_LOW += ['hour']
|
||||||
|
|
||||||
|
if feature_importance_avg is not None:
|
||||||
|
# Keep top-K by averaged importance
|
||||||
|
sorted_feats = sorted(feature_importance_avg.items(), key=lambda kv: kv[1], reverse=True)
|
||||||
|
keep_names = set(name for name, _ in sorted_feats[:TOP_K])
|
||||||
|
for name in feature_cols:
|
||||||
|
if name not in keep_names:
|
||||||
|
prune_set.add(name)
|
||||||
|
|
||||||
|
for name in KNOWN_LOW:
|
||||||
|
if name in feature_cols:
|
||||||
|
prune_set.add(name)
|
||||||
|
|
||||||
|
# Drop alternative vol estimators if Parkinson present at same window
|
||||||
|
for w in [5, 15, 30]:
|
||||||
|
park = f'park_vol_{w}'
|
||||||
|
gk = f'gk_vol_{w}'
|
||||||
|
rs = f'rs_vol_{w}'
|
||||||
|
yz = f'yz_vol_{w}'
|
||||||
|
if park in feature_cols:
|
||||||
|
for alt in [gk, rs, yz]:
|
||||||
|
if alt in feature_cols:
|
||||||
|
prune_set.add(alt)
|
||||||
|
|
||||||
|
# Apply pruning to training set
|
||||||
|
if AUTO_PRUNE and prune_set:
|
||||||
|
print(f'Pruning {len(prune_set)} features before training...')
|
||||||
|
kept_feature_cols = [c for c in feature_cols if c not in prune_set]
|
||||||
|
else:
|
||||||
|
kept_feature_cols = feature_cols
|
||||||
|
|
||||||
|
model = CustomXGBoostGPU(
|
||||||
|
df[kept_feature_cols].values.astype(np.float32)[:split_idx],
|
||||||
|
df[kept_feature_cols].values.astype(np.float32)[split_idx:],
|
||||||
|
y[:split_idx],
|
||||||
|
y[split_idx:]
|
||||||
|
)
|
||||||
booster = model.train(eval_metric='rmse')
|
booster = model.train(eval_metric='rmse')
|
||||||
# colsample_bytree=1.0,
|
# colsample_bytree=1.0,
|
||||||
# learning_rate=0.05,
|
# learning_rate=0.05,
|
||||||
@ -239,10 +298,10 @@ if __name__ == '__main__':
|
|||||||
mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100
|
mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100
|
||||||
|
|
||||||
# Save results to CSV for all features used in this run
|
# Save results to CSV for all features used in this run
|
||||||
feature_importance_dict = model.get_feature_importance(feature_cols)
|
feature_importance_dict = model.get_feature_importance(kept_feature_cols)
|
||||||
with open(results_csv, 'a', newline='') as f:
|
with open(results_csv, 'a', newline='') as f:
|
||||||
writer = csv.writer(f)
|
writer = csv.writer(f)
|
||||||
for feature in feature_cols:
|
for feature in kept_feature_cols:
|
||||||
importance = feature_importance_dict.get(feature, 0.0)
|
importance = feature_importance_dict.get(feature, 0.0)
|
||||||
fi_str = format(importance, ".6f")
|
fi_str = format(importance, ".6f")
|
||||||
row = [feature]
|
row = [feature]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user