import sys import os sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) import pandas as pd import numpy as np from custom_xgboost import CustomXGBoostGPU from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap from cycles.supertrend import Supertrends import time from numba import njit import itertools import csv import pandas_ta as ta def run_indicator(func, *args): return func(*args) def run_indicator_job(job): import time func, *args = job indicator_name = func.__name__ start = time.time() result = func(*args) elapsed = time.time() - start print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds') return result def calc_rsi(close): from ta.momentum import RSIIndicator return ('rsi', RSIIndicator(close, window=14).rsi()) def calc_macd(close): from ta.trend import MACD return ('macd', MACD(close).macd()) def calc_bollinger(close): from ta.volatility import BollingerBands bb = BollingerBands(close=close, window=20, window_dev=2) return [ ('bb_bbm', bb.bollinger_mavg()), ('bb_bbh', bb.bollinger_hband()), ('bb_bbl', bb.bollinger_lband()), ('bb_bb_width', bb.bollinger_hband() - bb.bollinger_lband()) ] def calc_stochastic(high, low, close): from ta.momentum import StochasticOscillator stoch = StochasticOscillator(high=high, low=low, close=close, window=14, smooth_window=3) return [ ('stoch_k', stoch.stoch()), ('stoch_d', stoch.stoch_signal()) ] def calc_atr(high, low, close): from ta.volatility import AverageTrueRange atr = AverageTrueRange(high=high, low=low, close=close, window=14) return ('atr', atr.average_true_range()) def calc_cci(high, low, close): from ta.trend import CCIIndicator cci = CCIIndicator(high=high, low=low, close=close, window=20) return ('cci', cci.cci()) def calc_williamsr(high, low, close): from ta.momentum import WilliamsRIndicator willr = WilliamsRIndicator(high=high, low=low, close=close, lbp=14) return ('williams_r', willr.williams_r()) def calc_ema(close): from ta.trend import EMAIndicator ema = EMAIndicator(close=close, window=14) return ('ema_14', ema.ema_indicator()) def calc_obv(close, volume): from ta.volume import OnBalanceVolumeIndicator obv = OnBalanceVolumeIndicator(close=close, volume=volume) return ('obv', obv.on_balance_volume()) def calc_cmf(high, low, close, volume): from ta.volume import ChaikinMoneyFlowIndicator cmf = ChaikinMoneyFlowIndicator(high=high, low=low, close=close, volume=volume, window=20) return ('cmf', cmf.chaikin_money_flow()) def calc_sma(close): from ta.trend import SMAIndicator return [ ('sma_50', SMAIndicator(close, window=50).sma_indicator()), ('sma_200', SMAIndicator(close, window=200).sma_indicator()) ] def calc_roc(close): from ta.momentum import ROCIndicator return ('roc_10', ROCIndicator(close, window=10).roc()) def calc_momentum(close): return ('momentum_10', close - close.shift(10)) def calc_psar(high, low, close): # Use the Numba-accelerated fast_psar function for speed psar_values = fast_psar(np.array(high), np.array(low), np.array(close)) return [('psar', pd.Series(psar_values, index=close.index))] def calc_donchian(high, low, close): from ta.volatility import DonchianChannel donchian = DonchianChannel(high, low, close, window=20) return [ ('donchian_hband', donchian.donchian_channel_hband()), ('donchian_lband', donchian.donchian_channel_lband()), ('donchian_mband', donchian.donchian_channel_mband()) ] def calc_keltner(high, low, close): from ta.volatility import KeltnerChannel keltner = KeltnerChannel(high, low, close, window=20) return [ ('keltner_hband', keltner.keltner_channel_hband()), ('keltner_lband', keltner.keltner_channel_lband()), ('keltner_mband', keltner.keltner_channel_mband()) ] def calc_dpo(close): from ta.trend import DPOIndicator return ('dpo_20', DPOIndicator(close, window=20).dpo()) def calc_ultimate(high, low, close): from ta.momentum import UltimateOscillator return ('ultimate_osc', UltimateOscillator(high, low, close).ultimate_oscillator()) def calc_ichimoku(high, low): from ta.trend import IchimokuIndicator ichimoku = IchimokuIndicator(high, low, window1=9, window2=26, window3=52) return [ ('ichimoku_a', ichimoku.ichimoku_a()), ('ichimoku_b', ichimoku.ichimoku_b()), ('ichimoku_base_line', ichimoku.ichimoku_base_line()), ('ichimoku_conversion_line', ichimoku.ichimoku_conversion_line()) ] def calc_elder_ray(close, low, high): from ta.trend import EMAIndicator ema = EMAIndicator(close, window=13).ema_indicator() return [ ('elder_ray_bull', ema - low), ('elder_ray_bear', ema - high) ] def calc_daily_return(close): from ta.others import DailyReturnIndicator return ('daily_return', DailyReturnIndicator(close).daily_return()) @njit def fast_psar(high, low, close, af=0.02, max_af=0.2): length = len(close) psar = np.zeros(length) bull = True af_step = af ep = low[0] psar[0] = low[0] for i in range(1, length): prev_psar = psar[i-1] if bull: psar[i] = prev_psar + af_step * (ep - prev_psar) if low[i] < psar[i]: bull = False psar[i] = ep af_step = af ep = low[i] else: if high[i] > ep: ep = high[i] af_step = min(af_step + af, max_af) else: psar[i] = prev_psar + af_step * (ep - prev_psar) if high[i] > psar[i]: bull = True psar[i] = ep af_step = af ep = high[i] else: if low[i] < ep: ep = low[i] af_step = min(af_step + af, max_af) return psar def compute_lag(df, col, lag): return df[col].shift(lag) def compute_rolling(df, col, stat, window): if stat == 'mean': return df[col].rolling(window).mean() elif stat == 'std': return df[col].rolling(window).std() elif stat == 'min': return df[col].rolling(window).min() elif stat == 'max': return df[col].rolling(window).max() def compute_log_return(df, horizon): return np.log(df['Close'] / df['Close'].shift(horizon)) def compute_volatility(df, window): return df['log_return'].rolling(window).std() def run_feature_job(job, df): feature_name, func, *args = job print(f'Computing feature: {feature_name}') result = func(df, *args) return feature_name, result def calc_adx(high, low, close): from ta.trend import ADXIndicator adx = ADXIndicator(high=high, low=low, close=close, window=14) return [ ('adx', adx.adx()), ('adx_pos', adx.adx_pos()), ('adx_neg', adx.adx_neg()) ] def calc_trix(close): from ta.trend import TRIXIndicator trix = TRIXIndicator(close=close, window=15) return ('trix', trix.trix()) def calc_vortex(high, low, close): from ta.trend import VortexIndicator vortex = VortexIndicator(high=high, low=low, close=close, window=14) return [ ('vortex_pos', vortex.vortex_indicator_pos()), ('vortex_neg', vortex.vortex_indicator_neg()) ] def calc_kama(close): import pandas_ta as ta kama = ta.kama(close, length=10) return ('kama', kama) def calc_force_index(close, volume): from ta.volume import ForceIndexIndicator fi = ForceIndexIndicator(close=close, volume=volume, window=13) return ('force_index', fi.force_index()) def calc_eom(high, low, volume): from ta.volume import EaseOfMovementIndicator eom = EaseOfMovementIndicator(high=high, low=low, volume=volume, window=14) return ('eom', eom.ease_of_movement()) def calc_mfi(high, low, close, volume): from ta.volume import MFIIndicator mfi = MFIIndicator(high=high, low=low, close=close, volume=volume, window=14) return ('mfi', mfi.money_flow_index()) def calc_adi(high, low, close, volume): from ta.volume import AccDistIndexIndicator adi = AccDistIndexIndicator(high=high, low=low, close=close, volume=volume) return ('adi', adi.acc_dist_index()) def calc_tema(close): import pandas_ta as ta tema = ta.tema(close, length=10) return ('tema', tema) def calc_stochrsi(close): from ta.momentum import StochRSIIndicator stochrsi = StochRSIIndicator(close=close, window=14, smooth1=3, smooth2=3) return [ ('stochrsi', stochrsi.stochrsi()), ('stochrsi_k', stochrsi.stochrsi_k()), ('stochrsi_d', stochrsi.stochrsi_d()) ] def calc_awesome_oscillator(high, low): from ta.momentum import AwesomeOscillatorIndicator ao = AwesomeOscillatorIndicator(high=high, low=low, window1=5, window2=34) return ('awesome_osc', ao.awesome_oscillator()) if __name__ == '__main__': IMPUTE_NANS = True # Set to True to impute NaNs, False to drop rows with NaNs csv_path = './data/btcusd_1-min_data.csv' csv_prefix = os.path.splitext(os.path.basename(csv_path))[0] print('Reading CSV and filtering data...') df = pd.read_csv(csv_path) df = df[df['Volume'] != 0] min_date = '2017-06-01' print('Converting Timestamp and filtering by date...') df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s') df = df[df['Timestamp'] >= min_date] lags = 3 print('Calculating log returns as the new target...') df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume'] window_sizes = [5, 15, 30] # in minutes, adjust as needed features_dict = {} print('Starting feature computation...') feature_start_time = time.time() # --- Technical Indicator Features: Calculate or Load from Cache --- print('Calculating or loading technical indicator features...') # RSI feature_file = f'./data/{csv_prefix}_rsi.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['rsi'] = pd.Series(arr, index=df.index) else: print('Calculating feature: rsi') _, values = calc_rsi(df['Close']) features_dict['rsi'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # MACD feature_file = f'./data/{csv_prefix}_macd.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['macd'] = pd.Series(arr, index=df.index) else: print('Calculating feature: macd') _, values = calc_macd(df['Close']) features_dict['macd'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # ATR feature_file = f'./data/{csv_prefix}_atr.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['atr'] = pd.Series(arr, index=df.index) else: print('Calculating feature: atr') _, values = calc_atr(df['High'], df['Low'], df['Close']) features_dict['atr'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # CCI feature_file = f'./data/{csv_prefix}_cci.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['cci'] = pd.Series(arr, index=df.index) else: print('Calculating feature: cci') _, values = calc_cci(df['High'], df['Low'], df['Close']) features_dict['cci'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # Williams %R feature_file = f'./data/{csv_prefix}_williams_r.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['williams_r'] = pd.Series(arr, index=df.index) else: print('Calculating feature: williams_r') _, values = calc_williamsr(df['High'], df['Low'], df['Close']) features_dict['williams_r'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # EMA 14 feature_file = f'./data/{csv_prefix}_ema_14.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['ema_14'] = pd.Series(arr, index=df.index) else: print('Calculating feature: ema_14') _, values = calc_ema(df['Close']) features_dict['ema_14'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # OBV feature_file = f'./data/{csv_prefix}_obv.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['obv'] = pd.Series(arr, index=df.index) else: print('Calculating feature: obv') _, values = calc_obv(df['Close'], df['Volume']) features_dict['obv'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # CMF feature_file = f'./data/{csv_prefix}_cmf.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['cmf'] = pd.Series(arr, index=df.index) else: print('Calculating feature: cmf') _, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume']) features_dict['cmf'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # ROC 10 feature_file = f'./data/{csv_prefix}_roc_10.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['roc_10'] = pd.Series(arr, index=df.index) else: print('Calculating feature: roc_10') _, values = calc_roc(df['Close']) features_dict['roc_10'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # DPO 20 feature_file = f'./data/{csv_prefix}_dpo_20.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['dpo_20'] = pd.Series(arr, index=df.index) else: print('Calculating feature: dpo_20') _, values = calc_dpo(df['Close']) features_dict['dpo_20'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # Ultimate Oscillator feature_file = f'./data/{csv_prefix}_ultimate_osc.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['ultimate_osc'] = pd.Series(arr, index=df.index) else: print('Calculating feature: ultimate_osc') _, values = calc_ultimate(df['High'], df['Low'], df['Close']) features_dict['ultimate_osc'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # Daily Return feature_file = f'./data/{csv_prefix}_daily_return.npy' if os.path.exists(feature_file): print(f'A Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['daily_return'] = pd.Series(arr, index=df.index) else: print('Calculating feature: daily_return') _, values = calc_daily_return(df['Close']) features_dict['daily_return'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # Multi-column indicators # Bollinger Bands print('Calculating multi-column indicator: bollinger') result = calc_bollinger(df['Close']) for subname, values in result: print(f"Adding subfeature: {subname}") sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' if os.path.exists(sub_feature_file): print(f'B Loading cached feature: {sub_feature_file}') arr = np.load(sub_feature_file) features_dict[subname] = pd.Series(arr, index=df.index) else: features_dict[subname] = values np.save(sub_feature_file, values.values) print(f'Saved feature: {sub_feature_file}') # Stochastic Oscillator print('Calculating multi-column indicator: stochastic') result = calc_stochastic(df['High'], df['Low'], df['Close']) for subname, values in result: print(f"Adding subfeature: {subname}") sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' if os.path.exists(sub_feature_file): print(f'B Loading cached feature: {sub_feature_file}') arr = np.load(sub_feature_file) features_dict[subname] = pd.Series(arr, index=df.index) else: features_dict[subname] = values np.save(sub_feature_file, values.values) print(f'Saved feature: {sub_feature_file}') # SMA print('Calculating multi-column indicator: sma') result = calc_sma(df['Close']) for subname, values in result: print(f"Adding subfeature: {subname}") sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' if os.path.exists(sub_feature_file): print(f'B Loading cached feature: {sub_feature_file}') arr = np.load(sub_feature_file) features_dict[subname] = pd.Series(arr, index=df.index) else: features_dict[subname] = values np.save(sub_feature_file, values.values) print(f'Saved feature: {sub_feature_file}') # PSAR print('Calculating multi-column indicator: psar') result = calc_psar(df['High'], df['Low'], df['Close']) for subname, values in result: print(f"Adding subfeature: {subname}") sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' if os.path.exists(sub_feature_file): print(f'B Loading cached feature: {sub_feature_file}') arr = np.load(sub_feature_file) features_dict[subname] = pd.Series(arr, index=df.index) else: features_dict[subname] = values np.save(sub_feature_file, values.values) print(f'Saved feature: {sub_feature_file}') # Donchian Channel print('Calculating multi-column indicator: donchian') result = calc_donchian(df['High'], df['Low'], df['Close']) for subname, values in result: print(f"Adding subfeature: {subname}") sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' if os.path.exists(sub_feature_file): print(f'B Loading cached feature: {sub_feature_file}') arr = np.load(sub_feature_file) features_dict[subname] = pd.Series(arr, index=df.index) else: features_dict[subname] = values np.save(sub_feature_file, values.values) print(f'Saved feature: {sub_feature_file}') # Keltner Channel print('Calculating multi-column indicator: keltner') result = calc_keltner(df['High'], df['Low'], df['Close']) for subname, values in result: print(f"Adding subfeature: {subname}") sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' if os.path.exists(sub_feature_file): print(f'B Loading cached feature: {sub_feature_file}') arr = np.load(sub_feature_file) features_dict[subname] = pd.Series(arr, index=df.index) else: features_dict[subname] = values np.save(sub_feature_file, values.values) print(f'Saved feature: {sub_feature_file}') # Ichimoku print('Calculating multi-column indicator: ichimoku') result = calc_ichimoku(df['High'], df['Low']) for subname, values in result: print(f"Adding subfeature: {subname}") sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' if os.path.exists(sub_feature_file): print(f'B Loading cached feature: {sub_feature_file}') arr = np.load(sub_feature_file) features_dict[subname] = pd.Series(arr, index=df.index) else: features_dict[subname] = values np.save(sub_feature_file, values.values) print(f'Saved feature: {sub_feature_file}') # Elder Ray print('Calculating multi-column indicator: elder_ray') result = calc_elder_ray(df['Close'], df['Low'], df['High']) for subname, values in result: print(f"Adding subfeature: {subname}") sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' if os.path.exists(sub_feature_file): print(f'B Loading cached feature: {sub_feature_file}') arr = np.load(sub_feature_file) features_dict[subname] = pd.Series(arr, index=df.index) else: features_dict[subname] = values np.save(sub_feature_file, values.values) print(f'Saved feature: {sub_feature_file}') # Prepare lags, rolling stats, log returns, and volatility features sequentially # Lags for col in ohlcv_cols: for lag in range(1, lags + 1): feature_name = f'{col}_lag{lag}' feature_file = f'./data/{csv_prefix}_{feature_name}.npy' if os.path.exists(feature_file): print(f'C Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: print(f'Computing lag feature: {feature_name}') result = compute_lag(df, col, lag) features_dict[feature_name] = result np.save(feature_file, result.values) print(f'Saved feature: {feature_file}') # Rolling statistics for col in ohlcv_cols: for window in window_sizes: if (col == 'Open' and window == 5): continue if (col == 'High' and window == 5): continue if (col == 'High' and window == 30): continue if (col == 'Low' and window == 15): continue for stat in ['mean', 'std', 'min', 'max']: feature_name = f'{col}_roll_{stat}_{window}' feature_file = f'./data/{csv_prefix}_{feature_name}.npy' if os.path.exists(feature_file): print(f'D Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: print(f'Computing rolling stat feature: {feature_name}') result = compute_rolling(df, col, stat, window) features_dict[feature_name] = result np.save(feature_file, result.values) print(f'Saved feature: {feature_file}') # Log returns for different horizons for horizon in [5, 15, 30]: feature_name = f'log_return_{horizon}' feature_file = f'./data/{csv_prefix}_{feature_name}.npy' if os.path.exists(feature_file): print(f'E Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: print(f'Computing log return feature: {feature_name}') result = compute_log_return(df, horizon) features_dict[feature_name] = result np.save(feature_file, result.values) print(f'Saved feature: {feature_file}') # Volatility for window in window_sizes: feature_name = f'volatility_{window}' feature_file = f'./data/{csv_prefix}_{feature_name}.npy' if os.path.exists(feature_file): print(f'F Loading cached feature: {feature_file}') features_dict[feature_name] = np.load(feature_file) else: print(f'Computing volatility feature: {feature_name}') result = compute_volatility(df, window) features_dict[feature_name] = result np.save(feature_file, result.values) print(f'Saved feature: {feature_file}') # --- Additional Technical Indicator Features --- # ADX adx_names = ['adx', 'adx_pos', 'adx_neg'] adx_files = [f'./data/{csv_prefix}_{name}.npy' for name in adx_names] if all(os.path.exists(f) for f in adx_files): print('G Loading cached features: ADX') for name, f in zip(adx_names, adx_files): arr = np.load(f) features_dict[name] = pd.Series(arr, index=df.index) else: print('Calculating multi-column indicator: adx') result = calc_adx(df['High'], df['Low'], df['Close']) for subname, values in result: sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' features_dict[subname] = values np.save(sub_feature_file, values.values) print(f'Saved feature: {sub_feature_file}') # Force Index feature_file = f'./data/{csv_prefix}_force_index.npy' if os.path.exists(feature_file): print(f'K Loading cached feature: {feature_file}') arr = np.load(feature_file) features_dict['force_index'] = pd.Series(arr, index=df.index) else: print('Calculating feature: force_index') _, values = calc_force_index(df['Close'], df['Volume']) features_dict['force_index'] = values np.save(feature_file, values.values) print(f'Saved feature: {feature_file}') # Supertrend indicators for period, multiplier in [(12, 3.0), (10, 1.0), (11, 2.0)]: st_name = f'supertrend_{period}_{multiplier}' st_trend_name = f'supertrend_trend_{period}_{multiplier}' st_file = f'./data/{csv_prefix}_{st_name}.npy' st_trend_file = f'./data/{csv_prefix}_{st_trend_name}.npy' if os.path.exists(st_file) and os.path.exists(st_trend_file): print(f'L Loading cached features: {st_file}, {st_trend_file}') features_dict[st_name] = pd.Series(np.load(st_file), index=df.index) features_dict[st_trend_name] = pd.Series(np.load(st_trend_file), index=df.index) else: print(f'Calculating Supertrend indicator: {st_name}') st = ta.supertrend(df['High'], df['Low'], df['Close'], length=period, multiplier=multiplier) features_dict[st_name] = st[f'SUPERT_{period}_{multiplier}'] features_dict[st_trend_name] = st[f'SUPERTd_{period}_{multiplier}'] np.save(st_file, features_dict[st_name].values) np.save(st_trend_file, features_dict[st_trend_name].values) print(f'Saved features: {st_file}, {st_trend_file}') # Concatenate all new features at once print('Concatenating all new features to DataFrame...') features_df = pd.DataFrame(features_dict) print("Columns in features_df:", features_df.columns.tolist()) print("All-NaN columns in features_df:", features_df.columns[features_df.isna().all()].tolist()) df = pd.concat([df, features_df], axis=1) # Print all columns after concatenation print("All columns in df after concat:", df.columns.tolist()) # Downcast all float columns to save memory print('Downcasting float columns to save memory...') for col in df.columns: try: df[col] = pd.to_numeric(df[col], downcast='float') except Exception: pass # Add time features (exclude 'dayofweek') print('Adding hour feature...') df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce') df['hour'] = df['Timestamp'].dt.hour # Handle NaNs after all feature engineering if IMPUTE_NANS: print('Imputing NaNs after feature engineering (using mean imputation)...') numeric_cols = df.select_dtypes(include=[np.number]).columns for col in numeric_cols: df[col] = df[col].fillna(df[col].mean()) # If you want to impute non-numeric columns differently, add logic here else: print('Dropping NaNs after feature engineering...') df = df.dropna().reset_index(drop=True) # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features print('Selecting feature columns...') exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30'] feature_cols = [col for col in df.columns if col not in exclude_cols] print('Features used for training:', feature_cols) # Prepare CSV for results results_csv = './data/leave_one_out_results.csv' if not os.path.exists(results_csv): with open(results_csv, 'w', newline='') as f: writer = csv.writer(f) writer.writerow(['left_out_feature', 'used_features', 'rmse', 'mae', 'r2', 'mape', 'directional_accuracy']) total_features = len(feature_cols) never_leave_out = {'Open', 'High', 'Low', 'Close', 'Volume'} for idx, left_out in enumerate(feature_cols): if left_out in never_leave_out: continue used = [f for f in feature_cols if f != left_out] print(f'\n=== Leave-one-out {idx+1}/{total_features}: left out {left_out} ===') try: # Prepare X and y for this combination X = df[used].values.astype(np.float32) y = df["log_return"].values.astype(np.float32) split_idx = int(len(X) * 0.8) X_train, X_test = X[:split_idx], X[split_idx:] y_train, y_test = y[:split_idx], y[split_idx:] test_timestamps = df['Timestamp'].values[split_idx:] model = CustomXGBoostGPU(X_train, X_test, y_train, y_test) booster = model.train() model.save_model(f'./data/xgboost_model_wo_{left_out}.json') test_preds = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, test_preds)) # Reconstruct price series from log returns if 'Close' in df.columns: close_prices = df['Close'].values else: close_prices = pd.read_csv(csv_path)['Close'].values start_price = close_prices[split_idx] actual_prices = [start_price] for r_ in y_test: actual_prices.append(actual_prices[-1] * np.exp(r_)) actual_prices = np.array(actual_prices[1:]) predicted_prices = [start_price] for r_ in test_preds: predicted_prices.append(predicted_prices[-1] * np.exp(r_)) predicted_prices = np.array(predicted_prices[1:]) mae = mean_absolute_error(actual_prices, predicted_prices) r2 = r2_score(actual_prices, predicted_prices) direction_actual = np.sign(np.diff(actual_prices)) direction_pred = np.sign(np.diff(predicted_prices)) directional_accuracy = (direction_actual == direction_pred).mean() mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100 # Save results to CSV with open(results_csv, 'a', newline='') as f: writer = csv.writer(f) writer.writerow([left_out, "|".join(used), rmse, mae, r2, mape, directional_accuracy]) print(f'Left out {left_out}: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%, DirAcc={directional_accuracy*100:.2f}%') # Plotting for this run plot_prefix = f'loo_{left_out}' print('Plotting distribution of absolute prediction errors...') plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix) print('Plotting directional accuracy...') plot_direction_transition_heatmap(actual_prices, predicted_prices, prefix=plot_prefix) except Exception as e: print(f'Leave-one-out failed for {left_out}: {e}') print(f'All leave-one-out runs completed. Results saved to {results_csv}') sys.exit(0)