Cycles/xgboost/main.py

721 lines
28 KiB
Python
Raw Normal View History

2025-05-29 18:28:53 +08:00
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from custom_xgboost import CustomXGBoostGPU
from sklearn.metrics import mean_squared_error
from plot_results import plot_predicted_vs_actual_prices, plot_prediction_error_distribution
2025-05-29 18:28:53 +08:00
from cycles.supertrend import Supertrends
2025-05-29 12:45:45 +08:00
import time
from numba import njit
def run_indicator(func, *args):
return func(*args)
def run_indicator_job(job):
import time
func, *args = job
indicator_name = func.__name__
start = time.time()
result = func(*args)
elapsed = time.time() - start
print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds')
return result
def calc_rsi(close):
from ta.momentum import RSIIndicator
return ('rsi', RSIIndicator(close, window=14).rsi())
def calc_macd(close):
from ta.trend import MACD
return ('macd', MACD(close).macd())
def calc_bollinger(close):
from ta.volatility import BollingerBands
bb = BollingerBands(close=close, window=20, window_dev=2)
return [
('bb_bbm', bb.bollinger_mavg()),
('bb_bbh', bb.bollinger_hband()),
('bb_bbl', bb.bollinger_lband()),
('bb_bb_width', bb.bollinger_hband() - bb.bollinger_lband())
]
def calc_stochastic(high, low, close):
from ta.momentum import StochasticOscillator
stoch = StochasticOscillator(high=high, low=low, close=close, window=14, smooth_window=3)
return [
('stoch_k', stoch.stoch()),
('stoch_d', stoch.stoch_signal())
]
def calc_atr(high, low, close):
from ta.volatility import AverageTrueRange
atr = AverageTrueRange(high=high, low=low, close=close, window=14)
return ('atr', atr.average_true_range())
def calc_cci(high, low, close):
from ta.trend import CCIIndicator
cci = CCIIndicator(high=high, low=low, close=close, window=20)
return ('cci', cci.cci())
def calc_williamsr(high, low, close):
from ta.momentum import WilliamsRIndicator
willr = WilliamsRIndicator(high=high, low=low, close=close, lbp=14)
return ('williams_r', willr.williams_r())
def calc_ema(close):
from ta.trend import EMAIndicator
ema = EMAIndicator(close=close, window=14)
return ('ema_14', ema.ema_indicator())
def calc_obv(close, volume):
from ta.volume import OnBalanceVolumeIndicator
obv = OnBalanceVolumeIndicator(close=close, volume=volume)
return ('obv', obv.on_balance_volume())
def calc_cmf(high, low, close, volume):
from ta.volume import ChaikinMoneyFlowIndicator
cmf = ChaikinMoneyFlowIndicator(high=high, low=low, close=close, volume=volume, window=20)
return ('cmf', cmf.chaikin_money_flow())
def calc_sma(close):
from ta.trend import SMAIndicator
return [
('sma_50', SMAIndicator(close, window=50).sma_indicator()),
('sma_200', SMAIndicator(close, window=200).sma_indicator())
]
def calc_roc(close):
from ta.momentum import ROCIndicator
return ('roc_10', ROCIndicator(close, window=10).roc())
def calc_momentum(close):
return ('momentum_10', close - close.shift(10))
def calc_psar(high, low, close):
2025-05-30 12:29:37 +08:00
# Use the Numba-accelerated fast_psar function for speed
psar_values = fast_psar(np.array(high), np.array(low), np.array(close))
return [('psar', pd.Series(psar_values, index=close.index))]
2025-05-29 12:45:45 +08:00
def calc_donchian(high, low, close):
from ta.volatility import DonchianChannel
donchian = DonchianChannel(high, low, close, window=20)
return [
('donchian_hband', donchian.donchian_channel_hband()),
('donchian_lband', donchian.donchian_channel_lband()),
('donchian_mband', donchian.donchian_channel_mband())
]
def calc_keltner(high, low, close):
from ta.volatility import KeltnerChannel
keltner = KeltnerChannel(high, low, close, window=20)
return [
('keltner_hband', keltner.keltner_channel_hband()),
('keltner_lband', keltner.keltner_channel_lband()),
('keltner_mband', keltner.keltner_channel_mband())
]
def calc_dpo(close):
from ta.trend import DPOIndicator
return ('dpo_20', DPOIndicator(close, window=20).dpo())
def calc_ultimate(high, low, close):
from ta.momentum import UltimateOscillator
return ('ultimate_osc', UltimateOscillator(high, low, close).ultimate_oscillator())
def calc_ichimoku(high, low):
from ta.trend import IchimokuIndicator
ichimoku = IchimokuIndicator(high, low, window1=9, window2=26, window3=52)
return [
('ichimoku_a', ichimoku.ichimoku_a()),
('ichimoku_b', ichimoku.ichimoku_b()),
('ichimoku_base_line', ichimoku.ichimoku_base_line()),
('ichimoku_conversion_line', ichimoku.ichimoku_conversion_line())
]
def calc_elder_ray(close, low, high):
from ta.trend import EMAIndicator
ema = EMAIndicator(close, window=13).ema_indicator()
return [
('elder_ray_bull', ema - low),
('elder_ray_bear', ema - high)
]
def calc_daily_return(close):
from ta.others import DailyReturnIndicator
return ('daily_return', DailyReturnIndicator(close).daily_return())
@njit
def fast_psar(high, low, close, af=0.02, max_af=0.2):
length = len(close)
psar = np.zeros(length)
bull = True
af_step = af
ep = low[0]
psar[0] = low[0]
for i in range(1, length):
prev_psar = psar[i-1]
if bull:
psar[i] = prev_psar + af_step * (ep - prev_psar)
if low[i] < psar[i]:
bull = False
psar[i] = ep
af_step = af
ep = low[i]
else:
if high[i] > ep:
ep = high[i]
af_step = min(af_step + af, max_af)
else:
psar[i] = prev_psar + af_step * (ep - prev_psar)
if high[i] > psar[i]:
bull = True
psar[i] = ep
af_step = af
ep = high[i]
else:
if low[i] < ep:
ep = low[i]
af_step = min(af_step + af, max_af)
return psar
def compute_lag(df, col, lag):
return df[col].shift(lag)
def compute_rolling(df, col, stat, window):
if stat == 'mean':
return df[col].rolling(window).mean()
elif stat == 'std':
return df[col].rolling(window).std()
elif stat == 'min':
return df[col].rolling(window).min()
elif stat == 'max':
return df[col].rolling(window).max()
def compute_log_return(df, horizon):
return np.log(df['Close'] / df['Close'].shift(horizon))
def compute_volatility(df, window):
return df['log_return'].rolling(window).std()
def run_feature_job(job, df):
feature_name, func, *args = job
print(f'Computing feature: {feature_name}')
result = func(df, *args)
return feature_name, result
2025-05-29 18:28:53 +08:00
if __name__ == '__main__':
csv_path = './data/btcusd_1-min_data.csv'
2025-05-29 12:45:45 +08:00
csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]
2025-05-30 12:29:37 +08:00
print('Reading CSV and filtering data...')
2025-05-29 18:28:53 +08:00
df = pd.read_csv(csv_path)
df = df[df['Volume'] != 0]
min_date = '2017-06-01'
2025-05-30 12:29:37 +08:00
print('Converting Timestamp and filtering by date...')
2025-05-29 18:28:53 +08:00
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df = df[df['Timestamp'] >= min_date]
lags = 3
print('Calculating log returns as the new target...')
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
window_sizes = [5, 15, 30] # in minutes, adjust as needed
features_dict = {}
2025-05-29 12:45:45 +08:00
print('Starting feature computation...')
feature_start_time = time.time()
# --- Technical Indicator Features: Calculate or Load from Cache ---
print('Calculating or loading technical indicator features...')
2025-05-30 12:29:37 +08:00
# RSI
feature_file = f'./data/{csv_prefix}_rsi.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['rsi'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: rsi')
_, values = calc_rsi(df['Close'])
features_dict['rsi'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# MACD
feature_file = f'./data/{csv_prefix}_macd.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['macd'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: macd')
_, values = calc_macd(df['Close'])
features_dict['macd'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# ATR
feature_file = f'./data/{csv_prefix}_atr.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['atr'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: atr')
_, values = calc_atr(df['High'], df['Low'], df['Close'])
features_dict['atr'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# CCI
feature_file = f'./data/{csv_prefix}_cci.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['cci'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: cci')
_, values = calc_cci(df['High'], df['Low'], df['Close'])
features_dict['cci'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Williams %R
feature_file = f'./data/{csv_prefix}_williams_r.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['williams_r'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: williams_r')
_, values = calc_williamsr(df['High'], df['Low'], df['Close'])
features_dict['williams_r'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# EMA 14
feature_file = f'./data/{csv_prefix}_ema_14.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['ema_14'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: ema_14')
_, values = calc_ema(df['Close'])
features_dict['ema_14'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# OBV
feature_file = f'./data/{csv_prefix}_obv.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['obv'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: obv')
_, values = calc_obv(df['Close'], df['Volume'])
features_dict['obv'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# CMF
feature_file = f'./data/{csv_prefix}_cmf.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['cmf'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: cmf')
_, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
features_dict['cmf'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# ROC 10
feature_file = f'./data/{csv_prefix}_roc_10.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['roc_10'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: roc_10')
_, values = calc_roc(df['Close'])
features_dict['roc_10'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# DPO 20
feature_file = f'./data/{csv_prefix}_dpo_20.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['dpo_20'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: dpo_20')
_, values = calc_dpo(df['Close'])
features_dict['dpo_20'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Ultimate Oscillator
feature_file = f'./data/{csv_prefix}_ultimate_osc.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: ultimate_osc')
_, values = calc_ultimate(df['High'], df['Low'], df['Close'])
features_dict['ultimate_osc'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Daily Return
feature_file = f'./data/{csv_prefix}_daily_return.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['daily_return'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: daily_return')
_, values = calc_daily_return(df['Close'])
features_dict['daily_return'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
2025-05-29 12:45:45 +08:00
# Multi-column indicators
2025-05-30 12:29:37 +08:00
# Bollinger Bands
print('Calculating multi-column indicator: bollinger')
result = calc_bollinger(df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
2025-05-29 12:45:45 +08:00
else:
2025-05-30 12:29:37 +08:00
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Stochastic Oscillator
print('Calculating multi-column indicator: stochastic')
result = calc_stochastic(df['High'], df['Low'], df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# SMA
print('Calculating multi-column indicator: sma')
result = calc_sma(df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# PSAR
print('Calculating multi-column indicator: psar')
result = calc_psar(df['High'], df['Low'], df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Donchian Channel
print('Calculating multi-column indicator: donchian')
result = calc_donchian(df['High'], df['Low'], df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Keltner Channel
print('Calculating multi-column indicator: keltner')
result = calc_keltner(df['High'], df['Low'], df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Ichimoku
print('Calculating multi-column indicator: ichimoku')
result = calc_ichimoku(df['High'], df['Low'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Elder Ray
print('Calculating multi-column indicator: elder_ray')
result = calc_elder_ray(df['Close'], df['Low'], df['High'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
2025-05-29 12:45:45 +08:00
# Prepare lags, rolling stats, log returns, and volatility features sequentially
2025-05-29 18:28:53 +08:00
# Lags
for col in ohlcv_cols:
for lag in range(1, lags + 1):
2025-05-29 12:45:45 +08:00
feature_name = f'{col}_lag{lag}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
2025-05-30 12:29:37 +08:00
print(f'C Loading cached feature: {feature_file}')
2025-05-29 12:45:45 +08:00
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Computing lag feature: {feature_name}')
result = compute_lag(df, col, lag)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
2025-05-29 18:28:53 +08:00
# Rolling statistics
for col in ohlcv_cols:
for window in window_sizes:
if (col == 'Open' and window == 5):
2025-05-29 12:45:45 +08:00
continue
2025-05-29 18:28:53 +08:00
if (col == 'High' and window == 5):
2025-05-29 12:45:45 +08:00
continue
2025-05-29 18:28:53 +08:00
if (col == 'High' and window == 30):
2025-05-29 12:45:45 +08:00
continue
2025-05-29 18:28:53 +08:00
if (col == 'Low' and window == 15):
2025-05-29 12:45:45 +08:00
continue
for stat in ['mean', 'std', 'min', 'max']:
feature_name = f'{col}_roll_{stat}_{window}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
2025-05-30 12:29:37 +08:00
print(f'D Loading cached feature: {feature_file}')
2025-05-29 12:45:45 +08:00
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Computing rolling stat feature: {feature_name}')
result = compute_rolling(df, col, stat, window)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
2025-05-29 18:28:53 +08:00
# Log returns for different horizons
for horizon in [5, 15, 30]:
2025-05-29 12:45:45 +08:00
feature_name = f'log_return_{horizon}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
2025-05-30 12:29:37 +08:00
print(f'E Loading cached feature: {feature_file}')
2025-05-29 12:45:45 +08:00
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Computing log return feature: {feature_name}')
result = compute_log_return(df, horizon)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
2025-05-29 18:28:53 +08:00
# Volatility
for window in window_sizes:
2025-05-29 12:45:45 +08:00
feature_name = f'volatility_{window}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
2025-05-30 12:29:37 +08:00
print(f'F Loading cached feature: {feature_file}')
2025-05-29 12:45:45 +08:00
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Computing volatility feature: {feature_name}')
result = compute_volatility(df, window)
2025-05-30 12:29:37 +08:00
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
2025-05-29 18:28:53 +08:00
# Concatenate all new features at once
print('Concatenating all new features to DataFrame...')
features_df = pd.DataFrame(features_dict)
2025-05-30 12:29:37 +08:00
print("Columns in features_df:", features_df.columns.tolist())
print("All-NaN columns in features_df:", features_df.columns[features_df.isna().all()].tolist())
2025-05-29 18:28:53 +08:00
df = pd.concat([df, features_df], axis=1)
2025-05-30 12:29:37 +08:00
# Print all columns after concatenation
print("All columns in df after concat:", df.columns.tolist())
2025-05-29 12:45:45 +08:00
# Downcast all float columns to save memory
print('Downcasting float columns to save memory...')
for col in df.columns:
try:
df[col] = pd.to_numeric(df[col], downcast='float')
except Exception:
pass
# Drop intermediate features_df to free memory
2025-05-30 12:29:37 +08:00
print('Dropping intermediate features_df to free memory...')
2025-05-29 12:45:45 +08:00
del features_df
import gc
gc.collect()
feature_end_time = time.time()
print(f'Feature computation completed in {feature_end_time - feature_start_time:.2f} seconds.')
2025-05-29 18:28:53 +08:00
# Add Supertrend indicators (custom)
print('Preparing data for Supertrend calculation...')
st_df = df.rename(columns={'High': 'high', 'Low': 'low', 'Close': 'close'})
print('Calculating Supertrend indicators...')
supertrend = Supertrends(st_df)
st_results = supertrend.calculate_supertrend_indicators()
for idx, st in enumerate(st_results):
period = st['params']['period']
multiplier = st['params']['multiplier']
# Skip useless supertrend features
if (period == 10 and multiplier == 1.0) or (period == 11 and multiplier == 2.0):
continue
print(f'Adding Supertrend features: supertrend_{period}_{multiplier} and supertrend_trend_{period}_{multiplier}')
df[f'supertrend_{period}_{multiplier}'] = st['results']['supertrend']
df[f'supertrend_trend_{period}_{multiplier}'] = st['results']['trend']
# Add time features (exclude 'dayofweek')
print('Adding hour feature...')
2025-05-29 12:45:45 +08:00
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
2025-05-29 18:28:53 +08:00
df['hour'] = df['Timestamp'].dt.hour
# Impute NaNs after all feature engineering
print('Imputing NaNs after feature engineering (using column means)...')
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
# If you want to impute non-numeric columns differently, add logic here
2025-05-30 12:29:37 +08:00
2025-05-29 12:45:45 +08:00
# Drop excluded columns to save memory
print('Dropping excluded columns to save memory...')
df = df[feature_cols + ['log_return', 'Timestamp']]
2025-05-29 18:28:53 +08:00
print('Preparing X and y...')
X = df[feature_cols].values.astype(np.float32)
y = df['log_return'].values.astype(np.float32)
split_idx = int(len(X) * 0.8)
2025-05-30 12:29:37 +08:00
print(f'Splitting data: {split_idx} train, {len(X) - split_idx} test')
2025-05-29 18:28:53 +08:00
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
test_timestamps = df['Timestamp'].values[split_idx:]
print('Initializing model...')
model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
print('Training model...')
booster = model.train()
print('Training complete.')
2025-05-30 12:29:37 +08:00
# Save the trained model
model.save_model('./data/xgboost_model.json')
print('Model saved to ./data/xgboost_model.json')
2025-05-29 18:28:53 +08:00
if hasattr(model, 'params'):
print("Model hyperparameters:", model.params)
if hasattr(model, 'model') and hasattr(model.model, 'get_score'):
import operator
importances = model.model.get_score(importance_type='weight')
# Map f0, f1, ... to actual feature names
feature_map = {f"f{idx}": name for idx, name in enumerate(feature_cols)}
sorted_importances = sorted(importances.items(), key=operator.itemgetter(1), reverse=True)
print('Feature importances (sorted, with names):')
for feat, score in sorted_importances:
print(f'{feature_map.get(feat, feat)}: {score}')
2025-05-30 12:29:37 +08:00
print('Making predictions for first 5 test samples...')
2025-05-29 18:28:53 +08:00
preds = model.predict(X_test[:5])
print('Predictions for first 5 test samples:', preds)
print('Actual values for first 5 test samples:', y_test[:5])
2025-05-30 12:29:37 +08:00
print('Making predictions for all test samples...')
2025-05-29 18:28:53 +08:00
test_preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, test_preds))
print(f'RMSE on test set: {rmse:.4f}')
2025-05-30 12:29:37 +08:00
print('Saving y_test and test_preds to disk...')
2025-05-29 18:28:53 +08:00
np.save('./data/y_test.npy', y_test)
np.save('./data/test_preds.npy', test_preds)
2025-05-30 12:29:37 +08:00
# Reconstruct price series from log returns
print('Reconstructing price series from log returns...')
# Get the last available Close price before the test set
# The DataFrame df has been reset, so use split_idx to get the right row
if 'Close' in df.columns:
close_prices = df['Close'].values
else:
# Reload original CSV to get Close prices if not present
close_prices = pd.read_csv(csv_path)['Close'].values
start_price = close_prices[split_idx] # This is the price at the split point
# Actual prices
actual_prices = [start_price]
for r in y_test:
actual_prices.append(actual_prices[-1] * np.exp(r))
actual_prices = np.array(actual_prices[1:])
# Predicted prices
predicted_prices = [start_price]
for r in test_preds:
predicted_prices.append(predicted_prices[-1] * np.exp(r))
predicted_prices = np.array(predicted_prices[1:])
print('Plotting predicted vs actual prices...')
plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps)
print('Plotting distribution of absolute prediction errors...')
plot_prediction_error_distribution(predicted_prices, actual_prices)
2025-05-30 12:29:37 +08:00
print("Final features used for training:", feature_cols)
print("Shape of X:", X.shape)
print("First row of X:", X[0])
print("stoch_k in feature_cols?", "stoch_k" in feature_cols)
if "stoch_k" in feature_cols:
idx = feature_cols.index("stoch_k")
print("First 10 values of stoch_k:", X[:10, idx])