Cycles/xgboost/main.py
2025-05-30 12:29:37 +08:00

732 lines
28 KiB
Python

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from custom_xgboost import CustomXGBoostGPU
from sklearn.metrics import mean_squared_error
from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns, plot_predicted_vs_actual_prices
import ta
from cycles.supertrend import Supertrends
from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillator, StochasticOscillator, WilliamsRIndicator
from ta.volatility import KeltnerChannel, DonchianChannel
from ta.others import DailyReturnIndicator
import time
from numba import njit
def run_indicator(func, *args):
return func(*args)
def run_indicator_job(job):
import time
func, *args = job
indicator_name = func.__name__
start = time.time()
result = func(*args)
elapsed = time.time() - start
print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds')
return result
def calc_rsi(close):
from ta.momentum import RSIIndicator
return ('rsi', RSIIndicator(close, window=14).rsi())
def calc_macd(close):
from ta.trend import MACD
return ('macd', MACD(close).macd())
def calc_bollinger(close):
from ta.volatility import BollingerBands
bb = BollingerBands(close=close, window=20, window_dev=2)
return [
('bb_bbm', bb.bollinger_mavg()),
('bb_bbh', bb.bollinger_hband()),
('bb_bbl', bb.bollinger_lband()),
('bb_bb_width', bb.bollinger_hband() - bb.bollinger_lband())
]
def calc_stochastic(high, low, close):
from ta.momentum import StochasticOscillator
stoch = StochasticOscillator(high=high, low=low, close=close, window=14, smooth_window=3)
return [
('stoch_k', stoch.stoch()),
('stoch_d', stoch.stoch_signal())
]
def calc_atr(high, low, close):
from ta.volatility import AverageTrueRange
atr = AverageTrueRange(high=high, low=low, close=close, window=14)
return ('atr', atr.average_true_range())
def calc_cci(high, low, close):
from ta.trend import CCIIndicator
cci = CCIIndicator(high=high, low=low, close=close, window=20)
return ('cci', cci.cci())
def calc_williamsr(high, low, close):
from ta.momentum import WilliamsRIndicator
willr = WilliamsRIndicator(high=high, low=low, close=close, lbp=14)
return ('williams_r', willr.williams_r())
def calc_ema(close):
from ta.trend import EMAIndicator
ema = EMAIndicator(close=close, window=14)
return ('ema_14', ema.ema_indicator())
def calc_obv(close, volume):
from ta.volume import OnBalanceVolumeIndicator
obv = OnBalanceVolumeIndicator(close=close, volume=volume)
return ('obv', obv.on_balance_volume())
def calc_cmf(high, low, close, volume):
from ta.volume import ChaikinMoneyFlowIndicator
cmf = ChaikinMoneyFlowIndicator(high=high, low=low, close=close, volume=volume, window=20)
return ('cmf', cmf.chaikin_money_flow())
def calc_sma(close):
from ta.trend import SMAIndicator
return [
('sma_50', SMAIndicator(close, window=50).sma_indicator()),
('sma_200', SMAIndicator(close, window=200).sma_indicator())
]
def calc_roc(close):
from ta.momentum import ROCIndicator
return ('roc_10', ROCIndicator(close, window=10).roc())
def calc_momentum(close):
return ('momentum_10', close - close.shift(10))
def calc_psar(high, low, close):
# Use the Numba-accelerated fast_psar function for speed
psar_values = fast_psar(np.array(high), np.array(low), np.array(close))
return [('psar', pd.Series(psar_values, index=close.index))]
def calc_donchian(high, low, close):
from ta.volatility import DonchianChannel
donchian = DonchianChannel(high, low, close, window=20)
return [
('donchian_hband', donchian.donchian_channel_hband()),
('donchian_lband', donchian.donchian_channel_lband()),
('donchian_mband', donchian.donchian_channel_mband())
]
def calc_keltner(high, low, close):
from ta.volatility import KeltnerChannel
keltner = KeltnerChannel(high, low, close, window=20)
return [
('keltner_hband', keltner.keltner_channel_hband()),
('keltner_lband', keltner.keltner_channel_lband()),
('keltner_mband', keltner.keltner_channel_mband())
]
def calc_dpo(close):
from ta.trend import DPOIndicator
return ('dpo_20', DPOIndicator(close, window=20).dpo())
def calc_ultimate(high, low, close):
from ta.momentum import UltimateOscillator
return ('ultimate_osc', UltimateOscillator(high, low, close).ultimate_oscillator())
def calc_ichimoku(high, low):
from ta.trend import IchimokuIndicator
ichimoku = IchimokuIndicator(high, low, window1=9, window2=26, window3=52)
return [
('ichimoku_a', ichimoku.ichimoku_a()),
('ichimoku_b', ichimoku.ichimoku_b()),
('ichimoku_base_line', ichimoku.ichimoku_base_line()),
('ichimoku_conversion_line', ichimoku.ichimoku_conversion_line())
]
def calc_elder_ray(close, low, high):
from ta.trend import EMAIndicator
ema = EMAIndicator(close, window=13).ema_indicator()
return [
('elder_ray_bull', ema - low),
('elder_ray_bear', ema - high)
]
def calc_daily_return(close):
from ta.others import DailyReturnIndicator
return ('daily_return', DailyReturnIndicator(close).daily_return())
@njit
def fast_psar(high, low, close, af=0.02, max_af=0.2):
length = len(close)
psar = np.zeros(length)
bull = True
af_step = af
ep = low[0]
psar[0] = low[0]
for i in range(1, length):
prev_psar = psar[i-1]
if bull:
psar[i] = prev_psar + af_step * (ep - prev_psar)
if low[i] < psar[i]:
bull = False
psar[i] = ep
af_step = af
ep = low[i]
else:
if high[i] > ep:
ep = high[i]
af_step = min(af_step + af, max_af)
else:
psar[i] = prev_psar + af_step * (ep - prev_psar)
if high[i] > psar[i]:
bull = True
psar[i] = ep
af_step = af
ep = high[i]
else:
if low[i] < ep:
ep = low[i]
af_step = min(af_step + af, max_af)
return psar
def compute_lag(df, col, lag):
return df[col].shift(lag)
def compute_rolling(df, col, stat, window):
if stat == 'mean':
return df[col].rolling(window).mean()
elif stat == 'std':
return df[col].rolling(window).std()
elif stat == 'min':
return df[col].rolling(window).min()
elif stat == 'max':
return df[col].rolling(window).max()
def compute_log_return(df, horizon):
return np.log(df['Close'] / df['Close'].shift(horizon))
def compute_volatility(df, window):
return df['log_return'].rolling(window).std()
def run_feature_job(job, df):
feature_name, func, *args = job
print(f'Computing feature: {feature_name}')
result = func(df, *args)
return feature_name, result
if __name__ == '__main__':
csv_path = './data/btcusd_1-min_data.csv'
csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]
print('Reading CSV and filtering data...')
df = pd.read_csv(csv_path)
df = df[df['Volume'] != 0]
min_date = '2017-06-01'
print('Converting Timestamp and filtering by date...')
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df = df[df['Timestamp'] >= min_date]
lags = 3
print('Calculating log returns as the new target...')
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
window_sizes = [5, 15, 30] # in minutes, adjust as needed
features_dict = {}
print('Starting feature computation...')
feature_start_time = time.time()
# --- Technical Indicator Features: Calculate or Load from Cache ---
print('Calculating or loading technical indicator features...')
# RSI
feature_file = f'./data/{csv_prefix}_rsi.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['rsi'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: rsi')
_, values = calc_rsi(df['Close'])
features_dict['rsi'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# MACD
feature_file = f'./data/{csv_prefix}_macd.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['macd'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: macd')
_, values = calc_macd(df['Close'])
features_dict['macd'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# ATR
feature_file = f'./data/{csv_prefix}_atr.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['atr'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: atr')
_, values = calc_atr(df['High'], df['Low'], df['Close'])
features_dict['atr'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# CCI
feature_file = f'./data/{csv_prefix}_cci.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['cci'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: cci')
_, values = calc_cci(df['High'], df['Low'], df['Close'])
features_dict['cci'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Williams %R
feature_file = f'./data/{csv_prefix}_williams_r.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['williams_r'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: williams_r')
_, values = calc_williamsr(df['High'], df['Low'], df['Close'])
features_dict['williams_r'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# EMA 14
feature_file = f'./data/{csv_prefix}_ema_14.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['ema_14'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: ema_14')
_, values = calc_ema(df['Close'])
features_dict['ema_14'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# OBV
feature_file = f'./data/{csv_prefix}_obv.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['obv'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: obv')
_, values = calc_obv(df['Close'], df['Volume'])
features_dict['obv'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# CMF
feature_file = f'./data/{csv_prefix}_cmf.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['cmf'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: cmf')
_, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
features_dict['cmf'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# ROC 10
feature_file = f'./data/{csv_prefix}_roc_10.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['roc_10'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: roc_10')
_, values = calc_roc(df['Close'])
features_dict['roc_10'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# DPO 20
feature_file = f'./data/{csv_prefix}_dpo_20.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['dpo_20'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: dpo_20')
_, values = calc_dpo(df['Close'])
features_dict['dpo_20'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Ultimate Oscillator
feature_file = f'./data/{csv_prefix}_ultimate_osc.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: ultimate_osc')
_, values = calc_ultimate(df['High'], df['Low'], df['Close'])
features_dict['ultimate_osc'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Daily Return
feature_file = f'./data/{csv_prefix}_daily_return.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['daily_return'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: daily_return')
_, values = calc_daily_return(df['Close'])
features_dict['daily_return'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Multi-column indicators
# Bollinger Bands
print('Calculating multi-column indicator: bollinger')
result = calc_bollinger(df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Stochastic Oscillator
print('Calculating multi-column indicator: stochastic')
result = calc_stochastic(df['High'], df['Low'], df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# SMA
print('Calculating multi-column indicator: sma')
result = calc_sma(df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# PSAR
print('Calculating multi-column indicator: psar')
result = calc_psar(df['High'], df['Low'], df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Donchian Channel
print('Calculating multi-column indicator: donchian')
result = calc_donchian(df['High'], df['Low'], df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Keltner Channel
print('Calculating multi-column indicator: keltner')
result = calc_keltner(df['High'], df['Low'], df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Ichimoku
print('Calculating multi-column indicator: ichimoku')
result = calc_ichimoku(df['High'], df['Low'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Elder Ray
print('Calculating multi-column indicator: elder_ray')
result = calc_elder_ray(df['Close'], df['Low'], df['High'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Prepare jobs for lags, rolling stats, log returns, and volatility
feature_jobs = []
# Lags
for col in ohlcv_cols:
for lag in range(1, lags + 1):
feature_name = f'{col}_lag{lag}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
print(f'C Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Adding lag feature job: {feature_name}')
feature_jobs.append((feature_name, compute_lag, col, lag))
# Rolling statistics
for col in ohlcv_cols:
for window in window_sizes:
if (col == 'Open' and window == 5):
continue
if (col == 'High' and window == 5):
continue
if (col == 'High' and window == 30):
continue
if (col == 'Low' and window == 15):
continue
for stat in ['mean', 'std', 'min', 'max']:
feature_name = f'{col}_roll_{stat}_{window}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
print(f'D Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Adding rolling stat feature job: {feature_name}')
feature_jobs.append((feature_name, compute_rolling, col, stat, window))
# Log returns for different horizons
for horizon in [5, 15, 30]:
feature_name = f'log_return_{horizon}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
print(f'E Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Adding log return feature job: {feature_name}')
feature_jobs.append((feature_name, compute_log_return, horizon))
# Volatility
for window in window_sizes:
feature_name = f'volatility_{window}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
print(f'F Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file)
else:
print(f'Adding volatility feature job: {feature_name}')
feature_jobs.append((feature_name, compute_volatility, window))
# Sequential computation for all non-cached features
if feature_jobs:
print(f'Computing {len(feature_jobs)} features sequentially...')
for job in feature_jobs:
print(f'Computing feature job: {job[0]}')
feature_name, result = run_feature_job(job, df)
features_dict[feature_name] = result
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
np.save(feature_file, result.values)
print(f'Saved computed feature: {feature_file}')
print('All features computed.')
else:
print('All features loaded from cache.')
# Concatenate all new features at once
print('Concatenating all new features to DataFrame...')
features_df = pd.DataFrame(features_dict)
print("Columns in features_df:", features_df.columns.tolist())
print("All-NaN columns in features_df:", features_df.columns[features_df.isna().all()].tolist())
df = pd.concat([df, features_df], axis=1)
# Print all columns after concatenation
print("All columns in df after concat:", df.columns.tolist())
# Downcast all float columns to save memory
print('Downcasting float columns to save memory...')
for col in df.columns:
try:
df[col] = pd.to_numeric(df[col], downcast='float')
except Exception:
pass
# Drop intermediate features_df to free memory
print('Dropping intermediate features_df to free memory...')
del features_df
import gc
gc.collect()
feature_end_time = time.time()
print(f'Feature computation completed in {feature_end_time - feature_start_time:.2f} seconds.')
# Add Supertrend indicators (custom)
print('Preparing data for Supertrend calculation...')
st_df = df.rename(columns={'High': 'high', 'Low': 'low', 'Close': 'close'})
print('Calculating Supertrend indicators...')
supertrend = Supertrends(st_df)
st_results = supertrend.calculate_supertrend_indicators()
for idx, st in enumerate(st_results):
period = st['params']['period']
multiplier = st['params']['multiplier']
# Skip useless supertrend features
if (period == 10 and multiplier == 1.0) or (period == 11 and multiplier == 2.0):
continue
print(f'Adding Supertrend features: supertrend_{period}_{multiplier} and supertrend_trend_{period}_{multiplier}')
df[f'supertrend_{period}_{multiplier}'] = st['results']['supertrend']
df[f'supertrend_trend_{period}_{multiplier}'] = st['results']['trend']
# Add time features (exclude 'dayofweek')
print('Adding hour feature...')
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df['hour'] = df['Timestamp'].dt.hour
# Drop NaNs after all feature engineering
print('Dropping NaNs after feature engineering...')
df = df.dropna().reset_index(drop=True)
# Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
print('Selecting feature columns...')
exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
feature_cols = [col for col in df.columns if col not in exclude_cols]
# Print the features used for training
print("Features used for training:", feature_cols)
# Drop excluded columns to save memory
print('Dropping excluded columns to save memory...')
df = df[feature_cols + ['log_return', 'Timestamp']]
print('Preparing X and y...')
X = df[feature_cols].values.astype(np.float32)
y = df['log_return'].values.astype(np.float32)
split_idx = int(len(X) * 0.8)
print(f'Splitting data: {split_idx} train, {len(X) - split_idx} test')
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
test_timestamps = df['Timestamp'].values[split_idx:]
print('Initializing model...')
model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
print('Training model...')
booster = model.train()
print('Training complete.')
# Save the trained model
model.save_model('./data/xgboost_model.json')
print('Model saved to ./data/xgboost_model.json')
if hasattr(model, 'params'):
print("Model hyperparameters:", model.params)
if hasattr(model, 'model') and hasattr(model.model, 'get_score'):
import operator
importances = model.model.get_score(importance_type='weight')
# Map f0, f1, ... to actual feature names
feature_map = {f"f{idx}": name for idx, name in enumerate(feature_cols)}
sorted_importances = sorted(importances.items(), key=operator.itemgetter(1), reverse=True)
print('Feature importances (sorted, with names):')
for feat, score in sorted_importances:
print(f'{feature_map.get(feat, feat)}: {score}')
print('Making predictions for first 5 test samples...')
preds = model.predict(X_test[:5])
print('Predictions for first 5 test samples:', preds)
print('Actual values for first 5 test samples:', y_test[:5])
print('Making predictions for all test samples...')
test_preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, test_preds))
print(f'RMSE on test set: {rmse:.4f}')
print('Saving y_test and test_preds to disk...')
np.save('./data/y_test.npy', y_test)
np.save('./data/test_preds.npy', test_preds)
# Reconstruct price series from log returns
print('Reconstructing price series from log returns...')
# Get the last available Close price before the test set
# The DataFrame df has been reset, so use split_idx to get the right row
if 'Close' in df.columns:
close_prices = df['Close'].values
else:
# Reload original CSV to get Close prices if not present
close_prices = pd.read_csv(csv_path)['Close'].values
start_price = close_prices[split_idx] # This is the price at the split point
# Actual prices
actual_prices = [start_price]
for r in y_test:
actual_prices.append(actual_prices[-1] * np.exp(r))
actual_prices = np.array(actual_prices[1:])
# Predicted prices
predicted_prices = [start_price]
for r in test_preds:
predicted_prices.append(predicted_prices[-1] * np.exp(r))
predicted_prices = np.array(predicted_prices[1:])
print('Plotting predicted vs actual prices...')
plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps)
print("Final features used for training:", feature_cols)
print("Shape of X:", X.shape)
print("First row of X:", X[0])
print("stoch_k in feature_cols?", "stoch_k" in feature_cols)
if "stoch_k" in feature_cols:
idx = feature_cols.index("stoch_k")
print("First 10 values of stoch_k:", X[:10, idx])