model updated

This commit is contained in:
Simon Moisy 2025-05-30 12:29:37 +08:00
parent 2dba88b620
commit 81e4b640a7
3 changed files with 412 additions and 75 deletions

View File

@ -31,3 +31,9 @@ class CustomXGBoostGPU:
raise ValueError('Model not trained yet.') raise ValueError('Model not trained yet.')
dmatrix = xgb.DMatrix(X.astype(np.float32)) dmatrix = xgb.DMatrix(X.astype(np.float32))
return self.model.predict(dmatrix) return self.model.predict(dmatrix)
def save_model(self, file_path):
"""Save the trained XGBoost model to the specified file path."""
if self.model is None:
raise ValueError('Model not trained yet.')
self.model.save_model(file_path)

View File

@ -6,7 +6,7 @@ import numpy as np
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from custom_xgboost import CustomXGBoostGPU from custom_xgboost import CustomXGBoostGPU
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns, plot_predicted_vs_actual_prices
import ta import ta
from cycles.supertrend import Supertrends from cycles.supertrend import Supertrends
from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
@ -14,7 +14,6 @@ from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillator, Stochas
from ta.volatility import KeltnerChannel, DonchianChannel from ta.volatility import KeltnerChannel, DonchianChannel
from ta.others import DailyReturnIndicator from ta.others import DailyReturnIndicator
import time import time
import concurrent.futures
from numba import njit from numba import njit
def run_indicator(func, *args): def run_indicator(func, *args):
@ -101,13 +100,9 @@ def calc_momentum(close):
return ('momentum_10', close - close.shift(10)) return ('momentum_10', close - close.shift(10))
def calc_psar(high, low, close): def calc_psar(high, low, close):
from ta.trend import PSARIndicator # Use the Numba-accelerated fast_psar function for speed
psar = PSARIndicator(high, low, close) psar_values = fast_psar(np.array(high), np.array(low), np.array(close))
return [ return [('psar', pd.Series(psar_values, index=close.index))]
('psar', psar.psar()),
('psar_up', psar.psar_up()),
('psar_down', psar.psar_down())
]
def calc_donchian(high, low, close): def calc_donchian(high, low, close):
from ta.volatility import DonchianChannel from ta.volatility import DonchianChannel
@ -220,17 +215,18 @@ if __name__ == '__main__':
csv_path = './data/btcusd_1-min_data.csv' csv_path = './data/btcusd_1-min_data.csv'
csv_prefix = os.path.splitext(os.path.basename(csv_path))[0] csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]
print('Reading CSV and filtering data...')
df = pd.read_csv(csv_path) df = pd.read_csv(csv_path)
df = df[df['Volume'] != 0] df = df[df['Volume'] != 0]
min_date = '2017-06-01' min_date = '2017-06-01'
print('Converting Timestamp and filtering by date...')
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s') df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df = df[df['Timestamp'] >= min_date] df = df[df['Timestamp'] >= min_date]
lags = 3 lags = 3
print('Calculating log returns as the new target...') print('Calculating log returns as the new target...')
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume'] ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
@ -243,55 +239,282 @@ if __name__ == '__main__':
# --- Technical Indicator Features: Calculate or Load from Cache --- # --- Technical Indicator Features: Calculate or Load from Cache ---
print('Calculating or loading technical indicator features...') print('Calculating or loading technical indicator features...')
indicator_jobs = [ # RSI
('rsi', calc_rsi, [df['Close']]), feature_file = f'./data/{csv_prefix}_rsi.npy'
('macd', calc_macd, [df['Close']]), if os.path.exists(feature_file):
('atr', calc_atr, [df['High'], df['Low'], df['Close']]), print(f'A Loading cached feature: {feature_file}')
('cci', calc_cci, [df['High'], df['Low'], df['Close']]), arr = np.load(feature_file)
('williams_r', calc_williamsr, [df['High'], df['Low'], df['Close']]), features_dict['rsi'] = pd.Series(arr, index=df.index)
('ema_14', calc_ema, [df['Close']]), else:
('obv', calc_obv, [df['Close'], df['Volume']]), print('Calculating feature: rsi')
('cmf', calc_cmf, [df['High'], df['Low'], df['Close'], df['Volume']]), _, values = calc_rsi(df['Close'])
('roc_10', calc_roc, [df['Close']]), features_dict['rsi'] = values
('dpo_20', calc_dpo, [df['Close']]), np.save(feature_file, values.values)
('ultimate_osc', calc_ultimate, [df['High'], df['Low'], df['Close']]), print(f'Saved feature: {feature_file}')
('daily_return', calc_daily_return, [df['Close']]),
] # MACD
feature_file = f'./data/{csv_prefix}_macd.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['macd'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: macd')
_, values = calc_macd(df['Close'])
features_dict['macd'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# ATR
feature_file = f'./data/{csv_prefix}_atr.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['atr'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: atr')
_, values = calc_atr(df['High'], df['Low'], df['Close'])
features_dict['atr'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# CCI
feature_file = f'./data/{csv_prefix}_cci.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['cci'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: cci')
_, values = calc_cci(df['High'], df['Low'], df['Close'])
features_dict['cci'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Williams %R
feature_file = f'./data/{csv_prefix}_williams_r.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['williams_r'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: williams_r')
_, values = calc_williamsr(df['High'], df['Low'], df['Close'])
features_dict['williams_r'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# EMA 14
feature_file = f'./data/{csv_prefix}_ema_14.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['ema_14'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: ema_14')
_, values = calc_ema(df['Close'])
features_dict['ema_14'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# OBV
feature_file = f'./data/{csv_prefix}_obv.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['obv'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: obv')
_, values = calc_obv(df['Close'], df['Volume'])
features_dict['obv'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# CMF
feature_file = f'./data/{csv_prefix}_cmf.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['cmf'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: cmf')
_, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
features_dict['cmf'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# ROC 10
feature_file = f'./data/{csv_prefix}_roc_10.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['roc_10'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: roc_10')
_, values = calc_roc(df['Close'])
features_dict['roc_10'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# DPO 20
feature_file = f'./data/{csv_prefix}_dpo_20.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['dpo_20'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: dpo_20')
_, values = calc_dpo(df['Close'])
features_dict['dpo_20'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Ultimate Oscillator
feature_file = f'./data/{csv_prefix}_ultimate_osc.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: ultimate_osc')
_, values = calc_ultimate(df['High'], df['Low'], df['Close'])
features_dict['ultimate_osc'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Daily Return
feature_file = f'./data/{csv_prefix}_daily_return.npy'
if os.path.exists(feature_file):
print(f'A Loading cached feature: {feature_file}')
arr = np.load(feature_file)
features_dict['daily_return'] = pd.Series(arr, index=df.index)
else:
print('Calculating feature: daily_return')
_, values = calc_daily_return(df['Close'])
features_dict['daily_return'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Multi-column indicators # Multi-column indicators
multi_indicator_jobs = [ # Bollinger Bands
('bollinger', calc_bollinger, [df['Close']]), print('Calculating multi-column indicator: bollinger')
('stochastic', calc_stochastic, [df['High'], df['Low'], df['Close']]), result = calc_bollinger(df['Close'])
('sma', calc_sma, [df['Close']]), for subname, values in result:
('psar', calc_psar, [df['High'], df['Low'], df['Close']]), print(f"Adding subfeature: {subname}")
('donchian', calc_donchian, [df['High'], df['Low'], df['Close']]), sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
('keltner', calc_keltner, [df['High'], df['Low'], df['Close']]), if os.path.exists(sub_feature_file):
('ichimoku', calc_ichimoku, [df['High'], df['Low']]), print(f'B Loading cached feature: {sub_feature_file}')
('elder_ray', calc_elder_ray, [df['Close'], df['Low'], df['High']]), arr = np.load(sub_feature_file)
] features_dict[subname] = pd.Series(arr, index=df.index)
for feature_name, func, args in indicator_jobs:
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
print(f'Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file)
else: else:
result = func(*args) features_dict[subname] = values
if isinstance(result, tuple): np.save(sub_feature_file, values.values)
_, values = result print(f'Saved feature: {sub_feature_file}')
features_dict[feature_name] = values
np.save(feature_file, values.values) # Stochastic Oscillator
else: print('Calculating multi-column indicator: stochastic')
raise ValueError(f"Unexpected result for {feature_name}") result = calc_stochastic(df['High'], df['Low'], df['Close'])
for feature_name, func, args in multi_indicator_jobs: for subname, values in result:
# These return a list of (name, values) print(f"Adding subfeature: {subname}")
result = func(*args) sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
for subname, values in result: if os.path.exists(sub_feature_file):
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' print(f'B Loading cached feature: {sub_feature_file}')
if os.path.exists(sub_feature_file): arr = np.load(sub_feature_file)
print(f'Loading cached feature: {sub_feature_file}') features_dict[subname] = pd.Series(arr, index=df.index)
features_dict[subname] = np.load(sub_feature_file) else:
else: features_dict[subname] = values
features_dict[subname] = values np.save(sub_feature_file, values.values)
np.save(sub_feature_file, values.values) print(f'Saved feature: {sub_feature_file}')
# SMA
print('Calculating multi-column indicator: sma')
result = calc_sma(df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# PSAR
print('Calculating multi-column indicator: psar')
result = calc_psar(df['High'], df['Low'], df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Donchian Channel
print('Calculating multi-column indicator: donchian')
result = calc_donchian(df['High'], df['Low'], df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Keltner Channel
print('Calculating multi-column indicator: keltner')
result = calc_keltner(df['High'], df['Low'], df['Close'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Ichimoku
print('Calculating multi-column indicator: ichimoku')
result = calc_ichimoku(df['High'], df['Low'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Elder Ray
print('Calculating multi-column indicator: elder_ray')
result = calc_elder_ray(df['Close'], df['Low'], df['High'])
for subname, values in result:
print(f"Adding subfeature: {subname}")
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
print(f'B Loading cached feature: {sub_feature_file}')
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Prepare jobs for lags, rolling stats, log returns, and volatility # Prepare jobs for lags, rolling stats, log returns, and volatility
feature_jobs = [] feature_jobs = []
@ -301,9 +524,10 @@ if __name__ == '__main__':
feature_name = f'{col}_lag{lag}' feature_name = f'{col}_lag{lag}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy' feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file): if os.path.exists(feature_file):
print(f'Loading cached feature: {feature_file}') print(f'C Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file) features_dict[feature_name] = np.load(feature_file)
else: else:
print(f'Adding lag feature job: {feature_name}')
feature_jobs.append((feature_name, compute_lag, col, lag)) feature_jobs.append((feature_name, compute_lag, col, lag))
# Rolling statistics # Rolling statistics
for col in ohlcv_cols: for col in ohlcv_cols:
@ -320,48 +544,56 @@ if __name__ == '__main__':
feature_name = f'{col}_roll_{stat}_{window}' feature_name = f'{col}_roll_{stat}_{window}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy' feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file): if os.path.exists(feature_file):
print(f'Loading cached feature: {feature_file}') print(f'D Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file) features_dict[feature_name] = np.load(feature_file)
else: else:
print(f'Adding rolling stat feature job: {feature_name}')
feature_jobs.append((feature_name, compute_rolling, col, stat, window)) feature_jobs.append((feature_name, compute_rolling, col, stat, window))
# Log returns for different horizons # Log returns for different horizons
for horizon in [5, 15, 30]: for horizon in [5, 15, 30]:
feature_name = f'log_return_{horizon}' feature_name = f'log_return_{horizon}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy' feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file): if os.path.exists(feature_file):
print(f'Loading cached feature: {feature_file}') print(f'E Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file) features_dict[feature_name] = np.load(feature_file)
else: else:
print(f'Adding log return feature job: {feature_name}')
feature_jobs.append((feature_name, compute_log_return, horizon)) feature_jobs.append((feature_name, compute_log_return, horizon))
# Volatility # Volatility
for window in window_sizes: for window in window_sizes:
feature_name = f'volatility_{window}' feature_name = f'volatility_{window}'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy' feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file): if os.path.exists(feature_file):
print(f'Loading cached feature: {feature_file}') print(f'F Loading cached feature: {feature_file}')
features_dict[feature_name] = np.load(feature_file) features_dict[feature_name] = np.load(feature_file)
else: else:
print(f'Adding volatility feature job: {feature_name}')
feature_jobs.append((feature_name, compute_volatility, window)) feature_jobs.append((feature_name, compute_volatility, window))
# Parallel computation for all non-cached features # Sequential computation for all non-cached features
if feature_jobs: if feature_jobs:
print(f'Computing {len(feature_jobs)} features in parallel...') print(f'Computing {len(feature_jobs)} features sequentially...')
with concurrent.futures.ProcessPoolExecutor() as executor: for job in feature_jobs:
futures = [executor.submit(run_feature_job, job, df) for job in feature_jobs] print(f'Computing feature job: {job[0]}')
for future in concurrent.futures.as_completed(futures): feature_name, result = run_feature_job(job, df)
feature_name, result = future.result() features_dict[feature_name] = result
features_dict[feature_name] = result feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
feature_file = f'./data/{csv_prefix}_{feature_name}.npy' np.save(feature_file, result.values)
np.save(feature_file, result.values) print(f'Saved computed feature: {feature_file}')
print('All parallel features computed.') print('All features computed.')
else: else:
print('All features loaded from cache.') print('All features loaded from cache.')
# Concatenate all new features at once # Concatenate all new features at once
print('Concatenating all new features to DataFrame...') print('Concatenating all new features to DataFrame...')
features_df = pd.DataFrame(features_dict) features_df = pd.DataFrame(features_dict)
print("Columns in features_df:", features_df.columns.tolist())
print("All-NaN columns in features_df:", features_df.columns[features_df.isna().all()].tolist())
df = pd.concat([df, features_df], axis=1) df = pd.concat([df, features_df], axis=1)
# Print all columns after concatenation
print("All columns in df after concat:", df.columns.tolist())
# Downcast all float columns to save memory # Downcast all float columns to save memory
print('Downcasting float columns to save memory...') print('Downcasting float columns to save memory...')
for col in df.columns: for col in df.columns:
@ -371,6 +603,7 @@ if __name__ == '__main__':
pass pass
# Drop intermediate features_df to free memory # Drop intermediate features_df to free memory
print('Dropping intermediate features_df to free memory...')
del features_df del features_df
import gc import gc
gc.collect() gc.collect()
@ -408,6 +641,10 @@ if __name__ == '__main__':
print('Selecting feature columns...') print('Selecting feature columns...')
exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30'] exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
feature_cols = [col for col in df.columns if col not in exclude_cols] feature_cols = [col for col in df.columns if col not in exclude_cols]
# Print the features used for training
print("Features used for training:", feature_cols)
# Drop excluded columns to save memory # Drop excluded columns to save memory
print('Dropping excluded columns to save memory...') print('Dropping excluded columns to save memory...')
df = df[feature_cols + ['log_return', 'Timestamp']] df = df[feature_cols + ['log_return', 'Timestamp']]
@ -417,6 +654,7 @@ if __name__ == '__main__':
y = df['log_return'].values.astype(np.float32) y = df['log_return'].values.astype(np.float32)
split_idx = int(len(X) * 0.8) split_idx = int(len(X) * 0.8)
print(f'Splitting data: {split_idx} train, {len(X) - split_idx} test')
X_train, X_test = X[:split_idx], X[split_idx:] X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:] y_train, y_test = y[:split_idx], y[split_idx:]
test_timestamps = df['Timestamp'].values[split_idx:] test_timestamps = df['Timestamp'].values[split_idx:]
@ -428,7 +666,11 @@ if __name__ == '__main__':
booster = model.train() booster = model.train()
print('Training complete.') print('Training complete.')
# Save the trained model
model.save_model('./data/xgboost_model.json')
print('Model saved to ./data/xgboost_model.json')
if hasattr(model, 'params'): if hasattr(model, 'params'):
print("Model hyperparameters:", model.params) print("Model hyperparameters:", model.params)
if hasattr(model, 'model') and hasattr(model.model, 'get_score'): if hasattr(model, 'model') and hasattr(model.model, 'get_score'):
@ -441,18 +683,49 @@ if __name__ == '__main__':
for feat, score in sorted_importances: for feat, score in sorted_importances:
print(f'{feature_map.get(feat, feat)}: {score}') print(f'{feature_map.get(feat, feat)}: {score}')
print('Making predictions for first 5 test samples...')
preds = model.predict(X_test[:5]) preds = model.predict(X_test[:5])
print('Predictions for first 5 test samples:', preds) print('Predictions for first 5 test samples:', preds)
print('Actual values for first 5 test samples:', y_test[:5]) print('Actual values for first 5 test samples:', y_test[:5])
print('Making predictions for all test samples...')
test_preds = model.predict(X_test) test_preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, test_preds)) rmse = np.sqrt(mean_squared_error(y_test, test_preds))
print(f'RMSE on test set: {rmse:.4f}') print(f'RMSE on test set: {rmse:.4f}')
print('Saving y_test and test_preds to disk...')
np.save('./data/y_test.npy', y_test) np.save('./data/y_test.npy', y_test)
np.save('./data/test_preds.npy', test_preds) np.save('./data/test_preds.npy', test_preds)
# display_actual_vs_predicted(y_test, test_preds, test_timestamps) # Reconstruct price series from log returns
# plot_target_distribution(y_train, y_test) print('Reconstructing price series from log returns...')
# Get the last available Close price before the test set
# The DataFrame df has been reset, so use split_idx to get the right row
if 'Close' in df.columns:
close_prices = df['Close'].values
else:
# Reload original CSV to get Close prices if not present
close_prices = pd.read_csv(csv_path)['Close'].values
start_price = close_prices[split_idx] # This is the price at the split point
# Actual prices
actual_prices = [start_price]
for r in y_test:
actual_prices.append(actual_prices[-1] * np.exp(r))
actual_prices = np.array(actual_prices[1:])
# Predicted prices
predicted_prices = [start_price]
for r in test_preds:
predicted_prices.append(predicted_prices[-1] * np.exp(r))
predicted_prices = np.array(predicted_prices[1:])
plot_predicted_vs_actual_log_returns(y_test, test_preds, test_timestamps) print('Plotting predicted vs actual prices...')
plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps)
print("Final features used for training:", feature_cols)
print("Shape of X:", X.shape)
print("First row of X:", X[0])
print("stoch_k in feature_cols?", "stoch_k" in feature_cols)
if "stoch_k" in feature_cols:
idx = feature_cols.index("stoch_k")
print("First 10 values of stoch_k:", X[:10, idx])

View File

@ -109,3 +109,61 @@ def plot_predicted_vs_actual_log_returns(y_test, test_preds, timestamps=None, n_
) )
fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter) fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
pyo.plot(fig_scatter, filename='log_return_scatter_plot.html') pyo.plot(fig_scatter, filename='log_return_scatter_plot.html')
def plot_predicted_vs_actual_prices(actual_prices, predicted_prices, timestamps=None, n_plot=200):
import plotly.offline as pyo
import plotly.graph_objs as go
n_plot = min(n_plot, len(actual_prices))
actual = actual_prices[:n_plot]
predicted = predicted_prices[:n_plot]
if timestamps is not None:
x_axis = timestamps[:n_plot]
x_label = 'Timestamp'
else:
x_axis = list(range(n_plot))
x_label = 'Index'
# Line plot: Actual vs Predicted over time
trace_actual = go.Scatter(x=x_axis, y=actual, mode='lines', name='Actual Price')
trace_predicted = go.Scatter(x=x_axis, y=predicted, mode='lines', name='Predicted Price')
data_line = [trace_actual, trace_predicted]
layout_line = go.Layout(
title='Actual vs. Predicted BTC Prices (Test Set)',
xaxis={'title': x_label},
yaxis={'title': 'BTC Price'},
legend={'x': 0, 'y': 1},
margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
hovermode='closest'
)
fig_line = go.Figure(data=data_line, layout=layout_line)
pyo.plot(fig_line, filename='price_line_plot.html')
# Scatter plot: Predicted vs Actual
trace_scatter = go.Scatter(
x=actual,
y=predicted,
mode='markers',
name='Predicted vs Actual',
opacity=0.5
)
# Diagonal reference line
min_val = min(np.min(actual), np.min(predicted))
max_val = max(np.max(actual), np.max(predicted))
trace_diag = go.Scatter(
x=[min_val, max_val],
y=[min_val, max_val],
mode='lines',
name='Ideal',
line=dict(dash='dash', color='red')
)
data_scatter = [trace_scatter, trace_diag]
layout_scatter = go.Layout(
title='Predicted vs Actual Prices (Scatter)',
xaxis={'title': 'Actual Price'},
yaxis={'title': 'Predicted Price'},
showlegend=True,
margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
hovermode='closest'
)
fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
pyo.plot(fig_scatter, filename='price_scatter_plot.html')