257 lines
11 KiB
Python
257 lines
11 KiB
Python
import sys
|
|
import os
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split
|
|
from custom_xgboost import CustomXGBoostGPU
|
|
from sklearn.metrics import mean_squared_error
|
|
from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns
|
|
import ta
|
|
from cycles.supertrend import Supertrends
|
|
from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
|
|
from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillatorIndicator, StochasticOscillator, WilliamsRIndicator
|
|
from ta.volatility import KeltnerChannel, DonchianChannel
|
|
from ta.others import DailyReturnIndicator
|
|
|
|
if __name__ == '__main__':
|
|
csv_path = './data/btcusd_1-min_data.csv'
|
|
df = pd.read_csv(csv_path)
|
|
df = df[df['Volume'] != 0]
|
|
|
|
min_date = '2017-06-01'
|
|
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
|
|
df = df[df['Timestamp'] >= min_date]
|
|
|
|
lags = 3
|
|
|
|
print('Calculating log returns as the new target...')
|
|
# Calculate log returns as the new target
|
|
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
|
|
|
|
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
|
|
window_sizes = [5, 15, 30] # in minutes, adjust as needed
|
|
|
|
# Collect new features in a dictionary
|
|
features_dict = {}
|
|
|
|
# Lags
|
|
for col in ohlcv_cols:
|
|
for lag in range(1, lags + 1):
|
|
print(f'Calculating lag feature: {col}_lag{lag}')
|
|
features_dict[f'{col}_lag{lag}'] = df[col].shift(lag)
|
|
|
|
# Rolling statistics
|
|
for col in ohlcv_cols:
|
|
for window in window_sizes:
|
|
# Skip useless features
|
|
if (col == 'Open' and window == 5):
|
|
continue # Open_roll_min_5
|
|
if (col == 'High' and window == 5):
|
|
continue # High_roll_mean_5, High_roll_min_5, High_roll_max_5
|
|
if (col == 'High' and window == 30):
|
|
continue # High_roll_max_30
|
|
if (col == 'Low' and window == 15):
|
|
continue # Low_roll_max_15
|
|
print(f'Calculating rolling mean: {col}_roll_mean_{window}')
|
|
features_dict[f'{col}_roll_mean_{window}'] = df[col].rolling(window).mean()
|
|
print(f'Calculating rolling std: {col}_roll_std_{window}')
|
|
features_dict[f'{col}_roll_std_{window}'] = df[col].rolling(window).std()
|
|
print(f'Calculating rolling min: {col}_roll_min_{window}')
|
|
features_dict[f'{col}_roll_min_{window}'] = df[col].rolling(window).min()
|
|
print(f'Calculating rolling max: {col}_roll_max_{window}')
|
|
features_dict[f'{col}_roll_max_{window}'] = df[col].rolling(window).max()
|
|
|
|
# Log returns for different horizons
|
|
for horizon in [5, 15, 30]:
|
|
print(f'Calculating log return for horizon {horizon}...')
|
|
features_dict[f'log_return_{horizon}'] = np.log(df['Close'] / df['Close'].shift(horizon))
|
|
|
|
# Volatility
|
|
for window in window_sizes:
|
|
print(f'Calculating volatility for window {window}...')
|
|
features_dict[f'volatility_{window}'] = df['log_return'].rolling(window).std()
|
|
|
|
# Technical indicators (except Supertrend)
|
|
print('Calculating RSI...')
|
|
features_dict['rsi'] = ta.momentum.RSIIndicator(df['Close'], window=14).rsi()
|
|
|
|
print('Calculating MACD...')
|
|
features_dict['macd'] = ta.trend.MACD(df['Close']).macd()
|
|
|
|
print('Calculating Bollinger Bands...')
|
|
bb = ta.volatility.BollingerBands(close=df['Close'], window=20, window_dev=2)
|
|
features_dict['bb_bbm'] = bb.bollinger_mavg()
|
|
features_dict['bb_bbh'] = bb.bollinger_hband()
|
|
features_dict['bb_bbl'] = bb.bollinger_lband()
|
|
features_dict['bb_bb_width'] = features_dict['bb_bbh'] - features_dict['bb_bbl']
|
|
|
|
print('Calculating Stochastic Oscillator...')
|
|
stoch = ta.momentum.StochasticOscillator(high=df['High'], low=df['Low'], close=df['Close'], window=14, smooth_window=3)
|
|
features_dict['stoch_k'] = stoch.stoch()
|
|
features_dict['stoch_d'] = stoch.stoch_signal()
|
|
|
|
print('Calculating Average True Range (ATR)...')
|
|
atr = ta.volatility.AverageTrueRange(high=df['High'], low=df['Low'], close=df['Close'], window=14)
|
|
features_dict['atr'] = atr.average_true_range()
|
|
|
|
print('Calculating Commodity Channel Index (CCI)...')
|
|
cci = ta.trend.CCIIndicator(high=df['High'], low=df['Low'], close=df['Close'], window=20)
|
|
features_dict['cci'] = cci.cci()
|
|
|
|
print('Calculating Williams %R...')
|
|
willr = ta.momentum.WilliamsRIndicator(high=df['High'], low=df['Low'], close=df['Close'], lbp=14)
|
|
features_dict['williams_r'] = willr.williams_r()
|
|
|
|
print('Calculating Exponential Moving Average (EMA)...')
|
|
ema = ta.trend.EMAIndicator(close=df['Close'], window=14)
|
|
features_dict['ema_14'] = ema.ema_indicator()
|
|
|
|
print('Calculating On-Balance Volume (OBV)...')
|
|
obv = ta.volume.OnBalanceVolumeIndicator(close=df['Close'], volume=df['Volume'])
|
|
features_dict['obv'] = obv.on_balance_volume()
|
|
|
|
print('Calculating Chaikin Money Flow (CMF)...')
|
|
cmf = ta.volume.ChaikinMoneyFlowIndicator(high=df['High'], low=df['Low'], close=df['Close'], volume=df['Volume'], window=20)
|
|
features_dict['cmf'] = cmf.chaikin_money_flow()
|
|
|
|
# Additional TA indicators
|
|
# SMA
|
|
print('Calculating SMA 50 and 200...')
|
|
features_dict['sma_50'] = SMAIndicator(df['Close'], window=50).sma_indicator()
|
|
features_dict['sma_200'] = SMAIndicator(df['Close'], window=200).sma_indicator()
|
|
|
|
# Rate of Change
|
|
print('Calculating ROC 10...')
|
|
features_dict['roc_10'] = ROCIndicator(df['Close'], window=10).roc()
|
|
|
|
# Momentum
|
|
print('Calculating Momentum 10...')
|
|
features_dict['momentum_10'] = ta.momentum.MomentumIndicator(df['Close'], window=10).momentum()
|
|
|
|
# Parabolic SAR
|
|
print('Calculating Parabolic SAR...')
|
|
psar = PSARIndicator(df['High'], df['Low'], df['Close'])
|
|
features_dict['psar'] = psar.psar()
|
|
features_dict['psar_up'] = psar.psar_up()
|
|
features_dict['psar_down'] = psar.psar_down()
|
|
|
|
# Donchian Channel
|
|
print('Calculating Donchian Channel 20...')
|
|
donchian = DonchianChannel(df['High'], df['Low'], df['Close'], window=20)
|
|
features_dict['donchian_hband'] = donchian.donchian_channel_hband()
|
|
features_dict['donchian_lband'] = donchian.donchian_channel_lband()
|
|
features_dict['donchian_mband'] = donchian.donchian_channel_mband()
|
|
|
|
# Keltner Channel
|
|
print('Calculating Keltner Channel 20...')
|
|
keltner = KeltnerChannel(df['High'], df['Low'], df['Close'], window=20)
|
|
features_dict['keltner_hband'] = keltner.keltner_channel_hband()
|
|
features_dict['keltner_lband'] = keltner.keltner_channel_lband()
|
|
features_dict['keltner_mband'] = keltner.keltner_channel_mband()
|
|
|
|
# Detrended Price Oscillator
|
|
print('Calculating DPO 20...')
|
|
features_dict['dpo_20'] = DPOIndicator(df['Close'], window=20).dpo()
|
|
|
|
# Ultimate Oscillator
|
|
print('Calculating Ultimate Oscillator...')
|
|
features_dict['ultimate_osc'] = UltimateOscillatorIndicator(df['High'], df['Low'], df['Close']).ultimate_oscillator()
|
|
|
|
# Ichimoku
|
|
print('Calculating Ichimoku...')
|
|
ichimoku = IchimokuIndicator(df['High'], df['Low'], window1=9, window2=26, window3=52)
|
|
features_dict['ichimoku_a'] = ichimoku.ichimoku_a()
|
|
features_dict['ichimoku_b'] = ichimoku.ichimoku_b()
|
|
features_dict['ichimoku_base_line'] = ichimoku.ichimoku_base_line()
|
|
features_dict['ichimoku_conversion_line'] = ichimoku.ichimoku_conversion_line()
|
|
|
|
# Elder Ray Index (Bull Power, Bear Power)
|
|
print('Calculating Elder Ray Index...')
|
|
features_dict['elder_ray_bull'] = ta.trend.EMAIndicator(df['Close'], window=13).ema_indicator() - df['Low']
|
|
features_dict['elder_ray_bear'] = ta.trend.EMAIndicator(df['Close'], window=13).ema_indicator() - df['High']
|
|
|
|
# Pivot Points (Daily)
|
|
print('Calculating Daily Pivot Points...')
|
|
features_dict['daily_return'] = DailyReturnIndicator(df['Close']).daily_return()
|
|
|
|
# Concatenate all new features at once
|
|
print('Concatenating all new features to DataFrame...')
|
|
features_df = pd.DataFrame(features_dict)
|
|
df = pd.concat([df, features_df], axis=1)
|
|
|
|
# Add Supertrend indicators (custom)
|
|
print('Preparing data for Supertrend calculation...')
|
|
st_df = df.rename(columns={'High': 'high', 'Low': 'low', 'Close': 'close'})
|
|
|
|
print('Calculating Supertrend indicators...')
|
|
supertrend = Supertrends(st_df)
|
|
st_results = supertrend.calculate_supertrend_indicators()
|
|
for idx, st in enumerate(st_results):
|
|
period = st['params']['period']
|
|
multiplier = st['params']['multiplier']
|
|
# Skip useless supertrend features
|
|
if (period == 10 and multiplier == 1.0) or (period == 11 and multiplier == 2.0):
|
|
continue
|
|
print(f'Adding Supertrend features: supertrend_{period}_{multiplier} and supertrend_trend_{period}_{multiplier}')
|
|
df[f'supertrend_{period}_{multiplier}'] = st['results']['supertrend']
|
|
df[f'supertrend_trend_{period}_{multiplier}'] = st['results']['trend']
|
|
|
|
# Add time features (exclude 'dayofweek')
|
|
print('Adding hour feature...')
|
|
df['hour'] = df['Timestamp'].dt.hour
|
|
|
|
# Drop NaNs after all feature engineering
|
|
print('Dropping NaNs after feature engineering...')
|
|
df = df.dropna().reset_index(drop=True)
|
|
|
|
# Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
|
|
print('Selecting feature columns...')
|
|
exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
|
|
feature_cols = [col for col in df.columns if col not in exclude_cols]
|
|
|
|
print('Preparing X and y...')
|
|
X = df[feature_cols].values.astype(np.float32)
|
|
y = df['log_return'].values.astype(np.float32)
|
|
|
|
split_idx = int(len(X) * 0.8)
|
|
X_train, X_test = X[:split_idx], X[split_idx:]
|
|
y_train, y_test = y[:split_idx], y[split_idx:]
|
|
test_timestamps = df['Timestamp'].values[split_idx:]
|
|
|
|
print('Initializing model...')
|
|
model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
|
|
|
|
print('Training model...')
|
|
booster = model.train()
|
|
|
|
print('Training complete.')
|
|
|
|
if hasattr(model, 'params'):
|
|
print("Model hyperparameters:", model.params)
|
|
if hasattr(model, 'model') and hasattr(model.model, 'get_score'):
|
|
import operator
|
|
importances = model.model.get_score(importance_type='weight')
|
|
# Map f0, f1, ... to actual feature names
|
|
feature_map = {f"f{idx}": name for idx, name in enumerate(feature_cols)}
|
|
sorted_importances = sorted(importances.items(), key=operator.itemgetter(1), reverse=True)
|
|
print('Feature importances (sorted, with names):')
|
|
for feat, score in sorted_importances:
|
|
print(f'{feature_map.get(feat, feat)}: {score}')
|
|
|
|
preds = model.predict(X_test[:5])
|
|
print('Predictions for first 5 test samples:', preds)
|
|
print('Actual values for first 5 test samples:', y_test[:5])
|
|
|
|
test_preds = model.predict(X_test)
|
|
rmse = np.sqrt(mean_squared_error(y_test, test_preds))
|
|
print(f'RMSE on test set: {rmse:.4f}')
|
|
|
|
np.save('./data/y_test.npy', y_test)
|
|
np.save('./data/test_preds.npy', test_preds)
|
|
|
|
# display_actual_vs_predicted(y_test, test_preds, test_timestamps)
|
|
# plot_target_distribution(y_train, y_test)
|
|
|
|
plot_predicted_vs_actual_log_returns(y_test, test_preds, test_timestamps)
|