XGBoost first iteration

This commit is contained in:
Simon Moisy 2025-05-29 18:28:53 +08:00
parent 1284549106
commit de67b27e37
3 changed files with 400 additions and 0 deletions

33
xgboost/custom_xgboost.py Normal file
View File

@ -0,0 +1,33 @@
import xgboost as xgb
import numpy as np
class CustomXGBoostGPU:
def __init__(self, X_train, X_test, y_train, y_test):
self.X_train = X_train.astype(np.float32)
self.X_test = X_test.astype(np.float32)
self.y_train = y_train.astype(np.float32)
self.y_test = y_test.astype(np.float32)
self.model = None
self.params = None # Will be set during training
def train(self, **xgb_params):
params = {
'tree_method': 'hist',
'device': 'cuda',
'objective': 'reg:squarederror',
'eval_metric': 'rmse',
'verbosity': 1,
}
params.update(xgb_params)
self.params = params # Store params for later access
dtrain = xgb.DMatrix(self.X_train, label=self.y_train)
dtest = xgb.DMatrix(self.X_test, label=self.y_test)
evals = [(dtrain, 'train'), (dtest, 'eval')]
self.model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)
return self.model
def predict(self, X):
if self.model is None:
raise ValueError('Model not trained yet.')
dmatrix = xgb.DMatrix(X.astype(np.float32))
return self.model.predict(dmatrix)

256
xgboost/main.py Normal file
View File

@ -0,0 +1,256 @@
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from custom_xgboost import CustomXGBoostGPU
from sklearn.metrics import mean_squared_error
from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns
import ta
from cycles.supertrend import Supertrends
from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillatorIndicator, StochasticOscillator, WilliamsRIndicator
from ta.volatility import KeltnerChannel, DonchianChannel
from ta.others import DailyReturnIndicator
if __name__ == '__main__':
csv_path = './data/btcusd_1-min_data.csv'
df = pd.read_csv(csv_path)
df = df[df['Volume'] != 0]
min_date = '2017-06-01'
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df = df[df['Timestamp'] >= min_date]
lags = 3
print('Calculating log returns as the new target...')
# Calculate log returns as the new target
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
window_sizes = [5, 15, 30] # in minutes, adjust as needed
# Collect new features in a dictionary
features_dict = {}
# Lags
for col in ohlcv_cols:
for lag in range(1, lags + 1):
print(f'Calculating lag feature: {col}_lag{lag}')
features_dict[f'{col}_lag{lag}'] = df[col].shift(lag)
# Rolling statistics
for col in ohlcv_cols:
for window in window_sizes:
# Skip useless features
if (col == 'Open' and window == 5):
continue # Open_roll_min_5
if (col == 'High' and window == 5):
continue # High_roll_mean_5, High_roll_min_5, High_roll_max_5
if (col == 'High' and window == 30):
continue # High_roll_max_30
if (col == 'Low' and window == 15):
continue # Low_roll_max_15
print(f'Calculating rolling mean: {col}_roll_mean_{window}')
features_dict[f'{col}_roll_mean_{window}'] = df[col].rolling(window).mean()
print(f'Calculating rolling std: {col}_roll_std_{window}')
features_dict[f'{col}_roll_std_{window}'] = df[col].rolling(window).std()
print(f'Calculating rolling min: {col}_roll_min_{window}')
features_dict[f'{col}_roll_min_{window}'] = df[col].rolling(window).min()
print(f'Calculating rolling max: {col}_roll_max_{window}')
features_dict[f'{col}_roll_max_{window}'] = df[col].rolling(window).max()
# Log returns for different horizons
for horizon in [5, 15, 30]:
print(f'Calculating log return for horizon {horizon}...')
features_dict[f'log_return_{horizon}'] = np.log(df['Close'] / df['Close'].shift(horizon))
# Volatility
for window in window_sizes:
print(f'Calculating volatility for window {window}...')
features_dict[f'volatility_{window}'] = df['log_return'].rolling(window).std()
# Technical indicators (except Supertrend)
print('Calculating RSI...')
features_dict['rsi'] = ta.momentum.RSIIndicator(df['Close'], window=14).rsi()
print('Calculating MACD...')
features_dict['macd'] = ta.trend.MACD(df['Close']).macd()
print('Calculating Bollinger Bands...')
bb = ta.volatility.BollingerBands(close=df['Close'], window=20, window_dev=2)
features_dict['bb_bbm'] = bb.bollinger_mavg()
features_dict['bb_bbh'] = bb.bollinger_hband()
features_dict['bb_bbl'] = bb.bollinger_lband()
features_dict['bb_bb_width'] = features_dict['bb_bbh'] - features_dict['bb_bbl']
print('Calculating Stochastic Oscillator...')
stoch = ta.momentum.StochasticOscillator(high=df['High'], low=df['Low'], close=df['Close'], window=14, smooth_window=3)
features_dict['stoch_k'] = stoch.stoch()
features_dict['stoch_d'] = stoch.stoch_signal()
print('Calculating Average True Range (ATR)...')
atr = ta.volatility.AverageTrueRange(high=df['High'], low=df['Low'], close=df['Close'], window=14)
features_dict['atr'] = atr.average_true_range()
print('Calculating Commodity Channel Index (CCI)...')
cci = ta.trend.CCIIndicator(high=df['High'], low=df['Low'], close=df['Close'], window=20)
features_dict['cci'] = cci.cci()
print('Calculating Williams %R...')
willr = ta.momentum.WilliamsRIndicator(high=df['High'], low=df['Low'], close=df['Close'], lbp=14)
features_dict['williams_r'] = willr.williams_r()
print('Calculating Exponential Moving Average (EMA)...')
ema = ta.trend.EMAIndicator(close=df['Close'], window=14)
features_dict['ema_14'] = ema.ema_indicator()
print('Calculating On-Balance Volume (OBV)...')
obv = ta.volume.OnBalanceVolumeIndicator(close=df['Close'], volume=df['Volume'])
features_dict['obv'] = obv.on_balance_volume()
print('Calculating Chaikin Money Flow (CMF)...')
cmf = ta.volume.ChaikinMoneyFlowIndicator(high=df['High'], low=df['Low'], close=df['Close'], volume=df['Volume'], window=20)
features_dict['cmf'] = cmf.chaikin_money_flow()
# Additional TA indicators
# SMA
print('Calculating SMA 50 and 200...')
features_dict['sma_50'] = SMAIndicator(df['Close'], window=50).sma_indicator()
features_dict['sma_200'] = SMAIndicator(df['Close'], window=200).sma_indicator()
# Rate of Change
print('Calculating ROC 10...')
features_dict['roc_10'] = ROCIndicator(df['Close'], window=10).roc()
# Momentum
print('Calculating Momentum 10...')
features_dict['momentum_10'] = ta.momentum.MomentumIndicator(df['Close'], window=10).momentum()
# Parabolic SAR
print('Calculating Parabolic SAR...')
psar = PSARIndicator(df['High'], df['Low'], df['Close'])
features_dict['psar'] = psar.psar()
features_dict['psar_up'] = psar.psar_up()
features_dict['psar_down'] = psar.psar_down()
# Donchian Channel
print('Calculating Donchian Channel 20...')
donchian = DonchianChannel(df['High'], df['Low'], df['Close'], window=20)
features_dict['donchian_hband'] = donchian.donchian_channel_hband()
features_dict['donchian_lband'] = donchian.donchian_channel_lband()
features_dict['donchian_mband'] = donchian.donchian_channel_mband()
# Keltner Channel
print('Calculating Keltner Channel 20...')
keltner = KeltnerChannel(df['High'], df['Low'], df['Close'], window=20)
features_dict['keltner_hband'] = keltner.keltner_channel_hband()
features_dict['keltner_lband'] = keltner.keltner_channel_lband()
features_dict['keltner_mband'] = keltner.keltner_channel_mband()
# Detrended Price Oscillator
print('Calculating DPO 20...')
features_dict['dpo_20'] = DPOIndicator(df['Close'], window=20).dpo()
# Ultimate Oscillator
print('Calculating Ultimate Oscillator...')
features_dict['ultimate_osc'] = UltimateOscillatorIndicator(df['High'], df['Low'], df['Close']).ultimate_oscillator()
# Ichimoku
print('Calculating Ichimoku...')
ichimoku = IchimokuIndicator(df['High'], df['Low'], window1=9, window2=26, window3=52)
features_dict['ichimoku_a'] = ichimoku.ichimoku_a()
features_dict['ichimoku_b'] = ichimoku.ichimoku_b()
features_dict['ichimoku_base_line'] = ichimoku.ichimoku_base_line()
features_dict['ichimoku_conversion_line'] = ichimoku.ichimoku_conversion_line()
# Elder Ray Index (Bull Power, Bear Power)
print('Calculating Elder Ray Index...')
features_dict['elder_ray_bull'] = ta.trend.EMAIndicator(df['Close'], window=13).ema_indicator() - df['Low']
features_dict['elder_ray_bear'] = ta.trend.EMAIndicator(df['Close'], window=13).ema_indicator() - df['High']
# Pivot Points (Daily)
print('Calculating Daily Pivot Points...')
features_dict['daily_return'] = DailyReturnIndicator(df['Close']).daily_return()
# Concatenate all new features at once
print('Concatenating all new features to DataFrame...')
features_df = pd.DataFrame(features_dict)
df = pd.concat([df, features_df], axis=1)
# Add Supertrend indicators (custom)
print('Preparing data for Supertrend calculation...')
st_df = df.rename(columns={'High': 'high', 'Low': 'low', 'Close': 'close'})
print('Calculating Supertrend indicators...')
supertrend = Supertrends(st_df)
st_results = supertrend.calculate_supertrend_indicators()
for idx, st in enumerate(st_results):
period = st['params']['period']
multiplier = st['params']['multiplier']
# Skip useless supertrend features
if (period == 10 and multiplier == 1.0) or (period == 11 and multiplier == 2.0):
continue
print(f'Adding Supertrend features: supertrend_{period}_{multiplier} and supertrend_trend_{period}_{multiplier}')
df[f'supertrend_{period}_{multiplier}'] = st['results']['supertrend']
df[f'supertrend_trend_{period}_{multiplier}'] = st['results']['trend']
# Add time features (exclude 'dayofweek')
print('Adding hour feature...')
df['hour'] = df['Timestamp'].dt.hour
# Drop NaNs after all feature engineering
print('Dropping NaNs after feature engineering...')
df = df.dropna().reset_index(drop=True)
# Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
print('Selecting feature columns...')
exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
feature_cols = [col for col in df.columns if col not in exclude_cols]
print('Preparing X and y...')
X = df[feature_cols].values.astype(np.float32)
y = df['log_return'].values.astype(np.float32)
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
test_timestamps = df['Timestamp'].values[split_idx:]
print('Initializing model...')
model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
print('Training model...')
booster = model.train()
print('Training complete.')
if hasattr(model, 'params'):
print("Model hyperparameters:", model.params)
if hasattr(model, 'model') and hasattr(model.model, 'get_score'):
import operator
importances = model.model.get_score(importance_type='weight')
# Map f0, f1, ... to actual feature names
feature_map = {f"f{idx}": name for idx, name in enumerate(feature_cols)}
sorted_importances = sorted(importances.items(), key=operator.itemgetter(1), reverse=True)
print('Feature importances (sorted, with names):')
for feat, score in sorted_importances:
print(f'{feature_map.get(feat, feat)}: {score}')
preds = model.predict(X_test[:5])
print('Predictions for first 5 test samples:', preds)
print('Actual values for first 5 test samples:', y_test[:5])
test_preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, test_preds))
print(f'RMSE on test set: {rmse:.4f}')
np.save('./data/y_test.npy', y_test)
np.save('./data/test_preds.npy', test_preds)
# display_actual_vs_predicted(y_test, test_preds, test_timestamps)
# plot_target_distribution(y_train, y_test)
plot_predicted_vs_actual_log_returns(y_test, test_preds, test_timestamps)

111
xgboost/plot_results.py Normal file
View File

@ -0,0 +1,111 @@
import numpy as np
import dash
from dash import dcc, html
import plotly.graph_objs as go
import threading
def display_actual_vs_predicted(y_test, test_preds, timestamps, n_plot=200):
import plotly.offline as pyo
n_plot = min(n_plot, len(y_test))
plot_indices = timestamps[:n_plot]
actual = y_test[:n_plot]
predicted = test_preds[:n_plot]
trace_actual = go.Scatter(x=plot_indices, y=actual, mode='lines', name='Actual')
trace_predicted = go.Scatter(x=plot_indices, y=predicted, mode='lines', name='Predicted')
data = [trace_actual, trace_predicted]
layout = go.Layout(
title='Actual vs. Predicted BTC Close Prices (Test Set)',
xaxis={'title': 'Timestamp'},
yaxis={'title': 'BTC Close Price'},
legend={'x': 0, 'y': 1},
margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
hovermode='closest'
)
fig = go.Figure(data=data, layout=layout)
pyo.plot(fig)
def plot_target_distribution(y_train, y_test):
import plotly.offline as pyo
trace_train = go.Histogram(
x=y_train,
nbinsx=100,
opacity=0.5,
name='Train',
marker=dict(color='blue')
)
trace_test = go.Histogram(
x=y_test,
nbinsx=100,
opacity=0.5,
name='Test',
marker=dict(color='orange')
)
data = [trace_train, trace_test]
layout = go.Layout(
title='Distribution of Target Variable (Close Price)',
xaxis=dict(title='BTC Close Price'),
yaxis=dict(title='Frequency'),
barmode='overlay'
)
fig = go.Figure(data=data, layout=layout)
pyo.plot(fig)
def plot_predicted_vs_actual_log_returns(y_test, test_preds, timestamps=None, n_plot=200):
import plotly.offline as pyo
import plotly.graph_objs as go
n_plot = min(n_plot, len(y_test))
actual = y_test[:n_plot]
predicted = test_preds[:n_plot]
if timestamps is not None:
x_axis = timestamps[:n_plot]
x_label = 'Timestamp'
else:
x_axis = list(range(n_plot))
x_label = 'Index'
# Line plot: Actual vs Predicted over time
trace_actual = go.Scatter(x=x_axis, y=actual, mode='lines', name='Actual')
trace_predicted = go.Scatter(x=x_axis, y=predicted, mode='lines', name='Predicted')
data_line = [trace_actual, trace_predicted]
layout_line = go.Layout(
title='Actual vs. Predicted Log Returns (Test Set)',
xaxis={'title': x_label},
yaxis={'title': 'Log Return'},
legend={'x': 0, 'y': 1},
margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
hovermode='closest'
)
fig_line = go.Figure(data=data_line, layout=layout_line)
pyo.plot(fig_line, filename='log_return_line_plot.html')
# Scatter plot: Predicted vs Actual
trace_scatter = go.Scatter(
x=actual,
y=predicted,
mode='markers',
name='Predicted vs Actual',
opacity=0.5
)
# Diagonal reference line
min_val = min(np.min(actual), np.min(predicted))
max_val = max(np.max(actual), np.max(predicted))
trace_diag = go.Scatter(
x=[min_val, max_val],
y=[min_val, max_val],
mode='lines',
name='Ideal',
line=dict(dash='dash', color='red')
)
data_scatter = [trace_scatter, trace_diag]
layout_scatter = go.Layout(
title='Predicted vs Actual Log Returns (Scatter)',
xaxis={'title': 'Actual Log Return'},
yaxis={'title': 'Predicted Log Return'},
showlegend=True,
margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
hovermode='closest'
)
fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
pyo.plot(fig_scatter, filename='log_return_scatter_plot.html')