From de67b27e37f68cf612031e8c665c2dd11985d220 Mon Sep 17 00:00:00 2001 From: Simon Moisy Date: Thu, 29 May 2025 18:28:53 +0800 Subject: [PATCH] XGBoost first iteration --- xgboost/custom_xgboost.py | 33 +++++ xgboost/main.py | 256 ++++++++++++++++++++++++++++++++++++++ xgboost/plot_results.py | 111 +++++++++++++++++ 3 files changed, 400 insertions(+) create mode 100644 xgboost/custom_xgboost.py create mode 100644 xgboost/main.py create mode 100644 xgboost/plot_results.py diff --git a/xgboost/custom_xgboost.py b/xgboost/custom_xgboost.py new file mode 100644 index 0000000..697dc25 --- /dev/null +++ b/xgboost/custom_xgboost.py @@ -0,0 +1,33 @@ +import xgboost as xgb +import numpy as np + +class CustomXGBoostGPU: + def __init__(self, X_train, X_test, y_train, y_test): + self.X_train = X_train.astype(np.float32) + self.X_test = X_test.astype(np.float32) + self.y_train = y_train.astype(np.float32) + self.y_test = y_test.astype(np.float32) + self.model = None + self.params = None # Will be set during training + + def train(self, **xgb_params): + params = { + 'tree_method': 'hist', + 'device': 'cuda', + 'objective': 'reg:squarederror', + 'eval_metric': 'rmse', + 'verbosity': 1, + } + params.update(xgb_params) + self.params = params # Store params for later access + dtrain = xgb.DMatrix(self.X_train, label=self.y_train) + dtest = xgb.DMatrix(self.X_test, label=self.y_test) + evals = [(dtrain, 'train'), (dtest, 'eval')] + self.model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10) + return self.model + + def predict(self, X): + if self.model is None: + raise ValueError('Model not trained yet.') + dmatrix = xgb.DMatrix(X.astype(np.float32)) + return self.model.predict(dmatrix) diff --git a/xgboost/main.py b/xgboost/main.py new file mode 100644 index 0000000..8d6f7d1 --- /dev/null +++ b/xgboost/main.py @@ -0,0 +1,256 @@ +import sys +import os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from custom_xgboost import CustomXGBoostGPU +from sklearn.metrics import mean_squared_error +from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns +import ta +from cycles.supertrend import Supertrends +from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator +from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillatorIndicator, StochasticOscillator, WilliamsRIndicator +from ta.volatility import KeltnerChannel, DonchianChannel +from ta.others import DailyReturnIndicator + +if __name__ == '__main__': + csv_path = './data/btcusd_1-min_data.csv' + df = pd.read_csv(csv_path) + df = df[df['Volume'] != 0] + + min_date = '2017-06-01' + df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s') + df = df[df['Timestamp'] >= min_date] + + lags = 3 + + print('Calculating log returns as the new target...') + # Calculate log returns as the new target + df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) + + ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume'] + window_sizes = [5, 15, 30] # in minutes, adjust as needed + + # Collect new features in a dictionary + features_dict = {} + + # Lags + for col in ohlcv_cols: + for lag in range(1, lags + 1): + print(f'Calculating lag feature: {col}_lag{lag}') + features_dict[f'{col}_lag{lag}'] = df[col].shift(lag) + + # Rolling statistics + for col in ohlcv_cols: + for window in window_sizes: + # Skip useless features + if (col == 'Open' and window == 5): + continue # Open_roll_min_5 + if (col == 'High' and window == 5): + continue # High_roll_mean_5, High_roll_min_5, High_roll_max_5 + if (col == 'High' and window == 30): + continue # High_roll_max_30 + if (col == 'Low' and window == 15): + continue # Low_roll_max_15 + print(f'Calculating rolling mean: {col}_roll_mean_{window}') + features_dict[f'{col}_roll_mean_{window}'] = df[col].rolling(window).mean() + print(f'Calculating rolling std: {col}_roll_std_{window}') + features_dict[f'{col}_roll_std_{window}'] = df[col].rolling(window).std() + print(f'Calculating rolling min: {col}_roll_min_{window}') + features_dict[f'{col}_roll_min_{window}'] = df[col].rolling(window).min() + print(f'Calculating rolling max: {col}_roll_max_{window}') + features_dict[f'{col}_roll_max_{window}'] = df[col].rolling(window).max() + + # Log returns for different horizons + for horizon in [5, 15, 30]: + print(f'Calculating log return for horizon {horizon}...') + features_dict[f'log_return_{horizon}'] = np.log(df['Close'] / df['Close'].shift(horizon)) + + # Volatility + for window in window_sizes: + print(f'Calculating volatility for window {window}...') + features_dict[f'volatility_{window}'] = df['log_return'].rolling(window).std() + + # Technical indicators (except Supertrend) + print('Calculating RSI...') + features_dict['rsi'] = ta.momentum.RSIIndicator(df['Close'], window=14).rsi() + + print('Calculating MACD...') + features_dict['macd'] = ta.trend.MACD(df['Close']).macd() + + print('Calculating Bollinger Bands...') + bb = ta.volatility.BollingerBands(close=df['Close'], window=20, window_dev=2) + features_dict['bb_bbm'] = bb.bollinger_mavg() + features_dict['bb_bbh'] = bb.bollinger_hband() + features_dict['bb_bbl'] = bb.bollinger_lband() + features_dict['bb_bb_width'] = features_dict['bb_bbh'] - features_dict['bb_bbl'] + + print('Calculating Stochastic Oscillator...') + stoch = ta.momentum.StochasticOscillator(high=df['High'], low=df['Low'], close=df['Close'], window=14, smooth_window=3) + features_dict['stoch_k'] = stoch.stoch() + features_dict['stoch_d'] = stoch.stoch_signal() + + print('Calculating Average True Range (ATR)...') + atr = ta.volatility.AverageTrueRange(high=df['High'], low=df['Low'], close=df['Close'], window=14) + features_dict['atr'] = atr.average_true_range() + + print('Calculating Commodity Channel Index (CCI)...') + cci = ta.trend.CCIIndicator(high=df['High'], low=df['Low'], close=df['Close'], window=20) + features_dict['cci'] = cci.cci() + + print('Calculating Williams %R...') + willr = ta.momentum.WilliamsRIndicator(high=df['High'], low=df['Low'], close=df['Close'], lbp=14) + features_dict['williams_r'] = willr.williams_r() + + print('Calculating Exponential Moving Average (EMA)...') + ema = ta.trend.EMAIndicator(close=df['Close'], window=14) + features_dict['ema_14'] = ema.ema_indicator() + + print('Calculating On-Balance Volume (OBV)...') + obv = ta.volume.OnBalanceVolumeIndicator(close=df['Close'], volume=df['Volume']) + features_dict['obv'] = obv.on_balance_volume() + + print('Calculating Chaikin Money Flow (CMF)...') + cmf = ta.volume.ChaikinMoneyFlowIndicator(high=df['High'], low=df['Low'], close=df['Close'], volume=df['Volume'], window=20) + features_dict['cmf'] = cmf.chaikin_money_flow() + + # Additional TA indicators + # SMA + print('Calculating SMA 50 and 200...') + features_dict['sma_50'] = SMAIndicator(df['Close'], window=50).sma_indicator() + features_dict['sma_200'] = SMAIndicator(df['Close'], window=200).sma_indicator() + + # Rate of Change + print('Calculating ROC 10...') + features_dict['roc_10'] = ROCIndicator(df['Close'], window=10).roc() + + # Momentum + print('Calculating Momentum 10...') + features_dict['momentum_10'] = ta.momentum.MomentumIndicator(df['Close'], window=10).momentum() + + # Parabolic SAR + print('Calculating Parabolic SAR...') + psar = PSARIndicator(df['High'], df['Low'], df['Close']) + features_dict['psar'] = psar.psar() + features_dict['psar_up'] = psar.psar_up() + features_dict['psar_down'] = psar.psar_down() + + # Donchian Channel + print('Calculating Donchian Channel 20...') + donchian = DonchianChannel(df['High'], df['Low'], df['Close'], window=20) + features_dict['donchian_hband'] = donchian.donchian_channel_hband() + features_dict['donchian_lband'] = donchian.donchian_channel_lband() + features_dict['donchian_mband'] = donchian.donchian_channel_mband() + + # Keltner Channel + print('Calculating Keltner Channel 20...') + keltner = KeltnerChannel(df['High'], df['Low'], df['Close'], window=20) + features_dict['keltner_hband'] = keltner.keltner_channel_hband() + features_dict['keltner_lband'] = keltner.keltner_channel_lband() + features_dict['keltner_mband'] = keltner.keltner_channel_mband() + + # Detrended Price Oscillator + print('Calculating DPO 20...') + features_dict['dpo_20'] = DPOIndicator(df['Close'], window=20).dpo() + + # Ultimate Oscillator + print('Calculating Ultimate Oscillator...') + features_dict['ultimate_osc'] = UltimateOscillatorIndicator(df['High'], df['Low'], df['Close']).ultimate_oscillator() + + # Ichimoku + print('Calculating Ichimoku...') + ichimoku = IchimokuIndicator(df['High'], df['Low'], window1=9, window2=26, window3=52) + features_dict['ichimoku_a'] = ichimoku.ichimoku_a() + features_dict['ichimoku_b'] = ichimoku.ichimoku_b() + features_dict['ichimoku_base_line'] = ichimoku.ichimoku_base_line() + features_dict['ichimoku_conversion_line'] = ichimoku.ichimoku_conversion_line() + + # Elder Ray Index (Bull Power, Bear Power) + print('Calculating Elder Ray Index...') + features_dict['elder_ray_bull'] = ta.trend.EMAIndicator(df['Close'], window=13).ema_indicator() - df['Low'] + features_dict['elder_ray_bear'] = ta.trend.EMAIndicator(df['Close'], window=13).ema_indicator() - df['High'] + + # Pivot Points (Daily) + print('Calculating Daily Pivot Points...') + features_dict['daily_return'] = DailyReturnIndicator(df['Close']).daily_return() + + # Concatenate all new features at once + print('Concatenating all new features to DataFrame...') + features_df = pd.DataFrame(features_dict) + df = pd.concat([df, features_df], axis=1) + + # Add Supertrend indicators (custom) + print('Preparing data for Supertrend calculation...') + st_df = df.rename(columns={'High': 'high', 'Low': 'low', 'Close': 'close'}) + + print('Calculating Supertrend indicators...') + supertrend = Supertrends(st_df) + st_results = supertrend.calculate_supertrend_indicators() + for idx, st in enumerate(st_results): + period = st['params']['period'] + multiplier = st['params']['multiplier'] + # Skip useless supertrend features + if (period == 10 and multiplier == 1.0) or (period == 11 and multiplier == 2.0): + continue + print(f'Adding Supertrend features: supertrend_{period}_{multiplier} and supertrend_trend_{period}_{multiplier}') + df[f'supertrend_{period}_{multiplier}'] = st['results']['supertrend'] + df[f'supertrend_trend_{period}_{multiplier}'] = st['results']['trend'] + + # Add time features (exclude 'dayofweek') + print('Adding hour feature...') + df['hour'] = df['Timestamp'].dt.hour + + # Drop NaNs after all feature engineering + print('Dropping NaNs after feature engineering...') + df = df.dropna().reset_index(drop=True) + + # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features + print('Selecting feature columns...') + exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30'] + feature_cols = [col for col in df.columns if col not in exclude_cols] + + print('Preparing X and y...') + X = df[feature_cols].values.astype(np.float32) + y = df['log_return'].values.astype(np.float32) + + split_idx = int(len(X) * 0.8) + X_train, X_test = X[:split_idx], X[split_idx:] + y_train, y_test = y[:split_idx], y[split_idx:] + test_timestamps = df['Timestamp'].values[split_idx:] + + print('Initializing model...') + model = CustomXGBoostGPU(X_train, X_test, y_train, y_test) + + print('Training model...') + booster = model.train() + + print('Training complete.') + + if hasattr(model, 'params'): + print("Model hyperparameters:", model.params) + if hasattr(model, 'model') and hasattr(model.model, 'get_score'): + import operator + importances = model.model.get_score(importance_type='weight') + # Map f0, f1, ... to actual feature names + feature_map = {f"f{idx}": name for idx, name in enumerate(feature_cols)} + sorted_importances = sorted(importances.items(), key=operator.itemgetter(1), reverse=True) + print('Feature importances (sorted, with names):') + for feat, score in sorted_importances: + print(f'{feature_map.get(feat, feat)}: {score}') + + preds = model.predict(X_test[:5]) + print('Predictions for first 5 test samples:', preds) + print('Actual values for first 5 test samples:', y_test[:5]) + + test_preds = model.predict(X_test) + rmse = np.sqrt(mean_squared_error(y_test, test_preds)) + print(f'RMSE on test set: {rmse:.4f}') + + np.save('./data/y_test.npy', y_test) + np.save('./data/test_preds.npy', test_preds) + + # display_actual_vs_predicted(y_test, test_preds, test_timestamps) + # plot_target_distribution(y_train, y_test) + + plot_predicted_vs_actual_log_returns(y_test, test_preds, test_timestamps) diff --git a/xgboost/plot_results.py b/xgboost/plot_results.py new file mode 100644 index 0000000..4c9de7a --- /dev/null +++ b/xgboost/plot_results.py @@ -0,0 +1,111 @@ +import numpy as np +import dash +from dash import dcc, html +import plotly.graph_objs as go +import threading + + +def display_actual_vs_predicted(y_test, test_preds, timestamps, n_plot=200): + import plotly.offline as pyo + n_plot = min(n_plot, len(y_test)) + plot_indices = timestamps[:n_plot] + actual = y_test[:n_plot] + predicted = test_preds[:n_plot] + + trace_actual = go.Scatter(x=plot_indices, y=actual, mode='lines', name='Actual') + trace_predicted = go.Scatter(x=plot_indices, y=predicted, mode='lines', name='Predicted') + data = [trace_actual, trace_predicted] + layout = go.Layout( + title='Actual vs. Predicted BTC Close Prices (Test Set)', + xaxis={'title': 'Timestamp'}, + yaxis={'title': 'BTC Close Price'}, + legend={'x': 0, 'y': 1}, + margin={'l': 40, 'b': 40, 't': 40, 'r': 10}, + hovermode='closest' + ) + fig = go.Figure(data=data, layout=layout) + pyo.plot(fig) + +def plot_target_distribution(y_train, y_test): + import plotly.offline as pyo + trace_train = go.Histogram( + x=y_train, + nbinsx=100, + opacity=0.5, + name='Train', + marker=dict(color='blue') + ) + trace_test = go.Histogram( + x=y_test, + nbinsx=100, + opacity=0.5, + name='Test', + marker=dict(color='orange') + ) + data = [trace_train, trace_test] + layout = go.Layout( + title='Distribution of Target Variable (Close Price)', + xaxis=dict(title='BTC Close Price'), + yaxis=dict(title='Frequency'), + barmode='overlay' + ) + fig = go.Figure(data=data, layout=layout) + pyo.plot(fig) + +def plot_predicted_vs_actual_log_returns(y_test, test_preds, timestamps=None, n_plot=200): + import plotly.offline as pyo + import plotly.graph_objs as go + n_plot = min(n_plot, len(y_test)) + actual = y_test[:n_plot] + predicted = test_preds[:n_plot] + if timestamps is not None: + x_axis = timestamps[:n_plot] + x_label = 'Timestamp' + else: + x_axis = list(range(n_plot)) + x_label = 'Index' + + # Line plot: Actual vs Predicted over time + trace_actual = go.Scatter(x=x_axis, y=actual, mode='lines', name='Actual') + trace_predicted = go.Scatter(x=x_axis, y=predicted, mode='lines', name='Predicted') + data_line = [trace_actual, trace_predicted] + layout_line = go.Layout( + title='Actual vs. Predicted Log Returns (Test Set)', + xaxis={'title': x_label}, + yaxis={'title': 'Log Return'}, + legend={'x': 0, 'y': 1}, + margin={'l': 40, 'b': 40, 't': 40, 'r': 10}, + hovermode='closest' + ) + fig_line = go.Figure(data=data_line, layout=layout_line) + pyo.plot(fig_line, filename='log_return_line_plot.html') + + # Scatter plot: Predicted vs Actual + trace_scatter = go.Scatter( + x=actual, + y=predicted, + mode='markers', + name='Predicted vs Actual', + opacity=0.5 + ) + # Diagonal reference line + min_val = min(np.min(actual), np.min(predicted)) + max_val = max(np.max(actual), np.max(predicted)) + trace_diag = go.Scatter( + x=[min_val, max_val], + y=[min_val, max_val], + mode='lines', + name='Ideal', + line=dict(dash='dash', color='red') + ) + data_scatter = [trace_scatter, trace_diag] + layout_scatter = go.Layout( + title='Predicted vs Actual Log Returns (Scatter)', + xaxis={'title': 'Actual Log Return'}, + yaxis={'title': 'Predicted Log Return'}, + showlegend=True, + margin={'l': 40, 'b': 40, 't': 40, 'r': 10}, + hovermode='closest' + ) + fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter) + pyo.plot(fig_scatter, filename='log_return_scatter_plot.html')