From de67b27e37f68cf612031e8c665c2dd11985d220 Mon Sep 17 00:00:00 2001
From: Simon Moisy <simon.moisy@tutanota.com>
Date: Thu, 29 May 2025 18:28:53 +0800
Subject: [PATCH] XGBoost first iteration

---
 xgboost/custom_xgboost.py |  33 +++++
 xgboost/main.py           | 256 ++++++++++++++++++++++++++++++++++++++
 xgboost/plot_results.py   | 111 +++++++++++++++++
 3 files changed, 400 insertions(+)
 create mode 100644 xgboost/custom_xgboost.py
 create mode 100644 xgboost/main.py
 create mode 100644 xgboost/plot_results.py

diff --git a/xgboost/custom_xgboost.py b/xgboost/custom_xgboost.py
new file mode 100644
index 0000000..697dc25
--- /dev/null
+++ b/xgboost/custom_xgboost.py
@@ -0,0 +1,33 @@
+import xgboost as xgb
+import numpy as np
+
+class CustomXGBoostGPU:
+    def __init__(self, X_train, X_test, y_train, y_test):
+        self.X_train = X_train.astype(np.float32)
+        self.X_test = X_test.astype(np.float32)
+        self.y_train = y_train.astype(np.float32)
+        self.y_test = y_test.astype(np.float32)
+        self.model = None
+        self.params = None  # Will be set during training
+
+    def train(self, **xgb_params):
+        params = {
+            'tree_method': 'hist',
+            'device': 'cuda',
+            'objective': 'reg:squarederror',
+            'eval_metric': 'rmse',
+            'verbosity': 1,
+        }
+        params.update(xgb_params)
+        self.params = params  # Store params for later access
+        dtrain = xgb.DMatrix(self.X_train, label=self.y_train)
+        dtest = xgb.DMatrix(self.X_test, label=self.y_test)
+        evals = [(dtrain, 'train'), (dtest, 'eval')]
+        self.model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)
+        return self.model
+
+    def predict(self, X):
+        if self.model is None:
+            raise ValueError('Model not trained yet.')
+        dmatrix = xgb.DMatrix(X.astype(np.float32))
+        return self.model.predict(dmatrix)
diff --git a/xgboost/main.py b/xgboost/main.py
new file mode 100644
index 0000000..8d6f7d1
--- /dev/null
+++ b/xgboost/main.py
@@ -0,0 +1,256 @@
+import sys
+import os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from custom_xgboost import CustomXGBoostGPU
+from sklearn.metrics import mean_squared_error
+from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns
+import ta
+from cycles.supertrend import Supertrends
+from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
+from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillatorIndicator, StochasticOscillator, WilliamsRIndicator
+from ta.volatility import KeltnerChannel, DonchianChannel
+from ta.others import DailyReturnIndicator
+
+if __name__ == '__main__':
+    csv_path = './data/btcusd_1-min_data.csv'
+    df = pd.read_csv(csv_path)
+    df = df[df['Volume'] != 0]
+
+    min_date = '2017-06-01'
+    df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
+    df = df[df['Timestamp'] >= min_date]
+
+    lags = 3
+
+    print('Calculating log returns as the new target...')
+    # Calculate log returns as the new target
+    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
+
+    ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
+    window_sizes = [5, 15, 30]  # in minutes, adjust as needed
+
+    # Collect new features in a dictionary
+    features_dict = {}
+
+    # Lags
+    for col in ohlcv_cols:
+        for lag in range(1, lags + 1):
+            print(f'Calculating lag feature: {col}_lag{lag}')
+            features_dict[f'{col}_lag{lag}'] = df[col].shift(lag)
+
+    # Rolling statistics
+    for col in ohlcv_cols:
+        for window in window_sizes:
+            # Skip useless features
+            if (col == 'Open' and window == 5):
+                continue  # Open_roll_min_5
+            if (col == 'High' and window == 5):
+                continue  # High_roll_mean_5, High_roll_min_5, High_roll_max_5
+            if (col == 'High' and window == 30):
+                continue  # High_roll_max_30
+            if (col == 'Low' and window == 15):
+                continue  # Low_roll_max_15
+            print(f'Calculating rolling mean: {col}_roll_mean_{window}')
+            features_dict[f'{col}_roll_mean_{window}'] = df[col].rolling(window).mean()
+            print(f'Calculating rolling std: {col}_roll_std_{window}')
+            features_dict[f'{col}_roll_std_{window}'] = df[col].rolling(window).std()
+            print(f'Calculating rolling min: {col}_roll_min_{window}')
+            features_dict[f'{col}_roll_min_{window}'] = df[col].rolling(window).min()
+            print(f'Calculating rolling max: {col}_roll_max_{window}')
+            features_dict[f'{col}_roll_max_{window}'] = df[col].rolling(window).max()
+
+    # Log returns for different horizons
+    for horizon in [5, 15, 30]:
+        print(f'Calculating log return for horizon {horizon}...')
+        features_dict[f'log_return_{horizon}'] = np.log(df['Close'] / df['Close'].shift(horizon))
+
+    # Volatility
+    for window in window_sizes:
+        print(f'Calculating volatility for window {window}...')
+        features_dict[f'volatility_{window}'] = df['log_return'].rolling(window).std()
+
+    # Technical indicators (except Supertrend)
+    print('Calculating RSI...')
+    features_dict['rsi'] = ta.momentum.RSIIndicator(df['Close'], window=14).rsi()
+
+    print('Calculating MACD...')
+    features_dict['macd'] = ta.trend.MACD(df['Close']).macd()
+
+    print('Calculating Bollinger Bands...')
+    bb = ta.volatility.BollingerBands(close=df['Close'], window=20, window_dev=2)
+    features_dict['bb_bbm'] = bb.bollinger_mavg()
+    features_dict['bb_bbh'] = bb.bollinger_hband()
+    features_dict['bb_bbl'] = bb.bollinger_lband()
+    features_dict['bb_bb_width'] = features_dict['bb_bbh'] - features_dict['bb_bbl']
+
+    print('Calculating Stochastic Oscillator...')
+    stoch = ta.momentum.StochasticOscillator(high=df['High'], low=df['Low'], close=df['Close'], window=14, smooth_window=3)
+    features_dict['stoch_k'] = stoch.stoch()
+    features_dict['stoch_d'] = stoch.stoch_signal()
+
+    print('Calculating Average True Range (ATR)...')
+    atr = ta.volatility.AverageTrueRange(high=df['High'], low=df['Low'], close=df['Close'], window=14)
+    features_dict['atr'] = atr.average_true_range()
+
+    print('Calculating Commodity Channel Index (CCI)...')
+    cci = ta.trend.CCIIndicator(high=df['High'], low=df['Low'], close=df['Close'], window=20)
+    features_dict['cci'] = cci.cci()
+
+    print('Calculating Williams %R...')
+    willr = ta.momentum.WilliamsRIndicator(high=df['High'], low=df['Low'], close=df['Close'], lbp=14)
+    features_dict['williams_r'] = willr.williams_r()
+
+    print('Calculating Exponential Moving Average (EMA)...')
+    ema = ta.trend.EMAIndicator(close=df['Close'], window=14)
+    features_dict['ema_14'] = ema.ema_indicator()
+
+    print('Calculating On-Balance Volume (OBV)...')
+    obv = ta.volume.OnBalanceVolumeIndicator(close=df['Close'], volume=df['Volume'])
+    features_dict['obv'] = obv.on_balance_volume()
+
+    print('Calculating Chaikin Money Flow (CMF)...')
+    cmf = ta.volume.ChaikinMoneyFlowIndicator(high=df['High'], low=df['Low'], close=df['Close'], volume=df['Volume'], window=20)
+    features_dict['cmf'] = cmf.chaikin_money_flow()
+
+    # Additional TA indicators
+    # SMA
+    print('Calculating SMA 50 and 200...')
+    features_dict['sma_50'] = SMAIndicator(df['Close'], window=50).sma_indicator()
+    features_dict['sma_200'] = SMAIndicator(df['Close'], window=200).sma_indicator()
+
+    # Rate of Change
+    print('Calculating ROC 10...')
+    features_dict['roc_10'] = ROCIndicator(df['Close'], window=10).roc()
+
+    # Momentum
+    print('Calculating Momentum 10...')
+    features_dict['momentum_10'] = ta.momentum.MomentumIndicator(df['Close'], window=10).momentum()
+
+    # Parabolic SAR
+    print('Calculating Parabolic SAR...')
+    psar = PSARIndicator(df['High'], df['Low'], df['Close'])
+    features_dict['psar'] = psar.psar()
+    features_dict['psar_up'] = psar.psar_up()
+    features_dict['psar_down'] = psar.psar_down()
+
+    # Donchian Channel
+    print('Calculating Donchian Channel 20...')
+    donchian = DonchianChannel(df['High'], df['Low'], df['Close'], window=20)
+    features_dict['donchian_hband'] = donchian.donchian_channel_hband()
+    features_dict['donchian_lband'] = donchian.donchian_channel_lband()
+    features_dict['donchian_mband'] = donchian.donchian_channel_mband()
+
+    # Keltner Channel
+    print('Calculating Keltner Channel 20...')
+    keltner = KeltnerChannel(df['High'], df['Low'], df['Close'], window=20)
+    features_dict['keltner_hband'] = keltner.keltner_channel_hband()
+    features_dict['keltner_lband'] = keltner.keltner_channel_lband()
+    features_dict['keltner_mband'] = keltner.keltner_channel_mband()
+
+    # Detrended Price Oscillator
+    print('Calculating DPO 20...')
+    features_dict['dpo_20'] = DPOIndicator(df['Close'], window=20).dpo()
+
+    # Ultimate Oscillator
+    print('Calculating Ultimate Oscillator...')
+    features_dict['ultimate_osc'] = UltimateOscillatorIndicator(df['High'], df['Low'], df['Close']).ultimate_oscillator()
+
+    # Ichimoku
+    print('Calculating Ichimoku...')
+    ichimoku = IchimokuIndicator(df['High'], df['Low'], window1=9, window2=26, window3=52)
+    features_dict['ichimoku_a'] = ichimoku.ichimoku_a()
+    features_dict['ichimoku_b'] = ichimoku.ichimoku_b()
+    features_dict['ichimoku_base_line'] = ichimoku.ichimoku_base_line()
+    features_dict['ichimoku_conversion_line'] = ichimoku.ichimoku_conversion_line()
+
+    # Elder Ray Index (Bull Power, Bear Power)
+    print('Calculating Elder Ray Index...')
+    features_dict['elder_ray_bull'] = ta.trend.EMAIndicator(df['Close'], window=13).ema_indicator() - df['Low']
+    features_dict['elder_ray_bear'] = ta.trend.EMAIndicator(df['Close'], window=13).ema_indicator() - df['High']
+
+    # Pivot Points (Daily)
+    print('Calculating Daily Pivot Points...')
+    features_dict['daily_return'] = DailyReturnIndicator(df['Close']).daily_return()
+
+    # Concatenate all new features at once
+    print('Concatenating all new features to DataFrame...')
+    features_df = pd.DataFrame(features_dict)
+    df = pd.concat([df, features_df], axis=1)
+
+    # Add Supertrend indicators (custom)
+    print('Preparing data for Supertrend calculation...')
+    st_df = df.rename(columns={'High': 'high', 'Low': 'low', 'Close': 'close'})
+    
+    print('Calculating Supertrend indicators...')
+    supertrend = Supertrends(st_df)
+    st_results = supertrend.calculate_supertrend_indicators()
+    for idx, st in enumerate(st_results):
+        period = st['params']['period']
+        multiplier = st['params']['multiplier']
+        # Skip useless supertrend features
+        if (period == 10 and multiplier == 1.0) or (period == 11 and multiplier == 2.0):
+            continue
+        print(f'Adding Supertrend features: supertrend_{period}_{multiplier} and supertrend_trend_{period}_{multiplier}')
+        df[f'supertrend_{period}_{multiplier}'] = st['results']['supertrend']
+        df[f'supertrend_trend_{period}_{multiplier}'] = st['results']['trend']
+
+    # Add time features (exclude 'dayofweek')
+    print('Adding hour feature...')
+    df['hour'] = df['Timestamp'].dt.hour
+
+    # Drop NaNs after all feature engineering
+    print('Dropping NaNs after feature engineering...')
+    df = df.dropna().reset_index(drop=True)
+
+    # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
+    print('Selecting feature columns...')
+    exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
+    feature_cols = [col for col in df.columns if col not in exclude_cols]
+
+    print('Preparing X and y...')
+    X = df[feature_cols].values.astype(np.float32)
+    y = df['log_return'].values.astype(np.float32)
+  
+    split_idx = int(len(X) * 0.8)
+    X_train, X_test = X[:split_idx], X[split_idx:]
+    y_train, y_test = y[:split_idx], y[split_idx:]
+    test_timestamps = df['Timestamp'].values[split_idx:]
+
+    print('Initializing model...')
+    model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
+ 
+    print('Training model...')
+    booster = model.train()
+ 
+    print('Training complete.')
+ 
+    if hasattr(model, 'params'):
+        print("Model hyperparameters:", model.params)
+    if hasattr(model, 'model') and hasattr(model.model, 'get_score'):
+        import operator
+        importances = model.model.get_score(importance_type='weight')
+        # Map f0, f1, ... to actual feature names
+        feature_map = {f"f{idx}": name for idx, name in enumerate(feature_cols)}
+        sorted_importances = sorted(importances.items(), key=operator.itemgetter(1), reverse=True)
+        print('Feature importances (sorted, with names):')
+        for feat, score in sorted_importances:
+            print(f'{feature_map.get(feat, feat)}: {score}')
+
+    preds = model.predict(X_test[:5])
+    print('Predictions for first 5 test samples:', preds)
+    print('Actual values for first 5 test samples:', y_test[:5])
+
+    test_preds = model.predict(X_test)
+    rmse = np.sqrt(mean_squared_error(y_test, test_preds))
+    print(f'RMSE on test set: {rmse:.4f}')
+
+    np.save('./data/y_test.npy', y_test)
+    np.save('./data/test_preds.npy', test_preds)
+
+    # display_actual_vs_predicted(y_test, test_preds, test_timestamps)
+    # plot_target_distribution(y_train, y_test)
+
+    plot_predicted_vs_actual_log_returns(y_test, test_preds, test_timestamps)
diff --git a/xgboost/plot_results.py b/xgboost/plot_results.py
new file mode 100644
index 0000000..4c9de7a
--- /dev/null
+++ b/xgboost/plot_results.py
@@ -0,0 +1,111 @@
+import numpy as np
+import dash
+from dash import dcc, html
+import plotly.graph_objs as go
+import threading
+
+
+def display_actual_vs_predicted(y_test, test_preds, timestamps, n_plot=200):
+    import plotly.offline as pyo
+    n_plot = min(n_plot, len(y_test))
+    plot_indices = timestamps[:n_plot]
+    actual = y_test[:n_plot]
+    predicted = test_preds[:n_plot]
+
+    trace_actual = go.Scatter(x=plot_indices, y=actual, mode='lines', name='Actual')
+    trace_predicted = go.Scatter(x=plot_indices, y=predicted, mode='lines', name='Predicted')
+    data = [trace_actual, trace_predicted]
+    layout = go.Layout(
+        title='Actual vs. Predicted BTC Close Prices (Test Set)',
+        xaxis={'title': 'Timestamp'},
+        yaxis={'title': 'BTC Close Price'},
+        legend={'x': 0, 'y': 1},
+        margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
+        hovermode='closest'
+    )
+    fig = go.Figure(data=data, layout=layout)
+    pyo.plot(fig)
+
+def plot_target_distribution(y_train, y_test):
+    import plotly.offline as pyo
+    trace_train = go.Histogram(
+        x=y_train,
+        nbinsx=100,
+        opacity=0.5,
+        name='Train',
+        marker=dict(color='blue')
+    )
+    trace_test = go.Histogram(
+        x=y_test,
+        nbinsx=100,
+        opacity=0.5,
+        name='Test',
+        marker=dict(color='orange')
+    )
+    data = [trace_train, trace_test]
+    layout = go.Layout(
+        title='Distribution of Target Variable (Close Price)',
+        xaxis=dict(title='BTC Close Price'),
+        yaxis=dict(title='Frequency'),
+        barmode='overlay'
+    )
+    fig = go.Figure(data=data, layout=layout)
+    pyo.plot(fig)
+
+def plot_predicted_vs_actual_log_returns(y_test, test_preds, timestamps=None, n_plot=200):
+    import plotly.offline as pyo
+    import plotly.graph_objs as go
+    n_plot = min(n_plot, len(y_test))
+    actual = y_test[:n_plot]
+    predicted = test_preds[:n_plot]
+    if timestamps is not None:
+        x_axis = timestamps[:n_plot]
+        x_label = 'Timestamp'
+    else:
+        x_axis = list(range(n_plot))
+        x_label = 'Index'
+
+    # Line plot: Actual vs Predicted over time
+    trace_actual = go.Scatter(x=x_axis, y=actual, mode='lines', name='Actual')
+    trace_predicted = go.Scatter(x=x_axis, y=predicted, mode='lines', name='Predicted')
+    data_line = [trace_actual, trace_predicted]
+    layout_line = go.Layout(
+        title='Actual vs. Predicted Log Returns (Test Set)',
+        xaxis={'title': x_label},
+        yaxis={'title': 'Log Return'},
+        legend={'x': 0, 'y': 1},
+        margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
+        hovermode='closest'
+    )
+    fig_line = go.Figure(data=data_line, layout=layout_line)
+    pyo.plot(fig_line, filename='log_return_line_plot.html')
+
+    # Scatter plot: Predicted vs Actual
+    trace_scatter = go.Scatter(
+        x=actual,
+        y=predicted,
+        mode='markers',
+        name='Predicted vs Actual',
+        opacity=0.5
+    )
+    # Diagonal reference line
+    min_val = min(np.min(actual), np.min(predicted))
+    max_val = max(np.max(actual), np.max(predicted))
+    trace_diag = go.Scatter(
+        x=[min_val, max_val],
+        y=[min_val, max_val],
+        mode='lines',
+        name='Ideal',
+        line=dict(dash='dash', color='red')
+    )
+    data_scatter = [trace_scatter, trace_diag]
+    layout_scatter = go.Layout(
+        title='Predicted vs Actual Log Returns (Scatter)',
+        xaxis={'title': 'Actual Log Return'},
+        yaxis={'title': 'Predicted Log Return'},
+        showlegend=True,
+        margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
+        hovermode='closest'
+    )
+    fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
+    pyo.plot(fig_scatter, filename='log_return_scatter_plot.html')