gitignore updated, model file

model updated
Added mode indicators, still WIP
2025-05-30 12:31:20 +08:00 · 2025-05-30 12:29:37 +08:00 · 2025-05-29 12:45:45 +08:00 · 2025-05-29 18:28:53 +08:00 · 2025-05-29 11:04:03 +08:00 · 2025-05-28 02:50:40 +08:00
8 changed files with 1086 additions and 249 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,11 +1,12 @@
 # ---> Python
-*.json
+/credentials/*.json
 *.csv
 *.png
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 /data/*.npy
 # C extensions
 *.so
--- a/cycles/backtest.py
+++ b/cycles/backtest.py
@@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
 import time
 from cycles.supertrend import Supertrends
 from cycles.market_fees import MarketFees
@@ -27,6 +28,9 @@ class Backtest:
        trends_arr = np.stack(trends, axis=1)
        meta_trend = np.where((trends_arr[:,0] == trends_arr[:,1]) & (trends_arr[:,1] == trends_arr[:,2]), 
                                trends_arr[:,0], 0)
        # Shift meta_trend by one to avoid lookahead bias
        meta_trend_signal = np.roll(meta_trend, 1)
        meta_trend_signal[0] = 0  # or np.nan, but 0 means 'no signal' for first bar
        position = 0  # 0 = no position, 1 = long
        entry_price = 0
@@ -39,14 +43,22 @@ class Backtest:
        entry_time = None
        current_trade_min1_start_idx = None
-        min1_df['timestamp'] = pd.to_datetime(min1_df.index)
+        min1_df.index = pd.to_datetime(min1_df.index)
        min1_timestamps = min1_df.index.values
        last_print_time = time.time()
        for i in range(1, len(_df)):
            current_time = time.time()
            if current_time - last_print_time >= 5:
                progress = (i / len(_df)) * 100
                print(f"\rProgress: {progress:.1f}%", end="", flush=True)
                last_print_time = current_time
            price_open = _df['open'].iloc[i]
            price_close = _df['close'].iloc[i]
            date = _df['timestamp'].iloc[i]
-            prev_mt = meta_trend[i-1]
+            prev_mt = meta_trend_signal[i-1]
-            curr_mt = meta_trend[i]
+            curr_mt = meta_trend_signal[i]
            # Check stop loss if in position
            if position == 1:
@@ -87,6 +99,8 @@ class Backtest:
            drawdown = (max_balance - balance) / max_balance
            drawdowns.append(drawdown)
        print("\rProgress: 100%\r\n", end="", flush=True)
        # If still in position at end, sell at last close
        if position == 1:
            exit_result = Backtest.handle_exit(coin, _df['close'].iloc[-1], entry_price, entry_time, _df['timestamp'].iloc[-1])
--- a/cycles/supertrend.py
+++ b/cycles/supertrend.py
@@ -1,70 +1,30 @@
 import pandas as pd
 import numpy as np
 import logging
 from scipy.signal import find_peaks
 from matplotlib.patches import Rectangle
 from scipy import stats
 import concurrent.futures
 from functools import partial
 from functools import lru_cache
 import matplotlib.pyplot as plt
 # Color configuration
 # Plot colors
 DARK_BG_COLOR = '#181C27'
 LEGEND_BG_COLOR = '#333333'
 TITLE_COLOR = 'white'
 AXIS_LABEL_COLOR = 'white'
 # Candlestick colors
 CANDLE_UP_COLOR = '#089981'  # Green
 CANDLE_DOWN_COLOR = '#F23645'  # Red
 # Marker colors
 MIN_COLOR = 'red'
 MAX_COLOR = 'green'
 # Line style colors
 MIN_LINE_STYLE = 'g--'  # Green dashed
 MAX_LINE_STYLE = 'r--'  # Red dashed
 SMA7_LINE_STYLE = 'y-'  # Yellow solid
 SMA15_LINE_STYLE = 'm-'  # Magenta solid
 # SuperTrend colors
 ST_COLOR_UP = 'g-'
 ST_COLOR_DOWN = 'r-'
 # Cache the calculation results by function parameters
@lru_cache(maxsize=32)
 def cached_supertrend_calculation(period, multiplier, data_tuple):
    # Convert tuple back to numpy arrays
    high = np.array(data_tuple[0])
    low = np.array(data_tuple[1])
    close = np.array(data_tuple[2])
    # Calculate TR and ATR using vectorized operations
    tr = np.zeros_like(close)
    tr[0] = high[0] - low[0]
    hc_range = np.abs(high[1:] - close[:-1])
    lc_range = np.abs(low[1:] - close[:-1])
    hl_range = high[1:] - low[1:]
    tr[1:] = np.maximum.reduce([hl_range, hc_range, lc_range])
    # Use numpy's exponential moving average
    atr = np.zeros_like(tr)
    atr[0] = tr[0]
    multiplier_ema = 2.0 / (period + 1)
    for i in range(1, len(tr)):
        atr[i] = (tr[i] * multiplier_ema) + (atr[i-1] * (1 - multiplier_ema))
    # Calculate bands
    upper_band = np.zeros_like(close)
    lower_band = np.zeros_like(close)
    for i in range(len(close)):
        hl_avg = (high[i] + low[i]) / 2
        upper_band[i] = hl_avg + (multiplier * atr[i])
        lower_band[i] = hl_avg - (multiplier * atr[i])
    final_upper = np.zeros_like(close)
    final_lower = np.zeros_like(close)
    supertrend = np.zeros_like(close)
@@ -106,76 +66,18 @@ def cached_supertrend_calculation(period, multiplier, data_tuple):
    }
 def calculate_supertrend_external(data, period, multiplier):
    # Convert DataFrame columns to hashable tuples
    high_tuple = tuple(data['high'])
    low_tuple = tuple(data['low'])
    close_tuple = tuple(data['close'])
    # Call the cached function
    return cached_supertrend_calculation(period, multiplier, (high_tuple, low_tuple, close_tuple))
 class Supertrends:
    def __init__(self, data, verbose=False, display=False):
        """
        Initialize the TrendDetectorSimple class.
        Parameters:
        - data: pandas DataFrame containing price data
        - verbose: boolean, whether to display detailed logging information
        - display: boolean, whether to enable display/plotting features
        """
        self.data = data
        self.verbose = verbose
        self.display = display
        # Only define display-related variables if display is True
        if self.display:
            # Plot style configuration
            self.plot_style = 'dark_background' 
            self.bg_color = DARK_BG_COLOR
            self.plot_size = (12, 8)
            # Candlestick configuration
            self.candle_width = 0.6
            self.candle_up_color = CANDLE_UP_COLOR
            self.candle_down_color = CANDLE_DOWN_COLOR
            self.candle_alpha = 0.8
            self.wick_width = 1
            # Marker configuration
            self.min_marker = '^'
            self.min_color = MIN_COLOR
            self.min_size = 100
            self.max_marker = 'v'
            self.max_color = MAX_COLOR
            self.max_size = 100
            self.marker_zorder = 100
            # Line configuration
            self.line_width = 1
            self.min_line_style = MIN_LINE_STYLE
            self.max_line_style = MAX_LINE_STYLE
            self.sma7_line_style = SMA7_LINE_STYLE
            self.sma15_line_style = SMA15_LINE_STYLE
            # Text configuration
            self.title_size = 14
            self.title_color = TITLE_COLOR
            self.axis_label_size = 12
            self.axis_label_color = AXIS_LABEL_COLOR
            # Legend configuration
            self.legend_loc = 'best'
            self.legend_bg_color = LEGEND_BG_COLOR
        # Configure logging
        logging.basicConfig(level=logging.INFO if verbose else logging.WARNING,
                           format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger('TrendDetectorSimple')
        # Convert data to pandas DataFrame if it's not already
        if not isinstance(self.data, pd.DataFrame):
            if isinstance(self.data, list):
                self.data = pd.DataFrame({'close': self.data})
@@ -183,154 +85,101 @@ class Supertrends:
                raise ValueError("Data must be a pandas DataFrame or a list")
    def calculate_tr(self):
        df = self.data.copy()
        high = df['high'].values
        low = df['low'].values
        close = df['close'].values
        tr = np.zeros_like(close)
        tr[0] = high[0] - low[0]
        for i in range(1, len(close)):
            hl_range = high[i] - low[i]
            hc_range = abs(high[i] - close[i-1])
            lc_range = abs(low[i] - close[i-1])
            tr[i] = max(hl_range, hc_range, lc_range)
        return tr
    def calculate_atr(self, period=14):
        tr = self.calculate_tr()
        atr = np.zeros_like(tr)
        atr[0] = tr[0]
        multiplier = 2.0 / (period + 1)
        for i in range(1, len(tr)):
            atr[i] = (tr[i] * multiplier) + (atr[i-1] * (1 - multiplier))
        return atr
    def calculate_supertrend(self, period=10, multiplier=3.0):
        """
-        Calculate True Range (TR) for the price data.
+        Calculate SuperTrend indicator for the price data.
-        
+        SuperTrend is a trend-following indicator that uses ATR to determine the trend direction.
-        True Range is the greatest of:
+        Parameters:
-        1. Current high - current low
+        - period: int, the period for the ATR calculation (default: 10)
-        2. |Current high - previous close|
+        - multiplier: float, the multiplier for the ATR (default: 3.0)
        3. |Current low - previous close|
        Returns:
-        - Numpy array of TR values
+        - Dictionary containing SuperTrend values, trend direction, and upper/lower bands
        """
        df = self.data.copy()
        high = df['high'].values
        low = df['low'].values
        close = df['close'].values
-        
+        atr = self.calculate_atr(period)
-        tr = np.zeros_like(close)
+        upper_band = np.zeros_like(close)
-        tr[0] = high[0] - low[0]  # First TR is just the first day's range
+        lower_band = np.zeros_like(close)
-        
+        for i in range(len(close)):
            hl_avg = (high[i] + low[i]) / 2
            upper_band[i] = hl_avg + (multiplier * atr[i])
            lower_band[i] = hl_avg - (multiplier * atr[i])
        final_upper = np.zeros_like(close)
        final_lower = np.zeros_like(close)
        supertrend = np.zeros_like(close)
        trend = np.zeros_like(close)
        final_upper[0] = upper_band[0]
        final_lower[0] = lower_band[0]
        if close[0] <= upper_band[0]:
            supertrend[0] = upper_band[0]
            trend[0] = -1
        else:
            supertrend[0] = lower_band[0]
            trend[0] = 1
        for i in range(1, len(close)):
-            # Current high - current low
+            if (upper_band[i] < final_upper[i-1]) or (close[i-1] > final_upper[i-1]):
-            hl_range = high[i] - low[i]
+                final_upper[i] = upper_band[i]
-            # |Current high - previous close|
+            else:
-            hc_range = abs(high[i] - close[i-1])
+                final_upper[i] = final_upper[i-1]
-            # |Current low - previous close|
+            if (lower_band[i] > final_lower[i-1]) or (close[i-1] < final_lower[i-1]):
-            lc_range = abs(low[i] - close[i-1])
+                final_lower[i] = lower_band[i]
-            
+            else:
-            # TR is the maximum of these three values
+                final_lower[i] = final_lower[i-1]
-            tr[i] = max(hl_range, hc_range, lc_range)
+            if supertrend[i-1] == final_upper[i-1] and close[i] <= final_upper[i]:
-            
+                supertrend[i] = final_upper[i]
-        return tr
+                trend[i] = -1
-    
+            elif supertrend[i-1] == final_upper[i-1] and close[i] > final_upper[i]:
-    def calculate_atr(self, period=14):
+                supertrend[i] = final_lower[i]
-        """
+                trend[i] = 1
-        Calculate Average True Range (ATR) for the price data.
+            elif supertrend[i-1] == final_lower[i-1] and close[i] >= final_lower[i]:
-        
+                supertrend[i] = final_lower[i]
-        ATR is the exponential moving average of the True Range over a specified period.
+                trend[i] = 1
-        
+            elif supertrend[i-1] == final_lower[i-1] and close[i] < final_lower[i]:
-        Parameters:
+                supertrend[i] = final_upper[i]
-        - period: int, the period for the ATR calculation (default: 14)
+                trend[i] = -1
-        
+        supertrend_results = {
-        Returns:
+            'supertrend': supertrend,
-        - Numpy array of ATR values
+            'trend': trend,
-        """
+            'upper_band': final_upper,
-        
+            'lower_band': final_lower
-        tr = self.calculate_tr()
+        }
-        atr = np.zeros_like(tr)
+        return supertrend_results
        # First ATR value is just the first TR
        atr[0] = tr[0]
        # Calculate exponential moving average (EMA) of TR
        multiplier = 2.0 / (period + 1)
        for i in range(1, len(tr)):
            atr[i] = (tr[i] * multiplier) + (atr[i-1] * (1 - multiplier))
        return atr
    def detect_trends(self):
        """
        Detect trends by identifying local minima and maxima in the price data
        using scipy.signal.find_peaks.
        Parameters:
        - prominence: float, required prominence of peaks (relative to the price range)
        - width: int, required width of peaks in data points
        Returns:
        - DataFrame with columns for timestamps, prices, and trend indicators
        - Dictionary containing analysis results including linear regression, SMAs, and SuperTrend indicators
        """
        df = self.data
        # close_prices = df['close'].values
        # max_peaks, _ = find_peaks(close_prices)
        # min_peaks, _ = find_peaks(-close_prices)
        # df['is_min'] = False
        # df['is_max'] = False
        # for peak in max_peaks:
        #     df.at[peak, 'is_max'] = True
        # for peak in min_peaks:
        #     df.at[peak, 'is_min'] = True
        # result = df[['timestamp', 'close', 'is_min', 'is_max']].copy()
        # Perform linear regression on min_peaks and max_peaks
        # min_prices = df['close'].iloc[min_peaks].values
        # max_prices = df['close'].iloc[max_peaks].values
        # Linear regression for min peaks if we have at least 2 points
        # min_slope, min_intercept, min_r_value, _, _ = stats.linregress(min_peaks, min_prices)
        # Linear regression for max peaks if we have at least 2 points
        # max_slope, max_intercept, max_r_value, _, _ = stats.linregress(max_peaks, max_prices)
        # Calculate Simple Moving Averages (SMA) for 7 and 15 periods        
        # sma_7 = pd.Series(close_prices).rolling(window=7, min_periods=1).mean().values
        # sma_15 = pd.Series(close_prices).rolling(window=15, min_periods=1).mean().values
        analysis_results = {}
        # analysis_results['linear_regression'] = {
        #     'min': {
        #         'slope': min_slope,
        #         'intercept': min_intercept,
        #         'r_squared': min_r_value ** 2
        #     },
        #     'max': {
        #         'slope': max_slope,
        #         'intercept': max_intercept,
        #         'r_squared': max_r_value ** 2
        #     }
        # }
        # analysis_results['sma'] = {
        #     '7': sma_7,
        #     '15': sma_15
        # }
        # Calculate SuperTrend indicators
        supertrend_results_list = self._calculate_supertrend_indicators()
        analysis_results['supertrend'] = supertrend_results_list
        return analysis_results
    def calculate_supertrend_indicators(self):
        """
        Calculate SuperTrend indicators with different parameter sets in parallel.
        Returns:
        - list, the SuperTrend results
        """
        supertrend_params = [
-            {"period": 12, "multiplier": 3.0, "color_up": ST_COLOR_UP, "color_down": ST_COLOR_DOWN},
+            {"period": 12, "multiplier": 3.0},
-            {"period": 10, "multiplier": 1.0, "color_up": ST_COLOR_UP, "color_down": ST_COLOR_DOWN},
+            {"period": 10, "multiplier": 1.0},
-            {"period": 11, "multiplier": 2.0, "color_up": ST_COLOR_UP, "color_down": ST_COLOR_DOWN}
+            {"period": 11, "multiplier": 2.0}
        ]
        data = self.data.copy()
        # For just 3 calculations, direct calculation might be faster than process pool
        results = []
        for p in supertrend_params:
-            result = calculate_supertrend_external(data, p["period"], p["multiplier"])
+            result = self.calculate_supertrend(period=p["period"], multiplier=p["multiplier"])
-            results.append(result)
+            results.append({
        supertrend_results_list = []
        for params, result in zip(supertrend_params, results):
            supertrend_results_list.append({
                "results": result,
-                "params": params
+                "params": p
            })
-        return supertrend_results_list
+        return results
--- a/data/xgboost_model.json
+++ b/data/xgboost_model.json
--- a/main.py
+++ b/main.py
@@ -6,7 +6,6 @@ import os
 import datetime
 import argparse
 import json
 import ast
 from cycles.utils.storage import Storage
 from cycles.utils.system import SystemUtils
@@ -48,6 +47,7 @@ def process_timeframe_data(min1_df, df, stop_loss_pcts, rule_name, initial_usd,
        cumulative_profit = 0
        max_drawdown = 0
        peak = 0
        for trade in trades:
            cumulative_profit += trade['profit_pct']
            if cumulative_profit > peak:
@@ -55,10 +55,14 @@ def process_timeframe_data(min1_df, df, stop_loss_pcts, rule_name, initial_usd,
            drawdown = peak - cumulative_profit
            if drawdown > max_drawdown:
                max_drawdown = drawdown
        final_usd = initial_usd
        for trade in trades:
            final_usd *= (1 + trade['profit_pct'])
-        total_fees_usd = sum(trade.get('fee_usd', 0.0) for trade in trades)
+
        total_fees_usd = sum(trade['fee_usd'] for trade in trades)
        row = {
            "timeframe": rule_name,
            "stop_loss_pct": stop_loss_pct,
@@ -75,6 +79,7 @@ def process_timeframe_data(min1_df, df, stop_loss_pcts, rule_name, initial_usd,
            "total_fees_usd": total_fees_usd,
        }
        results_rows.append(row)
        for trade in trades:
            trade_rows.append({
                "timeframe": rule_name,
@@ -87,7 +92,9 @@ def process_timeframe_data(min1_df, df, stop_loss_pcts, rule_name, initial_usd,
                "type": trade.get("type"),
                "fee_usd": trade.get("fee_usd"),
            })
        logging.info(f"Timeframe: {rule_name}, Stop Loss: {stop_loss_pct}, Trades: {n_trades}")
        if debug:
            for trade in trades:
                if trade['type'] == 'STOP':
@@ -95,13 +102,16 @@ def process_timeframe_data(min1_df, df, stop_loss_pcts, rule_name, initial_usd,
            for trade in trades:
                if trade['profit_pct'] < -0.09:  # or whatever is close to -0.10
                    print("Large loss trade:", trade)
    return results_rows, trade_rows
 def process(timeframe_info, debug=False):
-    """Process a single (timeframe, stop_loss_pct) combination (no monthly split)"""
+    from cycles.utils.storage import Storage  # import inside function for safety
    storage = Storage(logging=None)  # or pass a logger if you want, but None is safest for multiprocessing
    rule, data_1min, stop_loss_pct, initial_usd = timeframe_info
-    if rule == "1T":
+    if rule == "1T" or rule == "1min":
        df = data_1min.copy()
    else:
        df = data_1min.resample(rule).agg({
@@ -112,7 +122,33 @@ def process(timeframe_info, debug=False):
            'volume': 'sum'
        }).dropna()
    df = df.reset_index()
    results_rows, all_trade_rows = process_timeframe_data(data_1min, df, [stop_loss_pct], rule, initial_usd, debug=debug)
    if all_trade_rows:
        trades_fieldnames = ["entry_time", "exit_time", "entry_price", "exit_price", "profit_pct", "type", "fee_usd"]
        # Prepare header
        summary_fields = ["timeframe", "stop_loss_pct", "n_trades", "n_stop_loss", "win_rate", "max_drawdown", "avg_trade", "profit_ratio", "final_usd"]
        summary_row = results_rows[0]
        header_line = "\t".join(summary_fields) + "\n"
        value_line = "\t".join(str(summary_row.get(f, "")) for f in summary_fields) + "\n"
        # File name
        tf = summary_row["timeframe"]
        sl = summary_row["stop_loss_pct"]
        sl_percent = int(round(sl * 100))
        trades_filename = os.path.join(storage.results_dir, f"trades_{tf}_ST{sl_percent}pct.csv")
        # Write header
        with open(trades_filename, "w") as f:
            f.write(header_line)
            f.write(value_line)
        # Now write trades (append mode, skip header)
        with open(trades_filename, "a", newline="") as f:
            import csv
            writer = csv.DictWriter(f, fieldnames=trades_fieldnames)
            writer.writeheader()
            for trade in all_trade_rows:
                writer.writerow({k: trade.get(k, "") for k in trades_fieldnames})
    return results_rows, all_trade_rows
 def aggregate_results(all_rows):
@@ -126,7 +162,6 @@ def aggregate_results(all_rows):
    summary_rows = []
    for (rule, stop_loss_pct), rows in grouped.items():
        n_months = len(rows)
        total_trades = sum(r['n_trades'] for r in rows)
        total_stop_loss = sum(r['n_stop_loss'] for r in rows)
        avg_win_rate = np.mean([r['win_rate'] for r in rows])
@@ -163,7 +198,7 @@ def get_nearest_price(df, target_date):
        return nearest_time, price
 if __name__ == "__main__":
-    debug = True
+    debug = False
    parser = argparse.ArgumentParser(description="Run backtest with config file.")
    parser.add_argument("config", type=str, nargs="?", help="Path to config JSON file.")
@@ -171,11 +206,11 @@ if __name__ == "__main__":
    # Default values (from config.json)
    default_config = {
-        "start_date": "2024-05-15",
+        "start_date": "2025-05-01",
        "stop_date": datetime.datetime.today().strftime('%Y-%m-%d'),
        "initial_usd": 10000,
-        "timeframes": ["1D"],
+        "timeframes": ["1D", "6h", "3h", "1h", "30m", "15m", "5m", "1m"],
-        "stop_loss_pcts": [0.01, 0.02, 0.03],
+        "stop_loss_pcts": [0.01, 0.02, 0.03, 0.05],
    }
    if args.config:
@@ -238,6 +273,7 @@ if __name__ == "__main__":
    if debug:
        all_results_rows = []
        all_trade_rows = []
        for task in tasks:
            results, trades = process(task, debug)
            if results or trades:
@@ -263,7 +299,4 @@ if __name__ == "__main__":
    ]
    storage.write_backtest_results(backtest_filename, backtest_fieldnames, all_results_rows, metadata_lines)
    trades_fieldnames = ["entry_time", "exit_time", "entry_price", "exit_price", "profit_pct", "type", "fee_usd"]
    storage.write_trades(all_trade_rows, trades_fieldnames)
--- a/xgboost/custom_xgboost.py
+++ b/xgboost/custom_xgboost.py
@@ -0,0 +1,39 @@
 import xgboost as xgb
 import numpy as np
 class CustomXGBoostGPU:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train.astype(np.float32)
        self.X_test = X_test.astype(np.float32)
        self.y_train = y_train.astype(np.float32)
        self.y_test = y_test.astype(np.float32)
        self.model = None
        self.params = None  # Will be set during training
    def train(self, **xgb_params):
        params = {
            'tree_method': 'hist',
            'device': 'cuda',
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'verbosity': 1,
        }
        params.update(xgb_params)
        self.params = params  # Store params for later access
        dtrain = xgb.DMatrix(self.X_train, label=self.y_train)
        dtest = xgb.DMatrix(self.X_test, label=self.y_test)
        evals = [(dtrain, 'train'), (dtest, 'eval')]
        self.model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10)
        return self.model
    def predict(self, X):
        if self.model is None:
            raise ValueError('Model not trained yet.')
        dmatrix = xgb.DMatrix(X.astype(np.float32))
        return self.model.predict(dmatrix)
    def save_model(self, file_path):
        """Save the trained XGBoost model to the specified file path."""
        if self.model is None:
            raise ValueError('Model not trained yet.')
        self.model.save_model(file_path)
--- a/xgboost/main.py
+++ b/xgboost/main.py
@@ -0,0 +1,731 @@
 import sys
 import os
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 import pandas as pd
 import numpy as np
 from sklearn.model_selection import train_test_split
 from custom_xgboost import CustomXGBoostGPU
 from sklearn.metrics import mean_squared_error
 from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns, plot_predicted_vs_actual_prices
 import ta
 from cycles.supertrend import Supertrends
 from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
 from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillator, StochasticOscillator, WilliamsRIndicator
 from ta.volatility import KeltnerChannel, DonchianChannel
 from ta.others import DailyReturnIndicator
 import time
 from numba import njit
 def run_indicator(func, *args):
    return func(*args)
 def run_indicator_job(job):
    import time
    func, *args = job
    indicator_name = func.__name__
    start = time.time()
    result = func(*args)
    elapsed = time.time() - start
    print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds')
    return result
 def calc_rsi(close):
    from ta.momentum import RSIIndicator
    return ('rsi', RSIIndicator(close, window=14).rsi())
 def calc_macd(close):
    from ta.trend import MACD
    return ('macd', MACD(close).macd())
 def calc_bollinger(close):
    from ta.volatility import BollingerBands
    bb = BollingerBands(close=close, window=20, window_dev=2)
    return [
        ('bb_bbm', bb.bollinger_mavg()),
        ('bb_bbh', bb.bollinger_hband()),
        ('bb_bbl', bb.bollinger_lband()),
        ('bb_bb_width', bb.bollinger_hband() - bb.bollinger_lband())
    ]
 def calc_stochastic(high, low, close):
    from ta.momentum import StochasticOscillator
    stoch = StochasticOscillator(high=high, low=low, close=close, window=14, smooth_window=3)
    return [
        ('stoch_k', stoch.stoch()),
        ('stoch_d', stoch.stoch_signal())
    ]
 def calc_atr(high, low, close):
    from ta.volatility import AverageTrueRange
    atr = AverageTrueRange(high=high, low=low, close=close, window=14)
    return ('atr', atr.average_true_range())
 def calc_cci(high, low, close):
    from ta.trend import CCIIndicator
    cci = CCIIndicator(high=high, low=low, close=close, window=20)
    return ('cci', cci.cci())
 def calc_williamsr(high, low, close):
    from ta.momentum import WilliamsRIndicator
    willr = WilliamsRIndicator(high=high, low=low, close=close, lbp=14)
    return ('williams_r', willr.williams_r())
 def calc_ema(close):
    from ta.trend import EMAIndicator
    ema = EMAIndicator(close=close, window=14)
    return ('ema_14', ema.ema_indicator())
 def calc_obv(close, volume):
    from ta.volume import OnBalanceVolumeIndicator
    obv = OnBalanceVolumeIndicator(close=close, volume=volume)
    return ('obv', obv.on_balance_volume())
 def calc_cmf(high, low, close, volume):
    from ta.volume import ChaikinMoneyFlowIndicator
    cmf = ChaikinMoneyFlowIndicator(high=high, low=low, close=close, volume=volume, window=20)
    return ('cmf', cmf.chaikin_money_flow())
 def calc_sma(close):
    from ta.trend import SMAIndicator
    return [
        ('sma_50', SMAIndicator(close, window=50).sma_indicator()),
        ('sma_200', SMAIndicator(close, window=200).sma_indicator())
    ]
 def calc_roc(close):
    from ta.momentum import ROCIndicator
    return ('roc_10', ROCIndicator(close, window=10).roc())
 def calc_momentum(close):
    return ('momentum_10', close - close.shift(10))
 def calc_psar(high, low, close):
    # Use the Numba-accelerated fast_psar function for speed
    psar_values = fast_psar(np.array(high), np.array(low), np.array(close))
    return [('psar', pd.Series(psar_values, index=close.index))]
 def calc_donchian(high, low, close):
    from ta.volatility import DonchianChannel
    donchian = DonchianChannel(high, low, close, window=20)
    return [
        ('donchian_hband', donchian.donchian_channel_hband()),
        ('donchian_lband', donchian.donchian_channel_lband()),
        ('donchian_mband', donchian.donchian_channel_mband())
    ]
 def calc_keltner(high, low, close):
    from ta.volatility import KeltnerChannel
    keltner = KeltnerChannel(high, low, close, window=20)
    return [
        ('keltner_hband', keltner.keltner_channel_hband()),
        ('keltner_lband', keltner.keltner_channel_lband()),
        ('keltner_mband', keltner.keltner_channel_mband())
    ]
 def calc_dpo(close):
    from ta.trend import DPOIndicator
    return ('dpo_20', DPOIndicator(close, window=20).dpo())
 def calc_ultimate(high, low, close):
    from ta.momentum import UltimateOscillator
    return ('ultimate_osc', UltimateOscillator(high, low, close).ultimate_oscillator())
 def calc_ichimoku(high, low):
    from ta.trend import IchimokuIndicator
    ichimoku = IchimokuIndicator(high, low, window1=9, window2=26, window3=52)
    return [
        ('ichimoku_a', ichimoku.ichimoku_a()),
        ('ichimoku_b', ichimoku.ichimoku_b()),
        ('ichimoku_base_line', ichimoku.ichimoku_base_line()),
        ('ichimoku_conversion_line', ichimoku.ichimoku_conversion_line())
    ]
 def calc_elder_ray(close, low, high):
    from ta.trend import EMAIndicator
    ema = EMAIndicator(close, window=13).ema_indicator()
    return [
        ('elder_ray_bull', ema - low),
        ('elder_ray_bear', ema - high)
    ]
 def calc_daily_return(close):
    from ta.others import DailyReturnIndicator
    return ('daily_return', DailyReturnIndicator(close).daily_return())
@njit
 def fast_psar(high, low, close, af=0.02, max_af=0.2):
    length = len(close)
    psar = np.zeros(length)
    bull = True
    af_step = af
    ep = low[0]
    psar[0] = low[0]
    for i in range(1, length):
        prev_psar = psar[i-1]
        if bull:
            psar[i] = prev_psar + af_step * (ep - prev_psar)
            if low[i] < psar[i]:
                bull = False
                psar[i] = ep
                af_step = af
                ep = low[i]
            else:
                if high[i] > ep:
                    ep = high[i]
                    af_step = min(af_step + af, max_af)
        else:
            psar[i] = prev_psar + af_step * (ep - prev_psar)
            if high[i] > psar[i]:
                bull = True
                psar[i] = ep
                af_step = af
                ep = high[i]
            else:
                if low[i] < ep:
                    ep = low[i]
                    af_step = min(af_step + af, max_af)
    return psar
 def compute_lag(df, col, lag):
    return df[col].shift(lag)
 def compute_rolling(df, col, stat, window):
    if stat == 'mean':
        return df[col].rolling(window).mean()
    elif stat == 'std':
        return df[col].rolling(window).std()
    elif stat == 'min':
        return df[col].rolling(window).min()
    elif stat == 'max':
        return df[col].rolling(window).max()
 def compute_log_return(df, horizon):
    return np.log(df['Close'] / df['Close'].shift(horizon))
 def compute_volatility(df, window):
    return df['log_return'].rolling(window).std()
 def run_feature_job(job, df):
    feature_name, func, *args = job
    print(f'Computing feature: {feature_name}')
    result = func(df, *args)
    return feature_name, result
 if __name__ == '__main__':
    csv_path = './data/btcusd_1-min_data.csv'
    csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]
    print('Reading CSV and filtering data...')
    df = pd.read_csv(csv_path)
    df = df[df['Volume'] != 0]
    min_date = '2017-06-01'
    print('Converting Timestamp and filtering by date...')
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
    df = df[df['Timestamp'] >= min_date]
    lags = 3
    print('Calculating log returns as the new target...')
    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
    ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
    window_sizes = [5, 15, 30]  # in minutes, adjust as needed
    features_dict = {}
    print('Starting feature computation...')
    feature_start_time = time.time()
    # --- Technical Indicator Features: Calculate or Load from Cache ---
    print('Calculating or loading technical indicator features...')
    # RSI
    feature_file = f'./data/{csv_prefix}_rsi.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['rsi'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: rsi')
        _, values = calc_rsi(df['Close'])
        features_dict['rsi'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # MACD
    feature_file = f'./data/{csv_prefix}_macd.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['macd'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: macd')
        _, values = calc_macd(df['Close'])
        features_dict['macd'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # ATR
    feature_file = f'./data/{csv_prefix}_atr.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['atr'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: atr')
        _, values = calc_atr(df['High'], df['Low'], df['Close'])
        features_dict['atr'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # CCI
    feature_file = f'./data/{csv_prefix}_cci.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['cci'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: cci')
        _, values = calc_cci(df['High'], df['Low'], df['Close'])
        features_dict['cci'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # Williams %R
    feature_file = f'./data/{csv_prefix}_williams_r.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['williams_r'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: williams_r')
        _, values = calc_williamsr(df['High'], df['Low'], df['Close'])
        features_dict['williams_r'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # EMA 14
    feature_file = f'./data/{csv_prefix}_ema_14.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['ema_14'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: ema_14')
        _, values = calc_ema(df['Close'])
        features_dict['ema_14'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # OBV
    feature_file = f'./data/{csv_prefix}_obv.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['obv'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: obv')
        _, values = calc_obv(df['Close'], df['Volume'])
        features_dict['obv'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # CMF
    feature_file = f'./data/{csv_prefix}_cmf.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['cmf'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: cmf')
        _, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
        features_dict['cmf'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # ROC 10
    feature_file = f'./data/{csv_prefix}_roc_10.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['roc_10'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: roc_10')
        _, values = calc_roc(df['Close'])
        features_dict['roc_10'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # DPO 20
    feature_file = f'./data/{csv_prefix}_dpo_20.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['dpo_20'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: dpo_20')
        _, values = calc_dpo(df['Close'])
        features_dict['dpo_20'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # Ultimate Oscillator
    feature_file = f'./data/{csv_prefix}_ultimate_osc.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: ultimate_osc')
        _, values = calc_ultimate(df['High'], df['Low'], df['Close'])
        features_dict['ultimate_osc'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # Daily Return
    feature_file = f'./data/{csv_prefix}_daily_return.npy'
    if os.path.exists(feature_file):
        print(f'A Loading cached feature: {feature_file}')
        arr = np.load(feature_file)
        features_dict['daily_return'] = pd.Series(arr, index=df.index)
    else:
        print('Calculating feature: daily_return')
        _, values = calc_daily_return(df['Close'])
        features_dict['daily_return'] = values
        np.save(feature_file, values.values)
        print(f'Saved feature: {feature_file}')
    # Multi-column indicators
    # Bollinger Bands
    print('Calculating multi-column indicator: bollinger')
    result = calc_bollinger(df['Close'])
    for subname, values in result:
        print(f"Adding subfeature: {subname}")
        sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
        if os.path.exists(sub_feature_file):
            print(f'B Loading cached feature: {sub_feature_file}')
            arr = np.load(sub_feature_file)
            features_dict[subname] = pd.Series(arr, index=df.index)
        else:
            features_dict[subname] = values
            np.save(sub_feature_file, values.values)
            print(f'Saved feature: {sub_feature_file}')
    # Stochastic Oscillator
    print('Calculating multi-column indicator: stochastic')
    result = calc_stochastic(df['High'], df['Low'], df['Close'])
    for subname, values in result:
        print(f"Adding subfeature: {subname}")
        sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
        if os.path.exists(sub_feature_file):
            print(f'B Loading cached feature: {sub_feature_file}')
            arr = np.load(sub_feature_file)
            features_dict[subname] = pd.Series(arr, index=df.index)
        else:
            features_dict[subname] = values
            np.save(sub_feature_file, values.values)
            print(f'Saved feature: {sub_feature_file}')
    # SMA
    print('Calculating multi-column indicator: sma')
    result = calc_sma(df['Close'])
    for subname, values in result:
        print(f"Adding subfeature: {subname}")
        sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
        if os.path.exists(sub_feature_file):
            print(f'B Loading cached feature: {sub_feature_file}')
            arr = np.load(sub_feature_file)
            features_dict[subname] = pd.Series(arr, index=df.index)
        else:
            features_dict[subname] = values
            np.save(sub_feature_file, values.values)
            print(f'Saved feature: {sub_feature_file}')
    # PSAR
    print('Calculating multi-column indicator: psar')
    result = calc_psar(df['High'], df['Low'], df['Close'])
    for subname, values in result:
        print(f"Adding subfeature: {subname}")
        sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
        if os.path.exists(sub_feature_file):
            print(f'B Loading cached feature: {sub_feature_file}')
            arr = np.load(sub_feature_file)
            features_dict[subname] = pd.Series(arr, index=df.index)
        else:
            features_dict[subname] = values
            np.save(sub_feature_file, values.values)
            print(f'Saved feature: {sub_feature_file}')
    # Donchian Channel
    print('Calculating multi-column indicator: donchian')
    result = calc_donchian(df['High'], df['Low'], df['Close'])
    for subname, values in result:
        print(f"Adding subfeature: {subname}")
        sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
        if os.path.exists(sub_feature_file):
            print(f'B Loading cached feature: {sub_feature_file}')
            arr = np.load(sub_feature_file)
            features_dict[subname] = pd.Series(arr, index=df.index)
        else:
            features_dict[subname] = values
            np.save(sub_feature_file, values.values)
            print(f'Saved feature: {sub_feature_file}')
    # Keltner Channel
    print('Calculating multi-column indicator: keltner')
    result = calc_keltner(df['High'], df['Low'], df['Close'])
    for subname, values in result:
        print(f"Adding subfeature: {subname}")
        sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
        if os.path.exists(sub_feature_file):
            print(f'B Loading cached feature: {sub_feature_file}')
            arr = np.load(sub_feature_file)
            features_dict[subname] = pd.Series(arr, index=df.index)
        else:
            features_dict[subname] = values
            np.save(sub_feature_file, values.values)
            print(f'Saved feature: {sub_feature_file}')
    # Ichimoku
    print('Calculating multi-column indicator: ichimoku')
    result = calc_ichimoku(df['High'], df['Low'])
    for subname, values in result:
        print(f"Adding subfeature: {subname}")
        sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
        if os.path.exists(sub_feature_file):
            print(f'B Loading cached feature: {sub_feature_file}')
            arr = np.load(sub_feature_file)
            features_dict[subname] = pd.Series(arr, index=df.index)
        else:
            features_dict[subname] = values
            np.save(sub_feature_file, values.values)
            print(f'Saved feature: {sub_feature_file}')
    # Elder Ray
    print('Calculating multi-column indicator: elder_ray')
    result = calc_elder_ray(df['Close'], df['Low'], df['High'])
    for subname, values in result:
        print(f"Adding subfeature: {subname}")
        sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
        if os.path.exists(sub_feature_file):
            print(f'B Loading cached feature: {sub_feature_file}')
            arr = np.load(sub_feature_file)
            features_dict[subname] = pd.Series(arr, index=df.index)
        else:
            features_dict[subname] = values
            np.save(sub_feature_file, values.values)
            print(f'Saved feature: {sub_feature_file}')
    # Prepare jobs for lags, rolling stats, log returns, and volatility
    feature_jobs = []
    # Lags
    for col in ohlcv_cols:
        for lag in range(1, lags + 1):
            feature_name = f'{col}_lag{lag}'
            feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
            if os.path.exists(feature_file):
                print(f'C Loading cached feature: {feature_file}')
                features_dict[feature_name] = np.load(feature_file)
            else:
                print(f'Adding lag feature job: {feature_name}')
                feature_jobs.append((feature_name, compute_lag, col, lag))
    # Rolling statistics
    for col in ohlcv_cols:
        for window in window_sizes:
            if (col == 'Open' and window == 5):
                continue
            if (col == 'High' and window == 5):
                continue
            if (col == 'High' and window == 30):
                continue
            if (col == 'Low' and window == 15):
                continue
            for stat in ['mean', 'std', 'min', 'max']:
                feature_name = f'{col}_roll_{stat}_{window}'
                feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
                if os.path.exists(feature_file):
                    print(f'D Loading cached feature: {feature_file}')
                    features_dict[feature_name] = np.load(feature_file)
                else:
                    print(f'Adding rolling stat feature job: {feature_name}')
                    feature_jobs.append((feature_name, compute_rolling, col, stat, window))
    # Log returns for different horizons
    for horizon in [5, 15, 30]:
        feature_name = f'log_return_{horizon}'
        feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
        if os.path.exists(feature_file):
            print(f'E Loading cached feature: {feature_file}')
            features_dict[feature_name] = np.load(feature_file)
        else:
            print(f'Adding log return feature job: {feature_name}')
            feature_jobs.append((feature_name, compute_log_return, horizon))
    # Volatility
    for window in window_sizes:
        feature_name = f'volatility_{window}'
        feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
        if os.path.exists(feature_file):
            print(f'F Loading cached feature: {feature_file}')
            features_dict[feature_name] = np.load(feature_file)
        else:
            print(f'Adding volatility feature job: {feature_name}')
            feature_jobs.append((feature_name, compute_volatility, window))
    # Sequential computation for all non-cached features
    if feature_jobs:
        print(f'Computing {len(feature_jobs)} features sequentially...')
        for job in feature_jobs:
            print(f'Computing feature job: {job[0]}')
            feature_name, result = run_feature_job(job, df)
            features_dict[feature_name] = result
            feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
            np.save(feature_file, result.values)
            print(f'Saved computed feature: {feature_file}')
        print('All features computed.')
    else:
        print('All features loaded from cache.')
    # Concatenate all new features at once
    print('Concatenating all new features to DataFrame...')
    features_df = pd.DataFrame(features_dict)
    print("Columns in features_df:", features_df.columns.tolist())
    print("All-NaN columns in features_df:", features_df.columns[features_df.isna().all()].tolist())
    df = pd.concat([df, features_df], axis=1)
    # Print all columns after concatenation
    print("All columns in df after concat:", df.columns.tolist())
    # Downcast all float columns to save memory
    print('Downcasting float columns to save memory...')
    for col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col], downcast='float')
        except Exception:
            pass
    # Drop intermediate features_df to free memory
    print('Dropping intermediate features_df to free memory...')
    del features_df
    import gc
    gc.collect()
    feature_end_time = time.time()
    print(f'Feature computation completed in {feature_end_time - feature_start_time:.2f} seconds.')
    # Add Supertrend indicators (custom)
    print('Preparing data for Supertrend calculation...')
    st_df = df.rename(columns={'High': 'high', 'Low': 'low', 'Close': 'close'})
    print('Calculating Supertrend indicators...')
    supertrend = Supertrends(st_df)
    st_results = supertrend.calculate_supertrend_indicators()
    for idx, st in enumerate(st_results):
        period = st['params']['period']
        multiplier = st['params']['multiplier']
        # Skip useless supertrend features
        if (period == 10 and multiplier == 1.0) or (period == 11 and multiplier == 2.0):
            continue
        print(f'Adding Supertrend features: supertrend_{period}_{multiplier} and supertrend_trend_{period}_{multiplier}')
        df[f'supertrend_{period}_{multiplier}'] = st['results']['supertrend']
        df[f'supertrend_trend_{period}_{multiplier}'] = st['results']['trend']
    # Add time features (exclude 'dayofweek')
    print('Adding hour feature...')
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
    df['hour'] = df['Timestamp'].dt.hour
    # Drop NaNs after all feature engineering
    print('Dropping NaNs after feature engineering...')
    df = df.dropna().reset_index(drop=True)
    # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
    print('Selecting feature columns...')
    exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
    feature_cols = [col for col in df.columns if col not in exclude_cols]
    # Print the features used for training
    print("Features used for training:", feature_cols)
    # Drop excluded columns to save memory
    print('Dropping excluded columns to save memory...')
    df = df[feature_cols + ['log_return', 'Timestamp']]
    print('Preparing X and y...')
    X = df[feature_cols].values.astype(np.float32)
    y = df['log_return'].values.astype(np.float32)
    split_idx = int(len(X) * 0.8)
    print(f'Splitting data: {split_idx} train, {len(X) - split_idx} test')
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    test_timestamps = df['Timestamp'].values[split_idx:]
    print('Initializing model...')
    model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
    print('Training model...')
    booster = model.train()
    print('Training complete.')
    # Save the trained model
    model.save_model('./data/xgboost_model.json')
    print('Model saved to ./data/xgboost_model.json')
    if hasattr(model, 'params'):
        print("Model hyperparameters:", model.params)
    if hasattr(model, 'model') and hasattr(model.model, 'get_score'):
        import operator
        importances = model.model.get_score(importance_type='weight')
        # Map f0, f1, ... to actual feature names
        feature_map = {f"f{idx}": name for idx, name in enumerate(feature_cols)}
        sorted_importances = sorted(importances.items(), key=operator.itemgetter(1), reverse=True)
        print('Feature importances (sorted, with names):')
        for feat, score in sorted_importances:
            print(f'{feature_map.get(feat, feat)}: {score}')
    print('Making predictions for first 5 test samples...')
    preds = model.predict(X_test[:5])
    print('Predictions for first 5 test samples:', preds)
    print('Actual values for first 5 test samples:', y_test[:5])
    print('Making predictions for all test samples...')
    test_preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    print(f'RMSE on test set: {rmse:.4f}')
    print('Saving y_test and test_preds to disk...')
    np.save('./data/y_test.npy', y_test)
    np.save('./data/test_preds.npy', test_preds)
    # Reconstruct price series from log returns
    print('Reconstructing price series from log returns...')
    # Get the last available Close price before the test set
    # The DataFrame df has been reset, so use split_idx to get the right row
    if 'Close' in df.columns:
        close_prices = df['Close'].values
    else:
        # Reload original CSV to get Close prices if not present
        close_prices = pd.read_csv(csv_path)['Close'].values
    start_price = close_prices[split_idx]  # This is the price at the split point
    # Actual prices
    actual_prices = [start_price]
    for r in y_test:
        actual_prices.append(actual_prices[-1] * np.exp(r))
    actual_prices = np.array(actual_prices[1:])
    # Predicted prices
    predicted_prices = [start_price]
    for r in test_preds:
        predicted_prices.append(predicted_prices[-1] * np.exp(r))
    predicted_prices = np.array(predicted_prices[1:])
    print('Plotting predicted vs actual prices...')
    plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps)
    print("Final features used for training:", feature_cols)
    print("Shape of X:", X.shape)
    print("First row of X:", X[0])
    print("stoch_k in feature_cols?", "stoch_k" in feature_cols)
    if "stoch_k" in feature_cols:
        idx = feature_cols.index("stoch_k")
        print("First 10 values of stoch_k:", X[:10, idx])
--- a/xgboost/plot_results.py
+++ b/xgboost/plot_results.py
@@ -0,0 +1,169 @@
 import numpy as np
 import dash
 from dash import dcc, html
 import plotly.graph_objs as go
 import threading
 def display_actual_vs_predicted(y_test, test_preds, timestamps, n_plot=200):
    import plotly.offline as pyo
    n_plot = min(n_plot, len(y_test))
    plot_indices = timestamps[:n_plot]
    actual = y_test[:n_plot]
    predicted = test_preds[:n_plot]
    trace_actual = go.Scatter(x=plot_indices, y=actual, mode='lines', name='Actual')
    trace_predicted = go.Scatter(x=plot_indices, y=predicted, mode='lines', name='Predicted')
    data = [trace_actual, trace_predicted]
    layout = go.Layout(
        title='Actual vs. Predicted BTC Close Prices (Test Set)',
        xaxis={'title': 'Timestamp'},
        yaxis={'title': 'BTC Close Price'},
        legend={'x': 0, 'y': 1},
        margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
        hovermode='closest'
    )
    fig = go.Figure(data=data, layout=layout)
    pyo.plot(fig)
 def plot_target_distribution(y_train, y_test):
    import plotly.offline as pyo
    trace_train = go.Histogram(
        x=y_train,
        nbinsx=100,
        opacity=0.5,
        name='Train',
        marker=dict(color='blue')
    )
    trace_test = go.Histogram(
        x=y_test,
        nbinsx=100,
        opacity=0.5,
        name='Test',
        marker=dict(color='orange')
    )
    data = [trace_train, trace_test]
    layout = go.Layout(
        title='Distribution of Target Variable (Close Price)',
        xaxis=dict(title='BTC Close Price'),
        yaxis=dict(title='Frequency'),
        barmode='overlay'
    )
    fig = go.Figure(data=data, layout=layout)
    pyo.plot(fig)
 def plot_predicted_vs_actual_log_returns(y_test, test_preds, timestamps=None, n_plot=200):
    import plotly.offline as pyo
    import plotly.graph_objs as go
    n_plot = min(n_plot, len(y_test))
    actual = y_test[:n_plot]
    predicted = test_preds[:n_plot]
    if timestamps is not None:
        x_axis = timestamps[:n_plot]
        x_label = 'Timestamp'
    else:
        x_axis = list(range(n_plot))
        x_label = 'Index'
    # Line plot: Actual vs Predicted over time
    trace_actual = go.Scatter(x=x_axis, y=actual, mode='lines', name='Actual')
    trace_predicted = go.Scatter(x=x_axis, y=predicted, mode='lines', name='Predicted')
    data_line = [trace_actual, trace_predicted]
    layout_line = go.Layout(
        title='Actual vs. Predicted Log Returns (Test Set)',
        xaxis={'title': x_label},
        yaxis={'title': 'Log Return'},
        legend={'x': 0, 'y': 1},
        margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
        hovermode='closest'
    )
    fig_line = go.Figure(data=data_line, layout=layout_line)
    pyo.plot(fig_line, filename='log_return_line_plot.html')
    # Scatter plot: Predicted vs Actual
    trace_scatter = go.Scatter(
        x=actual,
        y=predicted,
        mode='markers',
        name='Predicted vs Actual',
        opacity=0.5
    )
    # Diagonal reference line
    min_val = min(np.min(actual), np.min(predicted))
    max_val = max(np.max(actual), np.max(predicted))
    trace_diag = go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        name='Ideal',
        line=dict(dash='dash', color='red')
    )
    data_scatter = [trace_scatter, trace_diag]
    layout_scatter = go.Layout(
        title='Predicted vs Actual Log Returns (Scatter)',
        xaxis={'title': 'Actual Log Return'},
        yaxis={'title': 'Predicted Log Return'},
        showlegend=True,
        margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
        hovermode='closest'
    )
    fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
    pyo.plot(fig_scatter, filename='log_return_scatter_plot.html')
 def plot_predicted_vs_actual_prices(actual_prices, predicted_prices, timestamps=None, n_plot=200):
    import plotly.offline as pyo
    import plotly.graph_objs as go
    n_plot = min(n_plot, len(actual_prices))
    actual = actual_prices[:n_plot]
    predicted = predicted_prices[:n_plot]
    if timestamps is not None:
        x_axis = timestamps[:n_plot]
        x_label = 'Timestamp'
    else:
        x_axis = list(range(n_plot))
        x_label = 'Index'
    # Line plot: Actual vs Predicted over time
    trace_actual = go.Scatter(x=x_axis, y=actual, mode='lines', name='Actual Price')
    trace_predicted = go.Scatter(x=x_axis, y=predicted, mode='lines', name='Predicted Price')
    data_line = [trace_actual, trace_predicted]
    layout_line = go.Layout(
        title='Actual vs. Predicted BTC Prices (Test Set)',
        xaxis={'title': x_label},
        yaxis={'title': 'BTC Price'},
        legend={'x': 0, 'y': 1},
        margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
        hovermode='closest'
    )
    fig_line = go.Figure(data=data_line, layout=layout_line)
    pyo.plot(fig_line, filename='price_line_plot.html')
    # Scatter plot: Predicted vs Actual
    trace_scatter = go.Scatter(
        x=actual,
        y=predicted,
        mode='markers',
        name='Predicted vs Actual',
        opacity=0.5
    )
    # Diagonal reference line
    min_val = min(np.min(actual), np.min(predicted))
    max_val = max(np.max(actual), np.max(predicted))
    trace_diag = go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        name='Ideal',
        line=dict(dash='dash', color='red')
    )
    data_scatter = [trace_scatter, trace_diag]
    layout_scatter = go.Layout(
        title='Predicted vs Actual Prices (Scatter)',
        xaxis={'title': 'Actual Price'},
        yaxis={'title': 'Predicted Price'},
        showlegend=True,
        margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
        hovermode='closest'
    )
    fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
    pyo.plot(fig_scatter, filename='price_scatter_plot.html')
Author	SHA1	Message	Date
Simon Moisy	a22914731f	gitignore updated, model file	2025-05-30 12:31:20 +08:00
Simon Moisy	81e4b640a7	model updated	2025-05-30 12:29:37 +08:00
Simon Moisy	2dba88b620	Added mode indicators, still WIP	2025-05-29 12:45:45 +08:00
Simon Moisy	de67b27e37	XGBoost first iteration	2025-05-29 18:28:53 +08:00
Simon Moisy	1284549106	progress print	2025-05-29 11:04:03 +08:00
Simon Moisy	5f03524d6a	never fallback to default values for fee_usd	2025-05-28 02:50:40 +08:00
Simon Moisy	74c8048ed5	shifted one day back on the metatrend to avoid lookahead bias, reverted metatrend calculus to use no cpu optimization for readability	2025-05-27 17:49:55 +08:00