OHLCVPredictor/main.py

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import pandas as pd
import numpy as np
from custom_xgboost import CustomXGBoostGPU
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap
import time
from numba import njit
import csv
from feature_engineering import feature_engineering
from sklearn.feature_selection import VarianceThreshold

charts_dir = 'charts'
if not os.path.exists(charts_dir):
    os.makedirs(charts_dir)

def run_indicator(func, *args):
    return func(*args)

def run_indicator_job(job):
    import time
    func, *args = job
    indicator_name = func.__name__
    start = time.time()
    result = func(*args)
    elapsed = time.time() - start
    print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds')
    return result

if __name__ == '__main__':
    IMPUTE_NANS = True  # Set to True to impute NaNs, False to drop rows with NaNs
    csv_path = '../data/btcusd_1-min_data.csv'
    csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]

    print('Reading CSV and filtering data...')
    df = pd.read_csv(csv_path)
    df = df[df['Volume'] != 0]

    min_date = '2017-06-01'
    print('Converting Timestamp and filtering by date...')
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
    df = df[df['Timestamp'] >= min_date]

    lags = 3

    print('Calculating log returns as the new target...')
    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))

    ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
    window_sizes = [5, 15, 30]  # in minutes, adjust as needed

    features_dict = {}

    print('Starting feature computation...')
    feature_start_time = time.time()
    features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes)
    print('Concatenating all new features to DataFrame...')

    features_df = pd.DataFrame(features_dict)
    df = pd.concat([df, features_df], axis=1)

    # feature_cols_for_variance = [col for col in features_df.columns if features_df[col].dtype in [np.float32, np.float64, float, int, np.int32, np.int64]]
    # if feature_cols_for_variance:
    #     selector = VarianceThreshold(threshold=1e-5)
    #     filtered_features = selector.fit_transform(features_df[feature_cols_for_variance])
    #     kept_mask = selector.get_support()
    #     kept_feature_names = [col for col, keep in zip(feature_cols_for_variance, kept_mask) if keep]
    #     print(f"Features removed by low variance: {[col for col, keep in zip(feature_cols_for_variance, kept_mask) if not keep]}")
    #     # Only keep the selected features in features_df and df
    #     features_df = features_df[kept_feature_names]
    #     for col in feature_cols_for_variance:
    #         if col not in kept_feature_names:
    #             df.drop(col, axis=1, inplace=True)
    # else:
    #     print("No numeric features found for variance thresholding.")

    # Remove highly correlated features (keep only one from each correlated group)
    # corr_matrix = features_df.corr().abs()
    # upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    # to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    # if to_drop:
    #     print(f"Features removed due to high correlation: {to_drop}")
    #     features_df = features_df.drop(columns=to_drop)
    #     df = df.drop(columns=to_drop)
    # else:
    #     print("No highly correlated features found for removal.")

    print('Downcasting float columns to save memory...')
    for col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col], downcast='float')
        except Exception:
            pass

    # Add time features (exclude 'dayofweek')
    print('Adding hour feature...')
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
    df['hour'] = df['Timestamp'].dt.hour

    # Handle NaNs after all feature engineering
    if IMPUTE_NANS:
        print('Imputing NaNs after feature engineering (using mean imputation)...')
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            df[col] = df[col].fillna(df[col].mean())
        # If you want to impute non-numeric columns differently, add logic here
    else:
        print('Dropping NaNs after feature engineering...')
        df = df.dropna().reset_index(drop=True)

    # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
    print('Selecting feature columns...')
    exclude_cols = ['Timestamp', 'Close']
    exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30'] 
    exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar', 
    'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband', 
    'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line', 
    'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2', 
    'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15', 
    'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15', 
    'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30', 
    'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5', 
    'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30', 
    'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15', 
    'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']

    feature_cols = [col for col in df.columns if col not in exclude_cols]
    print('Features used for training:', feature_cols)

    # from xgboost import XGBRegressor
    # from sklearn.model_selection import GridSearchCV

    # # Prepare data for grid search
    # X = df[feature_cols].values.astype(np.float32)
    # y = df["log_return"].values.astype(np.float32)
    # split_idx = int(len(X) * 0.8)
    # X_train, X_test = X[:split_idx], X[split_idx:]
    # y_train, y_test = y[:split_idx], y[split_idx:]

    # # Define parameter grid
    # param_grid = {
    #     'learning_rate': [0.01, 0.05, 0.1],
    #     'max_depth': [3, 5, 7],
    #     'n_estimators': [100, 200],
    #     'subsample': [0.8, 1.0],
    #     'colsample_bytree': [0.8, 1.0],
    # }

    # print('Starting grid search for XGBoost hyperparameters...')
    # xgb_model = XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', eval_metric='mae', verbosity=0)
    # grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
    # grid_search.fit(X_train, y_train)
    # print('Best parameters found:', grid_search.best_params_)

    # # Use best estimator for predictions
    # best_model = grid_search.best_estimator_
    # test_preds = best_model.predict(X_test)
    # rmse = np.sqrt(mean_squared_error(y_test, test_preds))

    # # Reconstruct price series from log returns
    # if 'Close' in df.columns:
    #     close_prices = df['Close'].values
    # else:
    #     close_prices = pd.read_csv(csv_path)['Close'].values
    # start_price = close_prices[split_idx]
    # actual_prices = [start_price]
    # for r_ in y_test:
    #     actual_prices.append(actual_prices[-1] * np.exp(r_))
    # actual_prices = np.array(actual_prices[1:])
    # predicted_prices = [start_price]
    # for r_ in test_preds:
    #     predicted_prices.append(predicted_prices[-1] * np.exp(r_))
    # predicted_prices = np.array(predicted_prices[1:])

    # mae = mean_absolute_error(actual_prices, predicted_prices)
    # r2 = r2_score(actual_prices, predicted_prices)
    # direction_actual = np.sign(np.diff(actual_prices))
    # direction_pred = np.sign(np.diff(predicted_prices))
    # directional_accuracy = (direction_actual == direction_pred).mean()
    # mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100

    # print(f'Grid search results: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%, DirAcc={directional_accuracy*100:.2f}%')

    # plot_prefix = f'all_features_gridsearch'
    # plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)

    # sys.exit(0)

    # Prepare CSV for results
    results_csv = '../data/cumulative_feature_results.csv'
    if not os.path.exists(results_csv):
        with open(results_csv, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['num_features', 'added feature', 'rmse', 'mae', 'r2', 'mape', 'directional_accuracy', 'feature_importance'])

    try:
        X = df[feature_cols].values.astype(np.float32)
        y = df["log_return"].values.astype(np.float32)
        split_idx = int(len(X) * 0.8)
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        test_timestamps = df['Timestamp'].values[split_idx:]

        model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
        booster = model.train(eval_metric='rmse')
        #     colsample_bytree=1.0,
        #     learning_rate=0.05,
        #     max_depth=7,
        #     n_estimators=200,
        #     subsample=0.8
        # )
        model.save_model(f'../data/xgboost_model_all_features.json')

        test_preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, test_preds))

        # Reconstruct price series from log returns
        if 'Close' in df.columns:
            close_prices = df['Close'].values
        else:
            close_prices = pd.read_csv(csv_path)['Close'].values
        start_price = close_prices[split_idx]
        actual_prices = [start_price]
        for r_ in y_test:
            actual_prices.append(actual_prices[-1] * np.exp(r_))
        actual_prices = np.array(actual_prices[1:])
        predicted_prices = [start_price]
        for r_ in test_preds:
            predicted_prices.append(predicted_prices[-1] * np.exp(r_))
        predicted_prices = np.array(predicted_prices[1:])

        # mae = mean_absolute_error(actual_prices, predicted_prices)
        r2 = r2_score(actual_prices, predicted_prices)
        direction_actual = np.sign(np.diff(actual_prices))
        direction_pred = np.sign(np.diff(predicted_prices))
        directional_accuracy = (direction_actual == direction_pred).mean()
        mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100

        # Save results to CSV for all features used in this run
        feature_importance_dict = model.get_feature_importance(feature_cols)
        with open(results_csv, 'a', newline='') as f:
            writer = csv.writer(f)
            for feature in feature_cols:
                importance = feature_importance_dict.get(feature, 0.0)
                fi_str = format(importance, ".6f")
                row = [feature]
                for val in [rmse, mape, r2, directional_accuracy]:
                    if isinstance(val, float):
                        row.append(format(val, '.10f'))
                    else:
                        row.append(val)
                row.append(fi_str)
                writer.writerow(row)
        print('Feature importances and results saved for all features used in this run.')

        # Plotting for this run
        # plot_prefix = f'cumulative_{n}_features'
        # plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
        # plot_direction_transition_heatmap(actual_prices, predicted_prices, prefix=plot_prefix)
    except Exception as e:
        print(f'Cumulative feature run failed: {e}')
    print(f'All cumulative feature runs completed. Results saved to {results_csv}')

    plot_prefix = f'all_features'
    plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)

    sys.exit(0)
init 2025-05-31 00:57:31 +08:00			`import sys`
			`import os`
			`sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))`
			`import pandas as pd`
			`import numpy as np`
			`from custom_xgboost import CustomXGBoostGPU`
			`from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score`
			`from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap`
			`import time`
			`from numba import njit`
			`import csv`
Did feature selection, moved from RMSE to MAE for accuracy testing (maybe not a good idea after all), fine tuned hyperparameters a bit (need to do more) 2025-06-03 15:40:43 +08:00			`from feature_engineering import feature_engineering`
			`from sklearn.feature_selection import VarianceThreshold`

			`charts_dir = 'charts'`
			`if not os.path.exists(charts_dir):`
			`os.makedirs(charts_dir)`
init 2025-05-31 00:57:31 +08:00
			`def run_indicator(func, *args):`
			`return func(*args)`

			`def run_indicator_job(job):`
			`import time`
			`func, *args = job`
			`indicator_name = func.__name__`
			`start = time.time()`
			`result = func(*args)`
			`elapsed = time.time() - start`
			`print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds')`
			`return result`

			`if __name__ == '__main__':`
			`IMPUTE_NANS = True # Set to True to impute NaNs, False to drop rows with NaNs`
			`csv_path = '../data/btcusd_1-min_data.csv'`
			`csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]`

			`print('Reading CSV and filtering data...')`
			`df = pd.read_csv(csv_path)`
			`df = df[df['Volume'] != 0]`

			`min_date = '2017-06-01'`
			`print('Converting Timestamp and filtering by date...')`
			`df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')`
			`df = df[df['Timestamp'] >= min_date]`

			`lags = 3`

			`print('Calculating log returns as the new target...')`
			`df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))`

			`ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']`
			`window_sizes = [5, 15, 30] # in minutes, adjust as needed`

			`features_dict = {}`

			`print('Starting feature computation...')`
			`feature_start_time = time.time()`
Did feature selection, moved from RMSE to MAE for accuracy testing (maybe not a good idea after all), fine tuned hyperparameters a bit (need to do more) 2025-06-03 15:40:43 +08:00			`features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes)`
init 2025-05-31 00:57:31 +08:00			`print('Concatenating all new features to DataFrame...')`
Did feature selection, moved from RMSE to MAE for accuracy testing (maybe not a good idea after all), fine tuned hyperparameters a bit (need to do more) 2025-06-03 15:40:43 +08:00
init 2025-05-31 00:57:31 +08:00			`features_df = pd.DataFrame(features_dict)`
			`df = pd.concat([df, features_df], axis=1)`

Did feature selection, moved from RMSE to MAE for accuracy testing (maybe not a good idea after all), fine tuned hyperparameters a bit (need to do more) 2025-06-03 15:40:43 +08:00			`# feature_cols_for_variance = [col for col in features_df.columns if features_df[col].dtype in [np.float32, np.float64, float, int, np.int32, np.int64]]`
			`# if feature_cols_for_variance:`
			`# selector = VarianceThreshold(threshold=1e-5)`
			`# filtered_features = selector.fit_transform(features_df[feature_cols_for_variance])`
			`# kept_mask = selector.get_support()`
			`# kept_feature_names = [col for col, keep in zip(feature_cols_for_variance, kept_mask) if keep]`
			`# print(f"Features removed by low variance: {[col for col, keep in zip(feature_cols_for_variance, kept_mask) if not keep]}")`
			`# # Only keep the selected features in features_df and df`
			`# features_df = features_df[kept_feature_names]`
			`# for col in feature_cols_for_variance:`
			`# if col not in kept_feature_names:`
			`# df.drop(col, axis=1, inplace=True)`
			`# else:`
			`# print("No numeric features found for variance thresholding.")`

			`# Remove highly correlated features (keep only one from each correlated group)`
			`# corr_matrix = features_df.corr().abs()`
			`# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))`
			`# to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]`
			`# if to_drop:`
			`# print(f"Features removed due to high correlation: {to_drop}")`
			`# features_df = features_df.drop(columns=to_drop)`
			`# df = df.drop(columns=to_drop)`
			`# else:`
			`# print("No highly correlated features found for removal.")`
init 2025-05-31 00:57:31 +08:00
			`print('Downcasting float columns to save memory...')`
			`for col in df.columns:`
			`try:`
			`df[col] = pd.to_numeric(df[col], downcast='float')`
			`except Exception:`
			`pass`

			`# Add time features (exclude 'dayofweek')`
			`print('Adding hour feature...')`
			`df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')`
			`df['hour'] = df['Timestamp'].dt.hour`

			`# Handle NaNs after all feature engineering`
			`if IMPUTE_NANS:`
			`print('Imputing NaNs after feature engineering (using mean imputation)...')`
			`numeric_cols = df.select_dtypes(include=[np.number]).columns`
			`for col in numeric_cols:`
			`df[col] = df[col].fillna(df[col].mean())`
			`# If you want to impute non-numeric columns differently, add logic here`
			`else:`
			`print('Dropping NaNs after feature engineering...')`
			`df = df.dropna().reset_index(drop=True)`

			`# Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features`
			`print('Selecting feature columns...')`
Did feature selection, moved from RMSE to MAE for accuracy testing (maybe not a good idea after all), fine tuned hyperparameters a bit (need to do more) 2025-06-03 15:40:43 +08:00			`exclude_cols = ['Timestamp', 'Close']`
			`exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30']`
			`exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar',`
			`'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband',`
			`'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line',`
			`'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2',`
			`'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15',`
			`'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15',`
			`'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30',`
			`'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5',`
			`'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30',`
			`'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15',`
			`'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']`

init 2025-05-31 00:57:31 +08:00			`feature_cols = [col for col in df.columns if col not in exclude_cols]`
			`print('Features used for training:', feature_cols)`

Did feature selection, moved from RMSE to MAE for accuracy testing (maybe not a good idea after all), fine tuned hyperparameters a bit (need to do more) 2025-06-03 15:40:43 +08:00			`# from xgboost import XGBRegressor`
			`# from sklearn.model_selection import GridSearchCV`

			`# # Prepare data for grid search`
			`# X = df[feature_cols].values.astype(np.float32)`
			`# y = df["log_return"].values.astype(np.float32)`
			`# split_idx = int(len(X) * 0.8)`
			`# X_train, X_test = X[:split_idx], X[split_idx:]`
			`# y_train, y_test = y[:split_idx], y[split_idx:]`

			`# # Define parameter grid`
			`# param_grid = {`
			`# 'learning_rate': [0.01, 0.05, 0.1],`
			`# 'max_depth': [3, 5, 7],`
			`# 'n_estimators': [100, 200],`
			`# 'subsample': [0.8, 1.0],`
			`# 'colsample_bytree': [0.8, 1.0],`
			`# }`

			`# print('Starting grid search for XGBoost hyperparameters...')`
			`# xgb_model = XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', eval_metric='mae', verbosity=0)`
			`# grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)`
			`# grid_search.fit(X_train, y_train)`
			`# print('Best parameters found:', grid_search.best_params_)`

			`# # Use best estimator for predictions`
			`# best_model = grid_search.best_estimator_`
			`# test_preds = best_model.predict(X_test)`
			`# rmse = np.sqrt(mean_squared_error(y_test, test_preds))`

			`# # Reconstruct price series from log returns`
			`# if 'Close' in df.columns:`
			`# close_prices = df['Close'].values`
			`# else:`
			`# close_prices = pd.read_csv(csv_path)['Close'].values`
			`# start_price = close_prices[split_idx]`
			`# actual_prices = [start_price]`
			`# for r_ in y_test:`
			`# actual_prices.append(actual_prices[-1] * np.exp(r_))`
			`# actual_prices = np.array(actual_prices[1:])`
			`# predicted_prices = [start_price]`
			`# for r_ in test_preds:`
			`# predicted_prices.append(predicted_prices[-1] * np.exp(r_))`
			`# predicted_prices = np.array(predicted_prices[1:])`

			`# mae = mean_absolute_error(actual_prices, predicted_prices)`
			`# r2 = r2_score(actual_prices, predicted_prices)`
			`# direction_actual = np.sign(np.diff(actual_prices))`
			`# direction_pred = np.sign(np.diff(predicted_prices))`
			`# directional_accuracy = (direction_actual == direction_pred).mean()`
			`# mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100`

			`# print(f'Grid search results: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%, DirAcc={directional_accuracy*100:.2f}%')`

			`# plot_prefix = f'all_features_gridsearch'`
			`# plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)`

			`# sys.exit(0)`

init 2025-05-31 00:57:31 +08:00			`# Prepare CSV for results`
Did feature selection, moved from RMSE to MAE for accuracy testing (maybe not a good idea after all), fine tuned hyperparameters a bit (need to do more) 2025-06-03 15:40:43 +08:00			`results_csv = '../data/cumulative_feature_results.csv'`
init 2025-05-31 00:57:31 +08:00			`if not os.path.exists(results_csv):`
			`with open(results_csv, 'w', newline='') as f:`
			`writer = csv.writer(f)`
Did feature selection, moved from RMSE to MAE for accuracy testing (maybe not a good idea after all), fine tuned hyperparameters a bit (need to do more) 2025-06-03 15:40:43 +08:00			`writer.writerow(['num_features', 'added feature', 'rmse', 'mae', 'r2', 'mape', 'directional_accuracy', 'feature_importance'])`

			`try:`
			`X = df[feature_cols].values.astype(np.float32)`
			`y = df["log_return"].values.astype(np.float32)`
			`split_idx = int(len(X) * 0.8)`
			`X_train, X_test = X[:split_idx], X[split_idx:]`
			`y_train, y_test = y[:split_idx], y[split_idx:]`
			`test_timestamps = df['Timestamp'].values[split_idx:]`

			`model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)`
best prediction so far 2025-06-03 16:46:01 +08:00			`booster = model.train(eval_metric='rmse')`
			`# colsample_bytree=1.0,`
			`# learning_rate=0.05,`
			`# max_depth=7,`
			`# n_estimators=200,`
			`# subsample=0.8`
			`# )`
Did feature selection, moved from RMSE to MAE for accuracy testing (maybe not a good idea after all), fine tuned hyperparameters a bit (need to do more) 2025-06-03 15:40:43 +08:00			`model.save_model(f'../data/xgboost_model_all_features.json')`

			`test_preds = model.predict(X_test)`
			`rmse = np.sqrt(mean_squared_error(y_test, test_preds))`

			`# Reconstruct price series from log returns`
			`if 'Close' in df.columns:`
			`close_prices = df['Close'].values`
			`else:`
			`close_prices = pd.read_csv(csv_path)['Close'].values`
			`start_price = close_prices[split_idx]`
			`actual_prices = [start_price]`
			`for r_ in y_test:`
			`actual_prices.append(actual_prices[-1] * np.exp(r_))`
			`actual_prices = np.array(actual_prices[1:])`
			`predicted_prices = [start_price]`
			`for r_ in test_preds:`
			`predicted_prices.append(predicted_prices[-1] * np.exp(r_))`
			`predicted_prices = np.array(predicted_prices[1:])`

best prediction so far 2025-06-03 16:46:01 +08:00			`# mae = mean_absolute_error(actual_prices, predicted_prices)`
Did feature selection, moved from RMSE to MAE for accuracy testing (maybe not a good idea after all), fine tuned hyperparameters a bit (need to do more) 2025-06-03 15:40:43 +08:00			`r2 = r2_score(actual_prices, predicted_prices)`
			`direction_actual = np.sign(np.diff(actual_prices))`
			`direction_pred = np.sign(np.diff(predicted_prices))`
			`directional_accuracy = (direction_actual == direction_pred).mean()`
			`mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100`

			`# Save results to CSV for all features used in this run`
			`feature_importance_dict = model.get_feature_importance(feature_cols)`
			`with open(results_csv, 'a', newline='') as f:`
			`writer = csv.writer(f)`
			`for feature in feature_cols:`
			`importance = feature_importance_dict.get(feature, 0.0)`
			`fi_str = format(importance, ".6f")`
			`row = [feature]`
best prediction so far 2025-06-03 16:46:01 +08:00			`for val in [rmse, mape, r2, directional_accuracy]:`
Did feature selection, moved from RMSE to MAE for accuracy testing (maybe not a good idea after all), fine tuned hyperparameters a bit (need to do more) 2025-06-03 15:40:43 +08:00			`if isinstance(val, float):`
			`row.append(format(val, '.10f'))`
			`else:`
			`row.append(val)`
			`row.append(fi_str)`
			`writer.writerow(row)`
			`print('Feature importances and results saved for all features used in this run.')`

			`# Plotting for this run`
			`# plot_prefix = f'cumulative_{n}_features'`
			`# plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)`
			`# plot_direction_transition_heatmap(actual_prices, predicted_prices, prefix=plot_prefix)`
			`except Exception as e:`
			`print(f'Cumulative feature run failed: {e}')`
			`print(f'All cumulative feature runs completed. Results saved to {results_csv}')`

			`plot_prefix = f'all_features'`
			`plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)`

init 2025-05-31 00:57:31 +08:00			`sys.exit(0)`