OHLCVPredictor/main.py

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import pandas as pd
import numpy as np
from custom_xgboost import CustomXGBoostGPU
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap
import time
from numba import njit
import csv
from feature_engineering import feature_engineering
from sklearn.feature_selection import VarianceThreshold

charts_dir = 'charts'
if not os.path.exists(charts_dir):
    os.makedirs(charts_dir)

def run_indicator(func, *args):
    return func(*args)

def run_indicator_job(job):
    import time
    func, *args = job
    indicator_name = func.__name__
    start = time.time()
    result = func(*args)
    elapsed = time.time() - start
    print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds')
    return result

if __name__ == '__main__':
    IMPUTE_NANS = True  # Set to True to impute NaNs, False to drop rows with NaNs
    csv_path = '../data/btcusd_1-min_data.csv'
    csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]

    print('Reading CSV and filtering data...')
    df = pd.read_csv(csv_path)
    df = df[df['Volume'] != 0]

    min_date = '2017-06-01'
    print('Converting Timestamp and filtering by date...')
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
    df = df[df['Timestamp'] >= min_date]

    lags = 3

    print('Calculating log returns as the new target...')
    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))

    ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
    window_sizes = [5, 15, 30]  # in minutes, adjust as needed

    features_dict = {}

    print('Starting feature computation...')
    feature_start_time = time.time()
    features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes)
    print('Concatenating all new features to DataFrame...')

    features_df = pd.DataFrame(features_dict)
    df = pd.concat([df, features_df], axis=1)

    # feature_cols_for_variance = [col for col in features_df.columns if features_df[col].dtype in [np.float32, np.float64, float, int, np.int32, np.int64]]
    # if feature_cols_for_variance:
    #     selector = VarianceThreshold(threshold=1e-5)
    #     filtered_features = selector.fit_transform(features_df[feature_cols_for_variance])
    #     kept_mask = selector.get_support()
    #     kept_feature_names = [col for col, keep in zip(feature_cols_for_variance, kept_mask) if keep]
    #     print(f"Features removed by low variance: {[col for col, keep in zip(feature_cols_for_variance, kept_mask) if not keep]}")
    #     # Only keep the selected features in features_df and df
    #     features_df = features_df[kept_feature_names]
    #     for col in feature_cols_for_variance:
    #         if col not in kept_feature_names:
    #             df.drop(col, axis=1, inplace=True)
    # else:
    #     print("No numeric features found for variance thresholding.")

    # Remove highly correlated features (keep only one from each correlated group)
    # corr_matrix = features_df.corr().abs()
    # upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    # to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    # if to_drop:
    #     print(f"Features removed due to high correlation: {to_drop}")
    #     features_df = features_df.drop(columns=to_drop)
    #     df = df.drop(columns=to_drop)
    # else:
    #     print("No highly correlated features found for removal.")

    print('Downcasting float columns to save memory...')
    for col in df.columns:
        try:
            df[col] = pd.to_numeric(df[col], downcast='float')
        except Exception:
            pass

    # Add time features (exclude 'dayofweek')
    print('Adding hour feature...')
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
    df['hour'] = df['Timestamp'].dt.hour

    # Handle NaNs after all feature engineering
    if IMPUTE_NANS:
        print('Imputing NaNs after feature engineering (using mean imputation)...')
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            df[col] = df[col].fillna(df[col].mean())
        # If you want to impute non-numeric columns differently, add logic here
    else:
        print('Dropping NaNs after feature engineering...')
        df = df.dropna().reset_index(drop=True)

    # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
    print('Selecting feature columns...')
    exclude_cols = ['Timestamp', 'Close']
    exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30']
    exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar',
    'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband',
    'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line',
    'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2',
    'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15',
    'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15',
    'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30',
    'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5',
    'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30',
    'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15',
    'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']

    feature_cols = [col for col in df.columns if col not in exclude_cols]
    print('Features used for training:', feature_cols)

    # from xgboost import XGBRegressor
    # from sklearn.model_selection import GridSearchCV

    # # Prepare data for grid search
    # X = df[feature_cols].values.astype(np.float32)
    # y = df["log_return"].values.astype(np.float32)
    # split_idx = int(len(X) * 0.8)
    # X_train, X_test = X[:split_idx], X[split_idx:]
    # y_train, y_test = y[:split_idx], y[split_idx:]

    # # Define parameter grid
    # param_grid = {
    #     'learning_rate': [0.01, 0.05, 0.1],
    #     'max_depth': [3, 5, 7],
    #     'n_estimators': [100, 200],
    #     'subsample': [0.8, 1.0],
    #     'colsample_bytree': [0.8, 1.0],
    # }

    # print('Starting grid search for XGBoost hyperparameters...')
    # xgb_model = XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', eval_metric='mae', verbosity=0)
    # grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
    # grid_search.fit(X_train, y_train)
    # print('Best parameters found:', grid_search.best_params_)

    # # Use best estimator for predictions
    # best_model = grid_search.best_estimator_
    # test_preds = best_model.predict(X_test)
    # rmse = np.sqrt(mean_squared_error(y_test, test_preds))

    # # Reconstruct price series from log returns
    # if 'Close' in df.columns:
    #     close_prices = df['Close'].values
    # else:
    #     close_prices = pd.read_csv(csv_path)['Close'].values
    # start_price = close_prices[split_idx]
    # actual_prices = [start_price]
    # for r_ in y_test:
    #     actual_prices.append(actual_prices[-1] * np.exp(r_))
    # actual_prices = np.array(actual_prices[1:])
    # predicted_prices = [start_price]
    # for r_ in test_preds:
    #     predicted_prices.append(predicted_prices[-1] * np.exp(r_))
    # predicted_prices = np.array(predicted_prices[1:])

    # mae = mean_absolute_error(actual_prices, predicted_prices)
    # r2 = r2_score(actual_prices, predicted_prices)
    # direction_actual = np.sign(np.diff(actual_prices))
    # direction_pred = np.sign(np.diff(predicted_prices))
    # directional_accuracy = (direction_actual == direction_pred).mean()
    # mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100

    # print(f'Grid search results: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%, DirAcc={directional_accuracy*100:.2f}%')

    # plot_prefix = f'all_features_gridsearch'
    # plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)

    # sys.exit(0)

    # Prepare CSV for results
    results_csv = '../data/cumulative_feature_results.csv'
    if not os.path.exists(results_csv):
        with open(results_csv, 'w', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['num_features', 'added feature', 'rmse', 'mae', 'r2', 'mape', 'directional_accuracy', 'feature_importance'])

    try:
        X = df[feature_cols].values.astype(np.float32)
        y = df["log_return"].values.astype(np.float32)
        split_idx = int(len(X) * 0.8)
        X_train, X_test = X[:split_idx], X[split_idx:]
        y_train, y_test = y[:split_idx], y[split_idx:]
        test_timestamps = df['Timestamp'].values[split_idx:]

        model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
        booster = model.train(eval_metric='rmse')
        #     colsample_bytree=1.0,
        #     learning_rate=0.05,
        #     max_depth=7,
        #     n_estimators=200,
        #     subsample=0.8
        # )
        model.save_model(f'../data/xgboost_model_all_features.json')

        test_preds = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, test_preds))

        # Reconstruct price series from log returns
        if 'Close' in df.columns:
            close_prices = df['Close'].values
        else:
            close_prices = pd.read_csv(csv_path)['Close'].values
        start_price = close_prices[split_idx]
        actual_prices = [start_price]
        for r_ in y_test:
            actual_prices.append(actual_prices[-1] * np.exp(r_))
        actual_prices = np.array(actual_prices[1:])
        predicted_prices = [start_price]
        for r_ in test_preds:
            predicted_prices.append(predicted_prices[-1] * np.exp(r_))
        predicted_prices = np.array(predicted_prices[1:])

        # mae = mean_absolute_error(actual_prices, predicted_prices)
        r2 = r2_score(actual_prices, predicted_prices)
        direction_actual = np.sign(np.diff(actual_prices))
        direction_pred = np.sign(np.diff(predicted_prices))
        directional_accuracy = (direction_actual == direction_pred).mean()
        mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100

        # Save results to CSV for all features used in this run
        feature_importance_dict = model.get_feature_importance(feature_cols)
        with open(results_csv, 'a', newline='') as f:
            writer = csv.writer(f)
            for feature in feature_cols:
                importance = feature_importance_dict.get(feature, 0.0)
                fi_str = format(importance, ".6f")
                row = [feature]
                for val in [rmse, mape, r2, directional_accuracy]:
                    if isinstance(val, float):
                        row.append(format(val, '.10f'))
                    else:
                        row.append(val)
                row.append(fi_str)
                writer.writerow(row)
        print('Feature importances and results saved for all features used in this run.')

        # Plotting for this run
        # plot_prefix = f'cumulative_{n}_features'
        # plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
        # plot_direction_transition_heatmap(actual_prices, predicted_prices, prefix=plot_prefix)
    except Exception as e:
        print(f'Cumulative feature run failed: {e}')
    print(f'All cumulative feature runs completed. Results saved to {results_csv}')

    plot_prefix = f'all_features'
    plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)

    sys.exit(0)