import sys import os sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) import pandas as pd import numpy as np from custom_xgboost import CustomXGBoostGPU from evaluation import walk_forward_cv from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap import time from numba import njit import csv from feature_engineering import feature_engineering from sklearn.feature_selection import VarianceThreshold charts_dir = 'charts' if not os.path.exists(charts_dir): os.makedirs(charts_dir) def run_indicator(func, *args): return func(*args) def run_indicator_job(job): import time func, *args = job indicator_name = func.__name__ start = time.time() result = func(*args) elapsed = time.time() - start print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds') return result if __name__ == '__main__': IMPUTE_NANS = True # Set to True to impute NaNs, False to drop rows with NaNs csv_path = '../data/btcusd_1-min_data.csv' csv_prefix = os.path.splitext(os.path.basename(csv_path))[0] print('Reading CSV and filtering data...') df = pd.read_csv(csv_path) df = df[df['Volume'] != 0] min_date = '2017-06-01' print('Converting Timestamp and filtering by date...') df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s') df = df[df['Timestamp'] >= min_date] lags = 3 print('Calculating log returns as the new target...') df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume'] window_sizes = [5, 15, 30] # in minutes, adjust as needed features_dict = {} print('Starting feature computation...') feature_start_time = time.time() features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes) print('Concatenating all new features to DataFrame...') features_df = pd.DataFrame(features_dict) df = pd.concat([df, features_df], axis=1) # feature_cols_for_variance = [col for col in features_df.columns if features_df[col].dtype in [np.float32, np.float64, float, int, np.int32, np.int64]] # if feature_cols_for_variance: # selector = VarianceThreshold(threshold=1e-5) # filtered_features = selector.fit_transform(features_df[feature_cols_for_variance]) # kept_mask = selector.get_support() # kept_feature_names = [col for col, keep in zip(feature_cols_for_variance, kept_mask) if keep] # print(f"Features removed by low variance: {[col for col, keep in zip(feature_cols_for_variance, kept_mask) if not keep]}") # # Only keep the selected features in features_df and df # features_df = features_df[kept_feature_names] # for col in feature_cols_for_variance: # if col not in kept_feature_names: # df.drop(col, axis=1, inplace=True) # else: # print("No numeric features found for variance thresholding.") # Remove highly correlated features (keep only one from each correlated group) # corr_matrix = features_df.corr().abs() # upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) # to_drop = [column for column in upper.columns if any(upper[column] > 0.95)] # if to_drop: # print(f"Features removed due to high correlation: {to_drop}") # features_df = features_df.drop(columns=to_drop) # df = df.drop(columns=to_drop) # else: # print("No highly correlated features found for removal.") print('Downcasting float columns to save memory...') for col in df.columns: try: df[col] = pd.to_numeric(df[col], downcast='float') except Exception: pass # Add time features (exclude 'dayofweek') print('Adding hour feature...') df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce') df['hour'] = df['Timestamp'].dt.hour # Handle NaNs after all feature engineering if IMPUTE_NANS: print('Imputing NaNs after feature engineering (using mean imputation)...') numeric_cols = df.select_dtypes(include=[np.number]).columns for col in numeric_cols: df[col] = df[col].fillna(df[col].mean()) # If you want to impute non-numeric columns differently, add logic here else: print('Dropping NaNs after feature engineering...') df = df.dropna().reset_index(drop=True) # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features print('Selecting feature columns...') exclude_cols = ['Timestamp', 'Close'] exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30'] exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar', 'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband', 'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line', 'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2', 'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15', 'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15', 'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30', 'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5', 'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30', 'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15', 'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0'] feature_cols = [col for col in df.columns if col not in exclude_cols] print('Features used for training:', feature_cols) # from xgboost import XGBRegressor # from sklearn.model_selection import GridSearchCV # # Prepare data for grid search # X = df[feature_cols].values.astype(np.float32) # y = df["log_return"].values.astype(np.float32) # split_idx = int(len(X) * 0.8) # X_train, X_test = X[:split_idx], X[split_idx:] # y_train, y_test = y[:split_idx], y[split_idx:] # # Define parameter grid # param_grid = { # 'learning_rate': [0.01, 0.05, 0.1], # 'max_depth': [3, 5, 7], # 'n_estimators': [100, 200], # 'subsample': [0.8, 1.0], # 'colsample_bytree': [0.8, 1.0], # } # print('Starting grid search for XGBoost hyperparameters...') # xgb_model = XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', eval_metric='mae', verbosity=0) # grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1) # grid_search.fit(X_train, y_train) # print('Best parameters found:', grid_search.best_params_) # # Use best estimator for predictions # best_model = grid_search.best_estimator_ # test_preds = best_model.predict(X_test) # rmse = np.sqrt(mean_squared_error(y_test, test_preds)) # # Reconstruct price series from log returns # if 'Close' in df.columns: # close_prices = df['Close'].values # else: # close_prices = pd.read_csv(csv_path)['Close'].values # start_price = close_prices[split_idx] # actual_prices = [start_price] # for r_ in y_test: # actual_prices.append(actual_prices[-1] * np.exp(r_)) # actual_prices = np.array(actual_prices[1:]) # predicted_prices = [start_price] # for r_ in test_preds: # predicted_prices.append(predicted_prices[-1] * np.exp(r_)) # predicted_prices = np.array(predicted_prices[1:]) # mae = mean_absolute_error(actual_prices, predicted_prices) # r2 = r2_score(actual_prices, predicted_prices) # direction_actual = np.sign(np.diff(actual_prices)) # direction_pred = np.sign(np.diff(predicted_prices)) # directional_accuracy = (direction_actual == direction_pred).mean() # mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100 # print(f'Grid search results: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%, DirAcc={directional_accuracy*100:.2f}%') # plot_prefix = f'all_features_gridsearch' # plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix) # sys.exit(0) # Prepare CSV for results results_csv = '../data/cumulative_feature_results.csv' if not os.path.exists(results_csv): with open(results_csv, 'w', newline='') as f: writer = csv.writer(f) # Header aligned with what we actually write: feature, rmse, mape, r2, directional_accuracy, feature_importance writer.writerow(['feature', 'rmse', 'mape', 'r2', 'directional_accuracy', 'feature_importance']) try: X = df[feature_cols].values.astype(np.float32) y = df["log_return"].values.astype(np.float32) split_idx = int(len(X) * 0.8) X_train, X_test = X[:split_idx], X[split_idx:] y_train, y_test = y[:split_idx], y[split_idx:] test_timestamps = df['Timestamp'].values[split_idx:] # Optional: walk-forward CV to obtain averaged importance for pruning DO_WALK_FORWARD_CV = True AUTO_PRUNE = True TOP_K = 150 # keep top-K features by averaged importance feature_importance_avg = None if DO_WALK_FORWARD_CV: print('Running walk-forward CV for metrics and averaged feature importance...') metrics_avg, importance_avg = walk_forward_cv(X, y, feature_cols, n_splits=5) print(f"CV metrics: RMSE={metrics_avg['rmse']:.6f}, MAPE={metrics_avg['mape']:.4f}%, R2={metrics_avg['r2']:.6f}, DirAcc={metrics_avg['dir_acc']:.4f}") feature_importance_avg = importance_avg # Auto-prune low-importance and known low-value features prune_set = set() # Known low-value from analysis KNOWN_LOW = [ 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0', 'supertrend_trend_12_3.0', 'supertrend_trend_10_1.0', 'supertrend_trend_11_2.0', ] # Prefer cyclical hour; drop raw hour KNOWN_LOW += ['hour'] if feature_importance_avg is not None: # Keep top-K by averaged importance sorted_feats = sorted(feature_importance_avg.items(), key=lambda kv: kv[1], reverse=True) keep_names = set(name for name, _ in sorted_feats[:TOP_K]) for name in feature_cols: if name not in keep_names: prune_set.add(name) for name in KNOWN_LOW: if name in feature_cols: prune_set.add(name) # Drop alternative vol estimators if Parkinson present at same window for w in [5, 15, 30]: park = f'park_vol_{w}' gk = f'gk_vol_{w}' rs = f'rs_vol_{w}' yz = f'yz_vol_{w}' if park in feature_cols: for alt in [gk, rs, yz]: if alt in feature_cols: prune_set.add(alt) # Apply pruning to training set if AUTO_PRUNE and prune_set: print(f'Pruning {len(prune_set)} features before training...') kept_feature_cols = [c for c in feature_cols if c not in prune_set] else: kept_feature_cols = feature_cols model = CustomXGBoostGPU( df[kept_feature_cols].values.astype(np.float32)[:split_idx], df[kept_feature_cols].values.astype(np.float32)[split_idx:], y[:split_idx], y[split_idx:] ) booster = model.train(eval_metric='rmse') # colsample_bytree=1.0, # learning_rate=0.05, # max_depth=7, # n_estimators=200, # subsample=0.8 # ) model.save_model(f'../data/xgboost_model_all_features.json') test_preds = model.predict(X_test) rmse = np.sqrt(mean_squared_error(y_test, test_preds)) # Reconstruct price series from log returns if 'Close' in df.columns: close_prices = df['Close'].values else: close_prices = pd.read_csv(csv_path)['Close'].values start_price = close_prices[split_idx] actual_prices = [start_price] for r_ in y_test: actual_prices.append(actual_prices[-1] * np.exp(r_)) actual_prices = np.array(actual_prices[1:]) predicted_prices = [start_price] for r_ in test_preds: predicted_prices.append(predicted_prices[-1] * np.exp(r_)) predicted_prices = np.array(predicted_prices[1:]) # mae = mean_absolute_error(actual_prices, predicted_prices) r2 = r2_score(actual_prices, predicted_prices) direction_actual = np.sign(np.diff(actual_prices)) direction_pred = np.sign(np.diff(predicted_prices)) directional_accuracy = (direction_actual == direction_pred).mean() mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100 # Save results to CSV for all features used in this run feature_importance_dict = model.get_feature_importance(kept_feature_cols) with open(results_csv, 'a', newline='') as f: writer = csv.writer(f) for feature in kept_feature_cols: importance = feature_importance_dict.get(feature, 0.0) fi_str = format(importance, ".6f") row = [feature] for val in [rmse, mape, r2, directional_accuracy]: if isinstance(val, float): row.append(format(val, '.10f')) else: row.append(val) row.append(fi_str) writer.writerow(row) print('Feature importances and results saved for all features used in this run.') # Plotting for this run # plot_prefix = f'cumulative_{n}_features' # plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix) # plot_direction_transition_heatmap(actual_prices, predicted_prices, prefix=plot_prefix) except Exception as e: print(f'Cumulative feature run failed: {e}') print(f'All cumulative feature runs completed. Results saved to {results_csv}') plot_prefix = f'all_features' plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix) sys.exit(0)