2025-05-31 00:57:31 +08:00
import sys
import os
sys . path . append ( os . path . abspath ( os . path . join ( os . path . dirname ( __file__ ) , ' .. ' ) ) )
import pandas as pd
import numpy as np
from custom_xgboost import CustomXGBoostGPU
from sklearn . metrics import mean_squared_error , mean_absolute_error , r2_score
from plot_results import plot_prediction_error_distribution , plot_direction_transition_heatmap
import time
from numba import njit
import csv
2025-06-03 15:40:43 +08:00
from feature_engineering import feature_engineering
from sklearn . feature_selection import VarianceThreshold
charts_dir = ' charts '
if not os . path . exists ( charts_dir ) :
os . makedirs ( charts_dir )
2025-05-31 00:57:31 +08:00
def run_indicator ( func , * args ) :
return func ( * args )
def run_indicator_job ( job ) :
import time
func , * args = job
indicator_name = func . __name__
start = time . time ( )
result = func ( * args )
elapsed = time . time ( ) - start
print ( f ' Indicator { indicator_name } computed in { elapsed : .4f } seconds ' )
return result
if __name__ == ' __main__ ' :
IMPUTE_NANS = True # Set to True to impute NaNs, False to drop rows with NaNs
csv_path = ' ../data/btcusd_1-min_data.csv '
csv_prefix = os . path . splitext ( os . path . basename ( csv_path ) ) [ 0 ]
print ( ' Reading CSV and filtering data... ' )
df = pd . read_csv ( csv_path )
df = df [ df [ ' Volume ' ] != 0 ]
min_date = ' 2017-06-01 '
print ( ' Converting Timestamp and filtering by date... ' )
df [ ' Timestamp ' ] = pd . to_datetime ( df [ ' Timestamp ' ] , unit = ' s ' )
df = df [ df [ ' Timestamp ' ] > = min_date ]
lags = 3
print ( ' Calculating log returns as the new target... ' )
df [ ' log_return ' ] = np . log ( df [ ' Close ' ] / df [ ' Close ' ] . shift ( 1 ) )
ohlcv_cols = [ ' Open ' , ' High ' , ' Low ' , ' Close ' , ' Volume ' ]
window_sizes = [ 5 , 15 , 30 ] # in minutes, adjust as needed
features_dict = { }
print ( ' Starting feature computation... ' )
feature_start_time = time . time ( )
2025-06-03 15:40:43 +08:00
features_dict = feature_engineering ( df , csv_prefix , ohlcv_cols , lags , window_sizes )
2025-05-31 00:57:31 +08:00
print ( ' Concatenating all new features to DataFrame... ' )
2025-06-03 15:40:43 +08:00
2025-05-31 00:57:31 +08:00
features_df = pd . DataFrame ( features_dict )
df = pd . concat ( [ df , features_df ] , axis = 1 )
2025-06-03 15:40:43 +08:00
# feature_cols_for_variance = [col for col in features_df.columns if features_df[col].dtype in [np.float32, np.float64, float, int, np.int32, np.int64]]
# if feature_cols_for_variance:
# selector = VarianceThreshold(threshold=1e-5)
# filtered_features = selector.fit_transform(features_df[feature_cols_for_variance])
# kept_mask = selector.get_support()
# kept_feature_names = [col for col, keep in zip(feature_cols_for_variance, kept_mask) if keep]
# print(f"Features removed by low variance: {[col for col, keep in zip(feature_cols_for_variance, kept_mask) if not keep]}")
# # Only keep the selected features in features_df and df
# features_df = features_df[kept_feature_names]
# for col in feature_cols_for_variance:
# if col not in kept_feature_names:
# df.drop(col, axis=1, inplace=True)
# else:
# print("No numeric features found for variance thresholding.")
# Remove highly correlated features (keep only one from each correlated group)
# corr_matrix = features_df.corr().abs()
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# if to_drop:
# print(f"Features removed due to high correlation: {to_drop}")
# features_df = features_df.drop(columns=to_drop)
# df = df.drop(columns=to_drop)
# else:
# print("No highly correlated features found for removal.")
2025-05-31 00:57:31 +08:00
print ( ' Downcasting float columns to save memory... ' )
for col in df . columns :
try :
df [ col ] = pd . to_numeric ( df [ col ] , downcast = ' float ' )
except Exception :
pass
# Add time features (exclude 'dayofweek')
print ( ' Adding hour feature... ' )
df [ ' Timestamp ' ] = pd . to_datetime ( df [ ' Timestamp ' ] , errors = ' coerce ' )
df [ ' hour ' ] = df [ ' Timestamp ' ] . dt . hour
# Handle NaNs after all feature engineering
if IMPUTE_NANS :
print ( ' Imputing NaNs after feature engineering (using mean imputation)... ' )
numeric_cols = df . select_dtypes ( include = [ np . number ] ) . columns
for col in numeric_cols :
df [ col ] = df [ col ] . fillna ( df [ col ] . mean ( ) )
# If you want to impute non-numeric columns differently, add logic here
else :
print ( ' Dropping NaNs after feature engineering... ' )
df = df . dropna ( ) . reset_index ( drop = True )
# Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
print ( ' Selecting feature columns... ' )
2025-06-03 15:40:43 +08:00
exclude_cols = [ ' Timestamp ' , ' Close ' ]
exclude_cols + = [ ' log_return_5 ' , ' volatility_5 ' , ' volatility_15 ' , ' volatility_30 ' ]
exclude_cols + = [ ' bb_bbm ' , ' bb_bbh ' , ' bb_bbl ' , ' stoch_k ' , ' sma_50 ' , ' sma_200 ' , ' psar ' ,
' donchian_hband ' , ' donchian_lband ' , ' donchian_mband ' , ' keltner_hband ' , ' keltner_lband ' ,
' keltner_mband ' , ' ichimoku_a ' , ' ichimoku_b ' , ' ichimoku_base_line ' , ' ichimoku_conversion_line ' ,
' Open_lag1 ' , ' Open_lag2 ' , ' Open_lag3 ' , ' High_lag1 ' , ' High_lag2 ' , ' High_lag3 ' , ' Low_lag1 ' , ' Low_lag2 ' ,
' Low_lag3 ' , ' Close_lag1 ' , ' Close_lag2 ' , ' Close_lag3 ' , ' Open_roll_mean_15 ' , ' Open_roll_std_15 ' , ' Open_roll_min_15 ' ,
' Open_roll_max_15 ' , ' Open_roll_mean_30 ' , ' Open_roll_min_30 ' , ' Open_roll_max_30 ' , ' High_roll_mean_15 ' , ' High_roll_std_15 ' ,
' High_roll_min_15 ' , ' High_roll_max_15 ' , ' Low_roll_mean_5 ' , ' Low_roll_min_5 ' , ' Low_roll_max_5 ' , ' Low_roll_mean_30 ' ,
' Low_roll_std_30 ' , ' Low_roll_min_30 ' , ' Low_roll_max_30 ' , ' Close_roll_mean_5 ' , ' Close_roll_min_5 ' , ' Close_roll_max_5 ' ,
' Close_roll_mean_15 ' , ' Close_roll_std_15 ' , ' Close_roll_min_15 ' , ' Close_roll_max_15 ' , ' Close_roll_mean_30 ' ,
' Close_roll_std_30 ' , ' Close_roll_min_30 ' , ' Close_roll_max_30 ' , ' Volume_roll_max_5 ' , ' Volume_roll_max_15 ' ,
' Volume_roll_max_30 ' , ' supertrend_12_3.0 ' , ' supertrend_10_1.0 ' , ' supertrend_11_2.0 ' ]
2025-05-31 00:57:31 +08:00
feature_cols = [ col for col in df . columns if col not in exclude_cols ]
print ( ' Features used for training: ' , feature_cols )
2025-06-03 15:40:43 +08:00
# from xgboost import XGBRegressor
# from sklearn.model_selection import GridSearchCV
# # Prepare data for grid search
# X = df[feature_cols].values.astype(np.float32)
# y = df["log_return"].values.astype(np.float32)
# split_idx = int(len(X) * 0.8)
# X_train, X_test = X[:split_idx], X[split_idx:]
# y_train, y_test = y[:split_idx], y[split_idx:]
# # Define parameter grid
# param_grid = {
# 'learning_rate': [0.01, 0.05, 0.1],
# 'max_depth': [3, 5, 7],
# 'n_estimators': [100, 200],
# 'subsample': [0.8, 1.0],
# 'colsample_bytree': [0.8, 1.0],
# }
# print('Starting grid search for XGBoost hyperparameters...')
# xgb_model = XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', eval_metric='mae', verbosity=0)
# grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# print('Best parameters found:', grid_search.best_params_)
# # Use best estimator for predictions
# best_model = grid_search.best_estimator_
# test_preds = best_model.predict(X_test)
# rmse = np.sqrt(mean_squared_error(y_test, test_preds))
# # Reconstruct price series from log returns
# if 'Close' in df.columns:
# close_prices = df['Close'].values
# else:
# close_prices = pd.read_csv(csv_path)['Close'].values
# start_price = close_prices[split_idx]
# actual_prices = [start_price]
# for r_ in y_test:
# actual_prices.append(actual_prices[-1] * np.exp(r_))
# actual_prices = np.array(actual_prices[1:])
# predicted_prices = [start_price]
# for r_ in test_preds:
# predicted_prices.append(predicted_prices[-1] * np.exp(r_))
# predicted_prices = np.array(predicted_prices[1:])
# mae = mean_absolute_error(actual_prices, predicted_prices)
# r2 = r2_score(actual_prices, predicted_prices)
# direction_actual = np.sign(np.diff(actual_prices))
# direction_pred = np.sign(np.diff(predicted_prices))
# directional_accuracy = (direction_actual == direction_pred).mean()
# mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100
# print(f'Grid search results: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%, DirAcc={directional_accuracy*100:.2f}%')
# plot_prefix = f'all_features_gridsearch'
# plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
# sys.exit(0)
2025-05-31 00:57:31 +08:00
# Prepare CSV for results
2025-06-03 15:40:43 +08:00
results_csv = ' ../data/cumulative_feature_results.csv '
2025-05-31 00:57:31 +08:00
if not os . path . exists ( results_csv ) :
with open ( results_csv , ' w ' , newline = ' ' ) as f :
writer = csv . writer ( f )
2025-06-03 15:40:43 +08:00
writer . writerow ( [ ' num_features ' , ' added feature ' , ' rmse ' , ' mae ' , ' r2 ' , ' mape ' , ' directional_accuracy ' , ' feature_importance ' ] )
try :
X = df [ feature_cols ] . values . astype ( np . float32 )
y = df [ " log_return " ] . values . astype ( np . float32 )
split_idx = int ( len ( X ) * 0.8 )
X_train , X_test = X [ : split_idx ] , X [ split_idx : ]
y_train , y_test = y [ : split_idx ] , y [ split_idx : ]
test_timestamps = df [ ' Timestamp ' ] . values [ split_idx : ]
model = CustomXGBoostGPU ( X_train , X_test , y_train , y_test )
2025-06-03 16:46:01 +08:00
booster = model . train ( eval_metric = ' rmse ' )
# colsample_bytree=1.0,
# learning_rate=0.05,
# max_depth=7,
# n_estimators=200,
# subsample=0.8
# )
2025-06-03 15:40:43 +08:00
model . save_model ( f ' ../data/xgboost_model_all_features.json ' )
test_preds = model . predict ( X_test )
rmse = np . sqrt ( mean_squared_error ( y_test , test_preds ) )
# Reconstruct price series from log returns
if ' Close ' in df . columns :
close_prices = df [ ' Close ' ] . values
else :
close_prices = pd . read_csv ( csv_path ) [ ' Close ' ] . values
start_price = close_prices [ split_idx ]
actual_prices = [ start_price ]
for r_ in y_test :
actual_prices . append ( actual_prices [ - 1 ] * np . exp ( r_ ) )
actual_prices = np . array ( actual_prices [ 1 : ] )
predicted_prices = [ start_price ]
for r_ in test_preds :
predicted_prices . append ( predicted_prices [ - 1 ] * np . exp ( r_ ) )
predicted_prices = np . array ( predicted_prices [ 1 : ] )
2025-06-03 16:46:01 +08:00
# mae = mean_absolute_error(actual_prices, predicted_prices)
2025-06-03 15:40:43 +08:00
r2 = r2_score ( actual_prices , predicted_prices )
direction_actual = np . sign ( np . diff ( actual_prices ) )
direction_pred = np . sign ( np . diff ( predicted_prices ) )
directional_accuracy = ( direction_actual == direction_pred ) . mean ( )
mape = np . mean ( np . abs ( ( actual_prices - predicted_prices ) / actual_prices ) ) * 100
# Save results to CSV for all features used in this run
feature_importance_dict = model . get_feature_importance ( feature_cols )
with open ( results_csv , ' a ' , newline = ' ' ) as f :
writer = csv . writer ( f )
for feature in feature_cols :
importance = feature_importance_dict . get ( feature , 0.0 )
fi_str = format ( importance , " .6f " )
row = [ feature ]
2025-06-03 16:46:01 +08:00
for val in [ rmse , mape , r2 , directional_accuracy ] :
2025-06-03 15:40:43 +08:00
if isinstance ( val , float ) :
row . append ( format ( val , ' .10f ' ) )
else :
row . append ( val )
row . append ( fi_str )
writer . writerow ( row )
print ( ' Feature importances and results saved for all features used in this run. ' )
# Plotting for this run
# plot_prefix = f'cumulative_{n}_features'
# plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
# plot_direction_transition_heatmap(actual_prices, predicted_prices, prefix=plot_prefix)
except Exception as e :
print ( f ' Cumulative feature run failed: { e } ' )
print ( f ' All cumulative feature runs completed. Results saved to { results_csv } ' )
plot_prefix = f ' all_features '
plot_prediction_error_distribution ( predicted_prices , actual_prices , prefix = plot_prefix )
2025-05-31 00:57:31 +08:00
sys . exit ( 0 )