270 lines
12 KiB
Python
270 lines
12 KiB
Python
import sys
|
|
import os
|
|
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
import pandas as pd
|
|
import numpy as np
|
|
from custom_xgboost import CustomXGBoostGPU
|
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
|
from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap
|
|
import time
|
|
from numba import njit
|
|
import csv
|
|
import pandas_ta as ta
|
|
from feature_engineering import feature_engineering
|
|
from sklearn.feature_selection import VarianceThreshold
|
|
|
|
charts_dir = 'charts'
|
|
if not os.path.exists(charts_dir):
|
|
os.makedirs(charts_dir)
|
|
|
|
def run_indicator(func, *args):
|
|
return func(*args)
|
|
|
|
def run_indicator_job(job):
|
|
import time
|
|
func, *args = job
|
|
indicator_name = func.__name__
|
|
start = time.time()
|
|
result = func(*args)
|
|
elapsed = time.time() - start
|
|
print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds')
|
|
return result
|
|
|
|
if __name__ == '__main__':
|
|
IMPUTE_NANS = True # Set to True to impute NaNs, False to drop rows with NaNs
|
|
csv_path = '../data/btcusd_1-min_data.csv'
|
|
csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]
|
|
|
|
print('Reading CSV and filtering data...')
|
|
df = pd.read_csv(csv_path)
|
|
df = df[df['Volume'] != 0]
|
|
|
|
min_date = '2017-06-01'
|
|
print('Converting Timestamp and filtering by date...')
|
|
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
|
|
df = df[df['Timestamp'] >= min_date]
|
|
|
|
lags = 3
|
|
|
|
print('Calculating log returns as the new target...')
|
|
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
|
|
|
|
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
|
|
window_sizes = [5, 15, 30] # in minutes, adjust as needed
|
|
|
|
features_dict = {}
|
|
|
|
print('Starting feature computation...')
|
|
feature_start_time = time.time()
|
|
features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes)
|
|
print('Concatenating all new features to DataFrame...')
|
|
|
|
features_df = pd.DataFrame(features_dict)
|
|
df = pd.concat([df, features_df], axis=1)
|
|
|
|
# feature_cols_for_variance = [col for col in features_df.columns if features_df[col].dtype in [np.float32, np.float64, float, int, np.int32, np.int64]]
|
|
# if feature_cols_for_variance:
|
|
# selector = VarianceThreshold(threshold=1e-5)
|
|
# filtered_features = selector.fit_transform(features_df[feature_cols_for_variance])
|
|
# kept_mask = selector.get_support()
|
|
# kept_feature_names = [col for col, keep in zip(feature_cols_for_variance, kept_mask) if keep]
|
|
# print(f"Features removed by low variance: {[col for col, keep in zip(feature_cols_for_variance, kept_mask) if not keep]}")
|
|
# # Only keep the selected features in features_df and df
|
|
# features_df = features_df[kept_feature_names]
|
|
# for col in feature_cols_for_variance:
|
|
# if col not in kept_feature_names:
|
|
# df.drop(col, axis=1, inplace=True)
|
|
# else:
|
|
# print("No numeric features found for variance thresholding.")
|
|
|
|
# Remove highly correlated features (keep only one from each correlated group)
|
|
# corr_matrix = features_df.corr().abs()
|
|
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
|
# to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
|
|
# if to_drop:
|
|
# print(f"Features removed due to high correlation: {to_drop}")
|
|
# features_df = features_df.drop(columns=to_drop)
|
|
# df = df.drop(columns=to_drop)
|
|
# else:
|
|
# print("No highly correlated features found for removal.")
|
|
|
|
print('Downcasting float columns to save memory...')
|
|
for col in df.columns:
|
|
try:
|
|
df[col] = pd.to_numeric(df[col], downcast='float')
|
|
except Exception:
|
|
pass
|
|
|
|
# Add time features (exclude 'dayofweek')
|
|
print('Adding hour feature...')
|
|
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
|
|
df['hour'] = df['Timestamp'].dt.hour
|
|
|
|
# Handle NaNs after all feature engineering
|
|
if IMPUTE_NANS:
|
|
print('Imputing NaNs after feature engineering (using mean imputation)...')
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
for col in numeric_cols:
|
|
df[col] = df[col].fillna(df[col].mean())
|
|
# If you want to impute non-numeric columns differently, add logic here
|
|
else:
|
|
print('Dropping NaNs after feature engineering...')
|
|
df = df.dropna().reset_index(drop=True)
|
|
|
|
# Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
|
|
print('Selecting feature columns...')
|
|
exclude_cols = ['Timestamp', 'Close']
|
|
exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30']
|
|
exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar',
|
|
'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband',
|
|
'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line',
|
|
'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2',
|
|
'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15',
|
|
'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15',
|
|
'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30',
|
|
'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5',
|
|
'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30',
|
|
'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15',
|
|
'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']
|
|
|
|
feature_cols = [col for col in df.columns if col not in exclude_cols]
|
|
print('Features used for training:', feature_cols)
|
|
|
|
# from xgboost import XGBRegressor
|
|
# from sklearn.model_selection import GridSearchCV
|
|
|
|
# # Prepare data for grid search
|
|
# X = df[feature_cols].values.astype(np.float32)
|
|
# y = df["log_return"].values.astype(np.float32)
|
|
# split_idx = int(len(X) * 0.8)
|
|
# X_train, X_test = X[:split_idx], X[split_idx:]
|
|
# y_train, y_test = y[:split_idx], y[split_idx:]
|
|
|
|
# # Define parameter grid
|
|
# param_grid = {
|
|
# 'learning_rate': [0.01, 0.05, 0.1],
|
|
# 'max_depth': [3, 5, 7],
|
|
# 'n_estimators': [100, 200],
|
|
# 'subsample': [0.8, 1.0],
|
|
# 'colsample_bytree': [0.8, 1.0],
|
|
# }
|
|
|
|
# print('Starting grid search for XGBoost hyperparameters...')
|
|
# xgb_model = XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', eval_metric='mae', verbosity=0)
|
|
# grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
|
|
# grid_search.fit(X_train, y_train)
|
|
# print('Best parameters found:', grid_search.best_params_)
|
|
|
|
# # Use best estimator for predictions
|
|
# best_model = grid_search.best_estimator_
|
|
# test_preds = best_model.predict(X_test)
|
|
# rmse = np.sqrt(mean_squared_error(y_test, test_preds))
|
|
|
|
# # Reconstruct price series from log returns
|
|
# if 'Close' in df.columns:
|
|
# close_prices = df['Close'].values
|
|
# else:
|
|
# close_prices = pd.read_csv(csv_path)['Close'].values
|
|
# start_price = close_prices[split_idx]
|
|
# actual_prices = [start_price]
|
|
# for r_ in y_test:
|
|
# actual_prices.append(actual_prices[-1] * np.exp(r_))
|
|
# actual_prices = np.array(actual_prices[1:])
|
|
# predicted_prices = [start_price]
|
|
# for r_ in test_preds:
|
|
# predicted_prices.append(predicted_prices[-1] * np.exp(r_))
|
|
# predicted_prices = np.array(predicted_prices[1:])
|
|
|
|
# mae = mean_absolute_error(actual_prices, predicted_prices)
|
|
# r2 = r2_score(actual_prices, predicted_prices)
|
|
# direction_actual = np.sign(np.diff(actual_prices))
|
|
# direction_pred = np.sign(np.diff(predicted_prices))
|
|
# directional_accuracy = (direction_actual == direction_pred).mean()
|
|
# mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100
|
|
|
|
# print(f'Grid search results: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%, DirAcc={directional_accuracy*100:.2f}%')
|
|
|
|
# plot_prefix = f'all_features_gridsearch'
|
|
# plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
|
|
|
|
# sys.exit(0)
|
|
|
|
# Prepare CSV for results
|
|
results_csv = '../data/cumulative_feature_results.csv'
|
|
if not os.path.exists(results_csv):
|
|
with open(results_csv, 'w', newline='') as f:
|
|
writer = csv.writer(f)
|
|
writer.writerow(['num_features', 'added feature', 'rmse', 'mae', 'r2', 'mape', 'directional_accuracy', 'feature_importance'])
|
|
|
|
try:
|
|
X = df[feature_cols].values.astype(np.float32)
|
|
y = df["log_return"].values.astype(np.float32)
|
|
split_idx = int(len(X) * 0.8)
|
|
X_train, X_test = X[:split_idx], X[split_idx:]
|
|
y_train, y_test = y[:split_idx], y[split_idx:]
|
|
test_timestamps = df['Timestamp'].values[split_idx:]
|
|
|
|
model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
|
|
booster = model.train(eval_metric='rmse')
|
|
# colsample_bytree=1.0,
|
|
# learning_rate=0.05,
|
|
# max_depth=7,
|
|
# n_estimators=200,
|
|
# subsample=0.8
|
|
# )
|
|
model.save_model(f'../data/xgboost_model_all_features.json')
|
|
|
|
test_preds = model.predict(X_test)
|
|
rmse = np.sqrt(mean_squared_error(y_test, test_preds))
|
|
|
|
# Reconstruct price series from log returns
|
|
if 'Close' in df.columns:
|
|
close_prices = df['Close'].values
|
|
else:
|
|
close_prices = pd.read_csv(csv_path)['Close'].values
|
|
start_price = close_prices[split_idx]
|
|
actual_prices = [start_price]
|
|
for r_ in y_test:
|
|
actual_prices.append(actual_prices[-1] * np.exp(r_))
|
|
actual_prices = np.array(actual_prices[1:])
|
|
predicted_prices = [start_price]
|
|
for r_ in test_preds:
|
|
predicted_prices.append(predicted_prices[-1] * np.exp(r_))
|
|
predicted_prices = np.array(predicted_prices[1:])
|
|
|
|
# mae = mean_absolute_error(actual_prices, predicted_prices)
|
|
r2 = r2_score(actual_prices, predicted_prices)
|
|
direction_actual = np.sign(np.diff(actual_prices))
|
|
direction_pred = np.sign(np.diff(predicted_prices))
|
|
directional_accuracy = (direction_actual == direction_pred).mean()
|
|
mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100
|
|
|
|
# Save results to CSV for all features used in this run
|
|
feature_importance_dict = model.get_feature_importance(feature_cols)
|
|
with open(results_csv, 'a', newline='') as f:
|
|
writer = csv.writer(f)
|
|
for feature in feature_cols:
|
|
importance = feature_importance_dict.get(feature, 0.0)
|
|
fi_str = format(importance, ".6f")
|
|
row = [feature]
|
|
for val in [rmse, mape, r2, directional_accuracy]:
|
|
if isinstance(val, float):
|
|
row.append(format(val, '.10f'))
|
|
else:
|
|
row.append(val)
|
|
row.append(fi_str)
|
|
writer.writerow(row)
|
|
print('Feature importances and results saved for all features used in this run.')
|
|
|
|
# Plotting for this run
|
|
# plot_prefix = f'cumulative_{n}_features'
|
|
# plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
|
|
# plot_direction_transition_heatmap(actual_prices, predicted_prices, prefix=plot_prefix)
|
|
except Exception as e:
|
|
print(f'Cumulative feature run failed: {e}')
|
|
print(f'All cumulative feature runs completed. Results saved to {results_csv}')
|
|
|
|
plot_prefix = f'all_features'
|
|
plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
|
|
|
|
sys.exit(0) |