OHLCVPredictor/main.py

269 lines
12 KiB
Python

import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
import pandas as pd
import numpy as np
from custom_xgboost import CustomXGBoostGPU
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap
import time
from numba import njit
import csv
from feature_engineering import feature_engineering
from sklearn.feature_selection import VarianceThreshold
charts_dir = 'charts'
if not os.path.exists(charts_dir):
os.makedirs(charts_dir)
def run_indicator(func, *args):
return func(*args)
def run_indicator_job(job):
import time
func, *args = job
indicator_name = func.__name__
start = time.time()
result = func(*args)
elapsed = time.time() - start
print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds')
return result
if __name__ == '__main__':
IMPUTE_NANS = True # Set to True to impute NaNs, False to drop rows with NaNs
csv_path = '../data/btcusd_1-min_data.csv'
csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]
print('Reading CSV and filtering data...')
df = pd.read_csv(csv_path)
df = df[df['Volume'] != 0]
min_date = '2017-06-01'
print('Converting Timestamp and filtering by date...')
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df = df[df['Timestamp'] >= min_date]
lags = 3
print('Calculating log returns as the new target...')
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
window_sizes = [5, 15, 30] # in minutes, adjust as needed
features_dict = {}
print('Starting feature computation...')
feature_start_time = time.time()
features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes)
print('Concatenating all new features to DataFrame...')
features_df = pd.DataFrame(features_dict)
df = pd.concat([df, features_df], axis=1)
# feature_cols_for_variance = [col for col in features_df.columns if features_df[col].dtype in [np.float32, np.float64, float, int, np.int32, np.int64]]
# if feature_cols_for_variance:
# selector = VarianceThreshold(threshold=1e-5)
# filtered_features = selector.fit_transform(features_df[feature_cols_for_variance])
# kept_mask = selector.get_support()
# kept_feature_names = [col for col, keep in zip(feature_cols_for_variance, kept_mask) if keep]
# print(f"Features removed by low variance: {[col for col, keep in zip(feature_cols_for_variance, kept_mask) if not keep]}")
# # Only keep the selected features in features_df and df
# features_df = features_df[kept_feature_names]
# for col in feature_cols_for_variance:
# if col not in kept_feature_names:
# df.drop(col, axis=1, inplace=True)
# else:
# print("No numeric features found for variance thresholding.")
# Remove highly correlated features (keep only one from each correlated group)
# corr_matrix = features_df.corr().abs()
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# if to_drop:
# print(f"Features removed due to high correlation: {to_drop}")
# features_df = features_df.drop(columns=to_drop)
# df = df.drop(columns=to_drop)
# else:
# print("No highly correlated features found for removal.")
print('Downcasting float columns to save memory...')
for col in df.columns:
try:
df[col] = pd.to_numeric(df[col], downcast='float')
except Exception:
pass
# Add time features (exclude 'dayofweek')
print('Adding hour feature...')
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df['hour'] = df['Timestamp'].dt.hour
# Handle NaNs after all feature engineering
if IMPUTE_NANS:
print('Imputing NaNs after feature engineering (using mean imputation)...')
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
df[col] = df[col].fillna(df[col].mean())
# If you want to impute non-numeric columns differently, add logic here
else:
print('Dropping NaNs after feature engineering...')
df = df.dropna().reset_index(drop=True)
# Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features
print('Selecting feature columns...')
exclude_cols = ['Timestamp', 'Close']
exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30']
exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar',
'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband',
'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line',
'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2',
'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15',
'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15',
'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30',
'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5',
'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30',
'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15',
'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']
feature_cols = [col for col in df.columns if col not in exclude_cols]
print('Features used for training:', feature_cols)
# from xgboost import XGBRegressor
# from sklearn.model_selection import GridSearchCV
# # Prepare data for grid search
# X = df[feature_cols].values.astype(np.float32)
# y = df["log_return"].values.astype(np.float32)
# split_idx = int(len(X) * 0.8)
# X_train, X_test = X[:split_idx], X[split_idx:]
# y_train, y_test = y[:split_idx], y[split_idx:]
# # Define parameter grid
# param_grid = {
# 'learning_rate': [0.01, 0.05, 0.1],
# 'max_depth': [3, 5, 7],
# 'n_estimators': [100, 200],
# 'subsample': [0.8, 1.0],
# 'colsample_bytree': [0.8, 1.0],
# }
# print('Starting grid search for XGBoost hyperparameters...')
# xgb_model = XGBRegressor(objective='reg:squarederror', tree_method='hist', device='cuda', eval_metric='mae', verbosity=0)
# grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# print('Best parameters found:', grid_search.best_params_)
# # Use best estimator for predictions
# best_model = grid_search.best_estimator_
# test_preds = best_model.predict(X_test)
# rmse = np.sqrt(mean_squared_error(y_test, test_preds))
# # Reconstruct price series from log returns
# if 'Close' in df.columns:
# close_prices = df['Close'].values
# else:
# close_prices = pd.read_csv(csv_path)['Close'].values
# start_price = close_prices[split_idx]
# actual_prices = [start_price]
# for r_ in y_test:
# actual_prices.append(actual_prices[-1] * np.exp(r_))
# actual_prices = np.array(actual_prices[1:])
# predicted_prices = [start_price]
# for r_ in test_preds:
# predicted_prices.append(predicted_prices[-1] * np.exp(r_))
# predicted_prices = np.array(predicted_prices[1:])
# mae = mean_absolute_error(actual_prices, predicted_prices)
# r2 = r2_score(actual_prices, predicted_prices)
# direction_actual = np.sign(np.diff(actual_prices))
# direction_pred = np.sign(np.diff(predicted_prices))
# directional_accuracy = (direction_actual == direction_pred).mean()
# mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100
# print(f'Grid search results: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%, DirAcc={directional_accuracy*100:.2f}%')
# plot_prefix = f'all_features_gridsearch'
# plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
# sys.exit(0)
# Prepare CSV for results
results_csv = '../data/cumulative_feature_results.csv'
if not os.path.exists(results_csv):
with open(results_csv, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['num_features', 'added feature', 'rmse', 'mae', 'r2', 'mape', 'directional_accuracy', 'feature_importance'])
try:
X = df[feature_cols].values.astype(np.float32)
y = df["log_return"].values.astype(np.float32)
split_idx = int(len(X) * 0.8)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
test_timestamps = df['Timestamp'].values[split_idx:]
model = CustomXGBoostGPU(X_train, X_test, y_train, y_test)
booster = model.train(eval_metric='rmse')
# colsample_bytree=1.0,
# learning_rate=0.05,
# max_depth=7,
# n_estimators=200,
# subsample=0.8
# )
model.save_model(f'../data/xgboost_model_all_features.json')
test_preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, test_preds))
# Reconstruct price series from log returns
if 'Close' in df.columns:
close_prices = df['Close'].values
else:
close_prices = pd.read_csv(csv_path)['Close'].values
start_price = close_prices[split_idx]
actual_prices = [start_price]
for r_ in y_test:
actual_prices.append(actual_prices[-1] * np.exp(r_))
actual_prices = np.array(actual_prices[1:])
predicted_prices = [start_price]
for r_ in test_preds:
predicted_prices.append(predicted_prices[-1] * np.exp(r_))
predicted_prices = np.array(predicted_prices[1:])
# mae = mean_absolute_error(actual_prices, predicted_prices)
r2 = r2_score(actual_prices, predicted_prices)
direction_actual = np.sign(np.diff(actual_prices))
direction_pred = np.sign(np.diff(predicted_prices))
directional_accuracy = (direction_actual == direction_pred).mean()
mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100
# Save results to CSV for all features used in this run
feature_importance_dict = model.get_feature_importance(feature_cols)
with open(results_csv, 'a', newline='') as f:
writer = csv.writer(f)
for feature in feature_cols:
importance = feature_importance_dict.get(feature, 0.0)
fi_str = format(importance, ".6f")
row = [feature]
for val in [rmse, mape, r2, directional_accuracy]:
if isinstance(val, float):
row.append(format(val, '.10f'))
else:
row.append(val)
row.append(fi_str)
writer.writerow(row)
print('Feature importances and results saved for all features used in this run.')
# Plotting for this run
# plot_prefix = f'cumulative_{n}_features'
# plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
# plot_direction_transition_heatmap(actual_prices, predicted_prices, prefix=plot_prefix)
except Exception as e:
print(f'Cumulative feature run failed: {e}')
print(f'All cumulative feature runs completed. Results saved to {results_csv}')
plot_prefix = f'all_features'
plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix)
sys.exit(0)