model updated
This commit is contained in:
parent
2dba88b620
commit
81e4b640a7
@ -31,3 +31,9 @@ class CustomXGBoostGPU:
|
|||||||
raise ValueError('Model not trained yet.')
|
raise ValueError('Model not trained yet.')
|
||||||
dmatrix = xgb.DMatrix(X.astype(np.float32))
|
dmatrix = xgb.DMatrix(X.astype(np.float32))
|
||||||
return self.model.predict(dmatrix)
|
return self.model.predict(dmatrix)
|
||||||
|
|
||||||
|
def save_model(self, file_path):
|
||||||
|
"""Save the trained XGBoost model to the specified file path."""
|
||||||
|
if self.model is None:
|
||||||
|
raise ValueError('Model not trained yet.')
|
||||||
|
self.model.save_model(file_path)
|
||||||
|
|||||||
423
xgboost/main.py
423
xgboost/main.py
@ -6,7 +6,7 @@ import numpy as np
|
|||||||
from sklearn.model_selection import train_test_split
|
from sklearn.model_selection import train_test_split
|
||||||
from custom_xgboost import CustomXGBoostGPU
|
from custom_xgboost import CustomXGBoostGPU
|
||||||
from sklearn.metrics import mean_squared_error
|
from sklearn.metrics import mean_squared_error
|
||||||
from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns
|
from plot_results import display_actual_vs_predicted, plot_target_distribution, plot_predicted_vs_actual_log_returns, plot_predicted_vs_actual_prices
|
||||||
import ta
|
import ta
|
||||||
from cycles.supertrend import Supertrends
|
from cycles.supertrend import Supertrends
|
||||||
from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
|
from ta.trend import SMAIndicator, DPOIndicator, IchimokuIndicator, PSARIndicator
|
||||||
@ -14,7 +14,6 @@ from ta.momentum import ROCIndicator, KAMAIndicator, UltimateOscillator, Stochas
|
|||||||
from ta.volatility import KeltnerChannel, DonchianChannel
|
from ta.volatility import KeltnerChannel, DonchianChannel
|
||||||
from ta.others import DailyReturnIndicator
|
from ta.others import DailyReturnIndicator
|
||||||
import time
|
import time
|
||||||
import concurrent.futures
|
|
||||||
from numba import njit
|
from numba import njit
|
||||||
|
|
||||||
def run_indicator(func, *args):
|
def run_indicator(func, *args):
|
||||||
@ -101,13 +100,9 @@ def calc_momentum(close):
|
|||||||
return ('momentum_10', close - close.shift(10))
|
return ('momentum_10', close - close.shift(10))
|
||||||
|
|
||||||
def calc_psar(high, low, close):
|
def calc_psar(high, low, close):
|
||||||
from ta.trend import PSARIndicator
|
# Use the Numba-accelerated fast_psar function for speed
|
||||||
psar = PSARIndicator(high, low, close)
|
psar_values = fast_psar(np.array(high), np.array(low), np.array(close))
|
||||||
return [
|
return [('psar', pd.Series(psar_values, index=close.index))]
|
||||||
('psar', psar.psar()),
|
|
||||||
('psar_up', psar.psar_up()),
|
|
||||||
('psar_down', psar.psar_down())
|
|
||||||
]
|
|
||||||
|
|
||||||
def calc_donchian(high, low, close):
|
def calc_donchian(high, low, close):
|
||||||
from ta.volatility import DonchianChannel
|
from ta.volatility import DonchianChannel
|
||||||
@ -220,17 +215,18 @@ if __name__ == '__main__':
|
|||||||
csv_path = './data/btcusd_1-min_data.csv'
|
csv_path = './data/btcusd_1-min_data.csv'
|
||||||
csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]
|
csv_prefix = os.path.splitext(os.path.basename(csv_path))[0]
|
||||||
|
|
||||||
|
print('Reading CSV and filtering data...')
|
||||||
df = pd.read_csv(csv_path)
|
df = pd.read_csv(csv_path)
|
||||||
df = df[df['Volume'] != 0]
|
df = df[df['Volume'] != 0]
|
||||||
|
|
||||||
min_date = '2017-06-01'
|
min_date = '2017-06-01'
|
||||||
|
print('Converting Timestamp and filtering by date...')
|
||||||
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
|
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
|
||||||
df = df[df['Timestamp'] >= min_date]
|
df = df[df['Timestamp'] >= min_date]
|
||||||
|
|
||||||
lags = 3
|
lags = 3
|
||||||
|
|
||||||
print('Calculating log returns as the new target...')
|
print('Calculating log returns as the new target...')
|
||||||
|
|
||||||
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
|
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
|
||||||
|
|
||||||
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
|
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
|
||||||
@ -243,55 +239,282 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
# --- Technical Indicator Features: Calculate or Load from Cache ---
|
# --- Technical Indicator Features: Calculate or Load from Cache ---
|
||||||
print('Calculating or loading technical indicator features...')
|
print('Calculating or loading technical indicator features...')
|
||||||
indicator_jobs = [
|
# RSI
|
||||||
('rsi', calc_rsi, [df['Close']]),
|
feature_file = f'./data/{csv_prefix}_rsi.npy'
|
||||||
('macd', calc_macd, [df['Close']]),
|
if os.path.exists(feature_file):
|
||||||
('atr', calc_atr, [df['High'], df['Low'], df['Close']]),
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
('cci', calc_cci, [df['High'], df['Low'], df['Close']]),
|
arr = np.load(feature_file)
|
||||||
('williams_r', calc_williamsr, [df['High'], df['Low'], df['Close']]),
|
features_dict['rsi'] = pd.Series(arr, index=df.index)
|
||||||
('ema_14', calc_ema, [df['Close']]),
|
else:
|
||||||
('obv', calc_obv, [df['Close'], df['Volume']]),
|
print('Calculating feature: rsi')
|
||||||
('cmf', calc_cmf, [df['High'], df['Low'], df['Close'], df['Volume']]),
|
_, values = calc_rsi(df['Close'])
|
||||||
('roc_10', calc_roc, [df['Close']]),
|
features_dict['rsi'] = values
|
||||||
('dpo_20', calc_dpo, [df['Close']]),
|
np.save(feature_file, values.values)
|
||||||
('ultimate_osc', calc_ultimate, [df['High'], df['Low'], df['Close']]),
|
print(f'Saved feature: {feature_file}')
|
||||||
('daily_return', calc_daily_return, [df['Close']]),
|
|
||||||
]
|
# MACD
|
||||||
|
feature_file = f'./data/{csv_prefix}_macd.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict['macd'] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
print('Calculating feature: macd')
|
||||||
|
_, values = calc_macd(df['Close'])
|
||||||
|
features_dict['macd'] = values
|
||||||
|
np.save(feature_file, values.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
|
|
||||||
|
# ATR
|
||||||
|
feature_file = f'./data/{csv_prefix}_atr.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict['atr'] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
print('Calculating feature: atr')
|
||||||
|
_, values = calc_atr(df['High'], df['Low'], df['Close'])
|
||||||
|
features_dict['atr'] = values
|
||||||
|
np.save(feature_file, values.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
|
|
||||||
|
# CCI
|
||||||
|
feature_file = f'./data/{csv_prefix}_cci.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict['cci'] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
print('Calculating feature: cci')
|
||||||
|
_, values = calc_cci(df['High'], df['Low'], df['Close'])
|
||||||
|
features_dict['cci'] = values
|
||||||
|
np.save(feature_file, values.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
|
|
||||||
|
# Williams %R
|
||||||
|
feature_file = f'./data/{csv_prefix}_williams_r.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict['williams_r'] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
print('Calculating feature: williams_r')
|
||||||
|
_, values = calc_williamsr(df['High'], df['Low'], df['Close'])
|
||||||
|
features_dict['williams_r'] = values
|
||||||
|
np.save(feature_file, values.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
|
|
||||||
|
# EMA 14
|
||||||
|
feature_file = f'./data/{csv_prefix}_ema_14.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict['ema_14'] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
print('Calculating feature: ema_14')
|
||||||
|
_, values = calc_ema(df['Close'])
|
||||||
|
features_dict['ema_14'] = values
|
||||||
|
np.save(feature_file, values.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
|
|
||||||
|
# OBV
|
||||||
|
feature_file = f'./data/{csv_prefix}_obv.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict['obv'] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
print('Calculating feature: obv')
|
||||||
|
_, values = calc_obv(df['Close'], df['Volume'])
|
||||||
|
features_dict['obv'] = values
|
||||||
|
np.save(feature_file, values.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
|
|
||||||
|
# CMF
|
||||||
|
feature_file = f'./data/{csv_prefix}_cmf.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict['cmf'] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
print('Calculating feature: cmf')
|
||||||
|
_, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
|
||||||
|
features_dict['cmf'] = values
|
||||||
|
np.save(feature_file, values.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
|
|
||||||
|
# ROC 10
|
||||||
|
feature_file = f'./data/{csv_prefix}_roc_10.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict['roc_10'] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
print('Calculating feature: roc_10')
|
||||||
|
_, values = calc_roc(df['Close'])
|
||||||
|
features_dict['roc_10'] = values
|
||||||
|
np.save(feature_file, values.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
|
|
||||||
|
# DPO 20
|
||||||
|
feature_file = f'./data/{csv_prefix}_dpo_20.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict['dpo_20'] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
print('Calculating feature: dpo_20')
|
||||||
|
_, values = calc_dpo(df['Close'])
|
||||||
|
features_dict['dpo_20'] = values
|
||||||
|
np.save(feature_file, values.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
|
|
||||||
|
# Ultimate Oscillator
|
||||||
|
feature_file = f'./data/{csv_prefix}_ultimate_osc.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
print('Calculating feature: ultimate_osc')
|
||||||
|
_, values = calc_ultimate(df['High'], df['Low'], df['Close'])
|
||||||
|
features_dict['ultimate_osc'] = values
|
||||||
|
np.save(feature_file, values.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
|
|
||||||
|
# Daily Return
|
||||||
|
feature_file = f'./data/{csv_prefix}_daily_return.npy'
|
||||||
|
if os.path.exists(feature_file):
|
||||||
|
print(f'A Loading cached feature: {feature_file}')
|
||||||
|
arr = np.load(feature_file)
|
||||||
|
features_dict['daily_return'] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
print('Calculating feature: daily_return')
|
||||||
|
_, values = calc_daily_return(df['Close'])
|
||||||
|
features_dict['daily_return'] = values
|
||||||
|
np.save(feature_file, values.values)
|
||||||
|
print(f'Saved feature: {feature_file}')
|
||||||
|
|
||||||
# Multi-column indicators
|
# Multi-column indicators
|
||||||
multi_indicator_jobs = [
|
# Bollinger Bands
|
||||||
('bollinger', calc_bollinger, [df['Close']]),
|
print('Calculating multi-column indicator: bollinger')
|
||||||
('stochastic', calc_stochastic, [df['High'], df['Low'], df['Close']]),
|
result = calc_bollinger(df['Close'])
|
||||||
('sma', calc_sma, [df['Close']]),
|
for subname, values in result:
|
||||||
('psar', calc_psar, [df['High'], df['Low'], df['Close']]),
|
print(f"Adding subfeature: {subname}")
|
||||||
('donchian', calc_donchian, [df['High'], df['Low'], df['Close']]),
|
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
|
||||||
('keltner', calc_keltner, [df['High'], df['Low'], df['Close']]),
|
if os.path.exists(sub_feature_file):
|
||||||
('ichimoku', calc_ichimoku, [df['High'], df['Low']]),
|
print(f'B Loading cached feature: {sub_feature_file}')
|
||||||
('elder_ray', calc_elder_ray, [df['Close'], df['Low'], df['High']]),
|
arr = np.load(sub_feature_file)
|
||||||
]
|
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||||
for feature_name, func, args in indicator_jobs:
|
|
||||||
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
|
||||||
if os.path.exists(feature_file):
|
|
||||||
print(f'Loading cached feature: {feature_file}')
|
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
|
||||||
else:
|
else:
|
||||||
result = func(*args)
|
features_dict[subname] = values
|
||||||
if isinstance(result, tuple):
|
np.save(sub_feature_file, values.values)
|
||||||
_, values = result
|
print(f'Saved feature: {sub_feature_file}')
|
||||||
features_dict[feature_name] = values
|
|
||||||
np.save(feature_file, values.values)
|
# Stochastic Oscillator
|
||||||
else:
|
print('Calculating multi-column indicator: stochastic')
|
||||||
raise ValueError(f"Unexpected result for {feature_name}")
|
result = calc_stochastic(df['High'], df['Low'], df['Close'])
|
||||||
for feature_name, func, args in multi_indicator_jobs:
|
for subname, values in result:
|
||||||
# These return a list of (name, values)
|
print(f"Adding subfeature: {subname}")
|
||||||
result = func(*args)
|
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
|
||||||
for subname, values in result:
|
if os.path.exists(sub_feature_file):
|
||||||
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
|
print(f'B Loading cached feature: {sub_feature_file}')
|
||||||
if os.path.exists(sub_feature_file):
|
arr = np.load(sub_feature_file)
|
||||||
print(f'Loading cached feature: {sub_feature_file}')
|
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||||
features_dict[subname] = np.load(sub_feature_file)
|
else:
|
||||||
else:
|
features_dict[subname] = values
|
||||||
features_dict[subname] = values
|
np.save(sub_feature_file, values.values)
|
||||||
np.save(sub_feature_file, values.values)
|
print(f'Saved feature: {sub_feature_file}')
|
||||||
|
|
||||||
|
# SMA
|
||||||
|
print('Calculating multi-column indicator: sma')
|
||||||
|
result = calc_sma(df['Close'])
|
||||||
|
for subname, values in result:
|
||||||
|
print(f"Adding subfeature: {subname}")
|
||||||
|
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
|
||||||
|
if os.path.exists(sub_feature_file):
|
||||||
|
print(f'B Loading cached feature: {sub_feature_file}')
|
||||||
|
arr = np.load(sub_feature_file)
|
||||||
|
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
np.save(sub_feature_file, values.values)
|
||||||
|
print(f'Saved feature: {sub_feature_file}')
|
||||||
|
|
||||||
|
# PSAR
|
||||||
|
print('Calculating multi-column indicator: psar')
|
||||||
|
result = calc_psar(df['High'], df['Low'], df['Close'])
|
||||||
|
for subname, values in result:
|
||||||
|
print(f"Adding subfeature: {subname}")
|
||||||
|
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
|
||||||
|
if os.path.exists(sub_feature_file):
|
||||||
|
print(f'B Loading cached feature: {sub_feature_file}')
|
||||||
|
arr = np.load(sub_feature_file)
|
||||||
|
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
np.save(sub_feature_file, values.values)
|
||||||
|
print(f'Saved feature: {sub_feature_file}')
|
||||||
|
|
||||||
|
# Donchian Channel
|
||||||
|
print('Calculating multi-column indicator: donchian')
|
||||||
|
result = calc_donchian(df['High'], df['Low'], df['Close'])
|
||||||
|
for subname, values in result:
|
||||||
|
print(f"Adding subfeature: {subname}")
|
||||||
|
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
|
||||||
|
if os.path.exists(sub_feature_file):
|
||||||
|
print(f'B Loading cached feature: {sub_feature_file}')
|
||||||
|
arr = np.load(sub_feature_file)
|
||||||
|
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
np.save(sub_feature_file, values.values)
|
||||||
|
print(f'Saved feature: {sub_feature_file}')
|
||||||
|
|
||||||
|
# Keltner Channel
|
||||||
|
print('Calculating multi-column indicator: keltner')
|
||||||
|
result = calc_keltner(df['High'], df['Low'], df['Close'])
|
||||||
|
for subname, values in result:
|
||||||
|
print(f"Adding subfeature: {subname}")
|
||||||
|
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
|
||||||
|
if os.path.exists(sub_feature_file):
|
||||||
|
print(f'B Loading cached feature: {sub_feature_file}')
|
||||||
|
arr = np.load(sub_feature_file)
|
||||||
|
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
np.save(sub_feature_file, values.values)
|
||||||
|
print(f'Saved feature: {sub_feature_file}')
|
||||||
|
|
||||||
|
# Ichimoku
|
||||||
|
print('Calculating multi-column indicator: ichimoku')
|
||||||
|
result = calc_ichimoku(df['High'], df['Low'])
|
||||||
|
for subname, values in result:
|
||||||
|
print(f"Adding subfeature: {subname}")
|
||||||
|
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
|
||||||
|
if os.path.exists(sub_feature_file):
|
||||||
|
print(f'B Loading cached feature: {sub_feature_file}')
|
||||||
|
arr = np.load(sub_feature_file)
|
||||||
|
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
np.save(sub_feature_file, values.values)
|
||||||
|
print(f'Saved feature: {sub_feature_file}')
|
||||||
|
|
||||||
|
# Elder Ray
|
||||||
|
print('Calculating multi-column indicator: elder_ray')
|
||||||
|
result = calc_elder_ray(df['Close'], df['Low'], df['High'])
|
||||||
|
for subname, values in result:
|
||||||
|
print(f"Adding subfeature: {subname}")
|
||||||
|
sub_feature_file = f'./data/{csv_prefix}_{subname}.npy'
|
||||||
|
if os.path.exists(sub_feature_file):
|
||||||
|
print(f'B Loading cached feature: {sub_feature_file}')
|
||||||
|
arr = np.load(sub_feature_file)
|
||||||
|
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||||
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
np.save(sub_feature_file, values.values)
|
||||||
|
print(f'Saved feature: {sub_feature_file}')
|
||||||
|
|
||||||
# Prepare jobs for lags, rolling stats, log returns, and volatility
|
# Prepare jobs for lags, rolling stats, log returns, and volatility
|
||||||
feature_jobs = []
|
feature_jobs = []
|
||||||
@ -301,9 +524,10 @@ if __name__ == '__main__':
|
|||||||
feature_name = f'{col}_lag{lag}'
|
feature_name = f'{col}_lag{lag}'
|
||||||
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
print(f'Loading cached feature: {feature_file}')
|
print(f'C Loading cached feature: {feature_file}')
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
|
print(f'Adding lag feature job: {feature_name}')
|
||||||
feature_jobs.append((feature_name, compute_lag, col, lag))
|
feature_jobs.append((feature_name, compute_lag, col, lag))
|
||||||
# Rolling statistics
|
# Rolling statistics
|
||||||
for col in ohlcv_cols:
|
for col in ohlcv_cols:
|
||||||
@ -320,48 +544,56 @@ if __name__ == '__main__':
|
|||||||
feature_name = f'{col}_roll_{stat}_{window}'
|
feature_name = f'{col}_roll_{stat}_{window}'
|
||||||
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
print(f'Loading cached feature: {feature_file}')
|
print(f'D Loading cached feature: {feature_file}')
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
|
print(f'Adding rolling stat feature job: {feature_name}')
|
||||||
feature_jobs.append((feature_name, compute_rolling, col, stat, window))
|
feature_jobs.append((feature_name, compute_rolling, col, stat, window))
|
||||||
# Log returns for different horizons
|
# Log returns for different horizons
|
||||||
for horizon in [5, 15, 30]:
|
for horizon in [5, 15, 30]:
|
||||||
feature_name = f'log_return_{horizon}'
|
feature_name = f'log_return_{horizon}'
|
||||||
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
print(f'Loading cached feature: {feature_file}')
|
print(f'E Loading cached feature: {feature_file}')
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
|
print(f'Adding log return feature job: {feature_name}')
|
||||||
feature_jobs.append((feature_name, compute_log_return, horizon))
|
feature_jobs.append((feature_name, compute_log_return, horizon))
|
||||||
# Volatility
|
# Volatility
|
||||||
for window in window_sizes:
|
for window in window_sizes:
|
||||||
feature_name = f'volatility_{window}'
|
feature_name = f'volatility_{window}'
|
||||||
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
print(f'Loading cached feature: {feature_file}')
|
print(f'F Loading cached feature: {feature_file}')
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
|
print(f'Adding volatility feature job: {feature_name}')
|
||||||
feature_jobs.append((feature_name, compute_volatility, window))
|
feature_jobs.append((feature_name, compute_volatility, window))
|
||||||
|
|
||||||
# Parallel computation for all non-cached features
|
# Sequential computation for all non-cached features
|
||||||
if feature_jobs:
|
if feature_jobs:
|
||||||
print(f'Computing {len(feature_jobs)} features in parallel...')
|
print(f'Computing {len(feature_jobs)} features sequentially...')
|
||||||
with concurrent.futures.ProcessPoolExecutor() as executor:
|
for job in feature_jobs:
|
||||||
futures = [executor.submit(run_feature_job, job, df) for job in feature_jobs]
|
print(f'Computing feature job: {job[0]}')
|
||||||
for future in concurrent.futures.as_completed(futures):
|
feature_name, result = run_feature_job(job, df)
|
||||||
feature_name, result = future.result()
|
features_dict[feature_name] = result
|
||||||
features_dict[feature_name] = result
|
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
||||||
feature_file = f'./data/{csv_prefix}_{feature_name}.npy'
|
np.save(feature_file, result.values)
|
||||||
np.save(feature_file, result.values)
|
print(f'Saved computed feature: {feature_file}')
|
||||||
print('All parallel features computed.')
|
print('All features computed.')
|
||||||
else:
|
else:
|
||||||
print('All features loaded from cache.')
|
print('All features loaded from cache.')
|
||||||
|
|
||||||
# Concatenate all new features at once
|
# Concatenate all new features at once
|
||||||
print('Concatenating all new features to DataFrame...')
|
print('Concatenating all new features to DataFrame...')
|
||||||
features_df = pd.DataFrame(features_dict)
|
features_df = pd.DataFrame(features_dict)
|
||||||
|
print("Columns in features_df:", features_df.columns.tolist())
|
||||||
|
print("All-NaN columns in features_df:", features_df.columns[features_df.isna().all()].tolist())
|
||||||
df = pd.concat([df, features_df], axis=1)
|
df = pd.concat([df, features_df], axis=1)
|
||||||
|
|
||||||
|
# Print all columns after concatenation
|
||||||
|
print("All columns in df after concat:", df.columns.tolist())
|
||||||
|
|
||||||
# Downcast all float columns to save memory
|
# Downcast all float columns to save memory
|
||||||
print('Downcasting float columns to save memory...')
|
print('Downcasting float columns to save memory...')
|
||||||
for col in df.columns:
|
for col in df.columns:
|
||||||
@ -371,6 +603,7 @@ if __name__ == '__main__':
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
# Drop intermediate features_df to free memory
|
# Drop intermediate features_df to free memory
|
||||||
|
print('Dropping intermediate features_df to free memory...')
|
||||||
del features_df
|
del features_df
|
||||||
import gc
|
import gc
|
||||||
gc.collect()
|
gc.collect()
|
||||||
@ -408,6 +641,10 @@ if __name__ == '__main__':
|
|||||||
print('Selecting feature columns...')
|
print('Selecting feature columns...')
|
||||||
exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
|
exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30']
|
||||||
feature_cols = [col for col in df.columns if col not in exclude_cols]
|
feature_cols = [col for col in df.columns if col not in exclude_cols]
|
||||||
|
|
||||||
|
# Print the features used for training
|
||||||
|
print("Features used for training:", feature_cols)
|
||||||
|
|
||||||
# Drop excluded columns to save memory
|
# Drop excluded columns to save memory
|
||||||
print('Dropping excluded columns to save memory...')
|
print('Dropping excluded columns to save memory...')
|
||||||
df = df[feature_cols + ['log_return', 'Timestamp']]
|
df = df[feature_cols + ['log_return', 'Timestamp']]
|
||||||
@ -417,6 +654,7 @@ if __name__ == '__main__':
|
|||||||
y = df['log_return'].values.astype(np.float32)
|
y = df['log_return'].values.astype(np.float32)
|
||||||
|
|
||||||
split_idx = int(len(X) * 0.8)
|
split_idx = int(len(X) * 0.8)
|
||||||
|
print(f'Splitting data: {split_idx} train, {len(X) - split_idx} test')
|
||||||
X_train, X_test = X[:split_idx], X[split_idx:]
|
X_train, X_test = X[:split_idx], X[split_idx:]
|
||||||
y_train, y_test = y[:split_idx], y[split_idx:]
|
y_train, y_test = y[:split_idx], y[split_idx:]
|
||||||
test_timestamps = df['Timestamp'].values[split_idx:]
|
test_timestamps = df['Timestamp'].values[split_idx:]
|
||||||
@ -428,7 +666,11 @@ if __name__ == '__main__':
|
|||||||
booster = model.train()
|
booster = model.train()
|
||||||
|
|
||||||
print('Training complete.')
|
print('Training complete.')
|
||||||
|
|
||||||
|
# Save the trained model
|
||||||
|
model.save_model('./data/xgboost_model.json')
|
||||||
|
print('Model saved to ./data/xgboost_model.json')
|
||||||
|
|
||||||
if hasattr(model, 'params'):
|
if hasattr(model, 'params'):
|
||||||
print("Model hyperparameters:", model.params)
|
print("Model hyperparameters:", model.params)
|
||||||
if hasattr(model, 'model') and hasattr(model.model, 'get_score'):
|
if hasattr(model, 'model') and hasattr(model.model, 'get_score'):
|
||||||
@ -441,18 +683,49 @@ if __name__ == '__main__':
|
|||||||
for feat, score in sorted_importances:
|
for feat, score in sorted_importances:
|
||||||
print(f'{feature_map.get(feat, feat)}: {score}')
|
print(f'{feature_map.get(feat, feat)}: {score}')
|
||||||
|
|
||||||
|
print('Making predictions for first 5 test samples...')
|
||||||
preds = model.predict(X_test[:5])
|
preds = model.predict(X_test[:5])
|
||||||
print('Predictions for first 5 test samples:', preds)
|
print('Predictions for first 5 test samples:', preds)
|
||||||
print('Actual values for first 5 test samples:', y_test[:5])
|
print('Actual values for first 5 test samples:', y_test[:5])
|
||||||
|
|
||||||
|
print('Making predictions for all test samples...')
|
||||||
test_preds = model.predict(X_test)
|
test_preds = model.predict(X_test)
|
||||||
rmse = np.sqrt(mean_squared_error(y_test, test_preds))
|
rmse = np.sqrt(mean_squared_error(y_test, test_preds))
|
||||||
print(f'RMSE on test set: {rmse:.4f}')
|
print(f'RMSE on test set: {rmse:.4f}')
|
||||||
|
|
||||||
|
print('Saving y_test and test_preds to disk...')
|
||||||
np.save('./data/y_test.npy', y_test)
|
np.save('./data/y_test.npy', y_test)
|
||||||
np.save('./data/test_preds.npy', test_preds)
|
np.save('./data/test_preds.npy', test_preds)
|
||||||
|
|
||||||
# display_actual_vs_predicted(y_test, test_preds, test_timestamps)
|
# Reconstruct price series from log returns
|
||||||
# plot_target_distribution(y_train, y_test)
|
print('Reconstructing price series from log returns...')
|
||||||
|
# Get the last available Close price before the test set
|
||||||
|
# The DataFrame df has been reset, so use split_idx to get the right row
|
||||||
|
if 'Close' in df.columns:
|
||||||
|
close_prices = df['Close'].values
|
||||||
|
else:
|
||||||
|
# Reload original CSV to get Close prices if not present
|
||||||
|
close_prices = pd.read_csv(csv_path)['Close'].values
|
||||||
|
start_price = close_prices[split_idx] # This is the price at the split point
|
||||||
|
# Actual prices
|
||||||
|
actual_prices = [start_price]
|
||||||
|
for r in y_test:
|
||||||
|
actual_prices.append(actual_prices[-1] * np.exp(r))
|
||||||
|
actual_prices = np.array(actual_prices[1:])
|
||||||
|
# Predicted prices
|
||||||
|
predicted_prices = [start_price]
|
||||||
|
for r in test_preds:
|
||||||
|
predicted_prices.append(predicted_prices[-1] * np.exp(r))
|
||||||
|
predicted_prices = np.array(predicted_prices[1:])
|
||||||
|
|
||||||
plot_predicted_vs_actual_log_returns(y_test, test_preds, test_timestamps)
|
print('Plotting predicted vs actual prices...')
|
||||||
|
plot_predicted_vs_actual_prices(actual_prices, predicted_prices, test_timestamps)
|
||||||
|
|
||||||
|
print("Final features used for training:", feature_cols)
|
||||||
|
|
||||||
|
print("Shape of X:", X.shape)
|
||||||
|
print("First row of X:", X[0])
|
||||||
|
print("stoch_k in feature_cols?", "stoch_k" in feature_cols)
|
||||||
|
if "stoch_k" in feature_cols:
|
||||||
|
idx = feature_cols.index("stoch_k")
|
||||||
|
print("First 10 values of stoch_k:", X[:10, idx])
|
||||||
|
|||||||
@ -109,3 +109,61 @@ def plot_predicted_vs_actual_log_returns(y_test, test_preds, timestamps=None, n_
|
|||||||
)
|
)
|
||||||
fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
|
fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
|
||||||
pyo.plot(fig_scatter, filename='log_return_scatter_plot.html')
|
pyo.plot(fig_scatter, filename='log_return_scatter_plot.html')
|
||||||
|
|
||||||
|
def plot_predicted_vs_actual_prices(actual_prices, predicted_prices, timestamps=None, n_plot=200):
|
||||||
|
import plotly.offline as pyo
|
||||||
|
import plotly.graph_objs as go
|
||||||
|
n_plot = min(n_plot, len(actual_prices))
|
||||||
|
actual = actual_prices[:n_plot]
|
||||||
|
predicted = predicted_prices[:n_plot]
|
||||||
|
if timestamps is not None:
|
||||||
|
x_axis = timestamps[:n_plot]
|
||||||
|
x_label = 'Timestamp'
|
||||||
|
else:
|
||||||
|
x_axis = list(range(n_plot))
|
||||||
|
x_label = 'Index'
|
||||||
|
|
||||||
|
# Line plot: Actual vs Predicted over time
|
||||||
|
trace_actual = go.Scatter(x=x_axis, y=actual, mode='lines', name='Actual Price')
|
||||||
|
trace_predicted = go.Scatter(x=x_axis, y=predicted, mode='lines', name='Predicted Price')
|
||||||
|
data_line = [trace_actual, trace_predicted]
|
||||||
|
layout_line = go.Layout(
|
||||||
|
title='Actual vs. Predicted BTC Prices (Test Set)',
|
||||||
|
xaxis={'title': x_label},
|
||||||
|
yaxis={'title': 'BTC Price'},
|
||||||
|
legend={'x': 0, 'y': 1},
|
||||||
|
margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
|
||||||
|
hovermode='closest'
|
||||||
|
)
|
||||||
|
fig_line = go.Figure(data=data_line, layout=layout_line)
|
||||||
|
pyo.plot(fig_line, filename='price_line_plot.html')
|
||||||
|
|
||||||
|
# Scatter plot: Predicted vs Actual
|
||||||
|
trace_scatter = go.Scatter(
|
||||||
|
x=actual,
|
||||||
|
y=predicted,
|
||||||
|
mode='markers',
|
||||||
|
name='Predicted vs Actual',
|
||||||
|
opacity=0.5
|
||||||
|
)
|
||||||
|
# Diagonal reference line
|
||||||
|
min_val = min(np.min(actual), np.min(predicted))
|
||||||
|
max_val = max(np.max(actual), np.max(predicted))
|
||||||
|
trace_diag = go.Scatter(
|
||||||
|
x=[min_val, max_val],
|
||||||
|
y=[min_val, max_val],
|
||||||
|
mode='lines',
|
||||||
|
name='Ideal',
|
||||||
|
line=dict(dash='dash', color='red')
|
||||||
|
)
|
||||||
|
data_scatter = [trace_scatter, trace_diag]
|
||||||
|
layout_scatter = go.Layout(
|
||||||
|
title='Predicted vs Actual Prices (Scatter)',
|
||||||
|
xaxis={'title': 'Actual Price'},
|
||||||
|
yaxis={'title': 'Predicted Price'},
|
||||||
|
showlegend=True,
|
||||||
|
margin={'l': 40, 'b': 40, 't': 40, 'r': 10},
|
||||||
|
hovermode='closest'
|
||||||
|
)
|
||||||
|
fig_scatter = go.Figure(data=data_scatter, layout=layout_scatter)
|
||||||
|
pyo.plot(fig_scatter, filename='price_scatter_plot.html')
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user