diff --git a/.gitignore b/.gitignore index f102429..1bff80b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,181 +1,181 @@ -# ---> Python -/data/*.db -/credentials/*.json -*.csv -*.png -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class -/data/*.npy - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# UV -# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -#uv.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ - -An introduction to trading cycles.pdf -An introduction to trading cycles.txt -README.md -.vscode/launch.json -data/btcusd_1-day_data.csv -data/btcusd_1-min_data.csv +# ---> Python +/data/*.db +/credentials/*.json +*.csv +*.png +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class +/data/*.npy + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +#uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +An introduction to trading cycles.pdf +An introduction to trading cycles.txt +README.md +.vscode/launch.json +data/btcusd_1-day_data.csv +data/btcusd_1-min_data.csv diff --git a/xgboost/custom_xgboost.py b/xgboost/custom_xgboost.py index dd5a58a..1e9cf69 100644 --- a/xgboost/custom_xgboost.py +++ b/xgboost/custom_xgboost.py @@ -1,39 +1,39 @@ -import xgboost as xgb -import numpy as np - -class CustomXGBoostGPU: - def __init__(self, X_train, X_test, y_train, y_test): - self.X_train = X_train.astype(np.float32) - self.X_test = X_test.astype(np.float32) - self.y_train = y_train.astype(np.float32) - self.y_test = y_test.astype(np.float32) - self.model = None - self.params = None # Will be set during training - - def train(self, **xgb_params): - params = { - 'tree_method': 'hist', - 'device': 'cuda', - 'objective': 'reg:squarederror', - 'eval_metric': 'rmse', - 'verbosity': 1, - } - params.update(xgb_params) - self.params = params # Store params for later access - dtrain = xgb.DMatrix(self.X_train, label=self.y_train) - dtest = xgb.DMatrix(self.X_test, label=self.y_test) - evals = [(dtrain, 'train'), (dtest, 'eval')] - self.model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10) - return self.model - - def predict(self, X): - if self.model is None: - raise ValueError('Model not trained yet.') - dmatrix = xgb.DMatrix(X.astype(np.float32)) - return self.model.predict(dmatrix) - - def save_model(self, file_path): - """Save the trained XGBoost model to the specified file path.""" - if self.model is None: - raise ValueError('Model not trained yet.') - self.model.save_model(file_path) +import xgboost as xgb +import numpy as np + +class CustomXGBoostGPU: + def __init__(self, X_train, X_test, y_train, y_test): + self.X_train = X_train.astype(np.float32) + self.X_test = X_test.astype(np.float32) + self.y_train = y_train.astype(np.float32) + self.y_test = y_test.astype(np.float32) + self.model = None + self.params = None # Will be set during training + + def train(self, **xgb_params): + params = { + 'tree_method': 'hist', + 'device': 'cuda', + 'objective': 'reg:squarederror', + 'eval_metric': 'rmse', + 'verbosity': 1, + } + params.update(xgb_params) + self.params = params # Store params for later access + dtrain = xgb.DMatrix(self.X_train, label=self.y_train) + dtest = xgb.DMatrix(self.X_test, label=self.y_test) + evals = [(dtrain, 'train'), (dtest, 'eval')] + self.model = xgb.train(params, dtrain, num_boost_round=100, evals=evals, early_stopping_rounds=10) + return self.model + + def predict(self, X): + if self.model is None: + raise ValueError('Model not trained yet.') + dmatrix = xgb.DMatrix(X.astype(np.float32)) + return self.model.predict(dmatrix) + + def save_model(self, file_path): + """Save the trained XGBoost model to the specified file path.""" + if self.model is None: + raise ValueError('Model not trained yet.') + self.model.save_model(file_path) diff --git a/xgboost/main.py b/xgboost/main.py index c502683..fbfea2b 100644 --- a/xgboost/main.py +++ b/xgboost/main.py @@ -1,806 +1,806 @@ -import sys -import os -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -import pandas as pd -import numpy as np -from custom_xgboost import CustomXGBoostGPU -from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score -from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap -from cycles.supertrend import Supertrends -import time -from numba import njit -import itertools -import csv -import pandas_ta as ta - -def run_indicator(func, *args): - return func(*args) - -def run_indicator_job(job): - import time - func, *args = job - indicator_name = func.__name__ - start = time.time() - result = func(*args) - elapsed = time.time() - start - print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds') - return result - -def calc_rsi(close): - from ta.momentum import RSIIndicator - return ('rsi', RSIIndicator(close, window=14).rsi()) - -def calc_macd(close): - from ta.trend import MACD - return ('macd', MACD(close).macd()) - -def calc_bollinger(close): - from ta.volatility import BollingerBands - bb = BollingerBands(close=close, window=20, window_dev=2) - return [ - ('bb_bbm', bb.bollinger_mavg()), - ('bb_bbh', bb.bollinger_hband()), - ('bb_bbl', bb.bollinger_lband()), - ('bb_bb_width', bb.bollinger_hband() - bb.bollinger_lband()) - ] - -def calc_stochastic(high, low, close): - from ta.momentum import StochasticOscillator - stoch = StochasticOscillator(high=high, low=low, close=close, window=14, smooth_window=3) - return [ - ('stoch_k', stoch.stoch()), - ('stoch_d', stoch.stoch_signal()) - ] - -def calc_atr(high, low, close): - from ta.volatility import AverageTrueRange - atr = AverageTrueRange(high=high, low=low, close=close, window=14) - return ('atr', atr.average_true_range()) - -def calc_cci(high, low, close): - from ta.trend import CCIIndicator - cci = CCIIndicator(high=high, low=low, close=close, window=20) - return ('cci', cci.cci()) - -def calc_williamsr(high, low, close): - from ta.momentum import WilliamsRIndicator - willr = WilliamsRIndicator(high=high, low=low, close=close, lbp=14) - return ('williams_r', willr.williams_r()) - -def calc_ema(close): - from ta.trend import EMAIndicator - ema = EMAIndicator(close=close, window=14) - return ('ema_14', ema.ema_indicator()) - -def calc_obv(close, volume): - from ta.volume import OnBalanceVolumeIndicator - obv = OnBalanceVolumeIndicator(close=close, volume=volume) - return ('obv', obv.on_balance_volume()) - -def calc_cmf(high, low, close, volume): - from ta.volume import ChaikinMoneyFlowIndicator - cmf = ChaikinMoneyFlowIndicator(high=high, low=low, close=close, volume=volume, window=20) - return ('cmf', cmf.chaikin_money_flow()) - -def calc_sma(close): - from ta.trend import SMAIndicator - return [ - ('sma_50', SMAIndicator(close, window=50).sma_indicator()), - ('sma_200', SMAIndicator(close, window=200).sma_indicator()) - ] - -def calc_roc(close): - from ta.momentum import ROCIndicator - return ('roc_10', ROCIndicator(close, window=10).roc()) - -def calc_momentum(close): - return ('momentum_10', close - close.shift(10)) - -def calc_psar(high, low, close): - # Use the Numba-accelerated fast_psar function for speed - psar_values = fast_psar(np.array(high), np.array(low), np.array(close)) - return [('psar', pd.Series(psar_values, index=close.index))] - -def calc_donchian(high, low, close): - from ta.volatility import DonchianChannel - donchian = DonchianChannel(high, low, close, window=20) - return [ - ('donchian_hband', donchian.donchian_channel_hband()), - ('donchian_lband', donchian.donchian_channel_lband()), - ('donchian_mband', donchian.donchian_channel_mband()) - ] - -def calc_keltner(high, low, close): - from ta.volatility import KeltnerChannel - keltner = KeltnerChannel(high, low, close, window=20) - return [ - ('keltner_hband', keltner.keltner_channel_hband()), - ('keltner_lband', keltner.keltner_channel_lband()), - ('keltner_mband', keltner.keltner_channel_mband()) - ] - -def calc_dpo(close): - from ta.trend import DPOIndicator - return ('dpo_20', DPOIndicator(close, window=20).dpo()) - -def calc_ultimate(high, low, close): - from ta.momentum import UltimateOscillator - return ('ultimate_osc', UltimateOscillator(high, low, close).ultimate_oscillator()) - -def calc_ichimoku(high, low): - from ta.trend import IchimokuIndicator - ichimoku = IchimokuIndicator(high, low, window1=9, window2=26, window3=52) - return [ - ('ichimoku_a', ichimoku.ichimoku_a()), - ('ichimoku_b', ichimoku.ichimoku_b()), - ('ichimoku_base_line', ichimoku.ichimoku_base_line()), - ('ichimoku_conversion_line', ichimoku.ichimoku_conversion_line()) - ] - -def calc_elder_ray(close, low, high): - from ta.trend import EMAIndicator - ema = EMAIndicator(close, window=13).ema_indicator() - return [ - ('elder_ray_bull', ema - low), - ('elder_ray_bear', ema - high) - ] - -def calc_daily_return(close): - from ta.others import DailyReturnIndicator - return ('daily_return', DailyReturnIndicator(close).daily_return()) - -@njit -def fast_psar(high, low, close, af=0.02, max_af=0.2): - length = len(close) - psar = np.zeros(length) - bull = True - af_step = af - ep = low[0] - psar[0] = low[0] - for i in range(1, length): - prev_psar = psar[i-1] - if bull: - psar[i] = prev_psar + af_step * (ep - prev_psar) - if low[i] < psar[i]: - bull = False - psar[i] = ep - af_step = af - ep = low[i] - else: - if high[i] > ep: - ep = high[i] - af_step = min(af_step + af, max_af) - else: - psar[i] = prev_psar + af_step * (ep - prev_psar) - if high[i] > psar[i]: - bull = True - psar[i] = ep - af_step = af - ep = high[i] - else: - if low[i] < ep: - ep = low[i] - af_step = min(af_step + af, max_af) - return psar - -def compute_lag(df, col, lag): - return df[col].shift(lag) - -def compute_rolling(df, col, stat, window): - if stat == 'mean': - return df[col].rolling(window).mean() - elif stat == 'std': - return df[col].rolling(window).std() - elif stat == 'min': - return df[col].rolling(window).min() - elif stat == 'max': - return df[col].rolling(window).max() - -def compute_log_return(df, horizon): - return np.log(df['Close'] / df['Close'].shift(horizon)) - -def compute_volatility(df, window): - return df['log_return'].rolling(window).std() - -def run_feature_job(job, df): - feature_name, func, *args = job - print(f'Computing feature: {feature_name}') - result = func(df, *args) - return feature_name, result - -def calc_adx(high, low, close): - from ta.trend import ADXIndicator - adx = ADXIndicator(high=high, low=low, close=close, window=14) - return [ - ('adx', adx.adx()), - ('adx_pos', adx.adx_pos()), - ('adx_neg', adx.adx_neg()) - ] - -def calc_trix(close): - from ta.trend import TRIXIndicator - trix = TRIXIndicator(close=close, window=15) - return ('trix', trix.trix()) - -def calc_vortex(high, low, close): - from ta.trend import VortexIndicator - vortex = VortexIndicator(high=high, low=low, close=close, window=14) - return [ - ('vortex_pos', vortex.vortex_indicator_pos()), - ('vortex_neg', vortex.vortex_indicator_neg()) - ] - -def calc_kama(close): - import pandas_ta as ta - kama = ta.kama(close, length=10) - return ('kama', kama) - -def calc_force_index(close, volume): - from ta.volume import ForceIndexIndicator - fi = ForceIndexIndicator(close=close, volume=volume, window=13) - return ('force_index', fi.force_index()) - -def calc_eom(high, low, volume): - from ta.volume import EaseOfMovementIndicator - eom = EaseOfMovementIndicator(high=high, low=low, volume=volume, window=14) - return ('eom', eom.ease_of_movement()) - -def calc_mfi(high, low, close, volume): - from ta.volume import MFIIndicator - mfi = MFIIndicator(high=high, low=low, close=close, volume=volume, window=14) - return ('mfi', mfi.money_flow_index()) - -def calc_adi(high, low, close, volume): - from ta.volume import AccDistIndexIndicator - adi = AccDistIndexIndicator(high=high, low=low, close=close, volume=volume) - return ('adi', adi.acc_dist_index()) - -def calc_tema(close): - import pandas_ta as ta - tema = ta.tema(close, length=10) - return ('tema', tema) - -def calc_stochrsi(close): - from ta.momentum import StochRSIIndicator - stochrsi = StochRSIIndicator(close=close, window=14, smooth1=3, smooth2=3) - return [ - ('stochrsi', stochrsi.stochrsi()), - ('stochrsi_k', stochrsi.stochrsi_k()), - ('stochrsi_d', stochrsi.stochrsi_d()) - ] - -def calc_awesome_oscillator(high, low): - from ta.momentum import AwesomeOscillatorIndicator - ao = AwesomeOscillatorIndicator(high=high, low=low, window1=5, window2=34) - return ('awesome_osc', ao.awesome_oscillator()) - -if __name__ == '__main__': - IMPUTE_NANS = True # Set to True to impute NaNs, False to drop rows with NaNs - csv_path = './data/btcusd_1-min_data.csv' - csv_prefix = os.path.splitext(os.path.basename(csv_path))[0] - - print('Reading CSV and filtering data...') - df = pd.read_csv(csv_path) - df = df[df['Volume'] != 0] - - min_date = '2017-06-01' - print('Converting Timestamp and filtering by date...') - df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s') - df = df[df['Timestamp'] >= min_date] - - lags = 3 - - print('Calculating log returns as the new target...') - df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) - - ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume'] - window_sizes = [5, 15, 30] # in minutes, adjust as needed - - features_dict = {} - - print('Starting feature computation...') - feature_start_time = time.time() - - # --- Technical Indicator Features: Calculate or Load from Cache --- - print('Calculating or loading technical indicator features...') - # RSI - feature_file = f'./data/{csv_prefix}_rsi.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['rsi'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: rsi') - _, values = calc_rsi(df['Close']) - features_dict['rsi'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # MACD - feature_file = f'./data/{csv_prefix}_macd.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['macd'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: macd') - _, values = calc_macd(df['Close']) - features_dict['macd'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # ATR - feature_file = f'./data/{csv_prefix}_atr.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['atr'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: atr') - _, values = calc_atr(df['High'], df['Low'], df['Close']) - features_dict['atr'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # CCI - feature_file = f'./data/{csv_prefix}_cci.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['cci'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: cci') - _, values = calc_cci(df['High'], df['Low'], df['Close']) - features_dict['cci'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # Williams %R - feature_file = f'./data/{csv_prefix}_williams_r.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['williams_r'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: williams_r') - _, values = calc_williamsr(df['High'], df['Low'], df['Close']) - features_dict['williams_r'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # EMA 14 - feature_file = f'./data/{csv_prefix}_ema_14.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['ema_14'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: ema_14') - _, values = calc_ema(df['Close']) - features_dict['ema_14'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # OBV - feature_file = f'./data/{csv_prefix}_obv.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['obv'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: obv') - _, values = calc_obv(df['Close'], df['Volume']) - features_dict['obv'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # CMF - feature_file = f'./data/{csv_prefix}_cmf.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['cmf'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: cmf') - _, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume']) - features_dict['cmf'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # ROC 10 - feature_file = f'./data/{csv_prefix}_roc_10.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['roc_10'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: roc_10') - _, values = calc_roc(df['Close']) - features_dict['roc_10'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # DPO 20 - feature_file = f'./data/{csv_prefix}_dpo_20.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['dpo_20'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: dpo_20') - _, values = calc_dpo(df['Close']) - features_dict['dpo_20'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # Ultimate Oscillator - feature_file = f'./data/{csv_prefix}_ultimate_osc.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['ultimate_osc'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: ultimate_osc') - _, values = calc_ultimate(df['High'], df['Low'], df['Close']) - features_dict['ultimate_osc'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # Daily Return - feature_file = f'./data/{csv_prefix}_daily_return.npy' - if os.path.exists(feature_file): - print(f'A Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['daily_return'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: daily_return') - _, values = calc_daily_return(df['Close']) - features_dict['daily_return'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # Multi-column indicators - # Bollinger Bands - print('Calculating multi-column indicator: bollinger') - result = calc_bollinger(df['Close']) - for subname, values in result: - print(f"Adding subfeature: {subname}") - sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - print(f'B Loading cached feature: {sub_feature_file}') - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) - else: - features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') - - # Stochastic Oscillator - print('Calculating multi-column indicator: stochastic') - result = calc_stochastic(df['High'], df['Low'], df['Close']) - for subname, values in result: - print(f"Adding subfeature: {subname}") - sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - print(f'B Loading cached feature: {sub_feature_file}') - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) - else: - features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') - - # SMA - print('Calculating multi-column indicator: sma') - result = calc_sma(df['Close']) - for subname, values in result: - print(f"Adding subfeature: {subname}") - sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - print(f'B Loading cached feature: {sub_feature_file}') - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) - else: - features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') - - # PSAR - print('Calculating multi-column indicator: psar') - result = calc_psar(df['High'], df['Low'], df['Close']) - for subname, values in result: - print(f"Adding subfeature: {subname}") - sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - print(f'B Loading cached feature: {sub_feature_file}') - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) - else: - features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') - - # Donchian Channel - print('Calculating multi-column indicator: donchian') - result = calc_donchian(df['High'], df['Low'], df['Close']) - for subname, values in result: - print(f"Adding subfeature: {subname}") - sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - print(f'B Loading cached feature: {sub_feature_file}') - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) - else: - features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') - - # Keltner Channel - print('Calculating multi-column indicator: keltner') - result = calc_keltner(df['High'], df['Low'], df['Close']) - for subname, values in result: - print(f"Adding subfeature: {subname}") - sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - print(f'B Loading cached feature: {sub_feature_file}') - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) - else: - features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') - - # Ichimoku - print('Calculating multi-column indicator: ichimoku') - result = calc_ichimoku(df['High'], df['Low']) - for subname, values in result: - print(f"Adding subfeature: {subname}") - sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - print(f'B Loading cached feature: {sub_feature_file}') - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) - else: - features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') - - # Elder Ray - print('Calculating multi-column indicator: elder_ray') - result = calc_elder_ray(df['Close'], df['Low'], df['High']) - for subname, values in result: - print(f"Adding subfeature: {subname}") - sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - print(f'B Loading cached feature: {sub_feature_file}') - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) - else: - features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') - - # Prepare lags, rolling stats, log returns, and volatility features sequentially - # Lags - for col in ohlcv_cols: - for lag in range(1, lags + 1): - feature_name = f'{col}_lag{lag}' - feature_file = f'./data/{csv_prefix}_{feature_name}.npy' - if os.path.exists(feature_file): - print(f'C Loading cached feature: {feature_file}') - features_dict[feature_name] = np.load(feature_file) - else: - print(f'Computing lag feature: {feature_name}') - result = compute_lag(df, col, lag) - features_dict[feature_name] = result - np.save(feature_file, result.values) - print(f'Saved feature: {feature_file}') - # Rolling statistics - for col in ohlcv_cols: - for window in window_sizes: - if (col == 'Open' and window == 5): - continue - if (col == 'High' and window == 5): - continue - if (col == 'High' and window == 30): - continue - if (col == 'Low' and window == 15): - continue - for stat in ['mean', 'std', 'min', 'max']: - feature_name = f'{col}_roll_{stat}_{window}' - feature_file = f'./data/{csv_prefix}_{feature_name}.npy' - if os.path.exists(feature_file): - print(f'D Loading cached feature: {feature_file}') - features_dict[feature_name] = np.load(feature_file) - else: - print(f'Computing rolling stat feature: {feature_name}') - result = compute_rolling(df, col, stat, window) - features_dict[feature_name] = result - np.save(feature_file, result.values) - print(f'Saved feature: {feature_file}') - # Log returns for different horizons - for horizon in [5, 15, 30]: - feature_name = f'log_return_{horizon}' - feature_file = f'./data/{csv_prefix}_{feature_name}.npy' - if os.path.exists(feature_file): - print(f'E Loading cached feature: {feature_file}') - features_dict[feature_name] = np.load(feature_file) - else: - print(f'Computing log return feature: {feature_name}') - result = compute_log_return(df, horizon) - features_dict[feature_name] = result - np.save(feature_file, result.values) - print(f'Saved feature: {feature_file}') - # Volatility - for window in window_sizes: - feature_name = f'volatility_{window}' - feature_file = f'./data/{csv_prefix}_{feature_name}.npy' - if os.path.exists(feature_file): - print(f'F Loading cached feature: {feature_file}') - features_dict[feature_name] = np.load(feature_file) - else: - print(f'Computing volatility feature: {feature_name}') - result = compute_volatility(df, window) - features_dict[feature_name] = result - np.save(feature_file, result.values) - print(f'Saved feature: {feature_file}') - - # --- Additional Technical Indicator Features --- - # ADX - adx_names = ['adx', 'adx_pos', 'adx_neg'] - adx_files = [f'./data/{csv_prefix}_{name}.npy' for name in adx_names] - if all(os.path.exists(f) for f in adx_files): - print('G Loading cached features: ADX') - for name, f in zip(adx_names, adx_files): - arr = np.load(f) - features_dict[name] = pd.Series(arr, index=df.index) - else: - print('Calculating multi-column indicator: adx') - result = calc_adx(df['High'], df['Low'], df['Close']) - for subname, values in result: - sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' - features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') - - # Force Index - feature_file = f'./data/{csv_prefix}_force_index.npy' - if os.path.exists(feature_file): - print(f'K Loading cached feature: {feature_file}') - arr = np.load(feature_file) - features_dict['force_index'] = pd.Series(arr, index=df.index) - else: - print('Calculating feature: force_index') - _, values = calc_force_index(df['Close'], df['Volume']) - features_dict['force_index'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') - - # Supertrend indicators - for period, multiplier in [(12, 3.0), (10, 1.0), (11, 2.0)]: - st_name = f'supertrend_{period}_{multiplier}' - st_trend_name = f'supertrend_trend_{period}_{multiplier}' - st_file = f'./data/{csv_prefix}_{st_name}.npy' - st_trend_file = f'./data/{csv_prefix}_{st_trend_name}.npy' - if os.path.exists(st_file) and os.path.exists(st_trend_file): - print(f'L Loading cached features: {st_file}, {st_trend_file}') - features_dict[st_name] = pd.Series(np.load(st_file), index=df.index) - features_dict[st_trend_name] = pd.Series(np.load(st_trend_file), index=df.index) - else: - print(f'Calculating Supertrend indicator: {st_name}') - st = ta.supertrend(df['High'], df['Low'], df['Close'], length=period, multiplier=multiplier) - features_dict[st_name] = st[f'SUPERT_{period}_{multiplier}'] - features_dict[st_trend_name] = st[f'SUPERTd_{period}_{multiplier}'] - np.save(st_file, features_dict[st_name].values) - np.save(st_trend_file, features_dict[st_trend_name].values) - print(f'Saved features: {st_file}, {st_trend_file}') - - # Concatenate all new features at once - print('Concatenating all new features to DataFrame...') - features_df = pd.DataFrame(features_dict) - print("Columns in features_df:", features_df.columns.tolist()) - print("All-NaN columns in features_df:", features_df.columns[features_df.isna().all()].tolist()) - df = pd.concat([df, features_df], axis=1) - - # Print all columns after concatenation - print("All columns in df after concat:", df.columns.tolist()) - - # Downcast all float columns to save memory - print('Downcasting float columns to save memory...') - for col in df.columns: - try: - df[col] = pd.to_numeric(df[col], downcast='float') - except Exception: - pass - - # Add time features (exclude 'dayofweek') - print('Adding hour feature...') - df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce') - df['hour'] = df['Timestamp'].dt.hour - - # Handle NaNs after all feature engineering - if IMPUTE_NANS: - print('Imputing NaNs after feature engineering (using mean imputation)...') - numeric_cols = df.select_dtypes(include=[np.number]).columns - for col in numeric_cols: - df[col] = df[col].fillna(df[col].mean()) - # If you want to impute non-numeric columns differently, add logic here - else: - print('Dropping NaNs after feature engineering...') - df = df.dropna().reset_index(drop=True) - - # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features - print('Selecting feature columns...') - exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30'] - feature_cols = [col for col in df.columns if col not in exclude_cols] - print('Features used for training:', feature_cols) - - # Prepare CSV for results - results_csv = './data/leave_one_out_results.csv' - if not os.path.exists(results_csv): - with open(results_csv, 'w', newline='') as f: - writer = csv.writer(f) - writer.writerow(['left_out_feature', 'used_features', 'rmse', 'mae', 'r2', 'mape', 'directional_accuracy']) - - total_features = len(feature_cols) - never_leave_out = {'Open', 'High', 'Low', 'Close', 'Volume'} - for idx, left_out in enumerate(feature_cols): - if left_out in never_leave_out: - continue - used = [f for f in feature_cols if f != left_out] - print(f'\n=== Leave-one-out {idx+1}/{total_features}: left out {left_out} ===') - try: - # Prepare X and y for this combination - X = df[used].values.astype(np.float32) - y = df["log_return"].values.astype(np.float32) - split_idx = int(len(X) * 0.8) - X_train, X_test = X[:split_idx], X[split_idx:] - y_train, y_test = y[:split_idx], y[split_idx:] - test_timestamps = df['Timestamp'].values[split_idx:] - - model = CustomXGBoostGPU(X_train, X_test, y_train, y_test) - booster = model.train() - model.save_model(f'./data/xgboost_model_wo_{left_out}.json') - - test_preds = model.predict(X_test) - rmse = np.sqrt(mean_squared_error(y_test, test_preds)) - - # Reconstruct price series from log returns - if 'Close' in df.columns: - close_prices = df['Close'].values - else: - close_prices = pd.read_csv(csv_path)['Close'].values - start_price = close_prices[split_idx] - actual_prices = [start_price] - for r_ in y_test: - actual_prices.append(actual_prices[-1] * np.exp(r_)) - actual_prices = np.array(actual_prices[1:]) - predicted_prices = [start_price] - for r_ in test_preds: - predicted_prices.append(predicted_prices[-1] * np.exp(r_)) - predicted_prices = np.array(predicted_prices[1:]) - - mae = mean_absolute_error(actual_prices, predicted_prices) - r2 = r2_score(actual_prices, predicted_prices) - direction_actual = np.sign(np.diff(actual_prices)) - direction_pred = np.sign(np.diff(predicted_prices)) - directional_accuracy = (direction_actual == direction_pred).mean() - mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100 - - # Save results to CSV - with open(results_csv, 'a', newline='') as f: - writer = csv.writer(f) - writer.writerow([left_out, "|".join(used), rmse, mae, r2, mape, directional_accuracy]) - print(f'Left out {left_out}: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%, DirAcc={directional_accuracy*100:.2f}%') - - # Plotting for this run - plot_prefix = f'loo_{left_out}' - print('Plotting distribution of absolute prediction errors...') - plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix) - - print('Plotting directional accuracy...') - plot_direction_transition_heatmap(actual_prices, predicted_prices, prefix=plot_prefix) - except Exception as e: - print(f'Leave-one-out failed for {left_out}: {e}') - print(f'All leave-one-out runs completed. Results saved to {results_csv}') +import sys +import os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +import pandas as pd +import numpy as np +from custom_xgboost import CustomXGBoostGPU +from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score +from plot_results import plot_prediction_error_distribution, plot_direction_transition_heatmap +from cycles.supertrend import Supertrends +import time +from numba import njit +import itertools +import csv +import pandas_ta as ta + +def run_indicator(func, *args): + return func(*args) + +def run_indicator_job(job): + import time + func, *args = job + indicator_name = func.__name__ + start = time.time() + result = func(*args) + elapsed = time.time() - start + print(f'Indicator {indicator_name} computed in {elapsed:.4f} seconds') + return result + +def calc_rsi(close): + from ta.momentum import RSIIndicator + return ('rsi', RSIIndicator(close, window=14).rsi()) + +def calc_macd(close): + from ta.trend import MACD + return ('macd', MACD(close).macd()) + +def calc_bollinger(close): + from ta.volatility import BollingerBands + bb = BollingerBands(close=close, window=20, window_dev=2) + return [ + ('bb_bbm', bb.bollinger_mavg()), + ('bb_bbh', bb.bollinger_hband()), + ('bb_bbl', bb.bollinger_lband()), + ('bb_bb_width', bb.bollinger_hband() - bb.bollinger_lband()) + ] + +def calc_stochastic(high, low, close): + from ta.momentum import StochasticOscillator + stoch = StochasticOscillator(high=high, low=low, close=close, window=14, smooth_window=3) + return [ + ('stoch_k', stoch.stoch()), + ('stoch_d', stoch.stoch_signal()) + ] + +def calc_atr(high, low, close): + from ta.volatility import AverageTrueRange + atr = AverageTrueRange(high=high, low=low, close=close, window=14) + return ('atr', atr.average_true_range()) + +def calc_cci(high, low, close): + from ta.trend import CCIIndicator + cci = CCIIndicator(high=high, low=low, close=close, window=20) + return ('cci', cci.cci()) + +def calc_williamsr(high, low, close): + from ta.momentum import WilliamsRIndicator + willr = WilliamsRIndicator(high=high, low=low, close=close, lbp=14) + return ('williams_r', willr.williams_r()) + +def calc_ema(close): + from ta.trend import EMAIndicator + ema = EMAIndicator(close=close, window=14) + return ('ema_14', ema.ema_indicator()) + +def calc_obv(close, volume): + from ta.volume import OnBalanceVolumeIndicator + obv = OnBalanceVolumeIndicator(close=close, volume=volume) + return ('obv', obv.on_balance_volume()) + +def calc_cmf(high, low, close, volume): + from ta.volume import ChaikinMoneyFlowIndicator + cmf = ChaikinMoneyFlowIndicator(high=high, low=low, close=close, volume=volume, window=20) + return ('cmf', cmf.chaikin_money_flow()) + +def calc_sma(close): + from ta.trend import SMAIndicator + return [ + ('sma_50', SMAIndicator(close, window=50).sma_indicator()), + ('sma_200', SMAIndicator(close, window=200).sma_indicator()) + ] + +def calc_roc(close): + from ta.momentum import ROCIndicator + return ('roc_10', ROCIndicator(close, window=10).roc()) + +def calc_momentum(close): + return ('momentum_10', close - close.shift(10)) + +def calc_psar(high, low, close): + # Use the Numba-accelerated fast_psar function for speed + psar_values = fast_psar(np.array(high), np.array(low), np.array(close)) + return [('psar', pd.Series(psar_values, index=close.index))] + +def calc_donchian(high, low, close): + from ta.volatility import DonchianChannel + donchian = DonchianChannel(high, low, close, window=20) + return [ + ('donchian_hband', donchian.donchian_channel_hband()), + ('donchian_lband', donchian.donchian_channel_lband()), + ('donchian_mband', donchian.donchian_channel_mband()) + ] + +def calc_keltner(high, low, close): + from ta.volatility import KeltnerChannel + keltner = KeltnerChannel(high, low, close, window=20) + return [ + ('keltner_hband', keltner.keltner_channel_hband()), + ('keltner_lband', keltner.keltner_channel_lband()), + ('keltner_mband', keltner.keltner_channel_mband()) + ] + +def calc_dpo(close): + from ta.trend import DPOIndicator + return ('dpo_20', DPOIndicator(close, window=20).dpo()) + +def calc_ultimate(high, low, close): + from ta.momentum import UltimateOscillator + return ('ultimate_osc', UltimateOscillator(high, low, close).ultimate_oscillator()) + +def calc_ichimoku(high, low): + from ta.trend import IchimokuIndicator + ichimoku = IchimokuIndicator(high, low, window1=9, window2=26, window3=52) + return [ + ('ichimoku_a', ichimoku.ichimoku_a()), + ('ichimoku_b', ichimoku.ichimoku_b()), + ('ichimoku_base_line', ichimoku.ichimoku_base_line()), + ('ichimoku_conversion_line', ichimoku.ichimoku_conversion_line()) + ] + +def calc_elder_ray(close, low, high): + from ta.trend import EMAIndicator + ema = EMAIndicator(close, window=13).ema_indicator() + return [ + ('elder_ray_bull', ema - low), + ('elder_ray_bear', ema - high) + ] + +def calc_daily_return(close): + from ta.others import DailyReturnIndicator + return ('daily_return', DailyReturnIndicator(close).daily_return()) + +@njit +def fast_psar(high, low, close, af=0.02, max_af=0.2): + length = len(close) + psar = np.zeros(length) + bull = True + af_step = af + ep = low[0] + psar[0] = low[0] + for i in range(1, length): + prev_psar = psar[i-1] + if bull: + psar[i] = prev_psar + af_step * (ep - prev_psar) + if low[i] < psar[i]: + bull = False + psar[i] = ep + af_step = af + ep = low[i] + else: + if high[i] > ep: + ep = high[i] + af_step = min(af_step + af, max_af) + else: + psar[i] = prev_psar + af_step * (ep - prev_psar) + if high[i] > psar[i]: + bull = True + psar[i] = ep + af_step = af + ep = high[i] + else: + if low[i] < ep: + ep = low[i] + af_step = min(af_step + af, max_af) + return psar + +def compute_lag(df, col, lag): + return df[col].shift(lag) + +def compute_rolling(df, col, stat, window): + if stat == 'mean': + return df[col].rolling(window).mean() + elif stat == 'std': + return df[col].rolling(window).std() + elif stat == 'min': + return df[col].rolling(window).min() + elif stat == 'max': + return df[col].rolling(window).max() + +def compute_log_return(df, horizon): + return np.log(df['Close'] / df['Close'].shift(horizon)) + +def compute_volatility(df, window): + return df['log_return'].rolling(window).std() + +def run_feature_job(job, df): + feature_name, func, *args = job + print(f'Computing feature: {feature_name}') + result = func(df, *args) + return feature_name, result + +def calc_adx(high, low, close): + from ta.trend import ADXIndicator + adx = ADXIndicator(high=high, low=low, close=close, window=14) + return [ + ('adx', adx.adx()), + ('adx_pos', adx.adx_pos()), + ('adx_neg', adx.adx_neg()) + ] + +def calc_trix(close): + from ta.trend import TRIXIndicator + trix = TRIXIndicator(close=close, window=15) + return ('trix', trix.trix()) + +def calc_vortex(high, low, close): + from ta.trend import VortexIndicator + vortex = VortexIndicator(high=high, low=low, close=close, window=14) + return [ + ('vortex_pos', vortex.vortex_indicator_pos()), + ('vortex_neg', vortex.vortex_indicator_neg()) + ] + +def calc_kama(close): + import pandas_ta as ta + kama = ta.kama(close, length=10) + return ('kama', kama) + +def calc_force_index(close, volume): + from ta.volume import ForceIndexIndicator + fi = ForceIndexIndicator(close=close, volume=volume, window=13) + return ('force_index', fi.force_index()) + +def calc_eom(high, low, volume): + from ta.volume import EaseOfMovementIndicator + eom = EaseOfMovementIndicator(high=high, low=low, volume=volume, window=14) + return ('eom', eom.ease_of_movement()) + +def calc_mfi(high, low, close, volume): + from ta.volume import MFIIndicator + mfi = MFIIndicator(high=high, low=low, close=close, volume=volume, window=14) + return ('mfi', mfi.money_flow_index()) + +def calc_adi(high, low, close, volume): + from ta.volume import AccDistIndexIndicator + adi = AccDistIndexIndicator(high=high, low=low, close=close, volume=volume) + return ('adi', adi.acc_dist_index()) + +def calc_tema(close): + import pandas_ta as ta + tema = ta.tema(close, length=10) + return ('tema', tema) + +def calc_stochrsi(close): + from ta.momentum import StochRSIIndicator + stochrsi = StochRSIIndicator(close=close, window=14, smooth1=3, smooth2=3) + return [ + ('stochrsi', stochrsi.stochrsi()), + ('stochrsi_k', stochrsi.stochrsi_k()), + ('stochrsi_d', stochrsi.stochrsi_d()) + ] + +def calc_awesome_oscillator(high, low): + from ta.momentum import AwesomeOscillatorIndicator + ao = AwesomeOscillatorIndicator(high=high, low=low, window1=5, window2=34) + return ('awesome_osc', ao.awesome_oscillator()) + +if __name__ == '__main__': + IMPUTE_NANS = True # Set to True to impute NaNs, False to drop rows with NaNs + csv_path = './data/btcusd_1-min_data.csv' + csv_prefix = os.path.splitext(os.path.basename(csv_path))[0] + + print('Reading CSV and filtering data...') + df = pd.read_csv(csv_path) + df = df[df['Volume'] != 0] + + min_date = '2017-06-01' + print('Converting Timestamp and filtering by date...') + df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s') + df = df[df['Timestamp'] >= min_date] + + lags = 3 + + print('Calculating log returns as the new target...') + df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) + + ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume'] + window_sizes = [5, 15, 30] # in minutes, adjust as needed + + features_dict = {} + + print('Starting feature computation...') + feature_start_time = time.time() + + # --- Technical Indicator Features: Calculate or Load from Cache --- + print('Calculating or loading technical indicator features...') + # RSI + feature_file = f'./data/{csv_prefix}_rsi.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['rsi'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: rsi') + _, values = calc_rsi(df['Close']) + features_dict['rsi'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # MACD + feature_file = f'./data/{csv_prefix}_macd.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['macd'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: macd') + _, values = calc_macd(df['Close']) + features_dict['macd'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # ATR + feature_file = f'./data/{csv_prefix}_atr.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['atr'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: atr') + _, values = calc_atr(df['High'], df['Low'], df['Close']) + features_dict['atr'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # CCI + feature_file = f'./data/{csv_prefix}_cci.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['cci'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: cci') + _, values = calc_cci(df['High'], df['Low'], df['Close']) + features_dict['cci'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # Williams %R + feature_file = f'./data/{csv_prefix}_williams_r.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['williams_r'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: williams_r') + _, values = calc_williamsr(df['High'], df['Low'], df['Close']) + features_dict['williams_r'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # EMA 14 + feature_file = f'./data/{csv_prefix}_ema_14.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['ema_14'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: ema_14') + _, values = calc_ema(df['Close']) + features_dict['ema_14'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # OBV + feature_file = f'./data/{csv_prefix}_obv.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['obv'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: obv') + _, values = calc_obv(df['Close'], df['Volume']) + features_dict['obv'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # CMF + feature_file = f'./data/{csv_prefix}_cmf.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['cmf'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: cmf') + _, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume']) + features_dict['cmf'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # ROC 10 + feature_file = f'./data/{csv_prefix}_roc_10.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['roc_10'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: roc_10') + _, values = calc_roc(df['Close']) + features_dict['roc_10'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # DPO 20 + feature_file = f'./data/{csv_prefix}_dpo_20.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['dpo_20'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: dpo_20') + _, values = calc_dpo(df['Close']) + features_dict['dpo_20'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # Ultimate Oscillator + feature_file = f'./data/{csv_prefix}_ultimate_osc.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['ultimate_osc'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: ultimate_osc') + _, values = calc_ultimate(df['High'], df['Low'], df['Close']) + features_dict['ultimate_osc'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # Daily Return + feature_file = f'./data/{csv_prefix}_daily_return.npy' + if os.path.exists(feature_file): + print(f'A Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['daily_return'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: daily_return') + _, values = calc_daily_return(df['Close']) + features_dict['daily_return'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # Multi-column indicators + # Bollinger Bands + print('Calculating multi-column indicator: bollinger') + result = calc_bollinger(df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Stochastic Oscillator + print('Calculating multi-column indicator: stochastic') + result = calc_stochastic(df['High'], df['Low'], df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # SMA + print('Calculating multi-column indicator: sma') + result = calc_sma(df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # PSAR + print('Calculating multi-column indicator: psar') + result = calc_psar(df['High'], df['Low'], df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Donchian Channel + print('Calculating multi-column indicator: donchian') + result = calc_donchian(df['High'], df['Low'], df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Keltner Channel + print('Calculating multi-column indicator: keltner') + result = calc_keltner(df['High'], df['Low'], df['Close']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Ichimoku + print('Calculating multi-column indicator: ichimoku') + result = calc_ichimoku(df['High'], df['Low']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Elder Ray + print('Calculating multi-column indicator: elder_ray') + result = calc_elder_ray(df['Close'], df['Low'], df['High']) + for subname, values in result: + print(f"Adding subfeature: {subname}") + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + print(f'B Loading cached feature: {sub_feature_file}') + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Prepare lags, rolling stats, log returns, and volatility features sequentially + # Lags + for col in ohlcv_cols: + for lag in range(1, lags + 1): + feature_name = f'{col}_lag{lag}' + feature_file = f'./data/{csv_prefix}_{feature_name}.npy' + if os.path.exists(feature_file): + print(f'C Loading cached feature: {feature_file}') + features_dict[feature_name] = np.load(feature_file) + else: + print(f'Computing lag feature: {feature_name}') + result = compute_lag(df, col, lag) + features_dict[feature_name] = result + np.save(feature_file, result.values) + print(f'Saved feature: {feature_file}') + # Rolling statistics + for col in ohlcv_cols: + for window in window_sizes: + if (col == 'Open' and window == 5): + continue + if (col == 'High' and window == 5): + continue + if (col == 'High' and window == 30): + continue + if (col == 'Low' and window == 15): + continue + for stat in ['mean', 'std', 'min', 'max']: + feature_name = f'{col}_roll_{stat}_{window}' + feature_file = f'./data/{csv_prefix}_{feature_name}.npy' + if os.path.exists(feature_file): + print(f'D Loading cached feature: {feature_file}') + features_dict[feature_name] = np.load(feature_file) + else: + print(f'Computing rolling stat feature: {feature_name}') + result = compute_rolling(df, col, stat, window) + features_dict[feature_name] = result + np.save(feature_file, result.values) + print(f'Saved feature: {feature_file}') + # Log returns for different horizons + for horizon in [5, 15, 30]: + feature_name = f'log_return_{horizon}' + feature_file = f'./data/{csv_prefix}_{feature_name}.npy' + if os.path.exists(feature_file): + print(f'E Loading cached feature: {feature_file}') + features_dict[feature_name] = np.load(feature_file) + else: + print(f'Computing log return feature: {feature_name}') + result = compute_log_return(df, horizon) + features_dict[feature_name] = result + np.save(feature_file, result.values) + print(f'Saved feature: {feature_file}') + # Volatility + for window in window_sizes: + feature_name = f'volatility_{window}' + feature_file = f'./data/{csv_prefix}_{feature_name}.npy' + if os.path.exists(feature_file): + print(f'F Loading cached feature: {feature_file}') + features_dict[feature_name] = np.load(feature_file) + else: + print(f'Computing volatility feature: {feature_name}') + result = compute_volatility(df, window) + features_dict[feature_name] = result + np.save(feature_file, result.values) + print(f'Saved feature: {feature_file}') + + # --- Additional Technical Indicator Features --- + # ADX + adx_names = ['adx', 'adx_pos', 'adx_neg'] + adx_files = [f'./data/{csv_prefix}_{name}.npy' for name in adx_names] + if all(os.path.exists(f) for f in adx_files): + print('G Loading cached features: ADX') + for name, f in zip(adx_names, adx_files): + arr = np.load(f) + features_dict[name] = pd.Series(arr, index=df.index) + else: + print('Calculating multi-column indicator: adx') + result = calc_adx(df['High'], df['Low'], df['Close']) + for subname, values in result: + sub_feature_file = f'./data/{csv_prefix}_{subname}.npy' + features_dict[subname] = values + np.save(sub_feature_file, values.values) + print(f'Saved feature: {sub_feature_file}') + + # Force Index + feature_file = f'./data/{csv_prefix}_force_index.npy' + if os.path.exists(feature_file): + print(f'K Loading cached feature: {feature_file}') + arr = np.load(feature_file) + features_dict['force_index'] = pd.Series(arr, index=df.index) + else: + print('Calculating feature: force_index') + _, values = calc_force_index(df['Close'], df['Volume']) + features_dict['force_index'] = values + np.save(feature_file, values.values) + print(f'Saved feature: {feature_file}') + + # Supertrend indicators + for period, multiplier in [(12, 3.0), (10, 1.0), (11, 2.0)]: + st_name = f'supertrend_{period}_{multiplier}' + st_trend_name = f'supertrend_trend_{period}_{multiplier}' + st_file = f'./data/{csv_prefix}_{st_name}.npy' + st_trend_file = f'./data/{csv_prefix}_{st_trend_name}.npy' + if os.path.exists(st_file) and os.path.exists(st_trend_file): + print(f'L Loading cached features: {st_file}, {st_trend_file}') + features_dict[st_name] = pd.Series(np.load(st_file), index=df.index) + features_dict[st_trend_name] = pd.Series(np.load(st_trend_file), index=df.index) + else: + print(f'Calculating Supertrend indicator: {st_name}') + st = ta.supertrend(df['High'], df['Low'], df['Close'], length=period, multiplier=multiplier) + features_dict[st_name] = st[f'SUPERT_{period}_{multiplier}'] + features_dict[st_trend_name] = st[f'SUPERTd_{period}_{multiplier}'] + np.save(st_file, features_dict[st_name].values) + np.save(st_trend_file, features_dict[st_trend_name].values) + print(f'Saved features: {st_file}, {st_trend_file}') + + # Concatenate all new features at once + print('Concatenating all new features to DataFrame...') + features_df = pd.DataFrame(features_dict) + print("Columns in features_df:", features_df.columns.tolist()) + print("All-NaN columns in features_df:", features_df.columns[features_df.isna().all()].tolist()) + df = pd.concat([df, features_df], axis=1) + + # Print all columns after concatenation + print("All columns in df after concat:", df.columns.tolist()) + + # Downcast all float columns to save memory + print('Downcasting float columns to save memory...') + for col in df.columns: + try: + df[col] = pd.to_numeric(df[col], downcast='float') + except Exception: + pass + + # Add time features (exclude 'dayofweek') + print('Adding hour feature...') + df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce') + df['hour'] = df['Timestamp'].dt.hour + + # Handle NaNs after all feature engineering + if IMPUTE_NANS: + print('Imputing NaNs after feature engineering (using mean imputation)...') + numeric_cols = df.select_dtypes(include=[np.number]).columns + for col in numeric_cols: + df[col] = df[col].fillna(df[col].mean()) + # If you want to impute non-numeric columns differently, add logic here + else: + print('Dropping NaNs after feature engineering...') + df = df.dropna().reset_index(drop=True) + + # Exclude 'Timestamp', 'Close', 'log_return', and any future target columns from features + print('Selecting feature columns...') + exclude_cols = ['Timestamp', 'Close', 'log_return', 'log_return_5', 'log_return_15', 'log_return_30'] + feature_cols = [col for col in df.columns if col not in exclude_cols] + print('Features used for training:', feature_cols) + + # Prepare CSV for results + results_csv = './data/leave_one_out_results.csv' + if not os.path.exists(results_csv): + with open(results_csv, 'w', newline='') as f: + writer = csv.writer(f) + writer.writerow(['left_out_feature', 'used_features', 'rmse', 'mae', 'r2', 'mape', 'directional_accuracy']) + + total_features = len(feature_cols) + never_leave_out = {'Open', 'High', 'Low', 'Close', 'Volume'} + for idx, left_out in enumerate(feature_cols): + if left_out in never_leave_out: + continue + used = [f for f in feature_cols if f != left_out] + print(f'\n=== Leave-one-out {idx+1}/{total_features}: left out {left_out} ===') + try: + # Prepare X and y for this combination + X = df[used].values.astype(np.float32) + y = df["log_return"].values.astype(np.float32) + split_idx = int(len(X) * 0.8) + X_train, X_test = X[:split_idx], X[split_idx:] + y_train, y_test = y[:split_idx], y[split_idx:] + test_timestamps = df['Timestamp'].values[split_idx:] + + model = CustomXGBoostGPU(X_train, X_test, y_train, y_test) + booster = model.train() + model.save_model(f'./data/xgboost_model_wo_{left_out}.json') + + test_preds = model.predict(X_test) + rmse = np.sqrt(mean_squared_error(y_test, test_preds)) + + # Reconstruct price series from log returns + if 'Close' in df.columns: + close_prices = df['Close'].values + else: + close_prices = pd.read_csv(csv_path)['Close'].values + start_price = close_prices[split_idx] + actual_prices = [start_price] + for r_ in y_test: + actual_prices.append(actual_prices[-1] * np.exp(r_)) + actual_prices = np.array(actual_prices[1:]) + predicted_prices = [start_price] + for r_ in test_preds: + predicted_prices.append(predicted_prices[-1] * np.exp(r_)) + predicted_prices = np.array(predicted_prices[1:]) + + mae = mean_absolute_error(actual_prices, predicted_prices) + r2 = r2_score(actual_prices, predicted_prices) + direction_actual = np.sign(np.diff(actual_prices)) + direction_pred = np.sign(np.diff(predicted_prices)) + directional_accuracy = (direction_actual == direction_pred).mean() + mape = np.mean(np.abs((actual_prices - predicted_prices) / actual_prices)) * 100 + + # Save results to CSV + with open(results_csv, 'a', newline='') as f: + writer = csv.writer(f) + writer.writerow([left_out, "|".join(used), rmse, mae, r2, mape, directional_accuracy]) + print(f'Left out {left_out}: RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%, DirAcc={directional_accuracy*100:.2f}%') + + # Plotting for this run + plot_prefix = f'loo_{left_out}' + print('Plotting distribution of absolute prediction errors...') + plot_prediction_error_distribution(predicted_prices, actual_prices, prefix=plot_prefix) + + print('Plotting directional accuracy...') + plot_direction_transition_heatmap(actual_prices, predicted_prices, prefix=plot_prefix) + except Exception as e: + print(f'Leave-one-out failed for {left_out}: {e}') + print(f'All leave-one-out runs completed. Results saved to {results_csv}') sys.exit(0) \ No newline at end of file