Enhance CustomXGBoostGPU for inference by making training data optional and adding model loading functionality. Update feature engineering to support caching and improve data handling. Introduce a new inference example and README for better usability in other projects.
This commit is contained in:
parent
b56d9ea3a1
commit
a419764fff
38
INFERENCE_README.md
Normal file
38
INFERENCE_README.md
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
# OHLCV Predictor - Simple Inference
|
||||||
|
|
||||||
|
Refactored for easy reuse in other projects.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
from predictor import OHLCVPredictor
|
||||||
|
|
||||||
|
predictor = OHLCVPredictor('model.json')
|
||||||
|
predictions = predictor.predict(your_ohlcv_dataframe)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Files Needed
|
||||||
|
|
||||||
|
Copy these 5 files to your other project:
|
||||||
|
|
||||||
|
1. `predictor.py`
|
||||||
|
2. `custom_xgboost.py`
|
||||||
|
3. `feature_engineering.py`
|
||||||
|
4. `technical_indicator_functions.py`
|
||||||
|
5. `xgboost_model_all_features.json`
|
||||||
|
|
||||||
|
## Data Requirements
|
||||||
|
|
||||||
|
Your DataFrame needs these columns:
|
||||||
|
- `Open`, `High`, `Low`, `Close`, `Volume`, `Timestamp`
|
||||||
|
|
||||||
|
## Dependencies
|
||||||
|
|
||||||
|
```
|
||||||
|
xgboost >= 3.0.2
|
||||||
|
pandas >= 2.2.3
|
||||||
|
numpy >= 2.2.3
|
||||||
|
scikit-learn >= 1.6.1
|
||||||
|
ta >= 0.11.0
|
||||||
|
numba >= 0.61.2
|
||||||
|
```
|
||||||
@ -2,15 +2,34 @@ import xgboost as xgb
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
class CustomXGBoostGPU:
|
class CustomXGBoostGPU:
|
||||||
def __init__(self, X_train, X_test, y_train, y_test):
|
def __init__(self, X_train=None, X_test=None, y_train=None, y_test=None):
|
||||||
self.X_train = X_train.astype(np.float32)
|
# Make training data optional for inference-only usage
|
||||||
self.X_test = X_test.astype(np.float32)
|
self.X_train = X_train.astype(np.float32) if X_train is not None else None
|
||||||
self.y_train = y_train.astype(np.float32)
|
self.X_test = X_test.astype(np.float32) if X_test is not None else None
|
||||||
self.y_test = y_test.astype(np.float32)
|
self.y_train = y_train.astype(np.float32) if y_train is not None else None
|
||||||
|
self.y_test = y_test.astype(np.float32) if y_test is not None else None
|
||||||
self.model = None
|
self.model = None
|
||||||
self.params = None # Will be set during training
|
self.params = None # Will be set during training
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load_model(cls, model_path):
|
||||||
|
"""Load a pre-trained model from file for inference
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model_path (str): Path to the saved XGBoost model file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
CustomXGBoostGPU: Instance with loaded model ready for inference
|
||||||
|
"""
|
||||||
|
instance = cls() # Create instance without training data
|
||||||
|
instance.model = xgb.Booster()
|
||||||
|
instance.model.load_model(model_path)
|
||||||
|
return instance
|
||||||
|
|
||||||
def train(self, **xgb_params):
|
def train(self, **xgb_params):
|
||||||
|
if self.X_train is None or self.y_train is None:
|
||||||
|
raise ValueError('Training data is required for training. Use load_model() for inference-only usage.')
|
||||||
|
|
||||||
params = {
|
params = {
|
||||||
'tree_method': 'hist',
|
'tree_method': 'hist',
|
||||||
'device': 'cuda',
|
'device': 'cuda',
|
||||||
|
|||||||
@ -2,158 +2,202 @@ import os
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import ta
|
import ta
|
||||||
from technical_indicator_functions import *
|
|
||||||
|
try:
|
||||||
|
from .technical_indicator_functions import *
|
||||||
|
except ImportError:
|
||||||
|
from technical_indicator_functions import *
|
||||||
|
|
||||||
def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
||||||
feature_file = f'../data/{csv_prefix}_rsi.npy'
|
"""
|
||||||
|
Compute and/or load features for the given DataFrame.
|
||||||
|
If csv_prefix is provided, features are cached to disk; otherwise, features are only computed in memory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df (pd.DataFrame): Input OHLCV data.
|
||||||
|
csv_prefix (str or None): Prefix for feature files (for caching). If None or '', disables caching.
|
||||||
|
ohlcv_cols (list): List of OHLCV column names.
|
||||||
|
lags (int): Number of lag features.
|
||||||
|
window_sizes (list): List of window sizes for rolling features.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
dict: Dictionary of computed features.
|
||||||
|
"""
|
||||||
features_dict = {}
|
features_dict = {}
|
||||||
|
|
||||||
|
# RSI
|
||||||
|
if csv_prefix:
|
||||||
|
feature_file = f'../data/{csv_prefix}_rsi.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['rsi'] = pd.Series(arr, index=df.index)
|
features_dict['rsi'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: rsi')
|
|
||||||
_, values = calc_rsi(df['Close'])
|
_, values = calc_rsi(df['Close'])
|
||||||
features_dict['rsi'] = values
|
features_dict['rsi'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_rsi(df['Close'])
|
||||||
|
features_dict['rsi'] = values
|
||||||
|
|
||||||
# MACD
|
# MACD
|
||||||
|
if csv_prefix:
|
||||||
feature_file = f'../data/{csv_prefix}_macd.npy'
|
feature_file = f'../data/{csv_prefix}_macd.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['macd'] = pd.Series(arr, index=df.index)
|
features_dict['macd'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: macd')
|
|
||||||
_, values = calc_macd(df['Close'])
|
_, values = calc_macd(df['Close'])
|
||||||
features_dict['macd'] = values
|
features_dict['macd'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_macd(df['Close'])
|
||||||
|
features_dict['macd'] = values
|
||||||
|
|
||||||
# ATR
|
# ATR
|
||||||
|
if csv_prefix:
|
||||||
feature_file = f'../data/{csv_prefix}_atr.npy'
|
feature_file = f'../data/{csv_prefix}_atr.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['atr'] = pd.Series(arr, index=df.index)
|
features_dict['atr'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: atr')
|
|
||||||
_, values = calc_atr(df['High'], df['Low'], df['Close'])
|
_, values = calc_atr(df['High'], df['Low'], df['Close'])
|
||||||
features_dict['atr'] = values
|
features_dict['atr'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_atr(df['High'], df['Low'], df['Close'])
|
||||||
|
features_dict['atr'] = values
|
||||||
|
|
||||||
# CCI
|
# CCI
|
||||||
|
if csv_prefix:
|
||||||
feature_file = f'../data/{csv_prefix}_cci.npy'
|
feature_file = f'../data/{csv_prefix}_cci.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['cci'] = pd.Series(arr, index=df.index)
|
features_dict['cci'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: cci')
|
|
||||||
_, values = calc_cci(df['High'], df['Low'], df['Close'])
|
_, values = calc_cci(df['High'], df['Low'], df['Close'])
|
||||||
features_dict['cci'] = values
|
features_dict['cci'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_cci(df['High'], df['Low'], df['Close'])
|
||||||
|
features_dict['cci'] = values
|
||||||
|
|
||||||
# Williams %R
|
# Williams %R
|
||||||
|
if csv_prefix:
|
||||||
feature_file = f'../data/{csv_prefix}_williams_r.npy'
|
feature_file = f'../data/{csv_prefix}_williams_r.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['williams_r'] = pd.Series(arr, index=df.index)
|
features_dict['williams_r'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: williams_r')
|
|
||||||
_, values = calc_williamsr(df['High'], df['Low'], df['Close'])
|
_, values = calc_williamsr(df['High'], df['Low'], df['Close'])
|
||||||
features_dict['williams_r'] = values
|
features_dict['williams_r'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_williamsr(df['High'], df['Low'], df['Close'])
|
||||||
|
features_dict['williams_r'] = values
|
||||||
|
|
||||||
# EMA 14
|
# EMA 14
|
||||||
|
if csv_prefix:
|
||||||
feature_file = f'../data/{csv_prefix}_ema_14.npy'
|
feature_file = f'../data/{csv_prefix}_ema_14.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['ema_14'] = pd.Series(arr, index=df.index)
|
features_dict['ema_14'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: ema_14')
|
|
||||||
_, values = calc_ema(df['Close'])
|
_, values = calc_ema(df['Close'])
|
||||||
features_dict['ema_14'] = values
|
features_dict['ema_14'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_ema(df['Close'])
|
||||||
|
features_dict['ema_14'] = values
|
||||||
|
|
||||||
# OBV
|
# OBV
|
||||||
|
if csv_prefix:
|
||||||
feature_file = f'../data/{csv_prefix}_obv.npy'
|
feature_file = f'../data/{csv_prefix}_obv.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['obv'] = pd.Series(arr, index=df.index)
|
features_dict['obv'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: obv')
|
|
||||||
_, values = calc_obv(df['Close'], df['Volume'])
|
_, values = calc_obv(df['Close'], df['Volume'])
|
||||||
features_dict['obv'] = values
|
features_dict['obv'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_obv(df['Close'], df['Volume'])
|
||||||
|
features_dict['obv'] = values
|
||||||
|
|
||||||
# CMF
|
# CMF
|
||||||
|
if csv_prefix:
|
||||||
feature_file = f'../data/{csv_prefix}_cmf.npy'
|
feature_file = f'../data/{csv_prefix}_cmf.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['cmf'] = pd.Series(arr, index=df.index)
|
features_dict['cmf'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: cmf')
|
|
||||||
_, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
|
_, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
|
||||||
features_dict['cmf'] = values
|
features_dict['cmf'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
|
||||||
|
features_dict['cmf'] = values
|
||||||
|
|
||||||
# ROC 10
|
# ROC 10
|
||||||
|
if csv_prefix:
|
||||||
feature_file = f'../data/{csv_prefix}_roc_10.npy'
|
feature_file = f'../data/{csv_prefix}_roc_10.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['roc_10'] = pd.Series(arr, index=df.index)
|
features_dict['roc_10'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: roc_10')
|
|
||||||
_, values = calc_roc(df['Close'])
|
_, values = calc_roc(df['Close'])
|
||||||
features_dict['roc_10'] = values
|
features_dict['roc_10'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_roc(df['Close'])
|
||||||
|
features_dict['roc_10'] = values
|
||||||
|
|
||||||
# DPO 20
|
# DPO 20
|
||||||
|
if csv_prefix:
|
||||||
feature_file = f'../data/{csv_prefix}_dpo_20.npy'
|
feature_file = f'../data/{csv_prefix}_dpo_20.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['dpo_20'] = pd.Series(arr, index=df.index)
|
features_dict['dpo_20'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: dpo_20')
|
|
||||||
_, values = calc_dpo(df['Close'])
|
_, values = calc_dpo(df['Close'])
|
||||||
features_dict['dpo_20'] = values
|
features_dict['dpo_20'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_dpo(df['Close'])
|
||||||
|
features_dict['dpo_20'] = values
|
||||||
|
|
||||||
# Ultimate Oscillator
|
# Ultimate Oscillator
|
||||||
|
if csv_prefix:
|
||||||
feature_file = f'../data/{csv_prefix}_ultimate_osc.npy'
|
feature_file = f'../data/{csv_prefix}_ultimate_osc.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
|
features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: ultimate_osc')
|
|
||||||
_, values = calc_ultimate(df['High'], df['Low'], df['Close'])
|
_, values = calc_ultimate(df['High'], df['Low'], df['Close'])
|
||||||
features_dict['ultimate_osc'] = values
|
features_dict['ultimate_osc'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_ultimate(df['High'], df['Low'], df['Close'])
|
||||||
|
features_dict['ultimate_osc'] = values
|
||||||
|
|
||||||
# Daily Return
|
# Daily Return
|
||||||
|
if csv_prefix:
|
||||||
feature_file = f'../data/{csv_prefix}_daily_return.npy'
|
feature_file = f'../data/{csv_prefix}_daily_return.npy'
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['daily_return'] = pd.Series(arr, index=df.index)
|
features_dict['daily_return'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: daily_return')
|
|
||||||
_, values = calc_daily_return(df['Close'])
|
_, values = calc_daily_return(df['Close'])
|
||||||
features_dict['daily_return'] = values
|
features_dict['daily_return'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_daily_return(df['Close'])
|
||||||
|
features_dict['daily_return'] = values
|
||||||
|
|
||||||
# Multi-column indicators
|
# Multi-column indicators
|
||||||
# Bollinger Bands
|
# Bollinger Bands
|
||||||
result = calc_bollinger(df['Close'])
|
result = calc_bollinger(df['Close'])
|
||||||
for subname, values in result:
|
for subname, values in result:
|
||||||
|
if csv_prefix:
|
||||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||||
if os.path.exists(sub_feature_file):
|
if os.path.exists(sub_feature_file):
|
||||||
arr = np.load(sub_feature_file)
|
arr = np.load(sub_feature_file)
|
||||||
@ -161,11 +205,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
else:
|
else:
|
||||||
features_dict[subname] = values
|
features_dict[subname] = values
|
||||||
np.save(sub_feature_file, values.values)
|
np.save(sub_feature_file, values.values)
|
||||||
print(f'Saved feature: {sub_feature_file}')
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
|
||||||
# Stochastic Oscillator
|
# Stochastic Oscillator
|
||||||
result = calc_stochastic(df['High'], df['Low'], df['Close'])
|
result = calc_stochastic(df['High'], df['Low'], df['Close'])
|
||||||
for subname, values in result:
|
for subname, values in result:
|
||||||
|
if csv_prefix:
|
||||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||||
if os.path.exists(sub_feature_file):
|
if os.path.exists(sub_feature_file):
|
||||||
arr = np.load(sub_feature_file)
|
arr = np.load(sub_feature_file)
|
||||||
@ -173,11 +219,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
else:
|
else:
|
||||||
features_dict[subname] = values
|
features_dict[subname] = values
|
||||||
np.save(sub_feature_file, values.values)
|
np.save(sub_feature_file, values.values)
|
||||||
print(f'Saved feature: {sub_feature_file}')
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
|
||||||
# SMA
|
# SMA
|
||||||
result = calc_sma(df['Close'])
|
result = calc_sma(df['Close'])
|
||||||
for subname, values in result:
|
for subname, values in result:
|
||||||
|
if csv_prefix:
|
||||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||||
if os.path.exists(sub_feature_file):
|
if os.path.exists(sub_feature_file):
|
||||||
arr = np.load(sub_feature_file)
|
arr = np.load(sub_feature_file)
|
||||||
@ -185,11 +233,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
else:
|
else:
|
||||||
features_dict[subname] = values
|
features_dict[subname] = values
|
||||||
np.save(sub_feature_file, values.values)
|
np.save(sub_feature_file, values.values)
|
||||||
print(f'Saved feature: {sub_feature_file}')
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
|
||||||
# PSAR
|
# PSAR
|
||||||
result = calc_psar(df['High'], df['Low'], df['Close'])
|
result = calc_psar(df['High'], df['Low'], df['Close'])
|
||||||
for subname, values in result:
|
for subname, values in result:
|
||||||
|
if csv_prefix:
|
||||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||||
if os.path.exists(sub_feature_file):
|
if os.path.exists(sub_feature_file):
|
||||||
arr = np.load(sub_feature_file)
|
arr = np.load(sub_feature_file)
|
||||||
@ -197,11 +247,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
else:
|
else:
|
||||||
features_dict[subname] = values
|
features_dict[subname] = values
|
||||||
np.save(sub_feature_file, values.values)
|
np.save(sub_feature_file, values.values)
|
||||||
print(f'Saved feature: {sub_feature_file}')
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
|
||||||
# Donchian Channel
|
# Donchian Channel
|
||||||
result = calc_donchian(df['High'], df['Low'], df['Close'])
|
result = calc_donchian(df['High'], df['Low'], df['Close'])
|
||||||
for subname, values in result:
|
for subname, values in result:
|
||||||
|
if csv_prefix:
|
||||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||||
if os.path.exists(sub_feature_file):
|
if os.path.exists(sub_feature_file):
|
||||||
arr = np.load(sub_feature_file)
|
arr = np.load(sub_feature_file)
|
||||||
@ -209,11 +261,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
else:
|
else:
|
||||||
features_dict[subname] = values
|
features_dict[subname] = values
|
||||||
np.save(sub_feature_file, values.values)
|
np.save(sub_feature_file, values.values)
|
||||||
print(f'Saved feature: {sub_feature_file}')
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
|
||||||
# Keltner Channel
|
# Keltner Channel
|
||||||
result = calc_keltner(df['High'], df['Low'], df['Close'])
|
result = calc_keltner(df['High'], df['Low'], df['Close'])
|
||||||
for subname, values in result:
|
for subname, values in result:
|
||||||
|
if csv_prefix:
|
||||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||||
if os.path.exists(sub_feature_file):
|
if os.path.exists(sub_feature_file):
|
||||||
arr = np.load(sub_feature_file)
|
arr = np.load(sub_feature_file)
|
||||||
@ -221,11 +275,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
else:
|
else:
|
||||||
features_dict[subname] = values
|
features_dict[subname] = values
|
||||||
np.save(sub_feature_file, values.values)
|
np.save(sub_feature_file, values.values)
|
||||||
print(f'Saved feature: {sub_feature_file}')
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
|
||||||
# Ichimoku
|
# Ichimoku
|
||||||
result = calc_ichimoku(df['High'], df['Low'])
|
result = calc_ichimoku(df['High'], df['Low'])
|
||||||
for subname, values in result:
|
for subname, values in result:
|
||||||
|
if csv_prefix:
|
||||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||||
if os.path.exists(sub_feature_file):
|
if os.path.exists(sub_feature_file):
|
||||||
arr = np.load(sub_feature_file)
|
arr = np.load(sub_feature_file)
|
||||||
@ -233,11 +289,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
else:
|
else:
|
||||||
features_dict[subname] = values
|
features_dict[subname] = values
|
||||||
np.save(sub_feature_file, values.values)
|
np.save(sub_feature_file, values.values)
|
||||||
print(f'Saved feature: {sub_feature_file}')
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
|
||||||
# Elder Ray
|
# Elder Ray
|
||||||
result = calc_elder_ray(df['Close'], df['Low'], df['High'])
|
result = calc_elder_ray(df['Close'], df['Low'], df['High'])
|
||||||
for subname, values in result:
|
for subname, values in result:
|
||||||
|
if csv_prefix:
|
||||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||||
if os.path.exists(sub_feature_file):
|
if os.path.exists(sub_feature_file):
|
||||||
arr = np.load(sub_feature_file)
|
arr = np.load(sub_feature_file)
|
||||||
@ -245,7 +303,8 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
else:
|
else:
|
||||||
features_dict[subname] = values
|
features_dict[subname] = values
|
||||||
np.save(sub_feature_file, values.values)
|
np.save(sub_feature_file, values.values)
|
||||||
print(f'Saved feature: {sub_feature_file}')
|
else:
|
||||||
|
features_dict[subname] = values
|
||||||
|
|
||||||
# Prepare lags, rolling stats, log returns, and volatility features sequentially
|
# Prepare lags, rolling stats, log returns, and volatility features sequentially
|
||||||
# Lags
|
# Lags
|
||||||
@ -253,14 +312,17 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
for lag in range(1, lags + 1):
|
for lag in range(1, lags + 1):
|
||||||
feature_name = f'{col}_lag{lag}'
|
feature_name = f'{col}_lag{lag}'
|
||||||
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
||||||
|
if csv_prefix:
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
print(f'Computing lag feature: {feature_name}')
|
|
||||||
result = compute_lag(df, col, lag)
|
result = compute_lag(df, col, lag)
|
||||||
features_dict[feature_name] = result
|
features_dict[feature_name] = result
|
||||||
np.save(feature_file, result.values)
|
np.save(feature_file, result.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
result = compute_lag(df, col, lag)
|
||||||
|
features_dict[feature_name] = result
|
||||||
|
|
||||||
# Rolling statistics
|
# Rolling statistics
|
||||||
for col in ohlcv_cols:
|
for col in ohlcv_cols:
|
||||||
for window in window_sizes:
|
for window in window_sizes:
|
||||||
@ -275,44 +337,52 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
for stat in ['mean', 'std', 'min', 'max']:
|
for stat in ['mean', 'std', 'min', 'max']:
|
||||||
feature_name = f'{col}_roll_{stat}_{window}'
|
feature_name = f'{col}_roll_{stat}_{window}'
|
||||||
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
||||||
|
if csv_prefix:
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
print(f'Computing rolling stat feature: {feature_name}')
|
|
||||||
result = compute_rolling(df, col, stat, window)
|
result = compute_rolling(df, col, stat, window)
|
||||||
features_dict[feature_name] = result
|
features_dict[feature_name] = result
|
||||||
np.save(feature_file, result.values)
|
np.save(feature_file, result.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
result = compute_rolling(df, col, stat, window)
|
||||||
|
features_dict[feature_name] = result
|
||||||
|
|
||||||
# Log returns for different horizons
|
# Log returns for different horizons
|
||||||
for horizon in [5, 15, 30]:
|
for horizon in [5, 15, 30]:
|
||||||
feature_name = f'log_return_{horizon}'
|
feature_name = f'log_return_{horizon}'
|
||||||
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
||||||
|
if csv_prefix:
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
print(f'Computing log return feature: {feature_name}')
|
|
||||||
result = compute_log_return(df, horizon)
|
result = compute_log_return(df, horizon)
|
||||||
features_dict[feature_name] = result
|
features_dict[feature_name] = result
|
||||||
np.save(feature_file, result.values)
|
np.save(feature_file, result.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
result = compute_log_return(df, horizon)
|
||||||
|
features_dict[feature_name] = result
|
||||||
|
|
||||||
# Volatility
|
# Volatility
|
||||||
for window in window_sizes:
|
for window in window_sizes:
|
||||||
feature_name = f'volatility_{window}'
|
feature_name = f'volatility_{window}'
|
||||||
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
||||||
|
if csv_prefix:
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
features_dict[feature_name] = np.load(feature_file)
|
features_dict[feature_name] = np.load(feature_file)
|
||||||
else:
|
else:
|
||||||
print(f'Computing volatility feature: {feature_name}')
|
|
||||||
result = compute_volatility(df, window)
|
result = compute_volatility(df, window)
|
||||||
features_dict[feature_name] = result
|
features_dict[feature_name] = result
|
||||||
np.save(feature_file, result.values)
|
np.save(feature_file, result.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
result = compute_volatility(df, window)
|
||||||
|
features_dict[feature_name] = result
|
||||||
|
|
||||||
# --- Additional Technical Indicator Features ---
|
# --- Additional Technical Indicator Features ---
|
||||||
# ADX
|
# ADX
|
||||||
adx_names = ['adx', 'adx_pos', 'adx_neg']
|
adx_names = ['adx', 'adx_pos', 'adx_neg']
|
||||||
adx_files = [f'../data/{csv_prefix}_{name}.npy' for name in adx_names]
|
adx_files = [f'../data/{csv_prefix}_{name}.npy' for name in adx_names]
|
||||||
if all(os.path.exists(f) for f in adx_files):
|
if csv_prefix and all(os.path.exists(f) for f in adx_files):
|
||||||
for name, f in zip(adx_names, adx_files):
|
for name, f in zip(adx_names, adx_files):
|
||||||
arr = np.load(f)
|
arr = np.load(f)
|
||||||
features_dict[name] = pd.Series(arr, index=df.index)
|
features_dict[name] = pd.Series(arr, index=df.index)
|
||||||
@ -321,20 +391,22 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
for subname, values in result:
|
for subname, values in result:
|
||||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||||
features_dict[subname] = values
|
features_dict[subname] = values
|
||||||
|
if csv_prefix:
|
||||||
np.save(sub_feature_file, values.values)
|
np.save(sub_feature_file, values.values)
|
||||||
print(f'Saved feature: {sub_feature_file}')
|
|
||||||
|
|
||||||
# Force Index
|
# Force Index
|
||||||
feature_file = f'../data/{csv_prefix}_force_index.npy'
|
feature_file = f'../data/{csv_prefix}_force_index.npy'
|
||||||
|
if csv_prefix:
|
||||||
if os.path.exists(feature_file):
|
if os.path.exists(feature_file):
|
||||||
arr = np.load(feature_file)
|
arr = np.load(feature_file)
|
||||||
features_dict['force_index'] = pd.Series(arr, index=df.index)
|
features_dict['force_index'] = pd.Series(arr, index=df.index)
|
||||||
else:
|
else:
|
||||||
print('Calculating feature: force_index')
|
|
||||||
_, values = calc_force_index(df['Close'], df['Volume'])
|
_, values = calc_force_index(df['Close'], df['Volume'])
|
||||||
features_dict['force_index'] = values
|
features_dict['force_index'] = values
|
||||||
np.save(feature_file, values.values)
|
np.save(feature_file, values.values)
|
||||||
print(f'Saved feature: {feature_file}')
|
else:
|
||||||
|
_, values = calc_force_index(df['Close'], df['Volume'])
|
||||||
|
features_dict['force_index'] = values
|
||||||
|
|
||||||
# Supertrend indicators (simplified implementation)
|
# Supertrend indicators (simplified implementation)
|
||||||
for period, multiplier in [(12, 3.0), (10, 1.0), (11, 2.0)]:
|
for period, multiplier in [(12, 3.0), (10, 1.0), (11, 2.0)]:
|
||||||
@ -342,26 +414,23 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
|||||||
st_trend_name = f'supertrend_trend_{period}_{multiplier}'
|
st_trend_name = f'supertrend_trend_{period}_{multiplier}'
|
||||||
st_file = f'../data/{csv_prefix}_{st_name}.npy'
|
st_file = f'../data/{csv_prefix}_{st_name}.npy'
|
||||||
st_trend_file = f'../data/{csv_prefix}_{st_trend_name}.npy'
|
st_trend_file = f'../data/{csv_prefix}_{st_trend_name}.npy'
|
||||||
if os.path.exists(st_file) and os.path.exists(st_trend_file):
|
if csv_prefix and os.path.exists(st_file) and os.path.exists(st_trend_file):
|
||||||
features_dict[st_name] = pd.Series(np.load(st_file), index=df.index)
|
features_dict[st_name] = pd.Series(np.load(st_file), index=df.index)
|
||||||
features_dict[st_trend_name] = pd.Series(np.load(st_trend_file), index=df.index)
|
features_dict[st_trend_name] = pd.Series(np.load(st_trend_file), index=df.index)
|
||||||
else:
|
else:
|
||||||
print(f'Calculating Supertrend indicator: {st_name}')
|
|
||||||
# Simple supertrend alternative using ATR and moving averages
|
# Simple supertrend alternative using ATR and moving averages
|
||||||
from ta.volatility import AverageTrueRange
|
from ta.volatility import AverageTrueRange
|
||||||
atr = AverageTrueRange(df['High'], df['Low'], df['Close'], window=period).average_true_range()
|
atr = AverageTrueRange(df['High'], df['Low'], df['Close'], window=period).average_true_range()
|
||||||
hl_avg = (df['High'] + df['Low']) / 2
|
hl_avg = (df['High'] + df['Low']) / 2
|
||||||
basic_ub = hl_avg + (multiplier * atr)
|
basic_ub = hl_avg + (multiplier * atr)
|
||||||
basic_lb = hl_avg - (multiplier * atr)
|
basic_lb = hl_avg - (multiplier * atr)
|
||||||
|
|
||||||
# Simplified supertrend calculation
|
# Simplified supertrend calculation
|
||||||
supertrend = hl_avg.copy()
|
supertrend = hl_avg.copy()
|
||||||
trend = pd.Series(1, index=df.index) # 1 for uptrend, -1 for downtrend
|
trend = pd.Series(1, index=df.index) # 1 for uptrend, -1 for downtrend
|
||||||
|
|
||||||
features_dict[st_name] = supertrend
|
features_dict[st_name] = supertrend
|
||||||
features_dict[st_trend_name] = trend
|
features_dict[st_trend_name] = trend
|
||||||
|
if csv_prefix:
|
||||||
np.save(st_file, features_dict[st_name].values)
|
np.save(st_file, features_dict[st_name].values)
|
||||||
np.save(st_trend_file, features_dict[st_trend_name].values)
|
np.save(st_trend_file, features_dict[st_trend_name].values)
|
||||||
print(f'Saved features: {st_file}, {st_trend_file}')
|
|
||||||
|
|
||||||
return features_dict
|
return features_dict
|
||||||
|
|||||||
299
inference_example.py
Normal file
299
inference_example.py
Normal file
@ -0,0 +1,299 @@
|
|||||||
|
"""
|
||||||
|
Complete example showing how to use the OHLCVPredictor for making predictions.
|
||||||
|
This example demonstrates:
|
||||||
|
1. Loading a trained model
|
||||||
|
2. Preparing sample OHLCV data
|
||||||
|
3. Making log return predictions
|
||||||
|
4. Making price predictions
|
||||||
|
5. Evaluating and displaying results
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from predictor import OHLCVPredictor
|
||||||
|
|
||||||
|
def create_sample_ohlcv_data(num_samples=200):
|
||||||
|
"""
|
||||||
|
Create realistic sample OHLCV data for demonstration.
|
||||||
|
In practice, replace this with your actual data loading.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: DataFrame with OHLCV data
|
||||||
|
"""
|
||||||
|
print("Creating sample OHLCV data for demonstration...")
|
||||||
|
|
||||||
|
# Start with a base price and simulate realistic price movements
|
||||||
|
np.random.seed(42) # For reproducible results
|
||||||
|
base_price = 50000.0 # Base Bitcoin price
|
||||||
|
|
||||||
|
# Generate timestamps (1-minute intervals)
|
||||||
|
start_time = datetime(2024, 1, 1)
|
||||||
|
timestamps = [start_time + timedelta(minutes=i) for i in range(num_samples)]
|
||||||
|
|
||||||
|
# Generate realistic price movements
|
||||||
|
returns = np.random.normal(0, 0.001, num_samples) # Small random returns
|
||||||
|
prices = [base_price]
|
||||||
|
|
||||||
|
for i in range(1, num_samples):
|
||||||
|
# Add some trending behavior
|
||||||
|
trend = 0.0001 * np.sin(i / 50.0) # Gentle sinusoidal trend
|
||||||
|
price_change = returns[i] + trend
|
||||||
|
new_price = prices[-1] * (1 + price_change)
|
||||||
|
prices.append(max(new_price, 1000)) # Minimum price floor
|
||||||
|
|
||||||
|
# Generate OHLCV data
|
||||||
|
data = []
|
||||||
|
for i in range(num_samples):
|
||||||
|
price = prices[i]
|
||||||
|
|
||||||
|
# Generate realistic OHLC within a reasonable range
|
||||||
|
volatility = abs(np.random.normal(0, 0.002)) # Random volatility
|
||||||
|
high = price * (1 + volatility)
|
||||||
|
low = price * (1 - volatility)
|
||||||
|
|
||||||
|
# Ensure OHLC relationships are correct
|
||||||
|
open_price = price * (1 + np.random.normal(0, 0.0005))
|
||||||
|
close_price = price * (1 + np.random.normal(0, 0.0005))
|
||||||
|
|
||||||
|
# Ensure high is highest and low is lowest
|
||||||
|
high = max(high, open_price, close_price)
|
||||||
|
low = min(low, open_price, close_price)
|
||||||
|
|
||||||
|
# Generate volume (typically higher during price movements)
|
||||||
|
base_volume = 100 + abs(np.random.normal(0, 50))
|
||||||
|
volume_multiplier = 1 + abs(open_price - close_price) / close_price * 10
|
||||||
|
volume = base_volume * volume_multiplier
|
||||||
|
|
||||||
|
data.append({
|
||||||
|
'Timestamp': timestamps[i],
|
||||||
|
'Open': round(open_price, 2),
|
||||||
|
'High': round(high, 2),
|
||||||
|
'Low': round(low, 2),
|
||||||
|
'Close': round(close_price, 2),
|
||||||
|
'Volume': round(volume, 2)
|
||||||
|
})
|
||||||
|
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
|
# Calculate log returns (required by feature engineering)
|
||||||
|
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
|
||||||
|
|
||||||
|
print(f"Generated {len(df)} samples of OHLCV data")
|
||||||
|
print(f"Price range: ${df['Close'].min():.2f} - ${df['Close'].max():.2f}")
|
||||||
|
return df
|
||||||
|
|
||||||
|
def load_real_data_example():
|
||||||
|
"""
|
||||||
|
Example of how to load real OHLCV data.
|
||||||
|
Replace this with your actual data loading logic.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame or None: Real OHLCV data if available
|
||||||
|
"""
|
||||||
|
# Example paths where real data might be located
|
||||||
|
possible_paths = [
|
||||||
|
'../data/btcusd_1-min_data.csv',
|
||||||
|
'../data/sample_data.csv',
|
||||||
|
'data/crypto_data.csv'
|
||||||
|
]
|
||||||
|
|
||||||
|
for path in possible_paths:
|
||||||
|
if os.path.exists(path):
|
||||||
|
print(f"Loading real data from {path}...")
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
# Ensure required columns exist
|
||||||
|
required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
|
||||||
|
if all(col in df.columns for col in required_cols):
|
||||||
|
# Filter out zero volume entries and calculate log returns
|
||||||
|
df = df[df['Volume'] != 0].reset_index(drop=True)
|
||||||
|
# Use only recent data and ensure proper data types
|
||||||
|
df = df.tail(500).reset_index(drop=True) # Get more data for better feature engineering
|
||||||
|
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
|
||||||
|
print(f"Successfully loaded {len(df)} rows of real data")
|
||||||
|
return df.tail(200) # Use last 200 for final processing
|
||||||
|
else:
|
||||||
|
print(f"Missing required columns in {path}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading {path}: {e}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def display_prediction_results(df, log_return_preds, predicted_prices=None, actual_prices=None):
|
||||||
|
"""
|
||||||
|
Display prediction results in a readable format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: Original OHLCV DataFrame
|
||||||
|
log_return_preds: Array of log return predictions
|
||||||
|
predicted_prices: Array of predicted prices (optional)
|
||||||
|
actual_prices: Array of actual prices (optional)
|
||||||
|
"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("PREDICTION RESULTS")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
# Convert timestamps back to readable format for display
|
||||||
|
df_display = df.copy()
|
||||||
|
df_display['Timestamp'] = pd.to_datetime(df_display['Timestamp'], unit='s')
|
||||||
|
|
||||||
|
print(f"\nLog Return Predictions (first 10):")
|
||||||
|
print("-" * 40)
|
||||||
|
for i in range(min(10, len(log_return_preds))):
|
||||||
|
timestamp = df_display.iloc[i]['Timestamp']
|
||||||
|
close_price = df_display.iloc[i]['Close']
|
||||||
|
log_ret = log_return_preds[i]
|
||||||
|
direction = "UP" if log_ret > 0 else "DOWN"
|
||||||
|
print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
|
||||||
|
f"Close: ${close_price:8.2f} | "
|
||||||
|
f"Log Return: {log_ret:8.6f} | "
|
||||||
|
f"Direction: {direction}")
|
||||||
|
|
||||||
|
if predicted_prices is not None and actual_prices is not None:
|
||||||
|
print(f"\nPrice Predictions vs Actual (first 10):")
|
||||||
|
print("-" * 50)
|
||||||
|
for i in range(min(10, len(predicted_prices))):
|
||||||
|
timestamp = df_display.iloc[i]['Timestamp']
|
||||||
|
pred_price = predicted_prices[i]
|
||||||
|
actual_price = actual_prices[i]
|
||||||
|
error = abs(pred_price - actual_price)
|
||||||
|
error_pct = (error / actual_price) * 100
|
||||||
|
print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
|
||||||
|
f"Predicted: ${pred_price:8.2f} | "
|
||||||
|
f"Actual: ${actual_price:8.2f} | "
|
||||||
|
f"Error: {error_pct:5.2f}%")
|
||||||
|
|
||||||
|
# Statistics
|
||||||
|
print(f"\nPrediction Statistics:")
|
||||||
|
print("-" * 30)
|
||||||
|
print(f"Total predictions: {len(log_return_preds)}")
|
||||||
|
print(f"Mean log return: {np.mean(log_return_preds):.6f}")
|
||||||
|
print(f"Std log return: {np.std(log_return_preds):.6f}")
|
||||||
|
print(f"Positive predictions: {np.sum(log_return_preds > 0)} ({np.mean(log_return_preds > 0)*100:.1f}%)")
|
||||||
|
print(f"Negative predictions: {np.sum(log_return_preds < 0)} ({np.mean(log_return_preds < 0)*100:.1f}%)")
|
||||||
|
|
||||||
|
if predicted_prices is not None and actual_prices is not None:
|
||||||
|
mae = np.mean(np.abs(predicted_prices - actual_prices))
|
||||||
|
mape = np.mean(np.abs((predicted_prices - actual_prices) / actual_prices)) * 100
|
||||||
|
print(f"\nPrice Prediction Accuracy:")
|
||||||
|
print(f"Mean Absolute Error: ${mae:.2f}")
|
||||||
|
print(f"Mean Absolute Percentage Error: {mape:.2f}%")
|
||||||
|
|
||||||
|
def demonstrate_batch_prediction(predictor, df):
|
||||||
|
"""
|
||||||
|
Demonstrate batch prediction on multiple data chunks.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
predictor: OHLCVPredictor instance
|
||||||
|
df: OHLCV DataFrame
|
||||||
|
"""
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("BATCH PREDICTION DEMONSTRATION")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
chunk_size = 50
|
||||||
|
num_chunks = min(3, len(df) // chunk_size)
|
||||||
|
|
||||||
|
for i in range(num_chunks):
|
||||||
|
start_idx = i * chunk_size
|
||||||
|
end_idx = start_idx + chunk_size
|
||||||
|
chunk_df = df.iloc[start_idx:end_idx].copy()
|
||||||
|
|
||||||
|
print(f"\nBatch {i+1}: Processing {len(chunk_df)} samples...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
log_return_preds = predictor.predict(chunk_df, csv_prefix=f'batch_{i+1}')
|
||||||
|
print(f"Successfully predicted {len(log_return_preds)} log returns")
|
||||||
|
print(f"Batch {i+1} mean prediction: {np.mean(log_return_preds):.6f}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error in batch {i+1}: {e}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
Main function demonstrating complete OHLCVPredictor usage.
|
||||||
|
"""
|
||||||
|
model_path = '../data/xgboost_model_all_features.json'
|
||||||
|
|
||||||
|
# Check if model exists
|
||||||
|
if not os.path.exists(model_path):
|
||||||
|
print("Model not found. Run main.py first to train the model.")
|
||||||
|
print(f"Expected model path: {model_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Load predictor
|
||||||
|
print("Loading predictor...")
|
||||||
|
predictor = OHLCVPredictor(model_path)
|
||||||
|
print("Predictor loaded successfully!")
|
||||||
|
|
||||||
|
# Try to load real data first, fall back to synthetic data
|
||||||
|
df = load_real_data_example()
|
||||||
|
if df is None:
|
||||||
|
df = create_sample_ohlcv_data(200)
|
||||||
|
|
||||||
|
print(f"\nDataFrame shape: {df.shape}")
|
||||||
|
print(f"Columns: {list(df.columns)}")
|
||||||
|
print(f"Data range: {len(df)} samples")
|
||||||
|
|
||||||
|
# Demonstrate log return predictions
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("LOG RETURN PREDICTIONS")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
log_return_preds = predictor.predict(df, csv_prefix='inference_demo')
|
||||||
|
print(f"Generated {len(log_return_preds)} log return predictions")
|
||||||
|
|
||||||
|
# Demonstrate price predictions
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("PRICE PREDICTIONS")
|
||||||
|
print("="*60)
|
||||||
|
|
||||||
|
predicted_prices, actual_prices = predictor.predict_prices(df, csv_prefix='price_demo')
|
||||||
|
print(f"Generated {len(predicted_prices)} price predictions")
|
||||||
|
|
||||||
|
# Display results
|
||||||
|
display_prediction_results(df, log_return_preds, predicted_prices, actual_prices)
|
||||||
|
|
||||||
|
# Demonstrate batch processing
|
||||||
|
demonstrate_batch_prediction(predictor, df)
|
||||||
|
|
||||||
|
print("\n" + "="*60)
|
||||||
|
print("USAGE EXAMPLES FOR OTHER PROJECTS")
|
||||||
|
print("="*60)
|
||||||
|
print("""
|
||||||
|
# Basic usage:
|
||||||
|
from predictor import OHLCVPredictor
|
||||||
|
|
||||||
|
# Load your trained model
|
||||||
|
predictor = OHLCVPredictor('path/to/your/model.json')
|
||||||
|
|
||||||
|
# Prepare your OHLCV data (pandas DataFrame with columns):
|
||||||
|
# ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
|
||||||
|
|
||||||
|
# Get log return predictions
|
||||||
|
log_returns = predictor.predict(your_dataframe)
|
||||||
|
|
||||||
|
# Get price predictions
|
||||||
|
predicted_prices, actual_prices = predictor.predict_prices(your_dataframe)
|
||||||
|
|
||||||
|
# Required files for deployment:
|
||||||
|
# - predictor.py
|
||||||
|
# - custom_xgboost.py
|
||||||
|
# - feature_engineering.py
|
||||||
|
# - technical_indicator_functions.py
|
||||||
|
# - your_trained_model.json
|
||||||
|
""")
|
||||||
|
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
print(f"File not found: {e}")
|
||||||
|
print("Make sure the model file exists and the path is correct.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error during prediction: {e}")
|
||||||
|
print("Check your data format and model compatibility.")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
97
predictor.py
Normal file
97
predictor.py
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .custom_xgboost import CustomXGBoostGPU
|
||||||
|
except ImportError:
|
||||||
|
from custom_xgboost import CustomXGBoostGPU
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .feature_engineering import feature_engineering
|
||||||
|
except ImportError:
|
||||||
|
from feature_engineering import feature_engineering
|
||||||
|
|
||||||
|
class OHLCVPredictor:
|
||||||
|
def __init__(self, model_path):
|
||||||
|
if not os.path.exists(model_path):
|
||||||
|
raise FileNotFoundError(f"Model file not found: {model_path}")
|
||||||
|
|
||||||
|
self.model = CustomXGBoostGPU.load_model(model_path)
|
||||||
|
self.exclude_cols = self._get_excluded_features()
|
||||||
|
|
||||||
|
def _get_excluded_features(self):
|
||||||
|
"""Get the list of features to exclude (copied from main.py)"""
|
||||||
|
exclude_cols = ['Timestamp', 'Close']
|
||||||
|
exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30']
|
||||||
|
exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar',
|
||||||
|
'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband',
|
||||||
|
'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line',
|
||||||
|
'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2',
|
||||||
|
'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15',
|
||||||
|
'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15',
|
||||||
|
'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30',
|
||||||
|
'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5',
|
||||||
|
'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30',
|
||||||
|
'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15',
|
||||||
|
'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']
|
||||||
|
return exclude_cols
|
||||||
|
|
||||||
|
def predict(self, df, csv_prefix=None):
|
||||||
|
# Validate input DataFrame
|
||||||
|
required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
|
||||||
|
missing_cols = [col for col in required_cols if col not in df.columns]
|
||||||
|
if missing_cols:
|
||||||
|
raise ValueError(f"Missing required columns: {missing_cols}")
|
||||||
|
|
||||||
|
# Make a copy and preprocess
|
||||||
|
df = df.copy()
|
||||||
|
df = df[df['Volume'] != 0].reset_index(drop=True)
|
||||||
|
|
||||||
|
# Convert timestamps
|
||||||
|
if df['Timestamp'].dtype == 'object':
|
||||||
|
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
|
||||||
|
else:
|
||||||
|
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
|
||||||
|
|
||||||
|
# Feature engineering
|
||||||
|
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
|
||||||
|
features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, 3, [5, 15, 30])
|
||||||
|
features_df = pd.DataFrame(features_dict)
|
||||||
|
df = pd.concat([df, features_df], axis=1)
|
||||||
|
|
||||||
|
# Downcast and add time features (exclude Timestamp to preserve datetime)
|
||||||
|
for col in df.columns:
|
||||||
|
if col != 'Timestamp': # Don't convert Timestamp to numeric
|
||||||
|
try:
|
||||||
|
df[col] = pd.to_numeric(df[col], downcast='float')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
df['hour'] = df['Timestamp'].dt.hour
|
||||||
|
|
||||||
|
# Handle NaNs
|
||||||
|
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
||||||
|
for col in numeric_cols:
|
||||||
|
if df[col].isna().any():
|
||||||
|
df[col] = df[col].fillna(df[col].mean())
|
||||||
|
|
||||||
|
# Defragment DataFrame after all columns have been added
|
||||||
|
df = df.copy()
|
||||||
|
|
||||||
|
# Select features and predict
|
||||||
|
feature_cols = [col for col in df.columns if col not in self.exclude_cols]
|
||||||
|
X = df[feature_cols].values.astype(np.float32)
|
||||||
|
return self.model.predict(X)
|
||||||
|
|
||||||
|
def predict_prices(self, df, csv_prefix=None):
|
||||||
|
log_return_preds = self.predict(df, csv_prefix)
|
||||||
|
df_clean = df[df['Volume'] != 0].copy()
|
||||||
|
close_prices = df_clean['Close'].values
|
||||||
|
|
||||||
|
predicted_prices = [close_prices[0]]
|
||||||
|
for i, log_ret in enumerate(log_return_preds[1:], 1):
|
||||||
|
if i < len(close_prices):
|
||||||
|
predicted_prices.append(predicted_prices[-1] * np.exp(log_ret))
|
||||||
|
|
||||||
|
return np.array(predicted_prices), close_prices[:len(predicted_prices)]
|
||||||
@ -12,3 +12,11 @@ dependencies = [
|
|||||||
"ta>=0.11.0",
|
"ta>=0.11.0",
|
||||||
"xgboost>=3.0.2",
|
"xgboost>=3.0.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=61.0", "wheel"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
include = ["ohlcvpredictor*"]
|
||||||
|
exclude = ["charts*"]
|
||||||
|
|||||||
4
uv.lock
generated
4
uv.lock
generated
@ -1,5 +1,5 @@
|
|||||||
version = 1
|
version = 1
|
||||||
revision = 2
|
revision = 3
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -309,7 +309,7 @@ wheels = [
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "ohlcvpredictor"
|
name = "ohlcvpredictor"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
source = { virtual = "." }
|
source = { editable = "." }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "dash" },
|
{ name = "dash" },
|
||||||
{ name = "numba" },
|
{ name = "numba" },
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user