Enhance CustomXGBoostGPU for inference by making training data optional and adding model loading functionality. Update feature engineering to support caching and improve data handling. Introduce a new inference example and README for better usability in other projects.
This commit is contained in:
parent
b56d9ea3a1
commit
a419764fff
38
INFERENCE_README.md
Normal file
38
INFERENCE_README.md
Normal file
@ -0,0 +1,38 @@
|
||||
# OHLCV Predictor - Simple Inference
|
||||
|
||||
Refactored for easy reuse in other projects.
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from predictor import OHLCVPredictor
|
||||
|
||||
predictor = OHLCVPredictor('model.json')
|
||||
predictions = predictor.predict(your_ohlcv_dataframe)
|
||||
```
|
||||
|
||||
## Files Needed
|
||||
|
||||
Copy these 5 files to your other project:
|
||||
|
||||
1. `predictor.py`
|
||||
2. `custom_xgboost.py`
|
||||
3. `feature_engineering.py`
|
||||
4. `technical_indicator_functions.py`
|
||||
5. `xgboost_model_all_features.json`
|
||||
|
||||
## Data Requirements
|
||||
|
||||
Your DataFrame needs these columns:
|
||||
- `Open`, `High`, `Low`, `Close`, `Volume`, `Timestamp`
|
||||
|
||||
## Dependencies
|
||||
|
||||
```
|
||||
xgboost >= 3.0.2
|
||||
pandas >= 2.2.3
|
||||
numpy >= 2.2.3
|
||||
scikit-learn >= 1.6.1
|
||||
ta >= 0.11.0
|
||||
numba >= 0.61.2
|
||||
```
|
||||
@ -2,15 +2,34 @@ import xgboost as xgb
|
||||
import numpy as np
|
||||
|
||||
class CustomXGBoostGPU:
|
||||
def __init__(self, X_train, X_test, y_train, y_test):
|
||||
self.X_train = X_train.astype(np.float32)
|
||||
self.X_test = X_test.astype(np.float32)
|
||||
self.y_train = y_train.astype(np.float32)
|
||||
self.y_test = y_test.astype(np.float32)
|
||||
def __init__(self, X_train=None, X_test=None, y_train=None, y_test=None):
|
||||
# Make training data optional for inference-only usage
|
||||
self.X_train = X_train.astype(np.float32) if X_train is not None else None
|
||||
self.X_test = X_test.astype(np.float32) if X_test is not None else None
|
||||
self.y_train = y_train.astype(np.float32) if y_train is not None else None
|
||||
self.y_test = y_test.astype(np.float32) if y_test is not None else None
|
||||
self.model = None
|
||||
self.params = None # Will be set during training
|
||||
|
||||
@classmethod
|
||||
def load_model(cls, model_path):
|
||||
"""Load a pre-trained model from file for inference
|
||||
|
||||
Args:
|
||||
model_path (str): Path to the saved XGBoost model file
|
||||
|
||||
Returns:
|
||||
CustomXGBoostGPU: Instance with loaded model ready for inference
|
||||
"""
|
||||
instance = cls() # Create instance without training data
|
||||
instance.model = xgb.Booster()
|
||||
instance.model.load_model(model_path)
|
||||
return instance
|
||||
|
||||
def train(self, **xgb_params):
|
||||
if self.X_train is None or self.y_train is None:
|
||||
raise ValueError('Training data is required for training. Use load_model() for inference-only usage.')
|
||||
|
||||
params = {
|
||||
'tree_method': 'hist',
|
||||
'device': 'cuda',
|
||||
|
||||
@ -2,250 +2,309 @@ import os
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import ta
|
||||
from technical_indicator_functions import *
|
||||
|
||||
try:
|
||||
from .technical_indicator_functions import *
|
||||
except ImportError:
|
||||
from technical_indicator_functions import *
|
||||
|
||||
def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
||||
feature_file = f'../data/{csv_prefix}_rsi.npy'
|
||||
"""
|
||||
Compute and/or load features for the given DataFrame.
|
||||
If csv_prefix is provided, features are cached to disk; otherwise, features are only computed in memory.
|
||||
|
||||
Args:
|
||||
df (pd.DataFrame): Input OHLCV data.
|
||||
csv_prefix (str or None): Prefix for feature files (for caching). If None or '', disables caching.
|
||||
ohlcv_cols (list): List of OHLCV column names.
|
||||
lags (int): Number of lag features.
|
||||
window_sizes (list): List of window sizes for rolling features.
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of computed features.
|
||||
"""
|
||||
features_dict = {}
|
||||
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['rsi'] = pd.Series(arr, index=df.index)
|
||||
# RSI
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_rsi.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['rsi'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_rsi(df['Close'])
|
||||
features_dict['rsi'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: rsi')
|
||||
_, values = calc_rsi(df['Close'])
|
||||
features_dict['rsi'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# MACD
|
||||
feature_file = f'../data/{csv_prefix}_macd.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['macd'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_macd.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['macd'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_macd(df['Close'])
|
||||
features_dict['macd'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: macd')
|
||||
_, values = calc_macd(df['Close'])
|
||||
features_dict['macd'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# ATR
|
||||
feature_file = f'../data/{csv_prefix}_atr.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['atr'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_atr.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['atr'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_atr(df['High'], df['Low'], df['Close'])
|
||||
features_dict['atr'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: atr')
|
||||
_, values = calc_atr(df['High'], df['Low'], df['Close'])
|
||||
features_dict['atr'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# CCI
|
||||
feature_file = f'../data/{csv_prefix}_cci.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['cci'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_cci.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['cci'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_cci(df['High'], df['Low'], df['Close'])
|
||||
features_dict['cci'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: cci')
|
||||
_, values = calc_cci(df['High'], df['Low'], df['Close'])
|
||||
features_dict['cci'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# Williams %R
|
||||
feature_file = f'../data/{csv_prefix}_williams_r.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['williams_r'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_williams_r.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['williams_r'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_williamsr(df['High'], df['Low'], df['Close'])
|
||||
features_dict['williams_r'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: williams_r')
|
||||
_, values = calc_williamsr(df['High'], df['Low'], df['Close'])
|
||||
features_dict['williams_r'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# EMA 14
|
||||
feature_file = f'../data/{csv_prefix}_ema_14.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['ema_14'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_ema_14.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['ema_14'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_ema(df['Close'])
|
||||
features_dict['ema_14'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: ema_14')
|
||||
_, values = calc_ema(df['Close'])
|
||||
features_dict['ema_14'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# OBV
|
||||
feature_file = f'../data/{csv_prefix}_obv.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['obv'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_obv.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['obv'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_obv(df['Close'], df['Volume'])
|
||||
features_dict['obv'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: obv')
|
||||
_, values = calc_obv(df['Close'], df['Volume'])
|
||||
features_dict['obv'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# CMF
|
||||
feature_file = f'../data/{csv_prefix}_cmf.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['cmf'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_cmf.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['cmf'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
|
||||
features_dict['cmf'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: cmf')
|
||||
_, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
|
||||
features_dict['cmf'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# ROC 10
|
||||
feature_file = f'../data/{csv_prefix}_roc_10.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['roc_10'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_roc_10.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['roc_10'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_roc(df['Close'])
|
||||
features_dict['roc_10'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: roc_10')
|
||||
_, values = calc_roc(df['Close'])
|
||||
features_dict['roc_10'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# DPO 20
|
||||
feature_file = f'../data/{csv_prefix}_dpo_20.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['dpo_20'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_dpo_20.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['dpo_20'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_dpo(df['Close'])
|
||||
features_dict['dpo_20'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: dpo_20')
|
||||
_, values = calc_dpo(df['Close'])
|
||||
features_dict['dpo_20'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# Ultimate Oscillator
|
||||
feature_file = f'../data/{csv_prefix}_ultimate_osc.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_ultimate_osc.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_ultimate(df['High'], df['Low'], df['Close'])
|
||||
features_dict['ultimate_osc'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: ultimate_osc')
|
||||
_, values = calc_ultimate(df['High'], df['Low'], df['Close'])
|
||||
features_dict['ultimate_osc'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# Daily Return
|
||||
feature_file = f'../data/{csv_prefix}_daily_return.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['daily_return'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
feature_file = f'../data/{csv_prefix}_daily_return.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['daily_return'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_daily_return(df['Close'])
|
||||
features_dict['daily_return'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: daily_return')
|
||||
_, values = calc_daily_return(df['Close'])
|
||||
features_dict['daily_return'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# Multi-column indicators
|
||||
# Bollinger Bands
|
||||
result = calc_bollinger(df['Close'])
|
||||
for subname, values in result:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
print(f'Saved feature: {sub_feature_file}')
|
||||
|
||||
# Stochastic Oscillator
|
||||
result = calc_stochastic(df['High'], df['Low'], df['Close'])
|
||||
for subname, values in result:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
print(f'Saved feature: {sub_feature_file}')
|
||||
|
||||
# SMA
|
||||
result = calc_sma(df['Close'])
|
||||
for subname, values in result:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
print(f'Saved feature: {sub_feature_file}')
|
||||
|
||||
# PSAR
|
||||
result = calc_psar(df['High'], df['Low'], df['Close'])
|
||||
for subname, values in result:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
print(f'Saved feature: {sub_feature_file}')
|
||||
|
||||
# Donchian Channel
|
||||
result = calc_donchian(df['High'], df['Low'], df['Close'])
|
||||
for subname, values in result:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
print(f'Saved feature: {sub_feature_file}')
|
||||
|
||||
# Keltner Channel
|
||||
result = calc_keltner(df['High'], df['Low'], df['Close'])
|
||||
for subname, values in result:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
print(f'Saved feature: {sub_feature_file}')
|
||||
|
||||
# Ichimoku
|
||||
result = calc_ichimoku(df['High'], df['Low'])
|
||||
for subname, values in result:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
print(f'Saved feature: {sub_feature_file}')
|
||||
|
||||
# Elder Ray
|
||||
result = calc_elder_ray(df['Close'], df['Low'], df['High'])
|
||||
for subname, values in result:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
if os.path.exists(sub_feature_file):
|
||||
arr = np.load(sub_feature_file)
|
||||
features_dict[subname] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
else:
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
print(f'Saved feature: {sub_feature_file}')
|
||||
|
||||
# Prepare lags, rolling stats, log returns, and volatility features sequentially
|
||||
# Lags
|
||||
@ -253,14 +312,17 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
||||
for lag in range(1, lags + 1):
|
||||
feature_name = f'{col}_lag{lag}'
|
||||
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
||||
if os.path.exists(feature_file):
|
||||
features_dict[feature_name] = np.load(feature_file)
|
||||
if csv_prefix:
|
||||
if os.path.exists(feature_file):
|
||||
features_dict[feature_name] = np.load(feature_file)
|
||||
else:
|
||||
result = compute_lag(df, col, lag)
|
||||
features_dict[feature_name] = result
|
||||
np.save(feature_file, result.values)
|
||||
else:
|
||||
print(f'Computing lag feature: {feature_name}')
|
||||
result = compute_lag(df, col, lag)
|
||||
features_dict[feature_name] = result
|
||||
np.save(feature_file, result.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# Rolling statistics
|
||||
for col in ohlcv_cols:
|
||||
for window in window_sizes:
|
||||
@ -275,44 +337,52 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
||||
for stat in ['mean', 'std', 'min', 'max']:
|
||||
feature_name = f'{col}_roll_{stat}_{window}'
|
||||
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
||||
if os.path.exists(feature_file):
|
||||
features_dict[feature_name] = np.load(feature_file)
|
||||
if csv_prefix:
|
||||
if os.path.exists(feature_file):
|
||||
features_dict[feature_name] = np.load(feature_file)
|
||||
else:
|
||||
result = compute_rolling(df, col, stat, window)
|
||||
features_dict[feature_name] = result
|
||||
np.save(feature_file, result.values)
|
||||
else:
|
||||
print(f'Computing rolling stat feature: {feature_name}')
|
||||
result = compute_rolling(df, col, stat, window)
|
||||
features_dict[feature_name] = result
|
||||
np.save(feature_file, result.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# Log returns for different horizons
|
||||
for horizon in [5, 15, 30]:
|
||||
feature_name = f'log_return_{horizon}'
|
||||
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
||||
if os.path.exists(feature_file):
|
||||
features_dict[feature_name] = np.load(feature_file)
|
||||
if csv_prefix:
|
||||
if os.path.exists(feature_file):
|
||||
features_dict[feature_name] = np.load(feature_file)
|
||||
else:
|
||||
result = compute_log_return(df, horizon)
|
||||
features_dict[feature_name] = result
|
||||
np.save(feature_file, result.values)
|
||||
else:
|
||||
print(f'Computing log return feature: {feature_name}')
|
||||
result = compute_log_return(df, horizon)
|
||||
features_dict[feature_name] = result
|
||||
np.save(feature_file, result.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# Volatility
|
||||
for window in window_sizes:
|
||||
feature_name = f'volatility_{window}'
|
||||
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
|
||||
if os.path.exists(feature_file):
|
||||
features_dict[feature_name] = np.load(feature_file)
|
||||
if csv_prefix:
|
||||
if os.path.exists(feature_file):
|
||||
features_dict[feature_name] = np.load(feature_file)
|
||||
else:
|
||||
result = compute_volatility(df, window)
|
||||
features_dict[feature_name] = result
|
||||
np.save(feature_file, result.values)
|
||||
else:
|
||||
print(f'Computing volatility feature: {feature_name}')
|
||||
result = compute_volatility(df, window)
|
||||
features_dict[feature_name] = result
|
||||
np.save(feature_file, result.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# --- Additional Technical Indicator Features ---
|
||||
# ADX
|
||||
adx_names = ['adx', 'adx_pos', 'adx_neg']
|
||||
adx_files = [f'../data/{csv_prefix}_{name}.npy' for name in adx_names]
|
||||
if all(os.path.exists(f) for f in adx_files):
|
||||
if csv_prefix and all(os.path.exists(f) for f in adx_files):
|
||||
for name, f in zip(adx_names, adx_files):
|
||||
arr = np.load(f)
|
||||
features_dict[name] = pd.Series(arr, index=df.index)
|
||||
@ -321,20 +391,22 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
||||
for subname, values in result:
|
||||
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
|
||||
features_dict[subname] = values
|
||||
np.save(sub_feature_file, values.values)
|
||||
print(f'Saved feature: {sub_feature_file}')
|
||||
if csv_prefix:
|
||||
np.save(sub_feature_file, values.values)
|
||||
|
||||
# Force Index
|
||||
feature_file = f'../data/{csv_prefix}_force_index.npy'
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['force_index'] = pd.Series(arr, index=df.index)
|
||||
if csv_prefix:
|
||||
if os.path.exists(feature_file):
|
||||
arr = np.load(feature_file)
|
||||
features_dict['force_index'] = pd.Series(arr, index=df.index)
|
||||
else:
|
||||
_, values = calc_force_index(df['Close'], df['Volume'])
|
||||
features_dict['force_index'] = values
|
||||
np.save(feature_file, values.values)
|
||||
else:
|
||||
print('Calculating feature: force_index')
|
||||
_, values = calc_force_index(df['Close'], df['Volume'])
|
||||
features_dict['force_index'] = values
|
||||
np.save(feature_file, values.values)
|
||||
print(f'Saved feature: {feature_file}')
|
||||
|
||||
# Supertrend indicators (simplified implementation)
|
||||
for period, multiplier in [(12, 3.0), (10, 1.0), (11, 2.0)]:
|
||||
@ -342,26 +414,23 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
|
||||
st_trend_name = f'supertrend_trend_{period}_{multiplier}'
|
||||
st_file = f'../data/{csv_prefix}_{st_name}.npy'
|
||||
st_trend_file = f'../data/{csv_prefix}_{st_trend_name}.npy'
|
||||
if os.path.exists(st_file) and os.path.exists(st_trend_file):
|
||||
if csv_prefix and os.path.exists(st_file) and os.path.exists(st_trend_file):
|
||||
features_dict[st_name] = pd.Series(np.load(st_file), index=df.index)
|
||||
features_dict[st_trend_name] = pd.Series(np.load(st_trend_file), index=df.index)
|
||||
else:
|
||||
print(f'Calculating Supertrend indicator: {st_name}')
|
||||
# Simple supertrend alternative using ATR and moving averages
|
||||
from ta.volatility import AverageTrueRange
|
||||
atr = AverageTrueRange(df['High'], df['Low'], df['Close'], window=period).average_true_range()
|
||||
hl_avg = (df['High'] + df['Low']) / 2
|
||||
basic_ub = hl_avg + (multiplier * atr)
|
||||
basic_lb = hl_avg - (multiplier * atr)
|
||||
|
||||
# Simplified supertrend calculation
|
||||
supertrend = hl_avg.copy()
|
||||
trend = pd.Series(1, index=df.index) # 1 for uptrend, -1 for downtrend
|
||||
|
||||
features_dict[st_name] = supertrend
|
||||
features_dict[st_trend_name] = trend
|
||||
np.save(st_file, features_dict[st_name].values)
|
||||
np.save(st_trend_file, features_dict[st_trend_name].values)
|
||||
print(f'Saved features: {st_file}, {st_trend_file}')
|
||||
|
||||
if csv_prefix:
|
||||
np.save(st_file, features_dict[st_name].values)
|
||||
np.save(st_trend_file, features_dict[st_trend_name].values)
|
||||
|
||||
return features_dict
|
||||
|
||||
299
inference_example.py
Normal file
299
inference_example.py
Normal file
@ -0,0 +1,299 @@
|
||||
"""
|
||||
Complete example showing how to use the OHLCVPredictor for making predictions.
|
||||
This example demonstrates:
|
||||
1. Loading a trained model
|
||||
2. Preparing sample OHLCV data
|
||||
3. Making log return predictions
|
||||
4. Making price predictions
|
||||
5. Evaluating and displaying results
|
||||
"""
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from datetime import datetime, timedelta
|
||||
from predictor import OHLCVPredictor
|
||||
|
||||
def create_sample_ohlcv_data(num_samples=200):
|
||||
"""
|
||||
Create realistic sample OHLCV data for demonstration.
|
||||
In practice, replace this with your actual data loading.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: DataFrame with OHLCV data
|
||||
"""
|
||||
print("Creating sample OHLCV data for demonstration...")
|
||||
|
||||
# Start with a base price and simulate realistic price movements
|
||||
np.random.seed(42) # For reproducible results
|
||||
base_price = 50000.0 # Base Bitcoin price
|
||||
|
||||
# Generate timestamps (1-minute intervals)
|
||||
start_time = datetime(2024, 1, 1)
|
||||
timestamps = [start_time + timedelta(minutes=i) for i in range(num_samples)]
|
||||
|
||||
# Generate realistic price movements
|
||||
returns = np.random.normal(0, 0.001, num_samples) # Small random returns
|
||||
prices = [base_price]
|
||||
|
||||
for i in range(1, num_samples):
|
||||
# Add some trending behavior
|
||||
trend = 0.0001 * np.sin(i / 50.0) # Gentle sinusoidal trend
|
||||
price_change = returns[i] + trend
|
||||
new_price = prices[-1] * (1 + price_change)
|
||||
prices.append(max(new_price, 1000)) # Minimum price floor
|
||||
|
||||
# Generate OHLCV data
|
||||
data = []
|
||||
for i in range(num_samples):
|
||||
price = prices[i]
|
||||
|
||||
# Generate realistic OHLC within a reasonable range
|
||||
volatility = abs(np.random.normal(0, 0.002)) # Random volatility
|
||||
high = price * (1 + volatility)
|
||||
low = price * (1 - volatility)
|
||||
|
||||
# Ensure OHLC relationships are correct
|
||||
open_price = price * (1 + np.random.normal(0, 0.0005))
|
||||
close_price = price * (1 + np.random.normal(0, 0.0005))
|
||||
|
||||
# Ensure high is highest and low is lowest
|
||||
high = max(high, open_price, close_price)
|
||||
low = min(low, open_price, close_price)
|
||||
|
||||
# Generate volume (typically higher during price movements)
|
||||
base_volume = 100 + abs(np.random.normal(0, 50))
|
||||
volume_multiplier = 1 + abs(open_price - close_price) / close_price * 10
|
||||
volume = base_volume * volume_multiplier
|
||||
|
||||
data.append({
|
||||
'Timestamp': timestamps[i],
|
||||
'Open': round(open_price, 2),
|
||||
'High': round(high, 2),
|
||||
'Low': round(low, 2),
|
||||
'Close': round(close_price, 2),
|
||||
'Volume': round(volume, 2)
|
||||
})
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Calculate log returns (required by feature engineering)
|
||||
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
|
||||
|
||||
print(f"Generated {len(df)} samples of OHLCV data")
|
||||
print(f"Price range: ${df['Close'].min():.2f} - ${df['Close'].max():.2f}")
|
||||
return df
|
||||
|
||||
def load_real_data_example():
|
||||
"""
|
||||
Example of how to load real OHLCV data.
|
||||
Replace this with your actual data loading logic.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame or None: Real OHLCV data if available
|
||||
"""
|
||||
# Example paths where real data might be located
|
||||
possible_paths = [
|
||||
'../data/btcusd_1-min_data.csv',
|
||||
'../data/sample_data.csv',
|
||||
'data/crypto_data.csv'
|
||||
]
|
||||
|
||||
for path in possible_paths:
|
||||
if os.path.exists(path):
|
||||
print(f"Loading real data from {path}...")
|
||||
try:
|
||||
df = pd.read_csv(path)
|
||||
# Ensure required columns exist
|
||||
required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
|
||||
if all(col in df.columns for col in required_cols):
|
||||
# Filter out zero volume entries and calculate log returns
|
||||
df = df[df['Volume'] != 0].reset_index(drop=True)
|
||||
# Use only recent data and ensure proper data types
|
||||
df = df.tail(500).reset_index(drop=True) # Get more data for better feature engineering
|
||||
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
|
||||
print(f"Successfully loaded {len(df)} rows of real data")
|
||||
return df.tail(200) # Use last 200 for final processing
|
||||
else:
|
||||
print(f"Missing required columns in {path}")
|
||||
except Exception as e:
|
||||
print(f"Error loading {path}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
def display_prediction_results(df, log_return_preds, predicted_prices=None, actual_prices=None):
|
||||
"""
|
||||
Display prediction results in a readable format.
|
||||
|
||||
Args:
|
||||
df: Original OHLCV DataFrame
|
||||
log_return_preds: Array of log return predictions
|
||||
predicted_prices: Array of predicted prices (optional)
|
||||
actual_prices: Array of actual prices (optional)
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("PREDICTION RESULTS")
|
||||
print("="*60)
|
||||
|
||||
# Convert timestamps back to readable format for display
|
||||
df_display = df.copy()
|
||||
df_display['Timestamp'] = pd.to_datetime(df_display['Timestamp'], unit='s')
|
||||
|
||||
print(f"\nLog Return Predictions (first 10):")
|
||||
print("-" * 40)
|
||||
for i in range(min(10, len(log_return_preds))):
|
||||
timestamp = df_display.iloc[i]['Timestamp']
|
||||
close_price = df_display.iloc[i]['Close']
|
||||
log_ret = log_return_preds[i]
|
||||
direction = "UP" if log_ret > 0 else "DOWN"
|
||||
print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
|
||||
f"Close: ${close_price:8.2f} | "
|
||||
f"Log Return: {log_ret:8.6f} | "
|
||||
f"Direction: {direction}")
|
||||
|
||||
if predicted_prices is not None and actual_prices is not None:
|
||||
print(f"\nPrice Predictions vs Actual (first 10):")
|
||||
print("-" * 50)
|
||||
for i in range(min(10, len(predicted_prices))):
|
||||
timestamp = df_display.iloc[i]['Timestamp']
|
||||
pred_price = predicted_prices[i]
|
||||
actual_price = actual_prices[i]
|
||||
error = abs(pred_price - actual_price)
|
||||
error_pct = (error / actual_price) * 100
|
||||
print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
|
||||
f"Predicted: ${pred_price:8.2f} | "
|
||||
f"Actual: ${actual_price:8.2f} | "
|
||||
f"Error: {error_pct:5.2f}%")
|
||||
|
||||
# Statistics
|
||||
print(f"\nPrediction Statistics:")
|
||||
print("-" * 30)
|
||||
print(f"Total predictions: {len(log_return_preds)}")
|
||||
print(f"Mean log return: {np.mean(log_return_preds):.6f}")
|
||||
print(f"Std log return: {np.std(log_return_preds):.6f}")
|
||||
print(f"Positive predictions: {np.sum(log_return_preds > 0)} ({np.mean(log_return_preds > 0)*100:.1f}%)")
|
||||
print(f"Negative predictions: {np.sum(log_return_preds < 0)} ({np.mean(log_return_preds < 0)*100:.1f}%)")
|
||||
|
||||
if predicted_prices is not None and actual_prices is not None:
|
||||
mae = np.mean(np.abs(predicted_prices - actual_prices))
|
||||
mape = np.mean(np.abs((predicted_prices - actual_prices) / actual_prices)) * 100
|
||||
print(f"\nPrice Prediction Accuracy:")
|
||||
print(f"Mean Absolute Error: ${mae:.2f}")
|
||||
print(f"Mean Absolute Percentage Error: {mape:.2f}%")
|
||||
|
||||
def demonstrate_batch_prediction(predictor, df):
|
||||
"""
|
||||
Demonstrate batch prediction on multiple data chunks.
|
||||
|
||||
Args:
|
||||
predictor: OHLCVPredictor instance
|
||||
df: OHLCV DataFrame
|
||||
"""
|
||||
print("\n" + "="*60)
|
||||
print("BATCH PREDICTION DEMONSTRATION")
|
||||
print("="*60)
|
||||
|
||||
chunk_size = 50
|
||||
num_chunks = min(3, len(df) // chunk_size)
|
||||
|
||||
for i in range(num_chunks):
|
||||
start_idx = i * chunk_size
|
||||
end_idx = start_idx + chunk_size
|
||||
chunk_df = df.iloc[start_idx:end_idx].copy()
|
||||
|
||||
print(f"\nBatch {i+1}: Processing {len(chunk_df)} samples...")
|
||||
|
||||
try:
|
||||
log_return_preds = predictor.predict(chunk_df, csv_prefix=f'batch_{i+1}')
|
||||
print(f"Successfully predicted {len(log_return_preds)} log returns")
|
||||
print(f"Batch {i+1} mean prediction: {np.mean(log_return_preds):.6f}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in batch {i+1}: {e}")
|
||||
|
||||
def main():
|
||||
"""
|
||||
Main function demonstrating complete OHLCVPredictor usage.
|
||||
"""
|
||||
model_path = '../data/xgboost_model_all_features.json'
|
||||
|
||||
# Check if model exists
|
||||
if not os.path.exists(model_path):
|
||||
print("Model not found. Run main.py first to train the model.")
|
||||
print(f"Expected model path: {model_path}")
|
||||
return
|
||||
|
||||
try:
|
||||
# Load predictor
|
||||
print("Loading predictor...")
|
||||
predictor = OHLCVPredictor(model_path)
|
||||
print("Predictor loaded successfully!")
|
||||
|
||||
# Try to load real data first, fall back to synthetic data
|
||||
df = load_real_data_example()
|
||||
if df is None:
|
||||
df = create_sample_ohlcv_data(200)
|
||||
|
||||
print(f"\nDataFrame shape: {df.shape}")
|
||||
print(f"Columns: {list(df.columns)}")
|
||||
print(f"Data range: {len(df)} samples")
|
||||
|
||||
# Demonstrate log return predictions
|
||||
print("\n" + "="*60)
|
||||
print("LOG RETURN PREDICTIONS")
|
||||
print("="*60)
|
||||
|
||||
log_return_preds = predictor.predict(df, csv_prefix='inference_demo')
|
||||
print(f"Generated {len(log_return_preds)} log return predictions")
|
||||
|
||||
# Demonstrate price predictions
|
||||
print("\n" + "="*60)
|
||||
print("PRICE PREDICTIONS")
|
||||
print("="*60)
|
||||
|
||||
predicted_prices, actual_prices = predictor.predict_prices(df, csv_prefix='price_demo')
|
||||
print(f"Generated {len(predicted_prices)} price predictions")
|
||||
|
||||
# Display results
|
||||
display_prediction_results(df, log_return_preds, predicted_prices, actual_prices)
|
||||
|
||||
# Demonstrate batch processing
|
||||
demonstrate_batch_prediction(predictor, df)
|
||||
|
||||
print("\n" + "="*60)
|
||||
print("USAGE EXAMPLES FOR OTHER PROJECTS")
|
||||
print("="*60)
|
||||
print("""
|
||||
# Basic usage:
|
||||
from predictor import OHLCVPredictor
|
||||
|
||||
# Load your trained model
|
||||
predictor = OHLCVPredictor('path/to/your/model.json')
|
||||
|
||||
# Prepare your OHLCV data (pandas DataFrame with columns):
|
||||
# ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
|
||||
|
||||
# Get log return predictions
|
||||
log_returns = predictor.predict(your_dataframe)
|
||||
|
||||
# Get price predictions
|
||||
predicted_prices, actual_prices = predictor.predict_prices(your_dataframe)
|
||||
|
||||
# Required files for deployment:
|
||||
# - predictor.py
|
||||
# - custom_xgboost.py
|
||||
# - feature_engineering.py
|
||||
# - technical_indicator_functions.py
|
||||
# - your_trained_model.json
|
||||
""")
|
||||
|
||||
except FileNotFoundError as e:
|
||||
print(f"File not found: {e}")
|
||||
print("Make sure the model file exists and the path is correct.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during prediction: {e}")
|
||||
print("Check your data format and model compatibility.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
97
predictor.py
Normal file
97
predictor.py
Normal file
@ -0,0 +1,97 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import os
|
||||
|
||||
try:
|
||||
from .custom_xgboost import CustomXGBoostGPU
|
||||
except ImportError:
|
||||
from custom_xgboost import CustomXGBoostGPU
|
||||
|
||||
try:
|
||||
from .feature_engineering import feature_engineering
|
||||
except ImportError:
|
||||
from feature_engineering import feature_engineering
|
||||
|
||||
class OHLCVPredictor:
|
||||
def __init__(self, model_path):
|
||||
if not os.path.exists(model_path):
|
||||
raise FileNotFoundError(f"Model file not found: {model_path}")
|
||||
|
||||
self.model = CustomXGBoostGPU.load_model(model_path)
|
||||
self.exclude_cols = self._get_excluded_features()
|
||||
|
||||
def _get_excluded_features(self):
|
||||
"""Get the list of features to exclude (copied from main.py)"""
|
||||
exclude_cols = ['Timestamp', 'Close']
|
||||
exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30']
|
||||
exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar',
|
||||
'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband',
|
||||
'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line',
|
||||
'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2',
|
||||
'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15',
|
||||
'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15',
|
||||
'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30',
|
||||
'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5',
|
||||
'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30',
|
||||
'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15',
|
||||
'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']
|
||||
return exclude_cols
|
||||
|
||||
def predict(self, df, csv_prefix=None):
|
||||
# Validate input DataFrame
|
||||
required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
|
||||
missing_cols = [col for col in required_cols if col not in df.columns]
|
||||
if missing_cols:
|
||||
raise ValueError(f"Missing required columns: {missing_cols}")
|
||||
|
||||
# Make a copy and preprocess
|
||||
df = df.copy()
|
||||
df = df[df['Volume'] != 0].reset_index(drop=True)
|
||||
|
||||
# Convert timestamps
|
||||
if df['Timestamp'].dtype == 'object':
|
||||
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
|
||||
else:
|
||||
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
|
||||
|
||||
# Feature engineering
|
||||
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
|
||||
features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, 3, [5, 15, 30])
|
||||
features_df = pd.DataFrame(features_dict)
|
||||
df = pd.concat([df, features_df], axis=1)
|
||||
|
||||
# Downcast and add time features (exclude Timestamp to preserve datetime)
|
||||
for col in df.columns:
|
||||
if col != 'Timestamp': # Don't convert Timestamp to numeric
|
||||
try:
|
||||
df[col] = pd.to_numeric(df[col], downcast='float')
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
df['hour'] = df['Timestamp'].dt.hour
|
||||
|
||||
# Handle NaNs
|
||||
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
||||
for col in numeric_cols:
|
||||
if df[col].isna().any():
|
||||
df[col] = df[col].fillna(df[col].mean())
|
||||
|
||||
# Defragment DataFrame after all columns have been added
|
||||
df = df.copy()
|
||||
|
||||
# Select features and predict
|
||||
feature_cols = [col for col in df.columns if col not in self.exclude_cols]
|
||||
X = df[feature_cols].values.astype(np.float32)
|
||||
return self.model.predict(X)
|
||||
|
||||
def predict_prices(self, df, csv_prefix=None):
|
||||
log_return_preds = self.predict(df, csv_prefix)
|
||||
df_clean = df[df['Volume'] != 0].copy()
|
||||
close_prices = df_clean['Close'].values
|
||||
|
||||
predicted_prices = [close_prices[0]]
|
||||
for i, log_ret in enumerate(log_return_preds[1:], 1):
|
||||
if i < len(close_prices):
|
||||
predicted_prices.append(predicted_prices[-1] * np.exp(log_ret))
|
||||
|
||||
return np.array(predicted_prices), close_prices[:len(predicted_prices)]
|
||||
@ -12,3 +12,11 @@ dependencies = [
|
||||
"ta>=0.11.0",
|
||||
"xgboost>=3.0.2",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.0", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["ohlcvpredictor*"]
|
||||
exclude = ["charts*"]
|
||||
|
||||
4
uv.lock
generated
4
uv.lock
generated
@ -1,5 +1,5 @@
|
||||
version = 1
|
||||
revision = 2
|
||||
revision = 3
|
||||
requires-python = ">=3.12"
|
||||
|
||||
[[package]]
|
||||
@ -309,7 +309,7 @@ wheels = [
|
||||
[[package]]
|
||||
name = "ohlcvpredictor"
|
||||
version = "0.1.0"
|
||||
source = { virtual = "." }
|
||||
source = { editable = "." }
|
||||
dependencies = [
|
||||
{ name = "dash" },
|
||||
{ name = "numba" },
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user