Enhance CustomXGBoostGPU for inference by making training data optional and adding model loading functionality. Update feature engineering to support caching and improve data handling. Introduce a new inference example and README for better usability in other projects.

This commit is contained in:
Simon Moisy 2025-08-08 09:38:18 +08:00
parent b56d9ea3a1
commit a419764fff
7 changed files with 707 additions and 177 deletions

38
INFERENCE_README.md Normal file
View File

@ -0,0 +1,38 @@
# OHLCV Predictor - Simple Inference
Refactored for easy reuse in other projects.
## Usage
```python
from predictor import OHLCVPredictor
predictor = OHLCVPredictor('model.json')
predictions = predictor.predict(your_ohlcv_dataframe)
```
## Files Needed
Copy these 5 files to your other project:
1. `predictor.py`
2. `custom_xgboost.py`
3. `feature_engineering.py`
4. `technical_indicator_functions.py`
5. `xgboost_model_all_features.json`
## Data Requirements
Your DataFrame needs these columns:
- `Open`, `High`, `Low`, `Close`, `Volume`, `Timestamp`
## Dependencies
```
xgboost >= 3.0.2
pandas >= 2.2.3
numpy >= 2.2.3
scikit-learn >= 1.6.1
ta >= 0.11.0
numba >= 0.61.2
```

View File

@ -2,15 +2,34 @@ import xgboost as xgb
import numpy as np
class CustomXGBoostGPU:
def __init__(self, X_train, X_test, y_train, y_test):
self.X_train = X_train.astype(np.float32)
self.X_test = X_test.astype(np.float32)
self.y_train = y_train.astype(np.float32)
self.y_test = y_test.astype(np.float32)
def __init__(self, X_train=None, X_test=None, y_train=None, y_test=None):
# Make training data optional for inference-only usage
self.X_train = X_train.astype(np.float32) if X_train is not None else None
self.X_test = X_test.astype(np.float32) if X_test is not None else None
self.y_train = y_train.astype(np.float32) if y_train is not None else None
self.y_test = y_test.astype(np.float32) if y_test is not None else None
self.model = None
self.params = None # Will be set during training
@classmethod
def load_model(cls, model_path):
"""Load a pre-trained model from file for inference
Args:
model_path (str): Path to the saved XGBoost model file
Returns:
CustomXGBoostGPU: Instance with loaded model ready for inference
"""
instance = cls() # Create instance without training data
instance.model = xgb.Booster()
instance.model.load_model(model_path)
return instance
def train(self, **xgb_params):
if self.X_train is None or self.y_train is None:
raise ValueError('Training data is required for training. Use load_model() for inference-only usage.')
params = {
'tree_method': 'hist',
'device': 'cuda',

View File

@ -2,250 +2,309 @@ import os
import numpy as np
import pandas as pd
import ta
from technical_indicator_functions import *
try:
from .technical_indicator_functions import *
except ImportError:
from technical_indicator_functions import *
def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
feature_file = f'../data/{csv_prefix}_rsi.npy'
"""
Compute and/or load features for the given DataFrame.
If csv_prefix is provided, features are cached to disk; otherwise, features are only computed in memory.
Args:
df (pd.DataFrame): Input OHLCV data.
csv_prefix (str or None): Prefix for feature files (for caching). If None or '', disables caching.
ohlcv_cols (list): List of OHLCV column names.
lags (int): Number of lag features.
window_sizes (list): List of window sizes for rolling features.
Returns:
dict: Dictionary of computed features.
"""
features_dict = {}
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['rsi'] = pd.Series(arr, index=df.index)
# RSI
if csv_prefix:
feature_file = f'../data/{csv_prefix}_rsi.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['rsi'] = pd.Series(arr, index=df.index)
else:
_, values = calc_rsi(df['Close'])
features_dict['rsi'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: rsi')
_, values = calc_rsi(df['Close'])
features_dict['rsi'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# MACD
feature_file = f'../data/{csv_prefix}_macd.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['macd'] = pd.Series(arr, index=df.index)
if csv_prefix:
feature_file = f'../data/{csv_prefix}_macd.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['macd'] = pd.Series(arr, index=df.index)
else:
_, values = calc_macd(df['Close'])
features_dict['macd'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: macd')
_, values = calc_macd(df['Close'])
features_dict['macd'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# ATR
feature_file = f'../data/{csv_prefix}_atr.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['atr'] = pd.Series(arr, index=df.index)
if csv_prefix:
feature_file = f'../data/{csv_prefix}_atr.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['atr'] = pd.Series(arr, index=df.index)
else:
_, values = calc_atr(df['High'], df['Low'], df['Close'])
features_dict['atr'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: atr')
_, values = calc_atr(df['High'], df['Low'], df['Close'])
features_dict['atr'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# CCI
feature_file = f'../data/{csv_prefix}_cci.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['cci'] = pd.Series(arr, index=df.index)
if csv_prefix:
feature_file = f'../data/{csv_prefix}_cci.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['cci'] = pd.Series(arr, index=df.index)
else:
_, values = calc_cci(df['High'], df['Low'], df['Close'])
features_dict['cci'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: cci')
_, values = calc_cci(df['High'], df['Low'], df['Close'])
features_dict['cci'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Williams %R
feature_file = f'../data/{csv_prefix}_williams_r.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['williams_r'] = pd.Series(arr, index=df.index)
if csv_prefix:
feature_file = f'../data/{csv_prefix}_williams_r.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['williams_r'] = pd.Series(arr, index=df.index)
else:
_, values = calc_williamsr(df['High'], df['Low'], df['Close'])
features_dict['williams_r'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: williams_r')
_, values = calc_williamsr(df['High'], df['Low'], df['Close'])
features_dict['williams_r'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# EMA 14
feature_file = f'../data/{csv_prefix}_ema_14.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['ema_14'] = pd.Series(arr, index=df.index)
if csv_prefix:
feature_file = f'../data/{csv_prefix}_ema_14.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['ema_14'] = pd.Series(arr, index=df.index)
else:
_, values = calc_ema(df['Close'])
features_dict['ema_14'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: ema_14')
_, values = calc_ema(df['Close'])
features_dict['ema_14'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# OBV
feature_file = f'../data/{csv_prefix}_obv.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['obv'] = pd.Series(arr, index=df.index)
if csv_prefix:
feature_file = f'../data/{csv_prefix}_obv.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['obv'] = pd.Series(arr, index=df.index)
else:
_, values = calc_obv(df['Close'], df['Volume'])
features_dict['obv'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: obv')
_, values = calc_obv(df['Close'], df['Volume'])
features_dict['obv'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# CMF
feature_file = f'../data/{csv_prefix}_cmf.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['cmf'] = pd.Series(arr, index=df.index)
if csv_prefix:
feature_file = f'../data/{csv_prefix}_cmf.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['cmf'] = pd.Series(arr, index=df.index)
else:
_, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
features_dict['cmf'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: cmf')
_, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
features_dict['cmf'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# ROC 10
feature_file = f'../data/{csv_prefix}_roc_10.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['roc_10'] = pd.Series(arr, index=df.index)
if csv_prefix:
feature_file = f'../data/{csv_prefix}_roc_10.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['roc_10'] = pd.Series(arr, index=df.index)
else:
_, values = calc_roc(df['Close'])
features_dict['roc_10'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: roc_10')
_, values = calc_roc(df['Close'])
features_dict['roc_10'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# DPO 20
feature_file = f'../data/{csv_prefix}_dpo_20.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['dpo_20'] = pd.Series(arr, index=df.index)
if csv_prefix:
feature_file = f'../data/{csv_prefix}_dpo_20.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['dpo_20'] = pd.Series(arr, index=df.index)
else:
_, values = calc_dpo(df['Close'])
features_dict['dpo_20'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: dpo_20')
_, values = calc_dpo(df['Close'])
features_dict['dpo_20'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Ultimate Oscillator
feature_file = f'../data/{csv_prefix}_ultimate_osc.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
if csv_prefix:
feature_file = f'../data/{csv_prefix}_ultimate_osc.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
else:
_, values = calc_ultimate(df['High'], df['Low'], df['Close'])
features_dict['ultimate_osc'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: ultimate_osc')
_, values = calc_ultimate(df['High'], df['Low'], df['Close'])
features_dict['ultimate_osc'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Daily Return
feature_file = f'../data/{csv_prefix}_daily_return.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['daily_return'] = pd.Series(arr, index=df.index)
if csv_prefix:
feature_file = f'../data/{csv_prefix}_daily_return.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['daily_return'] = pd.Series(arr, index=df.index)
else:
_, values = calc_daily_return(df['Close'])
features_dict['daily_return'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: daily_return')
_, values = calc_daily_return(df['Close'])
features_dict['daily_return'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Multi-column indicators
# Bollinger Bands
result = calc_bollinger(df['Close'])
for subname, values in result:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
if csv_prefix:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Stochastic Oscillator
result = calc_stochastic(df['High'], df['Low'], df['Close'])
for subname, values in result:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
if csv_prefix:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# SMA
result = calc_sma(df['Close'])
for subname, values in result:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
if csv_prefix:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# PSAR
result = calc_psar(df['High'], df['Low'], df['Close'])
for subname, values in result:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
if csv_prefix:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Donchian Channel
result = calc_donchian(df['High'], df['Low'], df['Close'])
for subname, values in result:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
if csv_prefix:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Keltner Channel
result = calc_keltner(df['High'], df['Low'], df['Close'])
for subname, values in result:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
if csv_prefix:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Ichimoku
result = calc_ichimoku(df['High'], df['Low'])
for subname, values in result:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
if csv_prefix:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Elder Ray
result = calc_elder_ray(df['Close'], df['Low'], df['High'])
for subname, values in result:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
if csv_prefix:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
if os.path.exists(sub_feature_file):
arr = np.load(sub_feature_file)
features_dict[subname] = pd.Series(arr, index=df.index)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
else:
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
# Prepare lags, rolling stats, log returns, and volatility features sequentially
# Lags
@ -253,14 +312,17 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
for lag in range(1, lags + 1):
feature_name = f'{col}_lag{lag}'
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
features_dict[feature_name] = np.load(feature_file)
if csv_prefix:
if os.path.exists(feature_file):
features_dict[feature_name] = np.load(feature_file)
else:
result = compute_lag(df, col, lag)
features_dict[feature_name] = result
np.save(feature_file, result.values)
else:
print(f'Computing lag feature: {feature_name}')
result = compute_lag(df, col, lag)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
# Rolling statistics
for col in ohlcv_cols:
for window in window_sizes:
@ -275,44 +337,52 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
for stat in ['mean', 'std', 'min', 'max']:
feature_name = f'{col}_roll_{stat}_{window}'
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
features_dict[feature_name] = np.load(feature_file)
if csv_prefix:
if os.path.exists(feature_file):
features_dict[feature_name] = np.load(feature_file)
else:
result = compute_rolling(df, col, stat, window)
features_dict[feature_name] = result
np.save(feature_file, result.values)
else:
print(f'Computing rolling stat feature: {feature_name}')
result = compute_rolling(df, col, stat, window)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
# Log returns for different horizons
for horizon in [5, 15, 30]:
feature_name = f'log_return_{horizon}'
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
features_dict[feature_name] = np.load(feature_file)
if csv_prefix:
if os.path.exists(feature_file):
features_dict[feature_name] = np.load(feature_file)
else:
result = compute_log_return(df, horizon)
features_dict[feature_name] = result
np.save(feature_file, result.values)
else:
print(f'Computing log return feature: {feature_name}')
result = compute_log_return(df, horizon)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
# Volatility
for window in window_sizes:
feature_name = f'volatility_{window}'
feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
if os.path.exists(feature_file):
features_dict[feature_name] = np.load(feature_file)
if csv_prefix:
if os.path.exists(feature_file):
features_dict[feature_name] = np.load(feature_file)
else:
result = compute_volatility(df, window)
features_dict[feature_name] = result
np.save(feature_file, result.values)
else:
print(f'Computing volatility feature: {feature_name}')
result = compute_volatility(df, window)
features_dict[feature_name] = result
np.save(feature_file, result.values)
print(f'Saved feature: {feature_file}')
# --- Additional Technical Indicator Features ---
# ADX
adx_names = ['adx', 'adx_pos', 'adx_neg']
adx_files = [f'../data/{csv_prefix}_{name}.npy' for name in adx_names]
if all(os.path.exists(f) for f in adx_files):
if csv_prefix and all(os.path.exists(f) for f in adx_files):
for name, f in zip(adx_names, adx_files):
arr = np.load(f)
features_dict[name] = pd.Series(arr, index=df.index)
@ -321,20 +391,22 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
for subname, values in result:
sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
features_dict[subname] = values
np.save(sub_feature_file, values.values)
print(f'Saved feature: {sub_feature_file}')
if csv_prefix:
np.save(sub_feature_file, values.values)
# Force Index
feature_file = f'../data/{csv_prefix}_force_index.npy'
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['force_index'] = pd.Series(arr, index=df.index)
if csv_prefix:
if os.path.exists(feature_file):
arr = np.load(feature_file)
features_dict['force_index'] = pd.Series(arr, index=df.index)
else:
_, values = calc_force_index(df['Close'], df['Volume'])
features_dict['force_index'] = values
np.save(feature_file, values.values)
else:
print('Calculating feature: force_index')
_, values = calc_force_index(df['Close'], df['Volume'])
features_dict['force_index'] = values
np.save(feature_file, values.values)
print(f'Saved feature: {feature_file}')
# Supertrend indicators (simplified implementation)
for period, multiplier in [(12, 3.0), (10, 1.0), (11, 2.0)]:
@ -342,26 +414,23 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
st_trend_name = f'supertrend_trend_{period}_{multiplier}'
st_file = f'../data/{csv_prefix}_{st_name}.npy'
st_trend_file = f'../data/{csv_prefix}_{st_trend_name}.npy'
if os.path.exists(st_file) and os.path.exists(st_trend_file):
if csv_prefix and os.path.exists(st_file) and os.path.exists(st_trend_file):
features_dict[st_name] = pd.Series(np.load(st_file), index=df.index)
features_dict[st_trend_name] = pd.Series(np.load(st_trend_file), index=df.index)
else:
print(f'Calculating Supertrend indicator: {st_name}')
# Simple supertrend alternative using ATR and moving averages
from ta.volatility import AverageTrueRange
atr = AverageTrueRange(df['High'], df['Low'], df['Close'], window=period).average_true_range()
hl_avg = (df['High'] + df['Low']) / 2
basic_ub = hl_avg + (multiplier * atr)
basic_lb = hl_avg - (multiplier * atr)
# Simplified supertrend calculation
supertrend = hl_avg.copy()
trend = pd.Series(1, index=df.index) # 1 for uptrend, -1 for downtrend
features_dict[st_name] = supertrend
features_dict[st_trend_name] = trend
np.save(st_file, features_dict[st_name].values)
np.save(st_trend_file, features_dict[st_trend_name].values)
print(f'Saved features: {st_file}, {st_trend_file}')
if csv_prefix:
np.save(st_file, features_dict[st_name].values)
np.save(st_trend_file, features_dict[st_trend_name].values)
return features_dict

299
inference_example.py Normal file
View File

@ -0,0 +1,299 @@
"""
Complete example showing how to use the OHLCVPredictor for making predictions.
This example demonstrates:
1. Loading a trained model
2. Preparing sample OHLCV data
3. Making log return predictions
4. Making price predictions
5. Evaluating and displaying results
"""
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from predictor import OHLCVPredictor
def create_sample_ohlcv_data(num_samples=200):
"""
Create realistic sample OHLCV data for demonstration.
In practice, replace this with your actual data loading.
Returns:
pd.DataFrame: DataFrame with OHLCV data
"""
print("Creating sample OHLCV data for demonstration...")
# Start with a base price and simulate realistic price movements
np.random.seed(42) # For reproducible results
base_price = 50000.0 # Base Bitcoin price
# Generate timestamps (1-minute intervals)
start_time = datetime(2024, 1, 1)
timestamps = [start_time + timedelta(minutes=i) for i in range(num_samples)]
# Generate realistic price movements
returns = np.random.normal(0, 0.001, num_samples) # Small random returns
prices = [base_price]
for i in range(1, num_samples):
# Add some trending behavior
trend = 0.0001 * np.sin(i / 50.0) # Gentle sinusoidal trend
price_change = returns[i] + trend
new_price = prices[-1] * (1 + price_change)
prices.append(max(new_price, 1000)) # Minimum price floor
# Generate OHLCV data
data = []
for i in range(num_samples):
price = prices[i]
# Generate realistic OHLC within a reasonable range
volatility = abs(np.random.normal(0, 0.002)) # Random volatility
high = price * (1 + volatility)
low = price * (1 - volatility)
# Ensure OHLC relationships are correct
open_price = price * (1 + np.random.normal(0, 0.0005))
close_price = price * (1 + np.random.normal(0, 0.0005))
# Ensure high is highest and low is lowest
high = max(high, open_price, close_price)
low = min(low, open_price, close_price)
# Generate volume (typically higher during price movements)
base_volume = 100 + abs(np.random.normal(0, 50))
volume_multiplier = 1 + abs(open_price - close_price) / close_price * 10
volume = base_volume * volume_multiplier
data.append({
'Timestamp': timestamps[i],
'Open': round(open_price, 2),
'High': round(high, 2),
'Low': round(low, 2),
'Close': round(close_price, 2),
'Volume': round(volume, 2)
})
df = pd.DataFrame(data)
# Calculate log returns (required by feature engineering)
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
print(f"Generated {len(df)} samples of OHLCV data")
print(f"Price range: ${df['Close'].min():.2f} - ${df['Close'].max():.2f}")
return df
def load_real_data_example():
"""
Example of how to load real OHLCV data.
Replace this with your actual data loading logic.
Returns:
pd.DataFrame or None: Real OHLCV data if available
"""
# Example paths where real data might be located
possible_paths = [
'../data/btcusd_1-min_data.csv',
'../data/sample_data.csv',
'data/crypto_data.csv'
]
for path in possible_paths:
if os.path.exists(path):
print(f"Loading real data from {path}...")
try:
df = pd.read_csv(path)
# Ensure required columns exist
required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
if all(col in df.columns for col in required_cols):
# Filter out zero volume entries and calculate log returns
df = df[df['Volume'] != 0].reset_index(drop=True)
# Use only recent data and ensure proper data types
df = df.tail(500).reset_index(drop=True) # Get more data for better feature engineering
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
print(f"Successfully loaded {len(df)} rows of real data")
return df.tail(200) # Use last 200 for final processing
else:
print(f"Missing required columns in {path}")
except Exception as e:
print(f"Error loading {path}: {e}")
return None
def display_prediction_results(df, log_return_preds, predicted_prices=None, actual_prices=None):
"""
Display prediction results in a readable format.
Args:
df: Original OHLCV DataFrame
log_return_preds: Array of log return predictions
predicted_prices: Array of predicted prices (optional)
actual_prices: Array of actual prices (optional)
"""
print("\n" + "="*60)
print("PREDICTION RESULTS")
print("="*60)
# Convert timestamps back to readable format for display
df_display = df.copy()
df_display['Timestamp'] = pd.to_datetime(df_display['Timestamp'], unit='s')
print(f"\nLog Return Predictions (first 10):")
print("-" * 40)
for i in range(min(10, len(log_return_preds))):
timestamp = df_display.iloc[i]['Timestamp']
close_price = df_display.iloc[i]['Close']
log_ret = log_return_preds[i]
direction = "UP" if log_ret > 0 else "DOWN"
print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
f"Close: ${close_price:8.2f} | "
f"Log Return: {log_ret:8.6f} | "
f"Direction: {direction}")
if predicted_prices is not None and actual_prices is not None:
print(f"\nPrice Predictions vs Actual (first 10):")
print("-" * 50)
for i in range(min(10, len(predicted_prices))):
timestamp = df_display.iloc[i]['Timestamp']
pred_price = predicted_prices[i]
actual_price = actual_prices[i]
error = abs(pred_price - actual_price)
error_pct = (error / actual_price) * 100
print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
f"Predicted: ${pred_price:8.2f} | "
f"Actual: ${actual_price:8.2f} | "
f"Error: {error_pct:5.2f}%")
# Statistics
print(f"\nPrediction Statistics:")
print("-" * 30)
print(f"Total predictions: {len(log_return_preds)}")
print(f"Mean log return: {np.mean(log_return_preds):.6f}")
print(f"Std log return: {np.std(log_return_preds):.6f}")
print(f"Positive predictions: {np.sum(log_return_preds > 0)} ({np.mean(log_return_preds > 0)*100:.1f}%)")
print(f"Negative predictions: {np.sum(log_return_preds < 0)} ({np.mean(log_return_preds < 0)*100:.1f}%)")
if predicted_prices is not None and actual_prices is not None:
mae = np.mean(np.abs(predicted_prices - actual_prices))
mape = np.mean(np.abs((predicted_prices - actual_prices) / actual_prices)) * 100
print(f"\nPrice Prediction Accuracy:")
print(f"Mean Absolute Error: ${mae:.2f}")
print(f"Mean Absolute Percentage Error: {mape:.2f}%")
def demonstrate_batch_prediction(predictor, df):
"""
Demonstrate batch prediction on multiple data chunks.
Args:
predictor: OHLCVPredictor instance
df: OHLCV DataFrame
"""
print("\n" + "="*60)
print("BATCH PREDICTION DEMONSTRATION")
print("="*60)
chunk_size = 50
num_chunks = min(3, len(df) // chunk_size)
for i in range(num_chunks):
start_idx = i * chunk_size
end_idx = start_idx + chunk_size
chunk_df = df.iloc[start_idx:end_idx].copy()
print(f"\nBatch {i+1}: Processing {len(chunk_df)} samples...")
try:
log_return_preds = predictor.predict(chunk_df, csv_prefix=f'batch_{i+1}')
print(f"Successfully predicted {len(log_return_preds)} log returns")
print(f"Batch {i+1} mean prediction: {np.mean(log_return_preds):.6f}")
except Exception as e:
print(f"Error in batch {i+1}: {e}")
def main():
"""
Main function demonstrating complete OHLCVPredictor usage.
"""
model_path = '../data/xgboost_model_all_features.json'
# Check if model exists
if not os.path.exists(model_path):
print("Model not found. Run main.py first to train the model.")
print(f"Expected model path: {model_path}")
return
try:
# Load predictor
print("Loading predictor...")
predictor = OHLCVPredictor(model_path)
print("Predictor loaded successfully!")
# Try to load real data first, fall back to synthetic data
df = load_real_data_example()
if df is None:
df = create_sample_ohlcv_data(200)
print(f"\nDataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Data range: {len(df)} samples")
# Demonstrate log return predictions
print("\n" + "="*60)
print("LOG RETURN PREDICTIONS")
print("="*60)
log_return_preds = predictor.predict(df, csv_prefix='inference_demo')
print(f"Generated {len(log_return_preds)} log return predictions")
# Demonstrate price predictions
print("\n" + "="*60)
print("PRICE PREDICTIONS")
print("="*60)
predicted_prices, actual_prices = predictor.predict_prices(df, csv_prefix='price_demo')
print(f"Generated {len(predicted_prices)} price predictions")
# Display results
display_prediction_results(df, log_return_preds, predicted_prices, actual_prices)
# Demonstrate batch processing
demonstrate_batch_prediction(predictor, df)
print("\n" + "="*60)
print("USAGE EXAMPLES FOR OTHER PROJECTS")
print("="*60)
print("""
# Basic usage:
from predictor import OHLCVPredictor
# Load your trained model
predictor = OHLCVPredictor('path/to/your/model.json')
# Prepare your OHLCV data (pandas DataFrame with columns):
# ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
# Get log return predictions
log_returns = predictor.predict(your_dataframe)
# Get price predictions
predicted_prices, actual_prices = predictor.predict_prices(your_dataframe)
# Required files for deployment:
# - predictor.py
# - custom_xgboost.py
# - feature_engineering.py
# - technical_indicator_functions.py
# - your_trained_model.json
""")
except FileNotFoundError as e:
print(f"File not found: {e}")
print("Make sure the model file exists and the path is correct.")
except Exception as e:
print(f"Error during prediction: {e}")
print("Check your data format and model compatibility.")
if __name__ == '__main__':
main()

97
predictor.py Normal file
View File

@ -0,0 +1,97 @@
import pandas as pd
import numpy as np
import os
try:
from .custom_xgboost import CustomXGBoostGPU
except ImportError:
from custom_xgboost import CustomXGBoostGPU
try:
from .feature_engineering import feature_engineering
except ImportError:
from feature_engineering import feature_engineering
class OHLCVPredictor:
def __init__(self, model_path):
if not os.path.exists(model_path):
raise FileNotFoundError(f"Model file not found: {model_path}")
self.model = CustomXGBoostGPU.load_model(model_path)
self.exclude_cols = self._get_excluded_features()
def _get_excluded_features(self):
"""Get the list of features to exclude (copied from main.py)"""
exclude_cols = ['Timestamp', 'Close']
exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30']
exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar',
'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband',
'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line',
'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2',
'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15',
'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15',
'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30',
'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5',
'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30',
'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15',
'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']
return exclude_cols
def predict(self, df, csv_prefix=None):
# Validate input DataFrame
required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
missing_cols = [col for col in required_cols if col not in df.columns]
if missing_cols:
raise ValueError(f"Missing required columns: {missing_cols}")
# Make a copy and preprocess
df = df.copy()
df = df[df['Volume'] != 0].reset_index(drop=True)
# Convert timestamps
if df['Timestamp'].dtype == 'object':
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
else:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
# Feature engineering
ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, 3, [5, 15, 30])
features_df = pd.DataFrame(features_dict)
df = pd.concat([df, features_df], axis=1)
# Downcast and add time features (exclude Timestamp to preserve datetime)
for col in df.columns:
if col != 'Timestamp': # Don't convert Timestamp to numeric
try:
df[col] = pd.to_numeric(df[col], downcast='float')
except Exception:
pass
df['hour'] = df['Timestamp'].dt.hour
# Handle NaNs
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
if df[col].isna().any():
df[col] = df[col].fillna(df[col].mean())
# Defragment DataFrame after all columns have been added
df = df.copy()
# Select features and predict
feature_cols = [col for col in df.columns if col not in self.exclude_cols]
X = df[feature_cols].values.astype(np.float32)
return self.model.predict(X)
def predict_prices(self, df, csv_prefix=None):
log_return_preds = self.predict(df, csv_prefix)
df_clean = df[df['Volume'] != 0].copy()
close_prices = df_clean['Close'].values
predicted_prices = [close_prices[0]]
for i, log_ret in enumerate(log_return_preds[1:], 1):
if i < len(close_prices):
predicted_prices.append(predicted_prices[-1] * np.exp(log_ret))
return np.array(predicted_prices), close_prices[:len(predicted_prices)]

View File

@ -12,3 +12,11 @@ dependencies = [
"ta>=0.11.0",
"xgboost>=3.0.2",
]
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"
[tool.setuptools.packages.find]
include = ["ohlcvpredictor*"]
exclude = ["charts*"]

4
uv.lock generated
View File

@ -1,5 +1,5 @@
version = 1
revision = 2
revision = 3
requires-python = ">=3.12"
[[package]]
@ -309,7 +309,7 @@ wheels = [
[[package]]
name = "ohlcvpredictor"
version = "0.1.0"
source = { virtual = "." }
source = { editable = "." }
dependencies = [
{ name = "dash" },
{ name = "numba" },