diff --git a/INFERENCE_README.md b/INFERENCE_README.md new file mode 100644 index 0000000..03995e7 --- /dev/null +++ b/INFERENCE_README.md @@ -0,0 +1,38 @@ +# OHLCV Predictor - Simple Inference + +Refactored for easy reuse in other projects. + +## Usage + +```python +from predictor import OHLCVPredictor + +predictor = OHLCVPredictor('model.json') +predictions = predictor.predict(your_ohlcv_dataframe) +``` + +## Files Needed + +Copy these 5 files to your other project: + +1. `predictor.py` +2. `custom_xgboost.py` +3. `feature_engineering.py` +4. `technical_indicator_functions.py` +5. `xgboost_model_all_features.json` + +## Data Requirements + +Your DataFrame needs these columns: +- `Open`, `High`, `Low`, `Close`, `Volume`, `Timestamp` + +## Dependencies + +``` +xgboost >= 3.0.2 +pandas >= 2.2.3 +numpy >= 2.2.3 +scikit-learn >= 1.6.1 +ta >= 0.11.0 +numba >= 0.61.2 +``` \ No newline at end of file diff --git a/custom_xgboost.py b/custom_xgboost.py index 4429af1..f20487b 100644 --- a/custom_xgboost.py +++ b/custom_xgboost.py @@ -2,15 +2,34 @@ import xgboost as xgb import numpy as np class CustomXGBoostGPU: - def __init__(self, X_train, X_test, y_train, y_test): - self.X_train = X_train.astype(np.float32) - self.X_test = X_test.astype(np.float32) - self.y_train = y_train.astype(np.float32) - self.y_test = y_test.astype(np.float32) + def __init__(self, X_train=None, X_test=None, y_train=None, y_test=None): + # Make training data optional for inference-only usage + self.X_train = X_train.astype(np.float32) if X_train is not None else None + self.X_test = X_test.astype(np.float32) if X_test is not None else None + self.y_train = y_train.astype(np.float32) if y_train is not None else None + self.y_test = y_test.astype(np.float32) if y_test is not None else None self.model = None self.params = None # Will be set during training + @classmethod + def load_model(cls, model_path): + """Load a pre-trained model from file for inference + + Args: + model_path (str): Path to the saved XGBoost model file + + Returns: + CustomXGBoostGPU: Instance with loaded model ready for inference + """ + instance = cls() # Create instance without training data + instance.model = xgb.Booster() + instance.model.load_model(model_path) + return instance + def train(self, **xgb_params): + if self.X_train is None or self.y_train is None: + raise ValueError('Training data is required for training. Use load_model() for inference-only usage.') + params = { 'tree_method': 'hist', 'device': 'cuda', diff --git a/feature_engineering.py b/feature_engineering.py index 349fed8..d6f4590 100644 --- a/feature_engineering.py +++ b/feature_engineering.py @@ -2,250 +2,309 @@ import os import numpy as np import pandas as pd import ta -from technical_indicator_functions import * + +try: + from .technical_indicator_functions import * +except ImportError: + from technical_indicator_functions import * def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes): - feature_file = f'../data/{csv_prefix}_rsi.npy' + """ + Compute and/or load features for the given DataFrame. + If csv_prefix is provided, features are cached to disk; otherwise, features are only computed in memory. + + Args: + df (pd.DataFrame): Input OHLCV data. + csv_prefix (str or None): Prefix for feature files (for caching). If None or '', disables caching. + ohlcv_cols (list): List of OHLCV column names. + lags (int): Number of lag features. + window_sizes (list): List of window sizes for rolling features. + + Returns: + dict: Dictionary of computed features. + """ features_dict = {} - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['rsi'] = pd.Series(arr, index=df.index) + # RSI + if csv_prefix: + feature_file = f'../data/{csv_prefix}_rsi.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['rsi'] = pd.Series(arr, index=df.index) + else: + _, values = calc_rsi(df['Close']) + features_dict['rsi'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: rsi') _, values = calc_rsi(df['Close']) features_dict['rsi'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # MACD - feature_file = f'../data/{csv_prefix}_macd.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['macd'] = pd.Series(arr, index=df.index) + if csv_prefix: + feature_file = f'../data/{csv_prefix}_macd.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['macd'] = pd.Series(arr, index=df.index) + else: + _, values = calc_macd(df['Close']) + features_dict['macd'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: macd') _, values = calc_macd(df['Close']) features_dict['macd'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # ATR - feature_file = f'../data/{csv_prefix}_atr.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['atr'] = pd.Series(arr, index=df.index) + if csv_prefix: + feature_file = f'../data/{csv_prefix}_atr.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['atr'] = pd.Series(arr, index=df.index) + else: + _, values = calc_atr(df['High'], df['Low'], df['Close']) + features_dict['atr'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: atr') _, values = calc_atr(df['High'], df['Low'], df['Close']) features_dict['atr'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # CCI - feature_file = f'../data/{csv_prefix}_cci.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['cci'] = pd.Series(arr, index=df.index) + if csv_prefix: + feature_file = f'../data/{csv_prefix}_cci.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['cci'] = pd.Series(arr, index=df.index) + else: + _, values = calc_cci(df['High'], df['Low'], df['Close']) + features_dict['cci'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: cci') _, values = calc_cci(df['High'], df['Low'], df['Close']) features_dict['cci'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # Williams %R - feature_file = f'../data/{csv_prefix}_williams_r.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['williams_r'] = pd.Series(arr, index=df.index) + if csv_prefix: + feature_file = f'../data/{csv_prefix}_williams_r.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['williams_r'] = pd.Series(arr, index=df.index) + else: + _, values = calc_williamsr(df['High'], df['Low'], df['Close']) + features_dict['williams_r'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: williams_r') _, values = calc_williamsr(df['High'], df['Low'], df['Close']) features_dict['williams_r'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # EMA 14 - feature_file = f'../data/{csv_prefix}_ema_14.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['ema_14'] = pd.Series(arr, index=df.index) + if csv_prefix: + feature_file = f'../data/{csv_prefix}_ema_14.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['ema_14'] = pd.Series(arr, index=df.index) + else: + _, values = calc_ema(df['Close']) + features_dict['ema_14'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: ema_14') _, values = calc_ema(df['Close']) features_dict['ema_14'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # OBV - feature_file = f'../data/{csv_prefix}_obv.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['obv'] = pd.Series(arr, index=df.index) + if csv_prefix: + feature_file = f'../data/{csv_prefix}_obv.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['obv'] = pd.Series(arr, index=df.index) + else: + _, values = calc_obv(df['Close'], df['Volume']) + features_dict['obv'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: obv') _, values = calc_obv(df['Close'], df['Volume']) features_dict['obv'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # CMF - feature_file = f'../data/{csv_prefix}_cmf.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['cmf'] = pd.Series(arr, index=df.index) + if csv_prefix: + feature_file = f'../data/{csv_prefix}_cmf.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['cmf'] = pd.Series(arr, index=df.index) + else: + _, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume']) + features_dict['cmf'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: cmf') _, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume']) features_dict['cmf'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # ROC 10 - feature_file = f'../data/{csv_prefix}_roc_10.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['roc_10'] = pd.Series(arr, index=df.index) + if csv_prefix: + feature_file = f'../data/{csv_prefix}_roc_10.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['roc_10'] = pd.Series(arr, index=df.index) + else: + _, values = calc_roc(df['Close']) + features_dict['roc_10'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: roc_10') _, values = calc_roc(df['Close']) features_dict['roc_10'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # DPO 20 - feature_file = f'../data/{csv_prefix}_dpo_20.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['dpo_20'] = pd.Series(arr, index=df.index) + if csv_prefix: + feature_file = f'../data/{csv_prefix}_dpo_20.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['dpo_20'] = pd.Series(arr, index=df.index) + else: + _, values = calc_dpo(df['Close']) + features_dict['dpo_20'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: dpo_20') _, values = calc_dpo(df['Close']) features_dict['dpo_20'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # Ultimate Oscillator - feature_file = f'../data/{csv_prefix}_ultimate_osc.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['ultimate_osc'] = pd.Series(arr, index=df.index) + if csv_prefix: + feature_file = f'../data/{csv_prefix}_ultimate_osc.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['ultimate_osc'] = pd.Series(arr, index=df.index) + else: + _, values = calc_ultimate(df['High'], df['Low'], df['Close']) + features_dict['ultimate_osc'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: ultimate_osc') _, values = calc_ultimate(df['High'], df['Low'], df['Close']) features_dict['ultimate_osc'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # Daily Return - feature_file = f'../data/{csv_prefix}_daily_return.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['daily_return'] = pd.Series(arr, index=df.index) + if csv_prefix: + feature_file = f'../data/{csv_prefix}_daily_return.npy' + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['daily_return'] = pd.Series(arr, index=df.index) + else: + _, values = calc_daily_return(df['Close']) + features_dict['daily_return'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: daily_return') _, values = calc_daily_return(df['Close']) features_dict['daily_return'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # Multi-column indicators # Bollinger Bands result = calc_bollinger(df['Close']) for subname, values in result: - sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) + if csv_prefix: + sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) else: features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') # Stochastic Oscillator result = calc_stochastic(df['High'], df['Low'], df['Close']) for subname, values in result: - sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) + if csv_prefix: + sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) else: features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') # SMA result = calc_sma(df['Close']) for subname, values in result: - sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) + if csv_prefix: + sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) else: features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') # PSAR result = calc_psar(df['High'], df['Low'], df['Close']) for subname, values in result: - sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) + if csv_prefix: + sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) else: features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') # Donchian Channel result = calc_donchian(df['High'], df['Low'], df['Close']) for subname, values in result: - sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) + if csv_prefix: + sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) else: features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') # Keltner Channel result = calc_keltner(df['High'], df['Low'], df['Close']) for subname, values in result: - sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) + if csv_prefix: + sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) else: features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') # Ichimoku result = calc_ichimoku(df['High'], df['Low']) for subname, values in result: - sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) + if csv_prefix: + sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) else: features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') # Elder Ray result = calc_elder_ray(df['Close'], df['Low'], df['High']) for subname, values in result: - sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' - if os.path.exists(sub_feature_file): - arr = np.load(sub_feature_file) - features_dict[subname] = pd.Series(arr, index=df.index) + if csv_prefix: + sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' + if os.path.exists(sub_feature_file): + arr = np.load(sub_feature_file) + features_dict[subname] = pd.Series(arr, index=df.index) + else: + features_dict[subname] = values + np.save(sub_feature_file, values.values) else: features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') # Prepare lags, rolling stats, log returns, and volatility features sequentially # Lags @@ -253,14 +312,17 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes): for lag in range(1, lags + 1): feature_name = f'{col}_lag{lag}' feature_file = f'../data/{csv_prefix}_{feature_name}.npy' - if os.path.exists(feature_file): - features_dict[feature_name] = np.load(feature_file) + if csv_prefix: + if os.path.exists(feature_file): + features_dict[feature_name] = np.load(feature_file) + else: + result = compute_lag(df, col, lag) + features_dict[feature_name] = result + np.save(feature_file, result.values) else: - print(f'Computing lag feature: {feature_name}') result = compute_lag(df, col, lag) features_dict[feature_name] = result - np.save(feature_file, result.values) - print(f'Saved feature: {feature_file}') + # Rolling statistics for col in ohlcv_cols: for window in window_sizes: @@ -275,44 +337,52 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes): for stat in ['mean', 'std', 'min', 'max']: feature_name = f'{col}_roll_{stat}_{window}' feature_file = f'../data/{csv_prefix}_{feature_name}.npy' - if os.path.exists(feature_file): - features_dict[feature_name] = np.load(feature_file) + if csv_prefix: + if os.path.exists(feature_file): + features_dict[feature_name] = np.load(feature_file) + else: + result = compute_rolling(df, col, stat, window) + features_dict[feature_name] = result + np.save(feature_file, result.values) else: - print(f'Computing rolling stat feature: {feature_name}') result = compute_rolling(df, col, stat, window) features_dict[feature_name] = result - np.save(feature_file, result.values) - print(f'Saved feature: {feature_file}') + # Log returns for different horizons for horizon in [5, 15, 30]: feature_name = f'log_return_{horizon}' feature_file = f'../data/{csv_prefix}_{feature_name}.npy' - if os.path.exists(feature_file): - features_dict[feature_name] = np.load(feature_file) + if csv_prefix: + if os.path.exists(feature_file): + features_dict[feature_name] = np.load(feature_file) + else: + result = compute_log_return(df, horizon) + features_dict[feature_name] = result + np.save(feature_file, result.values) else: - print(f'Computing log return feature: {feature_name}') result = compute_log_return(df, horizon) features_dict[feature_name] = result - np.save(feature_file, result.values) - print(f'Saved feature: {feature_file}') + # Volatility for window in window_sizes: feature_name = f'volatility_{window}' feature_file = f'../data/{csv_prefix}_{feature_name}.npy' - if os.path.exists(feature_file): - features_dict[feature_name] = np.load(feature_file) + if csv_prefix: + if os.path.exists(feature_file): + features_dict[feature_name] = np.load(feature_file) + else: + result = compute_volatility(df, window) + features_dict[feature_name] = result + np.save(feature_file, result.values) else: - print(f'Computing volatility feature: {feature_name}') result = compute_volatility(df, window) features_dict[feature_name] = result - np.save(feature_file, result.values) - print(f'Saved feature: {feature_file}') # --- Additional Technical Indicator Features --- # ADX adx_names = ['adx', 'adx_pos', 'adx_neg'] adx_files = [f'../data/{csv_prefix}_{name}.npy' for name in adx_names] - if all(os.path.exists(f) for f in adx_files): + if csv_prefix and all(os.path.exists(f) for f in adx_files): for name, f in zip(adx_names, adx_files): arr = np.load(f) features_dict[name] = pd.Series(arr, index=df.index) @@ -321,20 +391,22 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes): for subname, values in result: sub_feature_file = f'../data/{csv_prefix}_{subname}.npy' features_dict[subname] = values - np.save(sub_feature_file, values.values) - print(f'Saved feature: {sub_feature_file}') + if csv_prefix: + np.save(sub_feature_file, values.values) # Force Index feature_file = f'../data/{csv_prefix}_force_index.npy' - if os.path.exists(feature_file): - arr = np.load(feature_file) - features_dict['force_index'] = pd.Series(arr, index=df.index) + if csv_prefix: + if os.path.exists(feature_file): + arr = np.load(feature_file) + features_dict['force_index'] = pd.Series(arr, index=df.index) + else: + _, values = calc_force_index(df['Close'], df['Volume']) + features_dict['force_index'] = values + np.save(feature_file, values.values) else: - print('Calculating feature: force_index') _, values = calc_force_index(df['Close'], df['Volume']) features_dict['force_index'] = values - np.save(feature_file, values.values) - print(f'Saved feature: {feature_file}') # Supertrend indicators (simplified implementation) for period, multiplier in [(12, 3.0), (10, 1.0), (11, 2.0)]: @@ -342,26 +414,23 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes): st_trend_name = f'supertrend_trend_{period}_{multiplier}' st_file = f'../data/{csv_prefix}_{st_name}.npy' st_trend_file = f'../data/{csv_prefix}_{st_trend_name}.npy' - if os.path.exists(st_file) and os.path.exists(st_trend_file): + if csv_prefix and os.path.exists(st_file) and os.path.exists(st_trend_file): features_dict[st_name] = pd.Series(np.load(st_file), index=df.index) features_dict[st_trend_name] = pd.Series(np.load(st_trend_file), index=df.index) else: - print(f'Calculating Supertrend indicator: {st_name}') # Simple supertrend alternative using ATR and moving averages from ta.volatility import AverageTrueRange atr = AverageTrueRange(df['High'], df['Low'], df['Close'], window=period).average_true_range() hl_avg = (df['High'] + df['Low']) / 2 basic_ub = hl_avg + (multiplier * atr) basic_lb = hl_avg - (multiplier * atr) - # Simplified supertrend calculation supertrend = hl_avg.copy() trend = pd.Series(1, index=df.index) # 1 for uptrend, -1 for downtrend - features_dict[st_name] = supertrend features_dict[st_trend_name] = trend - np.save(st_file, features_dict[st_name].values) - np.save(st_trend_file, features_dict[st_trend_name].values) - print(f'Saved features: {st_file}, {st_trend_file}') - + if csv_prefix: + np.save(st_file, features_dict[st_name].values) + np.save(st_trend_file, features_dict[st_trend_name].values) + return features_dict diff --git a/inference_example.py b/inference_example.py new file mode 100644 index 0000000..1a8c629 --- /dev/null +++ b/inference_example.py @@ -0,0 +1,299 @@ +""" +Complete example showing how to use the OHLCVPredictor for making predictions. +This example demonstrates: +1. Loading a trained model +2. Preparing sample OHLCV data +3. Making log return predictions +4. Making price predictions +5. Evaluating and displaying results +""" + +import os +import pandas as pd +import numpy as np +from datetime import datetime, timedelta +from predictor import OHLCVPredictor + +def create_sample_ohlcv_data(num_samples=200): + """ + Create realistic sample OHLCV data for demonstration. + In practice, replace this with your actual data loading. + + Returns: + pd.DataFrame: DataFrame with OHLCV data + """ + print("Creating sample OHLCV data for demonstration...") + + # Start with a base price and simulate realistic price movements + np.random.seed(42) # For reproducible results + base_price = 50000.0 # Base Bitcoin price + + # Generate timestamps (1-minute intervals) + start_time = datetime(2024, 1, 1) + timestamps = [start_time + timedelta(minutes=i) for i in range(num_samples)] + + # Generate realistic price movements + returns = np.random.normal(0, 0.001, num_samples) # Small random returns + prices = [base_price] + + for i in range(1, num_samples): + # Add some trending behavior + trend = 0.0001 * np.sin(i / 50.0) # Gentle sinusoidal trend + price_change = returns[i] + trend + new_price = prices[-1] * (1 + price_change) + prices.append(max(new_price, 1000)) # Minimum price floor + + # Generate OHLCV data + data = [] + for i in range(num_samples): + price = prices[i] + + # Generate realistic OHLC within a reasonable range + volatility = abs(np.random.normal(0, 0.002)) # Random volatility + high = price * (1 + volatility) + low = price * (1 - volatility) + + # Ensure OHLC relationships are correct + open_price = price * (1 + np.random.normal(0, 0.0005)) + close_price = price * (1 + np.random.normal(0, 0.0005)) + + # Ensure high is highest and low is lowest + high = max(high, open_price, close_price) + low = min(low, open_price, close_price) + + # Generate volume (typically higher during price movements) + base_volume = 100 + abs(np.random.normal(0, 50)) + volume_multiplier = 1 + abs(open_price - close_price) / close_price * 10 + volume = base_volume * volume_multiplier + + data.append({ + 'Timestamp': timestamps[i], + 'Open': round(open_price, 2), + 'High': round(high, 2), + 'Low': round(low, 2), + 'Close': round(close_price, 2), + 'Volume': round(volume, 2) + }) + + df = pd.DataFrame(data) + + # Calculate log returns (required by feature engineering) + df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) + + print(f"Generated {len(df)} samples of OHLCV data") + print(f"Price range: ${df['Close'].min():.2f} - ${df['Close'].max():.2f}") + return df + +def load_real_data_example(): + """ + Example of how to load real OHLCV data. + Replace this with your actual data loading logic. + + Returns: + pd.DataFrame or None: Real OHLCV data if available + """ + # Example paths where real data might be located + possible_paths = [ + '../data/btcusd_1-min_data.csv', + '../data/sample_data.csv', + 'data/crypto_data.csv' + ] + + for path in possible_paths: + if os.path.exists(path): + print(f"Loading real data from {path}...") + try: + df = pd.read_csv(path) + # Ensure required columns exist + required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp'] + if all(col in df.columns for col in required_cols): + # Filter out zero volume entries and calculate log returns + df = df[df['Volume'] != 0].reset_index(drop=True) + # Use only recent data and ensure proper data types + df = df.tail(500).reset_index(drop=True) # Get more data for better feature engineering + df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) + print(f"Successfully loaded {len(df)} rows of real data") + return df.tail(200) # Use last 200 for final processing + else: + print(f"Missing required columns in {path}") + except Exception as e: + print(f"Error loading {path}: {e}") + + return None + +def display_prediction_results(df, log_return_preds, predicted_prices=None, actual_prices=None): + """ + Display prediction results in a readable format. + + Args: + df: Original OHLCV DataFrame + log_return_preds: Array of log return predictions + predicted_prices: Array of predicted prices (optional) + actual_prices: Array of actual prices (optional) + """ + print("\n" + "="*60) + print("PREDICTION RESULTS") + print("="*60) + + # Convert timestamps back to readable format for display + df_display = df.copy() + df_display['Timestamp'] = pd.to_datetime(df_display['Timestamp'], unit='s') + + print(f"\nLog Return Predictions (first 10):") + print("-" * 40) + for i in range(min(10, len(log_return_preds))): + timestamp = df_display.iloc[i]['Timestamp'] + close_price = df_display.iloc[i]['Close'] + log_ret = log_return_preds[i] + direction = "UP" if log_ret > 0 else "DOWN" + print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | " + f"Close: ${close_price:8.2f} | " + f"Log Return: {log_ret:8.6f} | " + f"Direction: {direction}") + + if predicted_prices is not None and actual_prices is not None: + print(f"\nPrice Predictions vs Actual (first 10):") + print("-" * 50) + for i in range(min(10, len(predicted_prices))): + timestamp = df_display.iloc[i]['Timestamp'] + pred_price = predicted_prices[i] + actual_price = actual_prices[i] + error = abs(pred_price - actual_price) + error_pct = (error / actual_price) * 100 + print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | " + f"Predicted: ${pred_price:8.2f} | " + f"Actual: ${actual_price:8.2f} | " + f"Error: {error_pct:5.2f}%") + + # Statistics + print(f"\nPrediction Statistics:") + print("-" * 30) + print(f"Total predictions: {len(log_return_preds)}") + print(f"Mean log return: {np.mean(log_return_preds):.6f}") + print(f"Std log return: {np.std(log_return_preds):.6f}") + print(f"Positive predictions: {np.sum(log_return_preds > 0)} ({np.mean(log_return_preds > 0)*100:.1f}%)") + print(f"Negative predictions: {np.sum(log_return_preds < 0)} ({np.mean(log_return_preds < 0)*100:.1f}%)") + + if predicted_prices is not None and actual_prices is not None: + mae = np.mean(np.abs(predicted_prices - actual_prices)) + mape = np.mean(np.abs((predicted_prices - actual_prices) / actual_prices)) * 100 + print(f"\nPrice Prediction Accuracy:") + print(f"Mean Absolute Error: ${mae:.2f}") + print(f"Mean Absolute Percentage Error: {mape:.2f}%") + +def demonstrate_batch_prediction(predictor, df): + """ + Demonstrate batch prediction on multiple data chunks. + + Args: + predictor: OHLCVPredictor instance + df: OHLCV DataFrame + """ + print("\n" + "="*60) + print("BATCH PREDICTION DEMONSTRATION") + print("="*60) + + chunk_size = 50 + num_chunks = min(3, len(df) // chunk_size) + + for i in range(num_chunks): + start_idx = i * chunk_size + end_idx = start_idx + chunk_size + chunk_df = df.iloc[start_idx:end_idx].copy() + + print(f"\nBatch {i+1}: Processing {len(chunk_df)} samples...") + + try: + log_return_preds = predictor.predict(chunk_df, csv_prefix=f'batch_{i+1}') + print(f"Successfully predicted {len(log_return_preds)} log returns") + print(f"Batch {i+1} mean prediction: {np.mean(log_return_preds):.6f}") + + except Exception as e: + print(f"Error in batch {i+1}: {e}") + +def main(): + """ + Main function demonstrating complete OHLCVPredictor usage. + """ + model_path = '../data/xgboost_model_all_features.json' + + # Check if model exists + if not os.path.exists(model_path): + print("Model not found. Run main.py first to train the model.") + print(f"Expected model path: {model_path}") + return + + try: + # Load predictor + print("Loading predictor...") + predictor = OHLCVPredictor(model_path) + print("Predictor loaded successfully!") + + # Try to load real data first, fall back to synthetic data + df = load_real_data_example() + if df is None: + df = create_sample_ohlcv_data(200) + + print(f"\nDataFrame shape: {df.shape}") + print(f"Columns: {list(df.columns)}") + print(f"Data range: {len(df)} samples") + + # Demonstrate log return predictions + print("\n" + "="*60) + print("LOG RETURN PREDICTIONS") + print("="*60) + + log_return_preds = predictor.predict(df, csv_prefix='inference_demo') + print(f"Generated {len(log_return_preds)} log return predictions") + + # Demonstrate price predictions + print("\n" + "="*60) + print("PRICE PREDICTIONS") + print("="*60) + + predicted_prices, actual_prices = predictor.predict_prices(df, csv_prefix='price_demo') + print(f"Generated {len(predicted_prices)} price predictions") + + # Display results + display_prediction_results(df, log_return_preds, predicted_prices, actual_prices) + + # Demonstrate batch processing + demonstrate_batch_prediction(predictor, df) + + print("\n" + "="*60) + print("USAGE EXAMPLES FOR OTHER PROJECTS") + print("="*60) + print(""" + # Basic usage: + from predictor import OHLCVPredictor + + # Load your trained model + predictor = OHLCVPredictor('path/to/your/model.json') + + # Prepare your OHLCV data (pandas DataFrame with columns): + # ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume'] + + # Get log return predictions + log_returns = predictor.predict(your_dataframe) + + # Get price predictions + predicted_prices, actual_prices = predictor.predict_prices(your_dataframe) + + # Required files for deployment: + # - predictor.py + # - custom_xgboost.py + # - feature_engineering.py + # - technical_indicator_functions.py + # - your_trained_model.json + """) + + except FileNotFoundError as e: + print(f"File not found: {e}") + print("Make sure the model file exists and the path is correct.") + + except Exception as e: + print(f"Error during prediction: {e}") + print("Check your data format and model compatibility.") + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/predictor.py b/predictor.py new file mode 100644 index 0000000..92670dc --- /dev/null +++ b/predictor.py @@ -0,0 +1,97 @@ +import pandas as pd +import numpy as np +import os + +try: + from .custom_xgboost import CustomXGBoostGPU +except ImportError: + from custom_xgboost import CustomXGBoostGPU + +try: + from .feature_engineering import feature_engineering +except ImportError: + from feature_engineering import feature_engineering + +class OHLCVPredictor: + def __init__(self, model_path): + if not os.path.exists(model_path): + raise FileNotFoundError(f"Model file not found: {model_path}") + + self.model = CustomXGBoostGPU.load_model(model_path) + self.exclude_cols = self._get_excluded_features() + + def _get_excluded_features(self): + """Get the list of features to exclude (copied from main.py)""" + exclude_cols = ['Timestamp', 'Close'] + exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30'] + exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar', + 'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband', + 'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line', + 'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2', + 'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15', + 'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15', + 'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30', + 'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5', + 'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30', + 'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15', + 'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0'] + return exclude_cols + + def predict(self, df, csv_prefix=None): + # Validate input DataFrame + required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp'] + missing_cols = [col for col in required_cols if col not in df.columns] + if missing_cols: + raise ValueError(f"Missing required columns: {missing_cols}") + + # Make a copy and preprocess + df = df.copy() + df = df[df['Volume'] != 0].reset_index(drop=True) + + # Convert timestamps + if df['Timestamp'].dtype == 'object': + df['Timestamp'] = pd.to_datetime(df['Timestamp']) + else: + df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s') + + # Feature engineering + ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume'] + features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, 3, [5, 15, 30]) + features_df = pd.DataFrame(features_dict) + df = pd.concat([df, features_df], axis=1) + + # Downcast and add time features (exclude Timestamp to preserve datetime) + for col in df.columns: + if col != 'Timestamp': # Don't convert Timestamp to numeric + try: + df[col] = pd.to_numeric(df[col], downcast='float') + except Exception: + pass + + df['hour'] = df['Timestamp'].dt.hour + + # Handle NaNs + numeric_cols = df.select_dtypes(include=[np.number]).columns + for col in numeric_cols: + if df[col].isna().any(): + df[col] = df[col].fillna(df[col].mean()) + + # Defragment DataFrame after all columns have been added + df = df.copy() + + # Select features and predict + feature_cols = [col for col in df.columns if col not in self.exclude_cols] + X = df[feature_cols].values.astype(np.float32) + return self.model.predict(X) + + def predict_prices(self, df, csv_prefix=None): + log_return_preds = self.predict(df, csv_prefix) + df_clean = df[df['Volume'] != 0].copy() + close_prices = df_clean['Close'].values + + predicted_prices = [close_prices[0]] + for i, log_ret in enumerate(log_return_preds[1:], 1): + if i < len(close_prices): + predicted_prices.append(predicted_prices[-1] * np.exp(log_ret)) + + return np.array(predicted_prices), close_prices[:len(predicted_prices)] \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 860ce36..2b2344d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,3 +12,11 @@ dependencies = [ "ta>=0.11.0", "xgboost>=3.0.2", ] + +[build-system] +requires = ["setuptools>=61.0", "wheel"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +include = ["ohlcvpredictor*"] +exclude = ["charts*"] diff --git a/uv.lock b/uv.lock index 8d25bef..c43c1a5 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.12" [[package]] @@ -309,7 +309,7 @@ wheels = [ [[package]] name = "ohlcvpredictor" version = "0.1.0" -source = { virtual = "." } +source = { editable = "." } dependencies = [ { name = "dash" }, { name = "numba" },