Enhance CustomXGBoostGPU for inference by making training data optional and adding model loading functionality. Update feature engineering to support caching and improve data handling. Introduce a new inference example and README for better usability in other projects.

2025-08-08 09:38:18 +08:00
parent b56d9ea3a1
commit a419764fff
7 changed files with 707 additions and 177 deletions
--- a/INFERENCE_README.md
+++ b/INFERENCE_README.md
@@ -0,0 +1,38 @@
+# OHLCV Predictor - Simple Inference
+
+Refactored for easy reuse in other projects.
+
+## Usage
+
+```python
+from predictor import OHLCVPredictor
+
+predictor = OHLCVPredictor('model.json')
+predictions = predictor.predict(your_ohlcv_dataframe)
+```
+
+## Files Needed
+
+Copy these 5 files to your other project:
+
+1. `predictor.py`
+2. `custom_xgboost.py`
+3. `feature_engineering.py`
+4. `technical_indicator_functions.py`
+5. `xgboost_model_all_features.json`
+
+## Data Requirements
+
+Your DataFrame needs these columns:
+- `Open`, `High`, `Low`, `Close`, `Volume`, `Timestamp`
+
+## Dependencies
+
+```
+xgboost >= 3.0.2
+pandas >= 2.2.3
+numpy >= 2.2.3
+scikit-learn >= 1.6.1
+ta >= 0.11.0
+numba >= 0.61.2
+``` 
--- a/custom_xgboost.py
+++ b/custom_xgboost.py
@@ -2,15 +2,34 @@ import xgboost as xgb
 import numpy as np

 class CustomXGBoostGPU:
-    def __init__(self, X_train, X_test, y_train, y_test):
-        self.X_train = X_train.astype(np.float32)
-        self.X_test = X_test.astype(np.float32)
-        self.y_train = y_train.astype(np.float32)
-        self.y_test = y_test.astype(np.float32)
+    def __init__(self, X_train=None, X_test=None, y_train=None, y_test=None):
+        # Make training data optional for inference-only usage
+        self.X_train = X_train.astype(np.float32) if X_train is not None else None
+        self.X_test = X_test.astype(np.float32) if X_test is not None else None
+        self.y_train = y_train.astype(np.float32) if y_train is not None else None
+        self.y_test = y_test.astype(np.float32) if y_test is not None else None
        self.model = None
        self.params = None  # Will be set during training

+    @classmethod
+    def load_model(cls, model_path):
+        """Load a pre-trained model from file for inference
+        
+        Args:
+            model_path (str): Path to the saved XGBoost model file
+            
+        Returns:
+            CustomXGBoostGPU: Instance with loaded model ready for inference
+        """
+        instance = cls()  # Create instance without training data
+        instance.model = xgb.Booster()
+        instance.model.load_model(model_path)
+        return instance
+
    def train(self, **xgb_params):
+        if self.X_train is None or self.y_train is None:
+            raise ValueError('Training data is required for training. Use load_model() for inference-only usage.')
+        
        params = {
            'tree_method': 'hist',
            'device': 'cuda',
--- a/feature_engineering.py
+++ b/feature_engineering.py
@@ -2,250 +2,309 @@ import os
 import numpy as np
 import pandas as pd
 import ta
-from technical_indicator_functions import *
+
+try:
+    from .technical_indicator_functions import *
+except ImportError:
+    from technical_indicator_functions import *

 def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
-    feature_file = f'../data/{csv_prefix}_rsi.npy'
+    """
+    Compute and/or load features for the given DataFrame.
+    If csv_prefix is provided, features are cached to disk; otherwise, features are only computed in memory.
+
+    Args:
+        df (pd.DataFrame): Input OHLCV data.
+        csv_prefix (str or None): Prefix for feature files (for caching). If None or '', disables caching.
+        ohlcv_cols (list): List of OHLCV column names.
+        lags (int): Number of lag features.
+        window_sizes (list): List of window sizes for rolling features.
+
+    Returns:
+        dict: Dictionary of computed features.
+    """
    features_dict = {}

-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['rsi'] = pd.Series(arr, index=df.index)
+    # RSI
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_rsi.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['rsi'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_rsi(df['Close'])
+            features_dict['rsi'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: rsi')
        _, values = calc_rsi(df['Close'])
        features_dict['rsi'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # MACD
-    feature_file = f'../data/{csv_prefix}_macd.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['macd'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_macd.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['macd'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_macd(df['Close'])
+            features_dict['macd'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: macd')
        _, values = calc_macd(df['Close'])
        features_dict['macd'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # ATR
-    feature_file = f'../data/{csv_prefix}_atr.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['atr'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_atr.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['atr'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_atr(df['High'], df['Low'], df['Close'])
+            features_dict['atr'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: atr')
        _, values = calc_atr(df['High'], df['Low'], df['Close'])
        features_dict['atr'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # CCI
-    feature_file = f'../data/{csv_prefix}_cci.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['cci'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_cci.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['cci'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_cci(df['High'], df['Low'], df['Close'])
+            features_dict['cci'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: cci')
        _, values = calc_cci(df['High'], df['Low'], df['Close'])
        features_dict['cci'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # Williams %R
-    feature_file = f'../data/{csv_prefix}_williams_r.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['williams_r'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_williams_r.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['williams_r'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_williamsr(df['High'], df['Low'], df['Close'])
+            features_dict['williams_r'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: williams_r')
        _, values = calc_williamsr(df['High'], df['Low'], df['Close'])
        features_dict['williams_r'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # EMA 14
-    feature_file = f'../data/{csv_prefix}_ema_14.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['ema_14'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_ema_14.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['ema_14'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_ema(df['Close'])
+            features_dict['ema_14'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: ema_14')
        _, values = calc_ema(df['Close'])
        features_dict['ema_14'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # OBV
-    feature_file = f'../data/{csv_prefix}_obv.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['obv'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_obv.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['obv'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_obv(df['Close'], df['Volume'])
+            features_dict['obv'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: obv')
        _, values = calc_obv(df['Close'], df['Volume'])
        features_dict['obv'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # CMF
-    feature_file = f'../data/{csv_prefix}_cmf.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['cmf'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_cmf.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['cmf'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
+            features_dict['cmf'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: cmf')
        _, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
        features_dict['cmf'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # ROC 10
-    feature_file = f'../data/{csv_prefix}_roc_10.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['roc_10'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_roc_10.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['roc_10'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_roc(df['Close'])
+            features_dict['roc_10'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: roc_10')
        _, values = calc_roc(df['Close'])
        features_dict['roc_10'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # DPO 20
-    feature_file = f'../data/{csv_prefix}_dpo_20.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['dpo_20'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_dpo_20.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['dpo_20'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_dpo(df['Close'])
+            features_dict['dpo_20'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: dpo_20')
        _, values = calc_dpo(df['Close'])
        features_dict['dpo_20'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # Ultimate Oscillator
-    feature_file = f'../data/{csv_prefix}_ultimate_osc.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_ultimate_osc.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_ultimate(df['High'], df['Low'], df['Close'])
+            features_dict['ultimate_osc'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: ultimate_osc')
        _, values = calc_ultimate(df['High'], df['Low'], df['Close'])
        features_dict['ultimate_osc'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # Daily Return
-    feature_file = f'../data/{csv_prefix}_daily_return.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['daily_return'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        feature_file = f'../data/{csv_prefix}_daily_return.npy'
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['daily_return'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_daily_return(df['Close'])
+            features_dict['daily_return'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: daily_return')
        _, values = calc_daily_return(df['Close'])
        features_dict['daily_return'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # Multi-column indicators
    # Bollinger Bands
    result = calc_bollinger(df['Close'])
    for subname, values in result:
-        sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
-        if os.path.exists(sub_feature_file):
-            arr = np.load(sub_feature_file)
-            features_dict[subname] = pd.Series(arr, index=df.index)
+        if csv_prefix:
+            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
+            if os.path.exists(sub_feature_file):
+                arr = np.load(sub_feature_file)
+                features_dict[subname] = pd.Series(arr, index=df.index)
+            else:
+                features_dict[subname] = values
+                np.save(sub_feature_file, values.values)
        else:
            features_dict[subname] = values
-            np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')

    # Stochastic Oscillator
    result = calc_stochastic(df['High'], df['Low'], df['Close'])
    for subname, values in result:
-        sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
-        if os.path.exists(sub_feature_file):
-            arr = np.load(sub_feature_file)
-            features_dict[subname] = pd.Series(arr, index=df.index)
+        if csv_prefix:
+            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
+            if os.path.exists(sub_feature_file):
+                arr = np.load(sub_feature_file)
+                features_dict[subname] = pd.Series(arr, index=df.index)
+            else:
+                features_dict[subname] = values
+                np.save(sub_feature_file, values.values)
        else:
            features_dict[subname] = values
-            np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')

    # SMA
    result = calc_sma(df['Close'])
    for subname, values in result:
-        sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
-        if os.path.exists(sub_feature_file):
-            arr = np.load(sub_feature_file)
-            features_dict[subname] = pd.Series(arr, index=df.index)
+        if csv_prefix:
+            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
+            if os.path.exists(sub_feature_file):
+                arr = np.load(sub_feature_file)
+                features_dict[subname] = pd.Series(arr, index=df.index)
+            else:
+                features_dict[subname] = values
+                np.save(sub_feature_file, values.values)
        else:
            features_dict[subname] = values
-            np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')

    # PSAR
    result = calc_psar(df['High'], df['Low'], df['Close'])
    for subname, values in result:
-        sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
-        if os.path.exists(sub_feature_file):
-            arr = np.load(sub_feature_file)
-            features_dict[subname] = pd.Series(arr, index=df.index)
+        if csv_prefix:
+            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
+            if os.path.exists(sub_feature_file):
+                arr = np.load(sub_feature_file)
+                features_dict[subname] = pd.Series(arr, index=df.index)
+            else:
+                features_dict[subname] = values
+                np.save(sub_feature_file, values.values)
        else:
            features_dict[subname] = values
-            np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')

    # Donchian Channel
    result = calc_donchian(df['High'], df['Low'], df['Close'])
    for subname, values in result:
-        sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
-        if os.path.exists(sub_feature_file):
-            arr = np.load(sub_feature_file)
-            features_dict[subname] = pd.Series(arr, index=df.index)
+        if csv_prefix:
+            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
+            if os.path.exists(sub_feature_file):
+                arr = np.load(sub_feature_file)
+                features_dict[subname] = pd.Series(arr, index=df.index)
+            else:
+                features_dict[subname] = values
+                np.save(sub_feature_file, values.values)
        else:
            features_dict[subname] = values
-            np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')

    # Keltner Channel
    result = calc_keltner(df['High'], df['Low'], df['Close'])
    for subname, values in result:
-        sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
-        if os.path.exists(sub_feature_file):
-            arr = np.load(sub_feature_file)
-            features_dict[subname] = pd.Series(arr, index=df.index)
+        if csv_prefix:
+            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
+            if os.path.exists(sub_feature_file):
+                arr = np.load(sub_feature_file)
+                features_dict[subname] = pd.Series(arr, index=df.index)
+            else:
+                features_dict[subname] = values
+                np.save(sub_feature_file, values.values)
        else:
            features_dict[subname] = values
-            np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')

    # Ichimoku
    result = calc_ichimoku(df['High'], df['Low'])
    for subname, values in result:
-        sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
-        if os.path.exists(sub_feature_file):
-            arr = np.load(sub_feature_file)
-            features_dict[subname] = pd.Series(arr, index=df.index)
+        if csv_prefix:
+            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
+            if os.path.exists(sub_feature_file):
+                arr = np.load(sub_feature_file)
+                features_dict[subname] = pd.Series(arr, index=df.index)
+            else:
+                features_dict[subname] = values
+                np.save(sub_feature_file, values.values)
        else:
            features_dict[subname] = values
-            np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')

    # Elder Ray
    result = calc_elder_ray(df['Close'], df['Low'], df['High'])
    for subname, values in result:
-        sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
-        if os.path.exists(sub_feature_file):
-            arr = np.load(sub_feature_file)
-            features_dict[subname] = pd.Series(arr, index=df.index)
+        if csv_prefix:
+            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
+            if os.path.exists(sub_feature_file):
+                arr = np.load(sub_feature_file)
+                features_dict[subname] = pd.Series(arr, index=df.index)
+            else:
+                features_dict[subname] = values
+                np.save(sub_feature_file, values.values)
        else:
            features_dict[subname] = values
-            np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')

    # Prepare lags, rolling stats, log returns, and volatility features sequentially
    # Lags
@@ -253,14 +312,17 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
        for lag in range(1, lags + 1):
            feature_name = f'{col}_lag{lag}'
            feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
-            if os.path.exists(feature_file):
-                features_dict[feature_name] = np.load(feature_file)
+            if csv_prefix:
+                if os.path.exists(feature_file):
+                    features_dict[feature_name] = np.load(feature_file)
+                else:
+                    result = compute_lag(df, col, lag)
+                    features_dict[feature_name] = result
+                    np.save(feature_file, result.values)
            else:
-                print(f'Computing lag feature: {feature_name}')
                result = compute_lag(df, col, lag)
                features_dict[feature_name] = result
-                np.save(feature_file, result.values)
-                print(f'Saved feature: {feature_file}')
+
    # Rolling statistics
    for col in ohlcv_cols:
        for window in window_sizes:
@@ -275,44 +337,52 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
            for stat in ['mean', 'std', 'min', 'max']:
                feature_name = f'{col}_roll_{stat}_{window}'
                feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
-                if os.path.exists(feature_file):
-                    features_dict[feature_name] = np.load(feature_file)
+                if csv_prefix:
+                    if os.path.exists(feature_file):
+                        features_dict[feature_name] = np.load(feature_file)
+                    else:
+                        result = compute_rolling(df, col, stat, window)
+                        features_dict[feature_name] = result
+                        np.save(feature_file, result.values)
                else:
-                    print(f'Computing rolling stat feature: {feature_name}')
                    result = compute_rolling(df, col, stat, window)
                    features_dict[feature_name] = result
-                    np.save(feature_file, result.values)
-                    print(f'Saved feature: {feature_file}')
+
    # Log returns for different horizons
    for horizon in [5, 15, 30]:
        feature_name = f'log_return_{horizon}'
        feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
-        if os.path.exists(feature_file):
-            features_dict[feature_name] = np.load(feature_file)
+        if csv_prefix:
+            if os.path.exists(feature_file):
+                features_dict[feature_name] = np.load(feature_file)
+            else:
+                result = compute_log_return(df, horizon)
+                features_dict[feature_name] = result
+                np.save(feature_file, result.values)
        else:
-            print(f'Computing log return feature: {feature_name}')
            result = compute_log_return(df, horizon)
            features_dict[feature_name] = result
-            np.save(feature_file, result.values)
-            print(f'Saved feature: {feature_file}')
+
    # Volatility
    for window in window_sizes:
        feature_name = f'volatility_{window}'
        feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
-        if os.path.exists(feature_file):
-            features_dict[feature_name] = np.load(feature_file)
+        if csv_prefix:
+            if os.path.exists(feature_file):
+                features_dict[feature_name] = np.load(feature_file)
+            else:
+                result = compute_volatility(df, window)
+                features_dict[feature_name] = result
+                np.save(feature_file, result.values)
        else:
-            print(f'Computing volatility feature: {feature_name}')
            result = compute_volatility(df, window)
            features_dict[feature_name] = result
-            np.save(feature_file, result.values)
-            print(f'Saved feature: {feature_file}')

    # --- Additional Technical Indicator Features ---
    # ADX
    adx_names = ['adx', 'adx_pos', 'adx_neg']
    adx_files = [f'../data/{csv_prefix}_{name}.npy' for name in adx_names]
-    if all(os.path.exists(f) for f in adx_files):
+    if csv_prefix and all(os.path.exists(f) for f in adx_files):
        for name, f in zip(adx_names, adx_files):
            arr = np.load(f)
            features_dict[name] = pd.Series(arr, index=df.index)
@@ -321,20 +391,22 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
        for subname, values in result:
            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
            features_dict[subname] = values
-            np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')
+            if csv_prefix:
+                np.save(sub_feature_file, values.values)

    # Force Index
    feature_file = f'../data/{csv_prefix}_force_index.npy'
-    if os.path.exists(feature_file):
-        arr = np.load(feature_file)
-        features_dict['force_index'] = pd.Series(arr, index=df.index)
+    if csv_prefix:
+        if os.path.exists(feature_file):
+            arr = np.load(feature_file)
+            features_dict['force_index'] = pd.Series(arr, index=df.index)
+        else:
+            _, values = calc_force_index(df['Close'], df['Volume'])
+            features_dict['force_index'] = values
+            np.save(feature_file, values.values)
    else:
-        print('Calculating feature: force_index')
        _, values = calc_force_index(df['Close'], df['Volume'])
        features_dict['force_index'] = values
-        np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')

    # Supertrend indicators (simplified implementation)
    for period, multiplier in [(12, 3.0), (10, 1.0), (11, 2.0)]:
@@ -342,26 +414,23 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
        st_trend_name = f'supertrend_trend_{period}_{multiplier}'
        st_file = f'../data/{csv_prefix}_{st_name}.npy'
        st_trend_file = f'../data/{csv_prefix}_{st_trend_name}.npy'
-        if os.path.exists(st_file) and os.path.exists(st_trend_file):
+        if csv_prefix and os.path.exists(st_file) and os.path.exists(st_trend_file):
            features_dict[st_name] = pd.Series(np.load(st_file), index=df.index)
            features_dict[st_trend_name] = pd.Series(np.load(st_trend_file), index=df.index)
        else:
-            print(f'Calculating Supertrend indicator: {st_name}')
            # Simple supertrend alternative using ATR and moving averages
            from ta.volatility import AverageTrueRange
            atr = AverageTrueRange(df['High'], df['Low'], df['Close'], window=period).average_true_range()
            hl_avg = (df['High'] + df['Low']) / 2
            basic_ub = hl_avg + (multiplier * atr)
            basic_lb = hl_avg - (multiplier * atr)
-            
            # Simplified supertrend calculation
            supertrend = hl_avg.copy()
            trend = pd.Series(1, index=df.index)  # 1 for uptrend, -1 for downtrend
-            
            features_dict[st_name] = supertrend
            features_dict[st_trend_name] = trend
-            np.save(st_file, features_dict[st_name].values)
-            np.save(st_trend_file, features_dict[st_trend_name].values)
-            print(f'Saved features: {st_file}, {st_trend_file}')
-    
+            if csv_prefix:
+                np.save(st_file, features_dict[st_name].values)
+                np.save(st_trend_file, features_dict[st_trend_name].values)
+
    return features_dict
--- a/inference_example.py
+++ b/inference_example.py
@@ -0,0 +1,299 @@
+"""
+Complete example showing how to use the OHLCVPredictor for making predictions.
+This example demonstrates:
+1. Loading a trained model
+2. Preparing sample OHLCV data
+3. Making log return predictions
+4. Making price predictions
+5. Evaluating and displaying results
+"""
+
+import os
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+from predictor import OHLCVPredictor
+
+def create_sample_ohlcv_data(num_samples=200):
+    """
+    Create realistic sample OHLCV data for demonstration.
+    In practice, replace this with your actual data loading.
+    
+    Returns:
+        pd.DataFrame: DataFrame with OHLCV data
+    """
+    print("Creating sample OHLCV data for demonstration...")
+    
+    # Start with a base price and simulate realistic price movements
+    np.random.seed(42)  # For reproducible results
+    base_price = 50000.0  # Base Bitcoin price
+    
+    # Generate timestamps (1-minute intervals)
+    start_time = datetime(2024, 1, 1)
+    timestamps = [start_time + timedelta(minutes=i) for i in range(num_samples)]
+    
+    # Generate realistic price movements
+    returns = np.random.normal(0, 0.001, num_samples)  # Small random returns
+    prices = [base_price]
+    
+    for i in range(1, num_samples):
+        # Add some trending behavior
+        trend = 0.0001 * np.sin(i / 50.0)  # Gentle sinusoidal trend
+        price_change = returns[i] + trend
+        new_price = prices[-1] * (1 + price_change)
+        prices.append(max(new_price, 1000))  # Minimum price floor
+    
+    # Generate OHLCV data
+    data = []
+    for i in range(num_samples):
+        price = prices[i]
+        
+        # Generate realistic OHLC within a reasonable range
+        volatility = abs(np.random.normal(0, 0.002))  # Random volatility
+        high = price * (1 + volatility)
+        low = price * (1 - volatility)
+        
+        # Ensure OHLC relationships are correct
+        open_price = price * (1 + np.random.normal(0, 0.0005))
+        close_price = price * (1 + np.random.normal(0, 0.0005))
+        
+        # Ensure high is highest and low is lowest
+        high = max(high, open_price, close_price)
+        low = min(low, open_price, close_price)
+        
+        # Generate volume (typically higher during price movements)
+        base_volume = 100 + abs(np.random.normal(0, 50))
+        volume_multiplier = 1 + abs(open_price - close_price) / close_price * 10
+        volume = base_volume * volume_multiplier
+        
+        data.append({
+            'Timestamp': timestamps[i],
+            'Open': round(open_price, 2),
+            'High': round(high, 2),
+            'Low': round(low, 2),
+            'Close': round(close_price, 2),
+            'Volume': round(volume, 2)
+        })
+    
+    df = pd.DataFrame(data)
+    
+    # Calculate log returns (required by feature engineering)
+    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
+    
+    print(f"Generated {len(df)} samples of OHLCV data")
+    print(f"Price range: ${df['Close'].min():.2f} - ${df['Close'].max():.2f}")
+    return df
+
+def load_real_data_example():
+    """
+    Example of how to load real OHLCV data.
+    Replace this with your actual data loading logic.
+    
+    Returns:
+        pd.DataFrame or None: Real OHLCV data if available
+    """
+    # Example paths where real data might be located
+    possible_paths = [
+        '../data/btcusd_1-min_data.csv',
+        '../data/sample_data.csv',
+        'data/crypto_data.csv'
+    ]
+    
+    for path in possible_paths:
+        if os.path.exists(path):
+            print(f"Loading real data from {path}...")
+            try:
+                df = pd.read_csv(path)
+                # Ensure required columns exist
+                required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
+                if all(col in df.columns for col in required_cols):
+                    # Filter out zero volume entries and calculate log returns
+                    df = df[df['Volume'] != 0].reset_index(drop=True)
+                    # Use only recent data and ensure proper data types
+                    df = df.tail(500).reset_index(drop=True)  # Get more data for better feature engineering
+                    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
+                    print(f"Successfully loaded {len(df)} rows of real data")
+                    return df.tail(200)  # Use last 200 for final processing
+                else:
+                    print(f"Missing required columns in {path}")
+            except Exception as e:
+                print(f"Error loading {path}: {e}")
+    
+    return None
+
+def display_prediction_results(df, log_return_preds, predicted_prices=None, actual_prices=None):
+    """
+    Display prediction results in a readable format.
+    
+    Args:
+        df: Original OHLCV DataFrame
+        log_return_preds: Array of log return predictions
+        predicted_prices: Array of predicted prices (optional)
+        actual_prices: Array of actual prices (optional)
+    """
+    print("\n" + "="*60)
+    print("PREDICTION RESULTS")
+    print("="*60)
+    
+    # Convert timestamps back to readable format for display
+    df_display = df.copy()
+    df_display['Timestamp'] = pd.to_datetime(df_display['Timestamp'], unit='s')
+    
+    print(f"\nLog Return Predictions (first 10):")
+    print("-" * 40)
+    for i in range(min(10, len(log_return_preds))):
+        timestamp = df_display.iloc[i]['Timestamp']
+        close_price = df_display.iloc[i]['Close']
+        log_ret = log_return_preds[i]
+        direction = "UP" if log_ret > 0 else "DOWN"
+        print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
+              f"Close: ${close_price:8.2f} | "
+              f"Log Return: {log_ret:8.6f} | "
+              f"Direction: {direction}")
+    
+    if predicted_prices is not None and actual_prices is not None:
+        print(f"\nPrice Predictions vs Actual (first 10):")
+        print("-" * 50)
+        for i in range(min(10, len(predicted_prices))):
+            timestamp = df_display.iloc[i]['Timestamp']
+            pred_price = predicted_prices[i]
+            actual_price = actual_prices[i]
+            error = abs(pred_price - actual_price)
+            error_pct = (error / actual_price) * 100
+            print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
+                  f"Predicted: ${pred_price:8.2f} | "
+                  f"Actual: ${actual_price:8.2f} | "
+                  f"Error: {error_pct:5.2f}%")
+    
+    # Statistics
+    print(f"\nPrediction Statistics:")
+    print("-" * 30)
+    print(f"Total predictions: {len(log_return_preds)}")
+    print(f"Mean log return: {np.mean(log_return_preds):.6f}")
+    print(f"Std log return: {np.std(log_return_preds):.6f}")
+    print(f"Positive predictions: {np.sum(log_return_preds > 0)} ({np.mean(log_return_preds > 0)*100:.1f}%)")
+    print(f"Negative predictions: {np.sum(log_return_preds < 0)} ({np.mean(log_return_preds < 0)*100:.1f}%)")
+    
+    if predicted_prices is not None and actual_prices is not None:
+        mae = np.mean(np.abs(predicted_prices - actual_prices))
+        mape = np.mean(np.abs((predicted_prices - actual_prices) / actual_prices)) * 100
+        print(f"\nPrice Prediction Accuracy:")
+        print(f"Mean Absolute Error: ${mae:.2f}")
+        print(f"Mean Absolute Percentage Error: {mape:.2f}%")
+
+def demonstrate_batch_prediction(predictor, df):
+    """
+    Demonstrate batch prediction on multiple data chunks.
+    
+    Args:
+        predictor: OHLCVPredictor instance
+        df: OHLCV DataFrame
+    """
+    print("\n" + "="*60)
+    print("BATCH PREDICTION DEMONSTRATION")
+    print("="*60)
+    
+    chunk_size = 50
+    num_chunks = min(3, len(df) // chunk_size)
+    
+    for i in range(num_chunks):
+        start_idx = i * chunk_size
+        end_idx = start_idx + chunk_size
+        chunk_df = df.iloc[start_idx:end_idx].copy()
+        
+        print(f"\nBatch {i+1}: Processing {len(chunk_df)} samples...")
+        
+        try:
+            log_return_preds = predictor.predict(chunk_df, csv_prefix=f'batch_{i+1}')
+            print(f"Successfully predicted {len(log_return_preds)} log returns")
+            print(f"Batch {i+1} mean prediction: {np.mean(log_return_preds):.6f}")
+            
+        except Exception as e:
+            print(f"Error in batch {i+1}: {e}")
+
+def main():
+    """
+    Main function demonstrating complete OHLCVPredictor usage.
+    """
+    model_path = '../data/xgboost_model_all_features.json'
+    
+    # Check if model exists
+    if not os.path.exists(model_path):
+        print("Model not found. Run main.py first to train the model.")
+        print(f"Expected model path: {model_path}")
+        return
+    
+    try:
+        # Load predictor
+        print("Loading predictor...")
+        predictor = OHLCVPredictor(model_path)
+        print("Predictor loaded successfully!")
+        
+        # Try to load real data first, fall back to synthetic data
+        df = load_real_data_example()
+        if df is None:
+            df = create_sample_ohlcv_data(200)
+        
+        print(f"\nDataFrame shape: {df.shape}")
+        print(f"Columns: {list(df.columns)}")
+        print(f"Data range: {len(df)} samples")
+        
+        # Demonstrate log return predictions
+        print("\n" + "="*60)
+        print("LOG RETURN PREDICTIONS")
+        print("="*60)
+        
+        log_return_preds = predictor.predict(df, csv_prefix='inference_demo')
+        print(f"Generated {len(log_return_preds)} log return predictions")
+        
+        # Demonstrate price predictions
+        print("\n" + "="*60)
+        print("PRICE PREDICTIONS")  
+        print("="*60)
+        
+        predicted_prices, actual_prices = predictor.predict_prices(df, csv_prefix='price_demo')
+        print(f"Generated {len(predicted_prices)} price predictions")
+        
+        # Display results
+        display_prediction_results(df, log_return_preds, predicted_prices, actual_prices)
+        
+        # Demonstrate batch processing
+        demonstrate_batch_prediction(predictor, df)
+        
+        print("\n" + "="*60)
+        print("USAGE EXAMPLES FOR OTHER PROJECTS")
+        print("="*60)
+        print("""
+                # Basic usage:
+                from predictor import OHLCVPredictor
+
+                # Load your trained model
+                predictor = OHLCVPredictor('path/to/your/model.json')
+
+                # Prepare your OHLCV data (pandas DataFrame with columns):
+                # ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
+
+                # Get log return predictions
+                log_returns = predictor.predict(your_dataframe)
+
+                # Get price predictions
+                predicted_prices, actual_prices = predictor.predict_prices(your_dataframe)
+
+                # Required files for deployment:
+                # - predictor.py
+                # - custom_xgboost.py
+                # - feature_engineering.py  
+                # - technical_indicator_functions.py
+                # - your_trained_model.json
+        """)
+        
+    except FileNotFoundError as e:
+        print(f"File not found: {e}")
+        print("Make sure the model file exists and the path is correct.")
+        
+    except Exception as e:
+        print(f"Error during prediction: {e}")
+        print("Check your data format and model compatibility.")
+
+if __name__ == '__main__':
+    main() 
--- a/predictor.py
+++ b/predictor.py
@@ -0,0 +1,97 @@
+import pandas as pd
+import numpy as np
+import os
+
+try:
+    from .custom_xgboost import CustomXGBoostGPU
+except ImportError:
+    from custom_xgboost import CustomXGBoostGPU
+
+try:
+    from .feature_engineering import feature_engineering
+except ImportError:
+    from feature_engineering import feature_engineering
+
+class OHLCVPredictor:
+    def __init__(self, model_path):
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(f"Model file not found: {model_path}")
+            
+        self.model = CustomXGBoostGPU.load_model(model_path)
+        self.exclude_cols = self._get_excluded_features()
+    
+    def _get_excluded_features(self):
+        """Get the list of features to exclude (copied from main.py)"""
+        exclude_cols = ['Timestamp', 'Close']
+        exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30'] 
+        exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar', 
+        'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband', 
+        'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line', 
+        'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2', 
+        'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15', 
+        'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15', 
+        'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30', 
+        'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5', 
+        'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30', 
+        'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15', 
+        'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']
+        return exclude_cols
+    
+    def predict(self, df, csv_prefix=None):
+        # Validate input DataFrame
+        required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
+        missing_cols = [col for col in required_cols if col not in df.columns]
+        if missing_cols:
+            raise ValueError(f"Missing required columns: {missing_cols}")
+        
+        # Make a copy and preprocess
+        df = df.copy()
+        df = df[df['Volume'] != 0].reset_index(drop=True)
+        
+        # Convert timestamps
+        if df['Timestamp'].dtype == 'object':
+            df['Timestamp'] = pd.to_datetime(df['Timestamp'])
+        else:
+            df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
+        
+        # Feature engineering
+        ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
+        features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, 3, [5, 15, 30])
+        features_df = pd.DataFrame(features_dict)
+        df = pd.concat([df, features_df], axis=1)
+        
+        # Downcast and add time features (exclude Timestamp to preserve datetime)
+        for col in df.columns:
+            if col != 'Timestamp':  # Don't convert Timestamp to numeric
+                try:
+                    df[col] = pd.to_numeric(df[col], downcast='float')
+                except Exception:
+                    pass
+        
+        df['hour'] = df['Timestamp'].dt.hour
+        
+        # Handle NaNs
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+        for col in numeric_cols:
+            if df[col].isna().any():
+                df[col] = df[col].fillna(df[col].mean())
+        
+        # Defragment DataFrame after all columns have been added
+        df = df.copy()
+        
+        # Select features and predict
+        feature_cols = [col for col in df.columns if col not in self.exclude_cols]
+        X = df[feature_cols].values.astype(np.float32)
+        return self.model.predict(X)
+    
+    def predict_prices(self, df, csv_prefix=None):
+        log_return_preds = self.predict(df, csv_prefix)
+        df_clean = df[df['Volume'] != 0].copy()
+        close_prices = df_clean['Close'].values
+        
+        predicted_prices = [close_prices[0]]
+        for i, log_ret in enumerate(log_return_preds[1:], 1):
+            if i < len(close_prices):
+                predicted_prices.append(predicted_prices[-1] * np.exp(log_ret))
+        
+        return np.array(predicted_prices), close_prices[:len(predicted_prices)] 
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,3 +12,11 @@ dependencies = [
    "ta>=0.11.0",
    "xgboost>=3.0.2",
 ]
+
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+include = ["ohlcvpredictor*"]
+exclude = ["charts*"]
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.12"

 [[package]]
@@ -309,7 +309,7 @@ wheels = [
 [[package]]
 name = "ohlcvpredictor"
 version = "0.1.0"
-source = { virtual = "." }
+source = { editable = "." }
 dependencies = [
    { name = "dash" },
    { name = "numba" },