Enhance CustomXGBoostGPU for inference by making training data optional and adding model loading functionality. Update feature engineering to support caching and improve data handling. Introduce a new inference example and README for better usability in other projects.

2025-08-08 09:38:18 +08:00
parent b56d9ea3a1
commit a419764fff
7 changed files with 707 additions and 177 deletions
--- a/INFERENCE_README.md
+++ b/INFERENCE_README.md
@@ -0,0 +1,38 @@
 # OHLCV Predictor - Simple Inference
 Refactored for easy reuse in other projects.
 ## Usage
 ```python
 from predictor import OHLCVPredictor
 predictor = OHLCVPredictor('model.json')
 predictions = predictor.predict(your_ohlcv_dataframe)
 ```
 ## Files Needed
 Copy these 5 files to your other project:
 1. `predictor.py`
 2. `custom_xgboost.py`
 3. `feature_engineering.py`
 4. `technical_indicator_functions.py`
 5. `xgboost_model_all_features.json`
 ## Data Requirements
 Your DataFrame needs these columns:
 - `Open`, `High`, `Low`, `Close`, `Volume`, `Timestamp`
 ## Dependencies
 ```
 xgboost >= 3.0.2
 pandas >= 2.2.3
 numpy >= 2.2.3
 scikit-learn >= 1.6.1
 ta >= 0.11.0
 numba >= 0.61.2
 ``` 
--- a/custom_xgboost.py
+++ b/custom_xgboost.py
@@ -2,15 +2,34 @@ import xgboost as xgb
 import numpy as np
 class CustomXGBoostGPU:
-    def __init__(self, X_train, X_test, y_train, y_test):
+    def __init__(self, X_train=None, X_test=None, y_train=None, y_test=None):
-        self.X_train = X_train.astype(np.float32)
+        # Make training data optional for inference-only usage
-        self.X_test = X_test.astype(np.float32)
+        self.X_train = X_train.astype(np.float32) if X_train is not None else None
-        self.y_train = y_train.astype(np.float32)
+        self.X_test = X_test.astype(np.float32) if X_test is not None else None
-        self.y_test = y_test.astype(np.float32)
+        self.y_train = y_train.astype(np.float32) if y_train is not None else None
        self.y_test = y_test.astype(np.float32) if y_test is not None else None
        self.model = None
        self.params = None  # Will be set during training
    @classmethod
    def load_model(cls, model_path):
        """Load a pre-trained model from file for inference
        Args:
            model_path (str): Path to the saved XGBoost model file
        Returns:
            CustomXGBoostGPU: Instance with loaded model ready for inference
        """
        instance = cls()  # Create instance without training data
        instance.model = xgb.Booster()
        instance.model.load_model(model_path)
        return instance
    def train(self, **xgb_params):
        if self.X_train is None or self.y_train is None:
            raise ValueError('Training data is required for training. Use load_model() for inference-only usage.')
        params = {
            'tree_method': 'hist',
            'device': 'cuda',
--- a/feature_engineering.py
+++ b/feature_engineering.py
@@ -2,158 +2,202 @@ import os
 import numpy as np
 import pandas as pd
 import ta
-from technical_indicator_functions import *
+
 try:
    from .technical_indicator_functions import *
 except ImportError:
    from technical_indicator_functions import *
 def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
-    feature_file = f'../data/{csv_prefix}_rsi.npy'
+    """
    Compute and/or load features for the given DataFrame.
    If csv_prefix is provided, features are cached to disk; otherwise, features are only computed in memory.
    Args:
        df (pd.DataFrame): Input OHLCV data.
        csv_prefix (str or None): Prefix for feature files (for caching). If None or '', disables caching.
        ohlcv_cols (list): List of OHLCV column names.
        lags (int): Number of lag features.
        window_sizes (list): List of window sizes for rolling features.
    Returns:
        dict: Dictionary of computed features.
    """
    features_dict = {}
    # RSI
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_rsi.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['rsi'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: rsi')
            _, values = calc_rsi(df['Close'])
            features_dict['rsi'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_rsi(df['Close'])
        features_dict['rsi'] = values
    # MACD
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_macd.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['macd'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: macd')
            _, values = calc_macd(df['Close'])
            features_dict['macd'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_macd(df['Close'])
        features_dict['macd'] = values
    # ATR
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_atr.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['atr'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: atr')
            _, values = calc_atr(df['High'], df['Low'], df['Close'])
            features_dict['atr'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_atr(df['High'], df['Low'], df['Close'])
        features_dict['atr'] = values
    # CCI
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_cci.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['cci'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: cci')
            _, values = calc_cci(df['High'], df['Low'], df['Close'])
            features_dict['cci'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_cci(df['High'], df['Low'], df['Close'])
        features_dict['cci'] = values
    # Williams %R
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_williams_r.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['williams_r'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: williams_r')
            _, values = calc_williamsr(df['High'], df['Low'], df['Close'])
            features_dict['williams_r'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_williamsr(df['High'], df['Low'], df['Close'])
        features_dict['williams_r'] = values
    # EMA 14
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_ema_14.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['ema_14'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: ema_14')
            _, values = calc_ema(df['Close'])
            features_dict['ema_14'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_ema(df['Close'])
        features_dict['ema_14'] = values
    # OBV
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_obv.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['obv'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: obv')
            _, values = calc_obv(df['Close'], df['Volume'])
            features_dict['obv'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_obv(df['Close'], df['Volume'])
        features_dict['obv'] = values
    # CMF
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_cmf.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['cmf'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: cmf')
            _, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
            features_dict['cmf'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
        features_dict['cmf'] = values
    # ROC 10
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_roc_10.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['roc_10'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: roc_10')
            _, values = calc_roc(df['Close'])
            features_dict['roc_10'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_roc(df['Close'])
        features_dict['roc_10'] = values
    # DPO 20
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_dpo_20.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['dpo_20'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: dpo_20')
            _, values = calc_dpo(df['Close'])
            features_dict['dpo_20'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_dpo(df['Close'])
        features_dict['dpo_20'] = values
    # Ultimate Oscillator
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_ultimate_osc.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['ultimate_osc'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: ultimate_osc')
            _, values = calc_ultimate(df['High'], df['Low'], df['Close'])
            features_dict['ultimate_osc'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_ultimate(df['High'], df['Low'], df['Close'])
        features_dict['ultimate_osc'] = values
    # Daily Return
    if csv_prefix:
        feature_file = f'../data/{csv_prefix}_daily_return.npy'
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['daily_return'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: daily_return')
            _, values = calc_daily_return(df['Close'])
            features_dict['daily_return'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_daily_return(df['Close'])
        features_dict['daily_return'] = values
    # Multi-column indicators
    # Bollinger Bands
    result = calc_bollinger(df['Close'])
    for subname, values in result:
        if csv_prefix:
            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
            if os.path.exists(sub_feature_file):
                arr = np.load(sub_feature_file)
@@ -161,11 +205,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
            else:
                features_dict[subname] = values
                np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')
+        else:
            features_dict[subname] = values
    # Stochastic Oscillator
    result = calc_stochastic(df['High'], df['Low'], df['Close'])
    for subname, values in result:
        if csv_prefix:
            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
            if os.path.exists(sub_feature_file):
                arr = np.load(sub_feature_file)
@@ -173,11 +219,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
            else:
                features_dict[subname] = values
                np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')
+        else:
            features_dict[subname] = values
    # SMA
    result = calc_sma(df['Close'])
    for subname, values in result:
        if csv_prefix:
            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
            if os.path.exists(sub_feature_file):
                arr = np.load(sub_feature_file)
@@ -185,11 +233,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
            else:
                features_dict[subname] = values
                np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')
+        else:
            features_dict[subname] = values
    # PSAR
    result = calc_psar(df['High'], df['Low'], df['Close'])
    for subname, values in result:
        if csv_prefix:
            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
            if os.path.exists(sub_feature_file):
                arr = np.load(sub_feature_file)
@@ -197,11 +247,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
            else:
                features_dict[subname] = values
                np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')
+        else:
            features_dict[subname] = values
    # Donchian Channel
    result = calc_donchian(df['High'], df['Low'], df['Close'])
    for subname, values in result:
        if csv_prefix:
            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
            if os.path.exists(sub_feature_file):
                arr = np.load(sub_feature_file)
@@ -209,11 +261,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
            else:
                features_dict[subname] = values
                np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')
+        else:
            features_dict[subname] = values
    # Keltner Channel
    result = calc_keltner(df['High'], df['Low'], df['Close'])
    for subname, values in result:
        if csv_prefix:
            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
            if os.path.exists(sub_feature_file):
                arr = np.load(sub_feature_file)
@@ -221,11 +275,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
            else:
                features_dict[subname] = values
                np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')
+        else:
            features_dict[subname] = values
    # Ichimoku
    result = calc_ichimoku(df['High'], df['Low'])
    for subname, values in result:
        if csv_prefix:
            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
            if os.path.exists(sub_feature_file):
                arr = np.load(sub_feature_file)
@@ -233,11 +289,13 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
            else:
                features_dict[subname] = values
                np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')
+        else:
            features_dict[subname] = values
    # Elder Ray
    result = calc_elder_ray(df['Close'], df['Low'], df['High'])
    for subname, values in result:
        if csv_prefix:
            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
            if os.path.exists(sub_feature_file):
                arr = np.load(sub_feature_file)
@@ -245,7 +303,8 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
            else:
                features_dict[subname] = values
                np.save(sub_feature_file, values.values)
-            print(f'Saved feature: {sub_feature_file}')
+        else:
            features_dict[subname] = values
    # Prepare lags, rolling stats, log returns, and volatility features sequentially
    # Lags
@@ -253,14 +312,17 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
        for lag in range(1, lags + 1):
            feature_name = f'{col}_lag{lag}'
            feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
            if csv_prefix:
                if os.path.exists(feature_file):
                    features_dict[feature_name] = np.load(feature_file)
                else:
                print(f'Computing lag feature: {feature_name}')
                    result = compute_lag(df, col, lag)
                    features_dict[feature_name] = result
                    np.save(feature_file, result.values)
-                print(f'Saved feature: {feature_file}')
+            else:
                result = compute_lag(df, col, lag)
                features_dict[feature_name] = result
    # Rolling statistics
    for col in ohlcv_cols:
        for window in window_sizes:
@@ -275,44 +337,52 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
            for stat in ['mean', 'std', 'min', 'max']:
                feature_name = f'{col}_roll_{stat}_{window}'
                feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
                if csv_prefix:
                    if os.path.exists(feature_file):
                        features_dict[feature_name] = np.load(feature_file)
                    else:
                    print(f'Computing rolling stat feature: {feature_name}')
                        result = compute_rolling(df, col, stat, window)
                        features_dict[feature_name] = result
                        np.save(feature_file, result.values)
-                    print(f'Saved feature: {feature_file}')
+                else:
                    result = compute_rolling(df, col, stat, window)
                    features_dict[feature_name] = result
    # Log returns for different horizons
    for horizon in [5, 15, 30]:
        feature_name = f'log_return_{horizon}'
        feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
        if csv_prefix:
            if os.path.exists(feature_file):
                features_dict[feature_name] = np.load(feature_file)
            else:
            print(f'Computing log return feature: {feature_name}')
                result = compute_log_return(df, horizon)
                features_dict[feature_name] = result
                np.save(feature_file, result.values)
-            print(f'Saved feature: {feature_file}')
+        else:
            result = compute_log_return(df, horizon)
            features_dict[feature_name] = result
    # Volatility
    for window in window_sizes:
        feature_name = f'volatility_{window}'
        feature_file = f'../data/{csv_prefix}_{feature_name}.npy'
        if csv_prefix:
            if os.path.exists(feature_file):
                features_dict[feature_name] = np.load(feature_file)
            else:
            print(f'Computing volatility feature: {feature_name}')
                result = compute_volatility(df, window)
                features_dict[feature_name] = result
                np.save(feature_file, result.values)
-            print(f'Saved feature: {feature_file}')
+        else:
            result = compute_volatility(df, window)
            features_dict[feature_name] = result
    # --- Additional Technical Indicator Features ---
    # ADX
    adx_names = ['adx', 'adx_pos', 'adx_neg']
    adx_files = [f'../data/{csv_prefix}_{name}.npy' for name in adx_names]
-    if all(os.path.exists(f) for f in adx_files):
+    if csv_prefix and all(os.path.exists(f) for f in adx_files):
        for name, f in zip(adx_names, adx_files):
            arr = np.load(f)
            features_dict[name] = pd.Series(arr, index=df.index)
@@ -321,20 +391,22 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
        for subname, values in result:
            sub_feature_file = f'../data/{csv_prefix}_{subname}.npy'
            features_dict[subname] = values
            if csv_prefix:
                np.save(sub_feature_file, values.values)
            print(f'Saved feature: {sub_feature_file}')
    # Force Index
    feature_file = f'../data/{csv_prefix}_force_index.npy'
    if csv_prefix:
        if os.path.exists(feature_file):
            arr = np.load(feature_file)
            features_dict['force_index'] = pd.Series(arr, index=df.index)
        else:
        print('Calculating feature: force_index')
            _, values = calc_force_index(df['Close'], df['Volume'])
            features_dict['force_index'] = values
            np.save(feature_file, values.values)
-        print(f'Saved feature: {feature_file}')
+    else:
        _, values = calc_force_index(df['Close'], df['Volume'])
        features_dict['force_index'] = values
    # Supertrend indicators (simplified implementation)
    for period, multiplier in [(12, 3.0), (10, 1.0), (11, 2.0)]:
@@ -342,26 +414,23 @@ def feature_engineering(df, csv_prefix, ohlcv_cols, lags, window_sizes):
        st_trend_name = f'supertrend_trend_{period}_{multiplier}'
        st_file = f'../data/{csv_prefix}_{st_name}.npy'
        st_trend_file = f'../data/{csv_prefix}_{st_trend_name}.npy'
-        if os.path.exists(st_file) and os.path.exists(st_trend_file):
+        if csv_prefix and os.path.exists(st_file) and os.path.exists(st_trend_file):
            features_dict[st_name] = pd.Series(np.load(st_file), index=df.index)
            features_dict[st_trend_name] = pd.Series(np.load(st_trend_file), index=df.index)
        else:
            print(f'Calculating Supertrend indicator: {st_name}')
            # Simple supertrend alternative using ATR and moving averages
            from ta.volatility import AverageTrueRange
            atr = AverageTrueRange(df['High'], df['Low'], df['Close'], window=period).average_true_range()
            hl_avg = (df['High'] + df['Low']) / 2
            basic_ub = hl_avg + (multiplier * atr)
            basic_lb = hl_avg - (multiplier * atr)
            # Simplified supertrend calculation
            supertrend = hl_avg.copy()
            trend = pd.Series(1, index=df.index)  # 1 for uptrend, -1 for downtrend
            features_dict[st_name] = supertrend
            features_dict[st_trend_name] = trend
            if csv_prefix:
                np.save(st_file, features_dict[st_name].values)
                np.save(st_trend_file, features_dict[st_trend_name].values)
            print(f'Saved features: {st_file}, {st_trend_file}')
    return features_dict
--- a/inference_example.py
+++ b/inference_example.py
@@ -0,0 +1,299 @@
 """
 Complete example showing how to use the OHLCVPredictor for making predictions.
 This example demonstrates:
 1. Loading a trained model
 2. Preparing sample OHLCV data
 3. Making log return predictions
 4. Making price predictions
 5. Evaluating and displaying results
 """
 import os
 import pandas as pd
 import numpy as np
 from datetime import datetime, timedelta
 from predictor import OHLCVPredictor
 def create_sample_ohlcv_data(num_samples=200):
    """
    Create realistic sample OHLCV data for demonstration.
    In practice, replace this with your actual data loading.
    Returns:
        pd.DataFrame: DataFrame with OHLCV data
    """
    print("Creating sample OHLCV data for demonstration...")
    # Start with a base price and simulate realistic price movements
    np.random.seed(42)  # For reproducible results
    base_price = 50000.0  # Base Bitcoin price
    # Generate timestamps (1-minute intervals)
    start_time = datetime(2024, 1, 1)
    timestamps = [start_time + timedelta(minutes=i) for i in range(num_samples)]
    # Generate realistic price movements
    returns = np.random.normal(0, 0.001, num_samples)  # Small random returns
    prices = [base_price]
    for i in range(1, num_samples):
        # Add some trending behavior
        trend = 0.0001 * np.sin(i / 50.0)  # Gentle sinusoidal trend
        price_change = returns[i] + trend
        new_price = prices[-1] * (1 + price_change)
        prices.append(max(new_price, 1000))  # Minimum price floor
    # Generate OHLCV data
    data = []
    for i in range(num_samples):
        price = prices[i]
        # Generate realistic OHLC within a reasonable range
        volatility = abs(np.random.normal(0, 0.002))  # Random volatility
        high = price * (1 + volatility)
        low = price * (1 - volatility)
        # Ensure OHLC relationships are correct
        open_price = price * (1 + np.random.normal(0, 0.0005))
        close_price = price * (1 + np.random.normal(0, 0.0005))
        # Ensure high is highest and low is lowest
        high = max(high, open_price, close_price)
        low = min(low, open_price, close_price)
        # Generate volume (typically higher during price movements)
        base_volume = 100 + abs(np.random.normal(0, 50))
        volume_multiplier = 1 + abs(open_price - close_price) / close_price * 10
        volume = base_volume * volume_multiplier
        data.append({
            'Timestamp': timestamps[i],
            'Open': round(open_price, 2),
            'High': round(high, 2),
            'Low': round(low, 2),
            'Close': round(close_price, 2),
            'Volume': round(volume, 2)
        })
    df = pd.DataFrame(data)
    # Calculate log returns (required by feature engineering)
    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
    print(f"Generated {len(df)} samples of OHLCV data")
    print(f"Price range: ${df['Close'].min():.2f} - ${df['Close'].max():.2f}")
    return df
 def load_real_data_example():
    """
    Example of how to load real OHLCV data.
    Replace this with your actual data loading logic.
    Returns:
        pd.DataFrame or None: Real OHLCV data if available
    """
    # Example paths where real data might be located
    possible_paths = [
        '../data/btcusd_1-min_data.csv',
        '../data/sample_data.csv',
        'data/crypto_data.csv'
    ]
    for path in possible_paths:
        if os.path.exists(path):
            print(f"Loading real data from {path}...")
            try:
                df = pd.read_csv(path)
                # Ensure required columns exist
                required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
                if all(col in df.columns for col in required_cols):
                    # Filter out zero volume entries and calculate log returns
                    df = df[df['Volume'] != 0].reset_index(drop=True)
                    # Use only recent data and ensure proper data types
                    df = df.tail(500).reset_index(drop=True)  # Get more data for better feature engineering
                    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
                    print(f"Successfully loaded {len(df)} rows of real data")
                    return df.tail(200)  # Use last 200 for final processing
                else:
                    print(f"Missing required columns in {path}")
            except Exception as e:
                print(f"Error loading {path}: {e}")
    return None
 def display_prediction_results(df, log_return_preds, predicted_prices=None, actual_prices=None):
    """
    Display prediction results in a readable format.
    Args:
        df: Original OHLCV DataFrame
        log_return_preds: Array of log return predictions
        predicted_prices: Array of predicted prices (optional)
        actual_prices: Array of actual prices (optional)
    """
    print("\n" + "="*60)
    print("PREDICTION RESULTS")
    print("="*60)
    # Convert timestamps back to readable format for display
    df_display = df.copy()
    df_display['Timestamp'] = pd.to_datetime(df_display['Timestamp'], unit='s')
    print(f"\nLog Return Predictions (first 10):")
    print("-" * 40)
    for i in range(min(10, len(log_return_preds))):
        timestamp = df_display.iloc[i]['Timestamp']
        close_price = df_display.iloc[i]['Close']
        log_ret = log_return_preds[i]
        direction = "UP" if log_ret > 0 else "DOWN"
        print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
              f"Close: ${close_price:8.2f} | "
              f"Log Return: {log_ret:8.6f} | "
              f"Direction: {direction}")
    if predicted_prices is not None and actual_prices is not None:
        print(f"\nPrice Predictions vs Actual (first 10):")
        print("-" * 50)
        for i in range(min(10, len(predicted_prices))):
            timestamp = df_display.iloc[i]['Timestamp']
            pred_price = predicted_prices[i]
            actual_price = actual_prices[i]
            error = abs(pred_price - actual_price)
            error_pct = (error / actual_price) * 100
            print(f"{timestamp.strftime('%Y-%m-%d %H:%M')} | "
                  f"Predicted: ${pred_price:8.2f} | "
                  f"Actual: ${actual_price:8.2f} | "
                  f"Error: {error_pct:5.2f}%")
    # Statistics
    print(f"\nPrediction Statistics:")
    print("-" * 30)
    print(f"Total predictions: {len(log_return_preds)}")
    print(f"Mean log return: {np.mean(log_return_preds):.6f}")
    print(f"Std log return: {np.std(log_return_preds):.6f}")
    print(f"Positive predictions: {np.sum(log_return_preds > 0)} ({np.mean(log_return_preds > 0)*100:.1f}%)")
    print(f"Negative predictions: {np.sum(log_return_preds < 0)} ({np.mean(log_return_preds < 0)*100:.1f}%)")
    if predicted_prices is not None and actual_prices is not None:
        mae = np.mean(np.abs(predicted_prices - actual_prices))
        mape = np.mean(np.abs((predicted_prices - actual_prices) / actual_prices)) * 100
        print(f"\nPrice Prediction Accuracy:")
        print(f"Mean Absolute Error: ${mae:.2f}")
        print(f"Mean Absolute Percentage Error: {mape:.2f}%")
 def demonstrate_batch_prediction(predictor, df):
    """
    Demonstrate batch prediction on multiple data chunks.
    Args:
        predictor: OHLCVPredictor instance
        df: OHLCV DataFrame
    """
    print("\n" + "="*60)
    print("BATCH PREDICTION DEMONSTRATION")
    print("="*60)
    chunk_size = 50
    num_chunks = min(3, len(df) // chunk_size)
    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = start_idx + chunk_size
        chunk_df = df.iloc[start_idx:end_idx].copy()
        print(f"\nBatch {i+1}: Processing {len(chunk_df)} samples...")
        try:
            log_return_preds = predictor.predict(chunk_df, csv_prefix=f'batch_{i+1}')
            print(f"Successfully predicted {len(log_return_preds)} log returns")
            print(f"Batch {i+1} mean prediction: {np.mean(log_return_preds):.6f}")
        except Exception as e:
            print(f"Error in batch {i+1}: {e}")
 def main():
    """
    Main function demonstrating complete OHLCVPredictor usage.
    """
    model_path = '../data/xgboost_model_all_features.json'
    # Check if model exists
    if not os.path.exists(model_path):
        print("Model not found. Run main.py first to train the model.")
        print(f"Expected model path: {model_path}")
        return
    try:
        # Load predictor
        print("Loading predictor...")
        predictor = OHLCVPredictor(model_path)
        print("Predictor loaded successfully!")
        # Try to load real data first, fall back to synthetic data
        df = load_real_data_example()
        if df is None:
            df = create_sample_ohlcv_data(200)
        print(f"\nDataFrame shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        print(f"Data range: {len(df)} samples")
        # Demonstrate log return predictions
        print("\n" + "="*60)
        print("LOG RETURN PREDICTIONS")
        print("="*60)
        log_return_preds = predictor.predict(df, csv_prefix='inference_demo')
        print(f"Generated {len(log_return_preds)} log return predictions")
        # Demonstrate price predictions
        print("\n" + "="*60)
        print("PRICE PREDICTIONS")  
        print("="*60)
        predicted_prices, actual_prices = predictor.predict_prices(df, csv_prefix='price_demo')
        print(f"Generated {len(predicted_prices)} price predictions")
        # Display results
        display_prediction_results(df, log_return_preds, predicted_prices, actual_prices)
        # Demonstrate batch processing
        demonstrate_batch_prediction(predictor, df)
        print("\n" + "="*60)
        print("USAGE EXAMPLES FOR OTHER PROJECTS")
        print("="*60)
        print("""
                # Basic usage:
                from predictor import OHLCVPredictor
                # Load your trained model
                predictor = OHLCVPredictor('path/to/your/model.json')
                # Prepare your OHLCV data (pandas DataFrame with columns):
                # ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']
                # Get log return predictions
                log_returns = predictor.predict(your_dataframe)
                # Get price predictions
                predicted_prices, actual_prices = predictor.predict_prices(your_dataframe)
                # Required files for deployment:
                # - predictor.py
                # - custom_xgboost.py
                # - feature_engineering.py  
                # - technical_indicator_functions.py
                # - your_trained_model.json
        """)
    except FileNotFoundError as e:
        print(f"File not found: {e}")
        print("Make sure the model file exists and the path is correct.")
    except Exception as e:
        print(f"Error during prediction: {e}")
        print("Check your data format and model compatibility.")
 if __name__ == '__main__':
    main() 
--- a/predictor.py
+++ b/predictor.py
@@ -0,0 +1,97 @@
 import pandas as pd
 import numpy as np
 import os
 try:
    from .custom_xgboost import CustomXGBoostGPU
 except ImportError:
    from custom_xgboost import CustomXGBoostGPU
 try:
    from .feature_engineering import feature_engineering
 except ImportError:
    from feature_engineering import feature_engineering
 class OHLCVPredictor:
    def __init__(self, model_path):
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file not found: {model_path}")
        self.model = CustomXGBoostGPU.load_model(model_path)
        self.exclude_cols = self._get_excluded_features()
    def _get_excluded_features(self):
        """Get the list of features to exclude (copied from main.py)"""
        exclude_cols = ['Timestamp', 'Close']
        exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30'] 
        exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar', 
        'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband', 
        'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line', 
        'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2', 
        'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15', 
        'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15', 
        'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30', 
        'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5', 
        'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30', 
        'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15', 
        'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']
        return exclude_cols
    def predict(self, df, csv_prefix=None):
        # Validate input DataFrame
        required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
        # Make a copy and preprocess
        df = df.copy()
        df = df[df['Volume'] != 0].reset_index(drop=True)
        # Convert timestamps
        if df['Timestamp'].dtype == 'object':
            df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        else:
            df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
        # Feature engineering
        ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
        features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, 3, [5, 15, 30])
        features_df = pd.DataFrame(features_dict)
        df = pd.concat([df, features_df], axis=1)
        # Downcast and add time features (exclude Timestamp to preserve datetime)
        for col in df.columns:
            if col != 'Timestamp':  # Don't convert Timestamp to numeric
                try:
                    df[col] = pd.to_numeric(df[col], downcast='float')
                except Exception:
                    pass
        df['hour'] = df['Timestamp'].dt.hour
        # Handle NaNs
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if df[col].isna().any():
                df[col] = df[col].fillna(df[col].mean())
        # Defragment DataFrame after all columns have been added
        df = df.copy()
        # Select features and predict
        feature_cols = [col for col in df.columns if col not in self.exclude_cols]
        X = df[feature_cols].values.astype(np.float32)
        return self.model.predict(X)
    def predict_prices(self, df, csv_prefix=None):
        log_return_preds = self.predict(df, csv_prefix)
        df_clean = df[df['Volume'] != 0].copy()
        close_prices = df_clean['Close'].values
        predicted_prices = [close_prices[0]]
        for i, log_ret in enumerate(log_return_preds[1:], 1):
            if i < len(close_prices):
                predicted_prices.append(predicted_prices[-1] * np.exp(log_ret))
        return np.array(predicted_prices), close_prices[:len(predicted_prices)] 
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,3 +12,11 @@ dependencies = [
    "ta>=0.11.0",
    "xgboost>=3.0.2",
 ]
 [build-system]
 requires = ["setuptools>=61.0", "wheel"]
 build-backend = "setuptools.build_meta"
 [tool.setuptools.packages.find]
 include = ["ohlcvpredictor*"]
 exclude = ["charts*"]
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = ">=3.12"
 [[package]]
@@ -309,7 +309,7 @@ wheels = [
 [[package]]
 name = "ohlcvpredictor"
 version = "0.1.0"
-source = { virtual = "." }
+source = { editable = "." }
 dependencies = [
    { name = "dash" },
    { name = "numba" },