OHLCVPredictor/predictor.py

import pandas as pd
import numpy as np
import os
import json

try:
    from .custom_xgboost import CustomXGBoostGPU
except ImportError:
    from custom_xgboost import CustomXGBoostGPU

try:
    from .feature_engineering import feature_engineering
except ImportError:
    from feature_engineering import feature_engineering

class OHLCVPredictor:
    def __init__(self, model_path):
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Model file not found: {model_path}")

        self.model = CustomXGBoostGPU.load_model(model_path)
        self.exclude_cols = self._get_excluded_features()
        self._feature_names = self._load_trained_feature_names(model_path)

    def _load_trained_feature_names(self, model_path: str):
        """Load the exact feature list saved during training, if present."""
        try:
            features_path = os.path.splitext(model_path)[0] + "_features.json"
            if os.path.exists(features_path):
                with open(features_path, "r") as f:
                    data = json.load(f)
                names = data.get("feature_names")
                if isinstance(names, list) and all(isinstance(x, str) for x in names):
                    return names
        except Exception:
            pass
        return None

    def _get_excluded_features(self):
        """Get the list of features to exclude (copied from main.py)"""
        exclude_cols = ['Timestamp', 'Close']
        exclude_cols += ['log_return_5', 'volatility_5', 'volatility_15', 'volatility_30']
        exclude_cols += ['bb_bbm', 'bb_bbh', 'bb_bbl', 'stoch_k', 'sma_50', 'sma_200', 'psar',
        'donchian_hband', 'donchian_lband', 'donchian_mband', 'keltner_hband', 'keltner_lband',
        'keltner_mband', 'ichimoku_a', 'ichimoku_b', 'ichimoku_base_line', 'ichimoku_conversion_line',
        'Open_lag1', 'Open_lag2', 'Open_lag3', 'High_lag1', 'High_lag2', 'High_lag3', 'Low_lag1', 'Low_lag2',
        'Low_lag3', 'Close_lag1', 'Close_lag2', 'Close_lag3', 'Open_roll_mean_15', 'Open_roll_std_15', 'Open_roll_min_15',
        'Open_roll_max_15', 'Open_roll_mean_30', 'Open_roll_min_30', 'Open_roll_max_30', 'High_roll_mean_15', 'High_roll_std_15',
        'High_roll_min_15', 'High_roll_max_15', 'Low_roll_mean_5', 'Low_roll_min_5', 'Low_roll_max_5', 'Low_roll_mean_30',
        'Low_roll_std_30', 'Low_roll_min_30', 'Low_roll_max_30', 'Close_roll_mean_5', 'Close_roll_min_5', 'Close_roll_max_5',
        'Close_roll_mean_15', 'Close_roll_std_15', 'Close_roll_min_15', 'Close_roll_max_15', 'Close_roll_mean_30',
        'Close_roll_std_30', 'Close_roll_min_30', 'Close_roll_max_30', 'Volume_roll_max_5', 'Volume_roll_max_15',
        'Volume_roll_max_30', 'supertrend_12_3.0', 'supertrend_10_1.0', 'supertrend_11_2.0']
        return exclude_cols

    def predict(self, df, csv_prefix=None):
        # Validate input DataFrame
        required_cols = ['Open', 'High', 'Low', 'Close', 'Volume', 'Timestamp']
        missing_cols = [col for col in required_cols if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")

        # Make a copy and preprocess
        df = df.copy()
        df = df[df['Volume'] != 0].reset_index(drop=True)

        # Convert timestamps
        if df['Timestamp'].dtype == 'object':
            df['Timestamp'] = pd.to_datetime(df['Timestamp'])
        else:
            df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')

        # Feature engineering
        ohlcv_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
        features_dict = feature_engineering(df, csv_prefix, ohlcv_cols, 3, [5, 15, 30])
        features_df = pd.DataFrame(features_dict)
        df = pd.concat([df, features_df], axis=1)

        # Downcast and add time features (exclude Timestamp to preserve datetime)
        for col in df.columns:
            if col != 'Timestamp':  # Don't convert Timestamp to numeric
                try:
                    df[col] = pd.to_numeric(df[col], downcast='float')
                except Exception:
                    pass

        df['hour'] = df['Timestamp'].dt.hour

        # Handle NaNs
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            if df[col].isna().any():
                df[col] = df[col].fillna(df[col].mean())

        # Defragment DataFrame after all columns have been added
        df = df.copy()

        # Select features and predict
        if self._feature_names is not None:
            # Use the exact training feature names and order
            missing = [c for c in self._feature_names if c not in df.columns]
            if missing:
                raise ValueError(f"Input is missing required trained features: {missing[:10]}{'...' if len(missing)>10 else ''}")
            feature_cols = self._feature_names
        else:
            feature_cols = [col for col in df.columns if col not in self.exclude_cols]
        X = df[feature_cols].values.astype(np.float32)
        return self.model.predict(X)

    def predict_prices(self, df, csv_prefix=None):
        log_return_preds = self.predict(df, csv_prefix)
        df_clean = df[df['Volume'] != 0].copy()
        close_prices = df_clean['Close'].values

        predicted_prices = [close_prices[0]]
        for i, log_ret in enumerate(log_return_preds[1:], 1):
            if i < len(close_prices):
                predicted_prices.append(predicted_prices[-1] * np.exp(log_ret))

        return np.array(predicted_prices), close_prices[:len(predicted_prices)]