lowkey_backtest/strategies/multi_pair/feature_engine.py

"""
Feature Engineering for Multi-Pair Divergence Strategy.

Calculates features for all pairs in the universe, including
spread technicals, volatility, and on-chain data.
"""
import pandas as pd
import numpy as np
import ta

from engine.logging_config import get_logger
from engine.data_manager import DataManager
from engine.market import MarketType
from .config import MultiPairConfig
from .pair_scanner import TradingPair
from .funding import FundingRateFetcher

logger = get_logger(__name__)


class MultiPairFeatureEngine:
    """
    Calculates features for multiple trading pairs.

    Generates consistent feature sets across all pairs for
    the universal ML model.
    """

    def __init__(self, config: MultiPairConfig):
        self.config = config
        self.dm = DataManager()
        self.funding_fetcher = FundingRateFetcher()
        self._funding_data: pd.DataFrame | None = None

    def load_all_assets(
        self,
        start_date: str | None = None,
        end_date: str | None = None
    ) -> dict[str, pd.DataFrame]:
        """
        Load OHLCV data for all assets in the universe.

        Args:
            start_date: Start date filter (YYYY-MM-DD)
            end_date: End date filter (YYYY-MM-DD)

        Returns:
            Dictionary mapping symbol to OHLCV DataFrame
        """
        data = {}
        market_type = MarketType.PERPETUAL

        for symbol in self.config.assets:
            try:
                df = self.dm.load_data(
                    self.config.exchange_id,
                    symbol,
                    self.config.timeframe,
                    market_type
                )

                # Apply date filters
                if start_date:
                    df = df[df.index >= pd.Timestamp(start_date, tz="UTC")]
                if end_date:
                    df = df[df.index <= pd.Timestamp(end_date, tz="UTC")]

                if len(df) >= 200:  # Minimum data requirement
                    data[symbol] = df
                    logger.debug("Loaded %s: %d bars", symbol, len(df))
                else:
                    logger.warning(
                        "Skipping %s: insufficient data (%d bars)",
                        symbol, len(df)
                    )
            except FileNotFoundError:
                logger.warning("Data not found for %s", symbol)
            except Exception as e:
                logger.error("Error loading %s: %s", symbol, e)

        logger.info("Loaded %d/%d assets", len(data), len(self.config.assets))
        return data

    def load_funding_data(
        self,
        start_date: str | None = None,
        end_date: str | None = None,
        use_cache: bool = True
    ) -> pd.DataFrame:
        """
        Load funding rate data for all assets.

        Args:
            start_date: Start date filter
            end_date: End date filter
            use_cache: Whether to use cached data

        Returns:
            DataFrame with funding rates for all assets
        """
        self._funding_data = self.funding_fetcher.get_funding_data(
            self.config.assets,
            start_date=start_date,
            end_date=end_date,
            use_cache=use_cache
        )

        if self._funding_data is not None and not self._funding_data.empty:
            logger.info(
                "Loaded funding data: %d rows, %d assets",
                len(self._funding_data),
                len(self._funding_data.columns)
            )
        else:
            logger.warning("No funding data available")

        return self._funding_data

    def calculate_pair_features(
        self,
        pair: TradingPair,
        asset_data: dict[str, pd.DataFrame],
        on_chain_data: pd.DataFrame | None = None
    ) -> pd.DataFrame | None:
        """
        Calculate features for a single pair.

        Args:
            pair: Trading pair
            asset_data: Dictionary of OHLCV DataFrames by symbol
            on_chain_data: Optional on-chain data (funding, inflows)

        Returns:
            DataFrame with features, or None if insufficient data
        """
        base = pair.base_asset
        quote = pair.quote_asset

        if base not in asset_data or quote not in asset_data:
            return None

        df_base = asset_data[base]
        df_quote = asset_data[quote]

        # Align indices
        common_idx = df_base.index.intersection(df_quote.index)
        if len(common_idx) < 200:
            logger.debug("Pair %s: insufficient aligned data", pair.name)
            return None

        df_a = df_base.loc[common_idx]
        df_b = df_quote.loc[common_idx]

        # Calculate spread (base / quote)
        spread = df_a['close'] / df_b['close']

        # Z-Score
        z_window = self.config.z_window
        rolling_mean = spread.rolling(window=z_window).mean()
        rolling_std = spread.rolling(window=z_window).std()
        z_score = (spread - rolling_mean) / rolling_std

        # Spread Technicals
        spread_rsi = ta.momentum.RSIIndicator(spread, window=14).rsi()
        spread_roc = spread.pct_change(periods=5) * 100
        spread_change_1h = spread.pct_change(periods=1)

        # Volume Analysis
        vol_ratio = df_a['volume'] / (df_b['volume'] + 1e-10)
        vol_ratio_ma = vol_ratio.rolling(window=12).mean()
        vol_ratio_rel = vol_ratio / (vol_ratio_ma + 1e-10)

        # Volatility
        ret_a = df_a['close'].pct_change()
        ret_b = df_b['close'].pct_change()
        vol_a = ret_a.rolling(window=z_window).std()
        vol_b = ret_b.rolling(window=z_window).std()
        vol_spread_ratio = vol_a / (vol_b + 1e-10)

        # Realized Volatility (for dynamic SL/TP)
        realized_vol_a = ret_a.rolling(window=self.config.volatility_window).std()
        realized_vol_b = ret_b.rolling(window=self.config.volatility_window).std()

        # ATR (Average True Range) for dynamic stops
        # ATR = average of max(high-low, |high-prev_close|, |low-prev_close|)
        high_a, low_a, close_a = df_a['high'], df_a['low'], df_a['close']
        high_b, low_b, close_b = df_b['high'], df_b['low'], df_b['close']

        # True Range for base asset
        tr_a = pd.concat([
            high_a - low_a,
            (high_a - close_a.shift(1)).abs(),
            (low_a - close_a.shift(1)).abs()
        ], axis=1).max(axis=1)
        atr_a = tr_a.rolling(window=self.config.atr_period).mean()

        # True Range for quote asset
        tr_b = pd.concat([
            high_b - low_b,
            (high_b - close_b.shift(1)).abs(),
            (low_b - close_b.shift(1)).abs()
        ], axis=1).max(axis=1)
        atr_b = tr_b.rolling(window=self.config.atr_period).mean()

        # ATR as percentage of price (normalized)
        atr_pct_a = atr_a / close_a
        atr_pct_b = atr_b / close_b

        # Build feature DataFrame
        features = pd.DataFrame(index=common_idx)
        features['pair_id'] = pair.pair_id
        features['base_asset'] = base
        features['quote_asset'] = quote

        # Price data (for reference, not features)
        features['spread'] = spread
        features['base_close'] = df_a['close']
        features['quote_close'] = df_b['close']
        features['base_volume'] = df_a['volume']

        # Core Features
        features['z_score'] = z_score
        features['spread_rsi'] = spread_rsi
        features['spread_roc'] = spread_roc
        features['spread_change_1h'] = spread_change_1h
        features['vol_ratio'] = vol_ratio
        features['vol_ratio_rel'] = vol_ratio_rel
        features['vol_diff_ratio'] = vol_spread_ratio

        # Volatility for SL/TP
        features['realized_vol_base'] = realized_vol_a
        features['realized_vol_quote'] = realized_vol_b
        features['realized_vol_avg'] = (realized_vol_a + realized_vol_b) / 2

        # ATR for dynamic stops (in price units and as percentage)
        features['atr_base'] = atr_a
        features['atr_quote'] = atr_b
        features['atr_pct_base'] = atr_pct_a
        features['atr_pct_quote'] = atr_pct_b
        features['atr_pct_avg'] = (atr_pct_a + atr_pct_b) / 2

        # Pair encoding (for universal model)
        # Using base and quote indices for hierarchical encoding
        assets = self.config.assets
        features['base_idx'] = assets.index(base) if base in assets else -1
        features['quote_idx'] = assets.index(quote) if quote in assets else -1

        # Add funding and on-chain features
        # Funding data is always added from self._funding_data (OKX, all 10 assets)
        # On-chain data is optional (CryptoQuant, BTC/ETH only)
        features = self._add_on_chain_features(
            features, on_chain_data, base, quote
        )

        # Drop rows with NaN in core features only (not funding/on-chain)
        core_cols = [
            'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h',
            'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio',
            'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg',
            'atr_base', 'atr_pct_base'  # ATR is core for SL/TP
        ]
        features = features.dropna(subset=core_cols)

        # Fill missing funding/on-chain features with 0 (neutral)
        optional_cols = [
            'base_funding', 'quote_funding', 'funding_diff', 'funding_avg',
            'base_inflow', 'quote_inflow', 'inflow_ratio'
        ]
        for col in optional_cols:
            if col in features.columns:
                features[col] = features[col].fillna(0)

        return features

    def calculate_all_pair_features(
        self,
        pairs: list[TradingPair],
        asset_data: dict[str, pd.DataFrame],
        on_chain_data: pd.DataFrame | None = None
    ) -> dict[str, pd.DataFrame]:
        """
        Calculate features for all pairs.

        Args:
            pairs: List of trading pairs
            asset_data: Dictionary of OHLCV DataFrames
            on_chain_data: Optional on-chain data

        Returns:
            Dictionary mapping pair_id to feature DataFrame
        """
        all_features = {}

        for pair in pairs:
            features = self.calculate_pair_features(
                pair, asset_data, on_chain_data
            )
            if features is not None and len(features) > 0:
                all_features[pair.pair_id] = features

        logger.info(
            "Calculated features for %d/%d pairs",
            len(all_features), len(pairs)
        )

        return all_features

    def get_combined_features(
        self,
        pair_features: dict[str, pd.DataFrame],
        timestamp: pd.Timestamp | None = None
    ) -> pd.DataFrame:
        """
        Combine all pair features into a single DataFrame.

        Useful for batch model prediction across all pairs.

        Args:
            pair_features: Dictionary of feature DataFrames by pair_id
            timestamp: Optional specific timestamp to filter to

        Returns:
            Combined DataFrame with all pairs as rows
        """
        if not pair_features:
            return pd.DataFrame()

        if timestamp is not None:
            # Get latest row from each pair at or before timestamp
            rows = []
            for pair_id, features in pair_features.items():
                valid = features[features.index <= timestamp]
                if len(valid) > 0:
                    row = valid.iloc[-1:].copy()
                    rows.append(row)

            if rows:
                return pd.concat(rows, ignore_index=False)
            return pd.DataFrame()

        # Combine all features (for training)
        return pd.concat(pair_features.values(), ignore_index=False)

    def _add_on_chain_features(
        self,
        features: pd.DataFrame,
        on_chain_data: pd.DataFrame | None,
        base_asset: str,
        quote_asset: str
    ) -> pd.DataFrame:
        """
        Add on-chain and funding rate features for the pair.

        Uses funding data from OKX (all 10 assets) and on-chain data
        from CryptoQuant (BTC/ETH only for inflows).
        """
        base_short = base_asset.replace('-USDT', '').lower()
        quote_short = quote_asset.replace('-USDT', '').lower()

        # Add funding rates from cached funding data
        if self._funding_data is not None and not self._funding_data.empty:
            funding_aligned = self._funding_data.reindex(
                features.index, method='ffill'
            )

            base_funding_col = f'{base_short}_funding'
            quote_funding_col = f'{quote_short}_funding'

            if base_funding_col in funding_aligned.columns:
                features['base_funding'] = funding_aligned[base_funding_col]
            if quote_funding_col in funding_aligned.columns:
                features['quote_funding'] = funding_aligned[quote_funding_col]

            # Funding difference (positive = base has higher funding)
            if 'base_funding' in features.columns and 'quote_funding' in features.columns:
                features['funding_diff'] = (
                    features['base_funding'] - features['quote_funding']
                )

                # Funding sentiment: average of both assets
                features['funding_avg'] = (
                    features['base_funding'] + features['quote_funding']
                ) / 2

        # Add on-chain features from CryptoQuant (BTC/ETH only)
        if on_chain_data is not None and not on_chain_data.empty:
            cq_aligned = on_chain_data.reindex(features.index, method='ffill')

            # Inflows (only available for BTC/ETH)
            base_inflow_col = f'{base_short}_inflow'
            quote_inflow_col = f'{quote_short}_inflow'

            if base_inflow_col in cq_aligned.columns:
                features['base_inflow'] = cq_aligned[base_inflow_col]
            if quote_inflow_col in cq_aligned.columns:
                features['quote_inflow'] = cq_aligned[quote_inflow_col]

            if 'base_inflow' in features.columns and 'quote_inflow' in features.columns:
                features['inflow_ratio'] = (
                    features['base_inflow'] /
                    (features['quote_inflow'] + 1)
                )

        return features

    def get_feature_columns(self) -> list[str]:
        """
        Get list of feature columns for ML model.

        Excludes metadata and target-related columns.

        Returns:
            List of feature column names
        """
        # Core features (always present)
        core_features = [
            'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h',
            'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio',
            'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg',
            'base_idx', 'quote_idx'
        ]

        # Funding features (now available for all 10 assets via OKX)
        funding_features = [
            'base_funding', 'quote_funding', 'funding_diff', 'funding_avg'
        ]

        # On-chain features (BTC/ETH only via CryptoQuant)
        onchain_features = [
            'base_inflow', 'quote_inflow', 'inflow_ratio'
        ]

        return core_features + funding_features + onchain_features