""" Feature Engineering for Multi-Pair Divergence Strategy. Calculates features for all pairs in the universe, including spread technicals, volatility, and on-chain data. """ import pandas as pd import numpy as np import ta from engine.logging_config import get_logger from engine.data_manager import DataManager from engine.market import MarketType from .config import MultiPairConfig from .pair_scanner import TradingPair from .funding import FundingRateFetcher logger = get_logger(__name__) class MultiPairFeatureEngine: """ Calculates features for multiple trading pairs. Generates consistent feature sets across all pairs for the universal ML model. """ def __init__(self, config: MultiPairConfig): self.config = config self.dm = DataManager() self.funding_fetcher = FundingRateFetcher() self._funding_data: pd.DataFrame | None = None def load_all_assets( self, start_date: str | None = None, end_date: str | None = None ) -> dict[str, pd.DataFrame]: """ Load OHLCV data for all assets in the universe. Args: start_date: Start date filter (YYYY-MM-DD) end_date: End date filter (YYYY-MM-DD) Returns: Dictionary mapping symbol to OHLCV DataFrame """ data = {} market_type = MarketType.PERPETUAL for symbol in self.config.assets: try: df = self.dm.load_data( self.config.exchange_id, symbol, self.config.timeframe, market_type ) # Apply date filters if start_date: df = df[df.index >= pd.Timestamp(start_date, tz="UTC")] if end_date: df = df[df.index <= pd.Timestamp(end_date, tz="UTC")] if len(df) >= 200: # Minimum data requirement data[symbol] = df logger.debug("Loaded %s: %d bars", symbol, len(df)) else: logger.warning( "Skipping %s: insufficient data (%d bars)", symbol, len(df) ) except FileNotFoundError: logger.warning("Data not found for %s", symbol) except Exception as e: logger.error("Error loading %s: %s", symbol, e) logger.info("Loaded %d/%d assets", len(data), len(self.config.assets)) return data def load_funding_data( self, start_date: str | None = None, end_date: str | None = None, use_cache: bool = True ) -> pd.DataFrame: """ Load funding rate data for all assets. Args: start_date: Start date filter end_date: End date filter use_cache: Whether to use cached data Returns: DataFrame with funding rates for all assets """ self._funding_data = self.funding_fetcher.get_funding_data( self.config.assets, start_date=start_date, end_date=end_date, use_cache=use_cache ) if self._funding_data is not None and not self._funding_data.empty: logger.info( "Loaded funding data: %d rows, %d assets", len(self._funding_data), len(self._funding_data.columns) ) else: logger.warning("No funding data available") return self._funding_data def calculate_pair_features( self, pair: TradingPair, asset_data: dict[str, pd.DataFrame], on_chain_data: pd.DataFrame | None = None ) -> pd.DataFrame | None: """ Calculate features for a single pair. Args: pair: Trading pair asset_data: Dictionary of OHLCV DataFrames by symbol on_chain_data: Optional on-chain data (funding, inflows) Returns: DataFrame with features, or None if insufficient data """ base = pair.base_asset quote = pair.quote_asset if base not in asset_data or quote not in asset_data: return None df_base = asset_data[base] df_quote = asset_data[quote] # Align indices common_idx = df_base.index.intersection(df_quote.index) if len(common_idx) < 200: logger.debug("Pair %s: insufficient aligned data", pair.name) return None df_a = df_base.loc[common_idx] df_b = df_quote.loc[common_idx] # Calculate spread (base / quote) spread = df_a['close'] / df_b['close'] # Z-Score z_window = self.config.z_window rolling_mean = spread.rolling(window=z_window).mean() rolling_std = spread.rolling(window=z_window).std() z_score = (spread - rolling_mean) / rolling_std # Spread Technicals spread_rsi = ta.momentum.RSIIndicator(spread, window=14).rsi() spread_roc = spread.pct_change(periods=5) * 100 spread_change_1h = spread.pct_change(periods=1) # Volume Analysis vol_ratio = df_a['volume'] / (df_b['volume'] + 1e-10) vol_ratio_ma = vol_ratio.rolling(window=12).mean() vol_ratio_rel = vol_ratio / (vol_ratio_ma + 1e-10) # Volatility ret_a = df_a['close'].pct_change() ret_b = df_b['close'].pct_change() vol_a = ret_a.rolling(window=z_window).std() vol_b = ret_b.rolling(window=z_window).std() vol_spread_ratio = vol_a / (vol_b + 1e-10) # Realized Volatility (for dynamic SL/TP) realized_vol_a = ret_a.rolling(window=self.config.volatility_window).std() realized_vol_b = ret_b.rolling(window=self.config.volatility_window).std() # ATR (Average True Range) for dynamic stops # ATR = average of max(high-low, |high-prev_close|, |low-prev_close|) high_a, low_a, close_a = df_a['high'], df_a['low'], df_a['close'] high_b, low_b, close_b = df_b['high'], df_b['low'], df_b['close'] # True Range for base asset tr_a = pd.concat([ high_a - low_a, (high_a - close_a.shift(1)).abs(), (low_a - close_a.shift(1)).abs() ], axis=1).max(axis=1) atr_a = tr_a.rolling(window=self.config.atr_period).mean() # True Range for quote asset tr_b = pd.concat([ high_b - low_b, (high_b - close_b.shift(1)).abs(), (low_b - close_b.shift(1)).abs() ], axis=1).max(axis=1) atr_b = tr_b.rolling(window=self.config.atr_period).mean() # ATR as percentage of price (normalized) atr_pct_a = atr_a / close_a atr_pct_b = atr_b / close_b # Build feature DataFrame features = pd.DataFrame(index=common_idx) features['pair_id'] = pair.pair_id features['base_asset'] = base features['quote_asset'] = quote # Price data (for reference, not features) features['spread'] = spread features['base_close'] = df_a['close'] features['quote_close'] = df_b['close'] features['base_volume'] = df_a['volume'] # Core Features features['z_score'] = z_score features['spread_rsi'] = spread_rsi features['spread_roc'] = spread_roc features['spread_change_1h'] = spread_change_1h features['vol_ratio'] = vol_ratio features['vol_ratio_rel'] = vol_ratio_rel features['vol_diff_ratio'] = vol_spread_ratio # Volatility for SL/TP features['realized_vol_base'] = realized_vol_a features['realized_vol_quote'] = realized_vol_b features['realized_vol_avg'] = (realized_vol_a + realized_vol_b) / 2 # ATR for dynamic stops (in price units and as percentage) features['atr_base'] = atr_a features['atr_quote'] = atr_b features['atr_pct_base'] = atr_pct_a features['atr_pct_quote'] = atr_pct_b features['atr_pct_avg'] = (atr_pct_a + atr_pct_b) / 2 # Pair encoding (for universal model) # Using base and quote indices for hierarchical encoding assets = self.config.assets features['base_idx'] = assets.index(base) if base in assets else -1 features['quote_idx'] = assets.index(quote) if quote in assets else -1 # Add funding and on-chain features # Funding data is always added from self._funding_data (OKX, all 10 assets) # On-chain data is optional (CryptoQuant, BTC/ETH only) features = self._add_on_chain_features( features, on_chain_data, base, quote ) # Drop rows with NaN in core features only (not funding/on-chain) core_cols = [ 'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h', 'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio', 'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg', 'atr_base', 'atr_pct_base' # ATR is core for SL/TP ] features = features.dropna(subset=core_cols) # Fill missing funding/on-chain features with 0 (neutral) optional_cols = [ 'base_funding', 'quote_funding', 'funding_diff', 'funding_avg', 'base_inflow', 'quote_inflow', 'inflow_ratio' ] for col in optional_cols: if col in features.columns: features[col] = features[col].fillna(0) return features def calculate_all_pair_features( self, pairs: list[TradingPair], asset_data: dict[str, pd.DataFrame], on_chain_data: pd.DataFrame | None = None ) -> dict[str, pd.DataFrame]: """ Calculate features for all pairs. Args: pairs: List of trading pairs asset_data: Dictionary of OHLCV DataFrames on_chain_data: Optional on-chain data Returns: Dictionary mapping pair_id to feature DataFrame """ all_features = {} for pair in pairs: features = self.calculate_pair_features( pair, asset_data, on_chain_data ) if features is not None and len(features) > 0: all_features[pair.pair_id] = features logger.info( "Calculated features for %d/%d pairs", len(all_features), len(pairs) ) return all_features def get_combined_features( self, pair_features: dict[str, pd.DataFrame], timestamp: pd.Timestamp | None = None ) -> pd.DataFrame: """ Combine all pair features into a single DataFrame. Useful for batch model prediction across all pairs. Args: pair_features: Dictionary of feature DataFrames by pair_id timestamp: Optional specific timestamp to filter to Returns: Combined DataFrame with all pairs as rows """ if not pair_features: return pd.DataFrame() if timestamp is not None: # Get latest row from each pair at or before timestamp rows = [] for pair_id, features in pair_features.items(): valid = features[features.index <= timestamp] if len(valid) > 0: row = valid.iloc[-1:].copy() rows.append(row) if rows: return pd.concat(rows, ignore_index=False) return pd.DataFrame() # Combine all features (for training) return pd.concat(pair_features.values(), ignore_index=False) def _add_on_chain_features( self, features: pd.DataFrame, on_chain_data: pd.DataFrame | None, base_asset: str, quote_asset: str ) -> pd.DataFrame: """ Add on-chain and funding rate features for the pair. Uses funding data from OKX (all 10 assets) and on-chain data from CryptoQuant (BTC/ETH only for inflows). """ base_short = base_asset.replace('-USDT', '').lower() quote_short = quote_asset.replace('-USDT', '').lower() # Add funding rates from cached funding data if self._funding_data is not None and not self._funding_data.empty: funding_aligned = self._funding_data.reindex( features.index, method='ffill' ) base_funding_col = f'{base_short}_funding' quote_funding_col = f'{quote_short}_funding' if base_funding_col in funding_aligned.columns: features['base_funding'] = funding_aligned[base_funding_col] if quote_funding_col in funding_aligned.columns: features['quote_funding'] = funding_aligned[quote_funding_col] # Funding difference (positive = base has higher funding) if 'base_funding' in features.columns and 'quote_funding' in features.columns: features['funding_diff'] = ( features['base_funding'] - features['quote_funding'] ) # Funding sentiment: average of both assets features['funding_avg'] = ( features['base_funding'] + features['quote_funding'] ) / 2 # Add on-chain features from CryptoQuant (BTC/ETH only) if on_chain_data is not None and not on_chain_data.empty: cq_aligned = on_chain_data.reindex(features.index, method='ffill') # Inflows (only available for BTC/ETH) base_inflow_col = f'{base_short}_inflow' quote_inflow_col = f'{quote_short}_inflow' if base_inflow_col in cq_aligned.columns: features['base_inflow'] = cq_aligned[base_inflow_col] if quote_inflow_col in cq_aligned.columns: features['quote_inflow'] = cq_aligned[quote_inflow_col] if 'base_inflow' in features.columns and 'quote_inflow' in features.columns: features['inflow_ratio'] = ( features['base_inflow'] / (features['quote_inflow'] + 1) ) return features def get_feature_columns(self) -> list[str]: """ Get list of feature columns for ML model. Excludes metadata and target-related columns. Returns: List of feature column names """ # Core features (always present) core_features = [ 'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h', 'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio', 'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg', 'base_idx', 'quote_idx' ] # Funding features (now available for all 10 assets via OKX) funding_features = [ 'base_funding', 'quote_funding', 'funding_diff', 'funding_avg' ] # On-chain features (BTC/ETH only via CryptoQuant) onchain_features = [ 'base_inflow', 'quote_inflow', 'inflow_ratio' ] return core_features + funding_features + onchain_features