feat: Multi-Pair Divergence Selection Strategy

- Extend regime detection to top 10 cryptocurrencies (45 pairs) - Dynamic pair selection based on divergence score (|z_score| * probability) - Universal ML model trained on all pairs - Correlation-based filtering to avoid redundant positions - Funding rate integration from OKX for all 10 assets - ATR-based dynamic stop-loss and take-profit - Walk-forward training with 70/30 split Performance: +35.69% return (vs +28.66% baseline), 63.6% win rate
2026-01-15 20:47:23 +08:00
parent 7e4a6874a2
commit df37366603
13 changed files with 2531 additions and 0 deletions
--- a/strategies/multi_pair/feature_engine.py
+++ b/strategies/multi_pair/feature_engine.py
@@ -0,0 +1,433 @@
+"""
+Feature Engineering for Multi-Pair Divergence Strategy.
+
+Calculates features for all pairs in the universe, including
+spread technicals, volatility, and on-chain data.
+"""
+import pandas as pd
+import numpy as np
+import ta
+
+from engine.logging_config import get_logger
+from engine.data_manager import DataManager
+from engine.market import MarketType
+from .config import MultiPairConfig
+from .pair_scanner import TradingPair
+from .funding import FundingRateFetcher
+
+logger = get_logger(__name__)
+
+
+class MultiPairFeatureEngine:
+    """
+    Calculates features for multiple trading pairs.
+    
+    Generates consistent feature sets across all pairs for
+    the universal ML model.
+    """
+    
+    def __init__(self, config: MultiPairConfig):
+        self.config = config
+        self.dm = DataManager()
+        self.funding_fetcher = FundingRateFetcher()
+        self._funding_data: pd.DataFrame | None = None
+    
+    def load_all_assets(
+        self,
+        start_date: str | None = None,
+        end_date: str | None = None
+    ) -> dict[str, pd.DataFrame]:
+        """
+        Load OHLCV data for all assets in the universe.
+        
+        Args:
+            start_date: Start date filter (YYYY-MM-DD)
+            end_date: End date filter (YYYY-MM-DD)
+            
+        Returns:
+            Dictionary mapping symbol to OHLCV DataFrame
+        """
+        data = {}
+        market_type = MarketType.PERPETUAL
+        
+        for symbol in self.config.assets:
+            try:
+                df = self.dm.load_data(
+                    self.config.exchange_id,
+                    symbol,
+                    self.config.timeframe,
+                    market_type
+                )
+                
+                # Apply date filters
+                if start_date:
+                    df = df[df.index >= pd.Timestamp(start_date, tz="UTC")]
+                if end_date:
+                    df = df[df.index <= pd.Timestamp(end_date, tz="UTC")]
+                
+                if len(df) >= 200:  # Minimum data requirement
+                    data[symbol] = df
+                    logger.debug("Loaded %s: %d bars", symbol, len(df))
+                else:
+                    logger.warning(
+                        "Skipping %s: insufficient data (%d bars)",
+                        symbol, len(df)
+                    )
+            except FileNotFoundError:
+                logger.warning("Data not found for %s", symbol)
+            except Exception as e:
+                logger.error("Error loading %s: %s", symbol, e)
+        
+        logger.info("Loaded %d/%d assets", len(data), len(self.config.assets))
+        return data
+    
+    def load_funding_data(
+        self,
+        start_date: str | None = None,
+        end_date: str | None = None,
+        use_cache: bool = True
+    ) -> pd.DataFrame:
+        """
+        Load funding rate data for all assets.
+        
+        Args:
+            start_date: Start date filter
+            end_date: End date filter
+            use_cache: Whether to use cached data
+            
+        Returns:
+            DataFrame with funding rates for all assets
+        """
+        self._funding_data = self.funding_fetcher.get_funding_data(
+            self.config.assets,
+            start_date=start_date,
+            end_date=end_date,
+            use_cache=use_cache
+        )
+        
+        if self._funding_data is not None and not self._funding_data.empty:
+            logger.info(
+                "Loaded funding data: %d rows, %d assets",
+                len(self._funding_data),
+                len(self._funding_data.columns)
+            )
+        else:
+            logger.warning("No funding data available")
+        
+        return self._funding_data
+    
+    def calculate_pair_features(
+        self,
+        pair: TradingPair,
+        asset_data: dict[str, pd.DataFrame],
+        on_chain_data: pd.DataFrame | None = None
+    ) -> pd.DataFrame | None:
+        """
+        Calculate features for a single pair.
+        
+        Args:
+            pair: Trading pair
+            asset_data: Dictionary of OHLCV DataFrames by symbol
+            on_chain_data: Optional on-chain data (funding, inflows)
+            
+        Returns:
+            DataFrame with features, or None if insufficient data
+        """
+        base = pair.base_asset
+        quote = pair.quote_asset
+        
+        if base not in asset_data or quote not in asset_data:
+            return None
+        
+        df_base = asset_data[base]
+        df_quote = asset_data[quote]
+        
+        # Align indices
+        common_idx = df_base.index.intersection(df_quote.index)
+        if len(common_idx) < 200:
+            logger.debug("Pair %s: insufficient aligned data", pair.name)
+            return None
+        
+        df_a = df_base.loc[common_idx]
+        df_b = df_quote.loc[common_idx]
+        
+        # Calculate spread (base / quote)
+        spread = df_a['close'] / df_b['close']
+        
+        # Z-Score
+        z_window = self.config.z_window
+        rolling_mean = spread.rolling(window=z_window).mean()
+        rolling_std = spread.rolling(window=z_window).std()
+        z_score = (spread - rolling_mean) / rolling_std
+        
+        # Spread Technicals
+        spread_rsi = ta.momentum.RSIIndicator(spread, window=14).rsi()
+        spread_roc = spread.pct_change(periods=5) * 100
+        spread_change_1h = spread.pct_change(periods=1)
+        
+        # Volume Analysis
+        vol_ratio = df_a['volume'] / (df_b['volume'] + 1e-10)
+        vol_ratio_ma = vol_ratio.rolling(window=12).mean()
+        vol_ratio_rel = vol_ratio / (vol_ratio_ma + 1e-10)
+        
+        # Volatility
+        ret_a = df_a['close'].pct_change()
+        ret_b = df_b['close'].pct_change()
+        vol_a = ret_a.rolling(window=z_window).std()
+        vol_b = ret_b.rolling(window=z_window).std()
+        vol_spread_ratio = vol_a / (vol_b + 1e-10)
+        
+        # Realized Volatility (for dynamic SL/TP)
+        realized_vol_a = ret_a.rolling(window=self.config.volatility_window).std()
+        realized_vol_b = ret_b.rolling(window=self.config.volatility_window).std()
+        
+        # ATR (Average True Range) for dynamic stops
+        # ATR = average of max(high-low, |high-prev_close|, |low-prev_close|)
+        high_a, low_a, close_a = df_a['high'], df_a['low'], df_a['close']
+        high_b, low_b, close_b = df_b['high'], df_b['low'], df_b['close']
+        
+        # True Range for base asset
+        tr_a = pd.concat([
+            high_a - low_a,
+            (high_a - close_a.shift(1)).abs(),
+            (low_a - close_a.shift(1)).abs()
+        ], axis=1).max(axis=1)
+        atr_a = tr_a.rolling(window=self.config.atr_period).mean()
+        
+        # True Range for quote asset
+        tr_b = pd.concat([
+            high_b - low_b,
+            (high_b - close_b.shift(1)).abs(),
+            (low_b - close_b.shift(1)).abs()
+        ], axis=1).max(axis=1)
+        atr_b = tr_b.rolling(window=self.config.atr_period).mean()
+        
+        # ATR as percentage of price (normalized)
+        atr_pct_a = atr_a / close_a
+        atr_pct_b = atr_b / close_b
+        
+        # Build feature DataFrame
+        features = pd.DataFrame(index=common_idx)
+        features['pair_id'] = pair.pair_id
+        features['base_asset'] = base
+        features['quote_asset'] = quote
+        
+        # Price data (for reference, not features)
+        features['spread'] = spread
+        features['base_close'] = df_a['close']
+        features['quote_close'] = df_b['close']
+        features['base_volume'] = df_a['volume']
+        
+        # Core Features
+        features['z_score'] = z_score
+        features['spread_rsi'] = spread_rsi
+        features['spread_roc'] = spread_roc
+        features['spread_change_1h'] = spread_change_1h
+        features['vol_ratio'] = vol_ratio
+        features['vol_ratio_rel'] = vol_ratio_rel
+        features['vol_diff_ratio'] = vol_spread_ratio
+        
+        # Volatility for SL/TP
+        features['realized_vol_base'] = realized_vol_a
+        features['realized_vol_quote'] = realized_vol_b
+        features['realized_vol_avg'] = (realized_vol_a + realized_vol_b) / 2
+        
+        # ATR for dynamic stops (in price units and as percentage)
+        features['atr_base'] = atr_a
+        features['atr_quote'] = atr_b
+        features['atr_pct_base'] = atr_pct_a
+        features['atr_pct_quote'] = atr_pct_b
+        features['atr_pct_avg'] = (atr_pct_a + atr_pct_b) / 2
+        
+        # Pair encoding (for universal model)
+        # Using base and quote indices for hierarchical encoding
+        assets = self.config.assets
+        features['base_idx'] = assets.index(base) if base in assets else -1
+        features['quote_idx'] = assets.index(quote) if quote in assets else -1
+        
+        # Add funding and on-chain features
+        # Funding data is always added from self._funding_data (OKX, all 10 assets)
+        # On-chain data is optional (CryptoQuant, BTC/ETH only)
+        features = self._add_on_chain_features(
+            features, on_chain_data, base, quote
+        )
+        
+        # Drop rows with NaN in core features only (not funding/on-chain)
+        core_cols = [
+            'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h',
+            'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio',
+            'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg',
+            'atr_base', 'atr_pct_base'  # ATR is core for SL/TP
+        ]
+        features = features.dropna(subset=core_cols)
+        
+        # Fill missing funding/on-chain features with 0 (neutral)
+        optional_cols = [
+            'base_funding', 'quote_funding', 'funding_diff', 'funding_avg',
+            'base_inflow', 'quote_inflow', 'inflow_ratio'
+        ]
+        for col in optional_cols:
+            if col in features.columns:
+                features[col] = features[col].fillna(0)
+        
+        return features
+    
+    def calculate_all_pair_features(
+        self,
+        pairs: list[TradingPair],
+        asset_data: dict[str, pd.DataFrame],
+        on_chain_data: pd.DataFrame | None = None
+    ) -> dict[str, pd.DataFrame]:
+        """
+        Calculate features for all pairs.
+        
+        Args:
+            pairs: List of trading pairs
+            asset_data: Dictionary of OHLCV DataFrames
+            on_chain_data: Optional on-chain data
+            
+        Returns:
+            Dictionary mapping pair_id to feature DataFrame
+        """
+        all_features = {}
+        
+        for pair in pairs:
+            features = self.calculate_pair_features(
+                pair, asset_data, on_chain_data
+            )
+            if features is not None and len(features) > 0:
+                all_features[pair.pair_id] = features
+        
+        logger.info(
+            "Calculated features for %d/%d pairs",
+            len(all_features), len(pairs)
+        )
+        
+        return all_features
+    
+    def get_combined_features(
+        self,
+        pair_features: dict[str, pd.DataFrame],
+        timestamp: pd.Timestamp | None = None
+    ) -> pd.DataFrame:
+        """
+        Combine all pair features into a single DataFrame.
+        
+        Useful for batch model prediction across all pairs.
+        
+        Args:
+            pair_features: Dictionary of feature DataFrames by pair_id
+            timestamp: Optional specific timestamp to filter to
+            
+        Returns:
+            Combined DataFrame with all pairs as rows
+        """
+        if not pair_features:
+            return pd.DataFrame()
+        
+        if timestamp is not None:
+            # Get latest row from each pair at or before timestamp
+            rows = []
+            for pair_id, features in pair_features.items():
+                valid = features[features.index <= timestamp]
+                if len(valid) > 0:
+                    row = valid.iloc[-1:].copy()
+                    rows.append(row)
+            
+            if rows:
+                return pd.concat(rows, ignore_index=False)
+            return pd.DataFrame()
+        
+        # Combine all features (for training)
+        return pd.concat(pair_features.values(), ignore_index=False)
+    
+    def _add_on_chain_features(
+        self,
+        features: pd.DataFrame,
+        on_chain_data: pd.DataFrame | None,
+        base_asset: str,
+        quote_asset: str
+    ) -> pd.DataFrame:
+        """
+        Add on-chain and funding rate features for the pair.
+        
+        Uses funding data from OKX (all 10 assets) and on-chain data
+        from CryptoQuant (BTC/ETH only for inflows).
+        """
+        base_short = base_asset.replace('-USDT', '').lower()
+        quote_short = quote_asset.replace('-USDT', '').lower()
+        
+        # Add funding rates from cached funding data
+        if self._funding_data is not None and not self._funding_data.empty:
+            funding_aligned = self._funding_data.reindex(
+                features.index, method='ffill'
+            )
+            
+            base_funding_col = f'{base_short}_funding'
+            quote_funding_col = f'{quote_short}_funding'
+            
+            if base_funding_col in funding_aligned.columns:
+                features['base_funding'] = funding_aligned[base_funding_col]
+            if quote_funding_col in funding_aligned.columns:
+                features['quote_funding'] = funding_aligned[quote_funding_col]
+            
+            # Funding difference (positive = base has higher funding)
+            if 'base_funding' in features.columns and 'quote_funding' in features.columns:
+                features['funding_diff'] = (
+                    features['base_funding'] - features['quote_funding']
+                )
+                
+                # Funding sentiment: average of both assets
+                features['funding_avg'] = (
+                    features['base_funding'] + features['quote_funding']
+                ) / 2
+        
+        # Add on-chain features from CryptoQuant (BTC/ETH only)
+        if on_chain_data is not None and not on_chain_data.empty:
+            cq_aligned = on_chain_data.reindex(features.index, method='ffill')
+            
+            # Inflows (only available for BTC/ETH)
+            base_inflow_col = f'{base_short}_inflow'
+            quote_inflow_col = f'{quote_short}_inflow'
+            
+            if base_inflow_col in cq_aligned.columns:
+                features['base_inflow'] = cq_aligned[base_inflow_col]
+            if quote_inflow_col in cq_aligned.columns:
+                features['quote_inflow'] = cq_aligned[quote_inflow_col]
+            
+            if 'base_inflow' in features.columns and 'quote_inflow' in features.columns:
+                features['inflow_ratio'] = (
+                    features['base_inflow'] / 
+                    (features['quote_inflow'] + 1)
+                )
+        
+        return features
+    
+    def get_feature_columns(self) -> list[str]:
+        """
+        Get list of feature columns for ML model.
+        
+        Excludes metadata and target-related columns.
+        
+        Returns:
+            List of feature column names
+        """
+        # Core features (always present)
+        core_features = [
+            'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h',
+            'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio',
+            'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg',
+            'base_idx', 'quote_idx'
+        ]
+        
+        # Funding features (now available for all 10 assets via OKX)
+        funding_features = [
+            'base_funding', 'quote_funding', 'funding_diff', 'funding_avg'
+        ]
+        
+        # On-chain features (BTC/ETH only via CryptoQuant)
+        onchain_features = [
+            'base_inflow', 'quote_inflow', 'inflow_ratio'
+        ]
+        
+        return core_features + funding_features + onchain_features