feat: Multi-Pair Divergence Selection Strategy

- Extend regime detection to top 10 cryptocurrencies (45 pairs) - Dynamic pair selection based on divergence score (|z_score| * probability) - Universal ML model trained on all pairs - Correlation-based filtering to avoid redundant positions - Funding rate integration from OKX for all 10 assets - ATR-based dynamic stop-loss and take-profit - Walk-forward training with 70/30 split Performance: +35.69% return (vs +28.66% baseline), 63.6% win rate
2026-01-15 20:47:23 +08:00
parent 7e4a6874a2
commit df37366603
13 changed files with 2531 additions and 0 deletions
--- a/strategies/multi_pair/divergence_scorer.py
+++ b/strategies/multi_pair/divergence_scorer.py
@@ -0,0 +1,311 @@
+"""
+Divergence Scorer for Multi-Pair Strategy.
+
+Ranks pairs by divergence score and selects the best candidate.
+"""
+from dataclasses import dataclass
+from typing import Optional
+
+import pandas as pd
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+import pickle
+from pathlib import Path
+
+from engine.logging_config import get_logger
+from .config import MultiPairConfig
+from .pair_scanner import TradingPair
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class DivergenceSignal:
+    """
+    Signal for a divergent pair.
+    
+    Attributes:
+        pair: Trading pair
+        z_score: Current Z-Score of the spread
+        probability: ML model probability of profitable reversion
+        divergence_score: Combined score (|z_score| * probability)
+        direction: 'long' or 'short' (relative to base asset)
+        base_price: Current price of base asset
+        quote_price: Current price of quote asset
+        atr: Average True Range in price units
+        atr_pct: ATR as percentage of price
+    """
+    pair: TradingPair
+    z_score: float
+    probability: float
+    divergence_score: float
+    direction: str
+    base_price: float
+    quote_price: float
+    atr: float
+    atr_pct: float
+    timestamp: pd.Timestamp
+
+
+class DivergenceScorer:
+    """
+    Scores and ranks pairs by divergence potential.
+    
+    Uses ML model predictions combined with Z-Score magnitude
+    to identify the most promising mean-reversion opportunity.
+    """
+    
+    def __init__(self, config: MultiPairConfig, model_path: str = "data/multi_pair_model.pkl"):
+        self.config = config
+        self.model_path = Path(model_path)
+        self.model: RandomForestClassifier | None = None
+        self.feature_cols: list[str] | None = None
+        self._load_model()
+    
+    def _load_model(self) -> None:
+        """Load pre-trained model if available."""
+        if self.model_path.exists():
+            try:
+                with open(self.model_path, 'rb') as f:
+                    saved = pickle.load(f)
+                    self.model = saved['model']
+                    self.feature_cols = saved['feature_cols']
+                logger.info("Loaded model from %s", self.model_path)
+            except Exception as e:
+                logger.warning("Could not load model: %s", e)
+    
+    def save_model(self) -> None:
+        """Save trained model."""
+        if self.model is None:
+            return
+        
+        self.model_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(self.model_path, 'wb') as f:
+            pickle.dump({
+                'model': self.model,
+                'feature_cols': self.feature_cols,
+            }, f)
+        logger.info("Saved model to %s", self.model_path)
+    
+    def train_model(
+        self,
+        combined_features: pd.DataFrame,
+        pair_features: dict[str, pd.DataFrame]
+    ) -> None:
+        """
+        Train universal model on all pairs.
+        
+        Args:
+            combined_features: Combined feature DataFrame from all pairs
+            pair_features: Individual pair feature DataFrames (for target calculation)
+        """
+        logger.info("Training universal model on %d samples...", len(combined_features))
+        
+        z_thresh = self.config.z_entry_threshold
+        horizon = self.config.horizon
+        profit_target = self.config.profit_target
+        
+        # Calculate targets for each pair
+        all_targets = []
+        all_features = []
+        
+        for pair_id, features in pair_features.items():
+            if len(features) < horizon + 50:
+                continue
+            
+            spread = features['spread']
+            z_score = features['z_score']
+            
+            # Future price movements
+            future_min = spread.rolling(window=horizon).min().shift(-horizon)
+            future_max = spread.rolling(window=horizon).max().shift(-horizon)
+            
+            # Target labels
+            target_short = spread * (1 - profit_target)
+            target_long = spread * (1 + profit_target)
+            
+            success_short = (z_score > z_thresh) & (future_min < target_short)
+            success_long = (z_score < -z_thresh) & (future_max > target_long)
+            
+            targets = np.select([success_short, success_long], [1, 1], default=0)
+            
+            # Valid mask (exclude rows without complete future data)
+            valid_mask = future_min.notna() & future_max.notna()
+            
+            # Collect valid samples
+            valid_features = features[valid_mask]
+            valid_targets = targets[valid_mask.values]
+            
+            if len(valid_features) > 0:
+                all_features.append(valid_features)
+                all_targets.extend(valid_targets)
+        
+        if not all_features:
+            logger.warning("No valid training samples")
+            return
+        
+        # Combine all training data
+        X_df = pd.concat(all_features, ignore_index=True)
+        y = np.array(all_targets)
+        
+        # Get feature columns
+        exclude_cols = [
+            'pair_id', 'base_asset', 'quote_asset',
+            'spread', 'base_close', 'quote_close', 'base_volume'
+        ]
+        self.feature_cols = [c for c in X_df.columns if c not in exclude_cols]
+        
+        # Prepare features
+        X = X_df[self.feature_cols].fillna(0)
+        X = X.replace([np.inf, -np.inf], 0)
+        
+        # Train model
+        self.model = RandomForestClassifier(
+            n_estimators=300,
+            max_depth=5,
+            min_samples_leaf=30,
+            class_weight={0: 1, 1: 3},
+            random_state=42
+        )
+        self.model.fit(X, y)
+        
+        logger.info(
+            "Model trained on %d samples, %d features, %.1f%% positive class",
+            len(X), len(self.feature_cols), y.mean() * 100
+        )
+        self.save_model()
+    
+    def score_pairs(
+        self,
+        pair_features: dict[str, pd.DataFrame],
+        pairs: list[TradingPair],
+        timestamp: pd.Timestamp | None = None
+    ) -> list[DivergenceSignal]:
+        """
+        Score all pairs and return ranked signals.
+        
+        Args:
+            pair_features: Feature DataFrames by pair_id
+            pairs: List of TradingPair objects
+            timestamp: Current timestamp for feature extraction
+            
+        Returns:
+            List of DivergenceSignal sorted by score (descending)
+        """
+        if self.model is None:
+            logger.warning("Model not trained, returning empty signals")
+            return []
+        
+        signals = []
+        pair_map = {p.pair_id: p for p in pairs}
+        
+        for pair_id, features in pair_features.items():
+            if pair_id not in pair_map:
+                continue
+            
+            pair = pair_map[pair_id]
+            
+            # Get latest features
+            if timestamp is not None:
+                valid = features[features.index <= timestamp]
+                if len(valid) == 0:
+                    continue
+                latest = valid.iloc[-1]
+                ts = valid.index[-1]
+            else:
+                latest = features.iloc[-1]
+                ts = features.index[-1]
+            
+            z_score = latest['z_score']
+            
+            # Skip if Z-score below threshold
+            if abs(z_score) < self.config.z_entry_threshold:
+                continue
+            
+            # Prepare features for prediction
+            feature_row = latest[self.feature_cols].fillna(0).infer_objects(copy=False)
+            feature_row = feature_row.replace([np.inf, -np.inf], 0)
+            X = pd.DataFrame([feature_row.values], columns=self.feature_cols)
+            
+            # Predict probability
+            prob = self.model.predict_proba(X)[0, 1]
+            
+            # Skip if probability below threshold
+            if prob < self.config.prob_threshold:
+                continue
+            
+            # Apply funding rate filter
+            # Block trades where funding opposes our direction
+            base_funding = latest.get('base_funding', 0) or 0
+            funding_thresh = self.config.funding_threshold
+            
+            if z_score > 0:  # Short signal
+                # High negative funding = shorts are paying -> skip
+                if base_funding < -funding_thresh:
+                    logger.debug(
+                        "Skipping %s short: funding too negative (%.4f)",
+                        pair.name, base_funding
+                    )
+                    continue
+            else:  # Long signal
+                # High positive funding = longs are paying -> skip
+                if base_funding > funding_thresh:
+                    logger.debug(
+                        "Skipping %s long: funding too positive (%.4f)",
+                        pair.name, base_funding
+                    )
+                    continue
+            
+            # Calculate divergence score
+            divergence_score = abs(z_score) * prob
+            
+            # Determine direction
+            # Z > 0: Spread high (base expensive vs quote) -> Short base
+            # Z < 0: Spread low (base cheap vs quote) -> Long base
+            direction = 'short' if z_score > 0 else 'long'
+            
+            signal = DivergenceSignal(
+                pair=pair,
+                z_score=z_score,
+                probability=prob,
+                divergence_score=divergence_score,
+                direction=direction,
+                base_price=latest['base_close'],
+                quote_price=latest['quote_close'],
+                atr=latest.get('atr_base', 0),
+                atr_pct=latest.get('atr_pct_base', 0.02),
+                timestamp=ts
+            )
+            signals.append(signal)
+        
+        # Sort by divergence score (highest first)
+        signals.sort(key=lambda s: s.divergence_score, reverse=True)
+        
+        if signals:
+            logger.debug(
+                "Scored %d pairs, top: %s (score=%.3f, z=%.2f, p=%.2f)",
+                len(signals),
+                signals[0].pair.name,
+                signals[0].divergence_score,
+                signals[0].z_score,
+                signals[0].probability
+            )
+        
+        return signals
+    
+    def select_best_pair(
+        self,
+        signals: list[DivergenceSignal]
+    ) -> DivergenceSignal | None:
+        """
+        Select the best pair from scored signals.
+        
+        Args:
+            signals: List of DivergenceSignal (pre-sorted by score)
+            
+        Returns:
+            Best signal or None if no valid candidates
+        """
+        if not signals:
+            return None
+        return signals[0]