""" Divergence Scorer for Multi-Pair Strategy. Ranks pairs by divergence score and selects the best candidate. """ from dataclasses import dataclass from typing import Optional import pandas as pd import numpy as np from sklearn.ensemble import RandomForestClassifier import pickle from pathlib import Path from engine.logging_config import get_logger from .config import MultiPairConfig from .pair_scanner import TradingPair logger = get_logger(__name__) @dataclass class DivergenceSignal: """ Signal for a divergent pair. Attributes: pair: Trading pair z_score: Current Z-Score of the spread probability: ML model probability of profitable reversion divergence_score: Combined score (|z_score| * probability) direction: 'long' or 'short' (relative to base asset) base_price: Current price of base asset quote_price: Current price of quote asset atr: Average True Range in price units atr_pct: ATR as percentage of price """ pair: TradingPair z_score: float probability: float divergence_score: float direction: str base_price: float quote_price: float atr: float atr_pct: float timestamp: pd.Timestamp class DivergenceScorer: """ Scores and ranks pairs by divergence potential. Uses ML model predictions combined with Z-Score magnitude to identify the most promising mean-reversion opportunity. """ def __init__(self, config: MultiPairConfig, model_path: str = "data/multi_pair_model.pkl"): self.config = config self.model_path = Path(model_path) self.model: RandomForestClassifier | None = None self.feature_cols: list[str] | None = None self._load_model() def _load_model(self) -> None: """Load pre-trained model if available.""" if self.model_path.exists(): try: with open(self.model_path, 'rb') as f: saved = pickle.load(f) self.model = saved['model'] self.feature_cols = saved['feature_cols'] logger.info("Loaded model from %s", self.model_path) except Exception as e: logger.warning("Could not load model: %s", e) def save_model(self) -> None: """Save trained model.""" if self.model is None: return self.model_path.parent.mkdir(parents=True, exist_ok=True) with open(self.model_path, 'wb') as f: pickle.dump({ 'model': self.model, 'feature_cols': self.feature_cols, }, f) logger.info("Saved model to %s", self.model_path) def train_model( self, combined_features: pd.DataFrame, pair_features: dict[str, pd.DataFrame] ) -> None: """ Train universal model on all pairs. Args: combined_features: Combined feature DataFrame from all pairs pair_features: Individual pair feature DataFrames (for target calculation) """ logger.info("Training universal model on %d samples...", len(combined_features)) z_thresh = self.config.z_entry_threshold horizon = self.config.horizon profit_target = self.config.profit_target # Calculate targets for each pair all_targets = [] all_features = [] for pair_id, features in pair_features.items(): if len(features) < horizon + 50: continue spread = features['spread'] z_score = features['z_score'] # Future price movements future_min = spread.rolling(window=horizon).min().shift(-horizon) future_max = spread.rolling(window=horizon).max().shift(-horizon) # Target labels target_short = spread * (1 - profit_target) target_long = spread * (1 + profit_target) success_short = (z_score > z_thresh) & (future_min < target_short) success_long = (z_score < -z_thresh) & (future_max > target_long) targets = np.select([success_short, success_long], [1, 1], default=0) # Valid mask (exclude rows without complete future data) valid_mask = future_min.notna() & future_max.notna() # Collect valid samples valid_features = features[valid_mask] valid_targets = targets[valid_mask.values] if len(valid_features) > 0: all_features.append(valid_features) all_targets.extend(valid_targets) if not all_features: logger.warning("No valid training samples") return # Combine all training data X_df = pd.concat(all_features, ignore_index=True) y = np.array(all_targets) # Get feature columns exclude_cols = [ 'pair_id', 'base_asset', 'quote_asset', 'spread', 'base_close', 'quote_close', 'base_volume' ] self.feature_cols = [c for c in X_df.columns if c not in exclude_cols] # Prepare features X = X_df[self.feature_cols].fillna(0) X = X.replace([np.inf, -np.inf], 0) # Train model self.model = RandomForestClassifier( n_estimators=300, max_depth=5, min_samples_leaf=30, class_weight={0: 1, 1: 3}, random_state=42 ) self.model.fit(X, y) logger.info( "Model trained on %d samples, %d features, %.1f%% positive class", len(X), len(self.feature_cols), y.mean() * 100 ) self.save_model() def score_pairs( self, pair_features: dict[str, pd.DataFrame], pairs: list[TradingPair], timestamp: pd.Timestamp | None = None ) -> list[DivergenceSignal]: """ Score all pairs and return ranked signals. Args: pair_features: Feature DataFrames by pair_id pairs: List of TradingPair objects timestamp: Current timestamp for feature extraction Returns: List of DivergenceSignal sorted by score (descending) """ if self.model is None: logger.warning("Model not trained, returning empty signals") return [] signals = [] pair_map = {p.pair_id: p for p in pairs} for pair_id, features in pair_features.items(): if pair_id not in pair_map: continue pair = pair_map[pair_id] # Get latest features if timestamp is not None: valid = features[features.index <= timestamp] if len(valid) == 0: continue latest = valid.iloc[-1] ts = valid.index[-1] else: latest = features.iloc[-1] ts = features.index[-1] z_score = latest['z_score'] # Skip if Z-score below threshold if abs(z_score) < self.config.z_entry_threshold: continue # Prepare features for prediction feature_row = latest[self.feature_cols].fillna(0).infer_objects(copy=False) feature_row = feature_row.replace([np.inf, -np.inf], 0) X = pd.DataFrame([feature_row.values], columns=self.feature_cols) # Predict probability prob = self.model.predict_proba(X)[0, 1] # Skip if probability below threshold if prob < self.config.prob_threshold: continue # Apply funding rate filter # Block trades where funding opposes our direction base_funding = latest.get('base_funding', 0) or 0 funding_thresh = self.config.funding_threshold if z_score > 0: # Short signal # High negative funding = shorts are paying -> skip if base_funding < -funding_thresh: logger.debug( "Skipping %s short: funding too negative (%.4f)", pair.name, base_funding ) continue else: # Long signal # High positive funding = longs are paying -> skip if base_funding > funding_thresh: logger.debug( "Skipping %s long: funding too positive (%.4f)", pair.name, base_funding ) continue # Calculate divergence score divergence_score = abs(z_score) * prob # Determine direction # Z > 0: Spread high (base expensive vs quote) -> Short base # Z < 0: Spread low (base cheap vs quote) -> Long base direction = 'short' if z_score > 0 else 'long' signal = DivergenceSignal( pair=pair, z_score=z_score, probability=prob, divergence_score=divergence_score, direction=direction, base_price=latest['base_close'], quote_price=latest['quote_close'], atr=latest.get('atr_base', 0), atr_pct=latest.get('atr_pct_base', 0.02), timestamp=ts ) signals.append(signal) # Sort by divergence score (highest first) signals.sort(key=lambda s: s.divergence_score, reverse=True) if signals: logger.debug( "Scored %d pairs, top: %s (score=%.3f, z=%.2f, p=%.2f)", len(signals), signals[0].pair.name, signals[0].divergence_score, signals[0].z_score, signals[0].probability ) return signals def select_best_pair( self, signals: list[DivergenceSignal] ) -> DivergenceSignal | None: """ Select the best pair from scored signals. Args: signals: List of DivergenceSignal (pre-sorted by score) Returns: Best signal or None if no valid candidates """ if not signals: return None return signals[0]