Files
lowkey_backtest/strategies/multi_pair/divergence_scorer.py
Simon Moisy df37366603 feat: Multi-Pair Divergence Selection Strategy
- Extend regime detection to top 10 cryptocurrencies (45 pairs)
- Dynamic pair selection based on divergence score (|z_score| * probability)
- Universal ML model trained on all pairs
- Correlation-based filtering to avoid redundant positions
- Funding rate integration from OKX for all 10 assets
- ATR-based dynamic stop-loss and take-profit
- Walk-forward training with 70/30 split

Performance: +35.69% return (vs +28.66% baseline), 63.6% win rate
2026-01-15 20:47:23 +08:00

312 lines
10 KiB
Python

"""
Divergence Scorer for Multi-Pair Strategy.
Ranks pairs by divergence score and selects the best candidate.
"""
from dataclasses import dataclass
from typing import Optional
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pickle
from pathlib import Path
from engine.logging_config import get_logger
from .config import MultiPairConfig
from .pair_scanner import TradingPair
logger = get_logger(__name__)
@dataclass
class DivergenceSignal:
"""
Signal for a divergent pair.
Attributes:
pair: Trading pair
z_score: Current Z-Score of the spread
probability: ML model probability of profitable reversion
divergence_score: Combined score (|z_score| * probability)
direction: 'long' or 'short' (relative to base asset)
base_price: Current price of base asset
quote_price: Current price of quote asset
atr: Average True Range in price units
atr_pct: ATR as percentage of price
"""
pair: TradingPair
z_score: float
probability: float
divergence_score: float
direction: str
base_price: float
quote_price: float
atr: float
atr_pct: float
timestamp: pd.Timestamp
class DivergenceScorer:
"""
Scores and ranks pairs by divergence potential.
Uses ML model predictions combined with Z-Score magnitude
to identify the most promising mean-reversion opportunity.
"""
def __init__(self, config: MultiPairConfig, model_path: str = "data/multi_pair_model.pkl"):
self.config = config
self.model_path = Path(model_path)
self.model: RandomForestClassifier | None = None
self.feature_cols: list[str] | None = None
self._load_model()
def _load_model(self) -> None:
"""Load pre-trained model if available."""
if self.model_path.exists():
try:
with open(self.model_path, 'rb') as f:
saved = pickle.load(f)
self.model = saved['model']
self.feature_cols = saved['feature_cols']
logger.info("Loaded model from %s", self.model_path)
except Exception as e:
logger.warning("Could not load model: %s", e)
def save_model(self) -> None:
"""Save trained model."""
if self.model is None:
return
self.model_path.parent.mkdir(parents=True, exist_ok=True)
with open(self.model_path, 'wb') as f:
pickle.dump({
'model': self.model,
'feature_cols': self.feature_cols,
}, f)
logger.info("Saved model to %s", self.model_path)
def train_model(
self,
combined_features: pd.DataFrame,
pair_features: dict[str, pd.DataFrame]
) -> None:
"""
Train universal model on all pairs.
Args:
combined_features: Combined feature DataFrame from all pairs
pair_features: Individual pair feature DataFrames (for target calculation)
"""
logger.info("Training universal model on %d samples...", len(combined_features))
z_thresh = self.config.z_entry_threshold
horizon = self.config.horizon
profit_target = self.config.profit_target
# Calculate targets for each pair
all_targets = []
all_features = []
for pair_id, features in pair_features.items():
if len(features) < horizon + 50:
continue
spread = features['spread']
z_score = features['z_score']
# Future price movements
future_min = spread.rolling(window=horizon).min().shift(-horizon)
future_max = spread.rolling(window=horizon).max().shift(-horizon)
# Target labels
target_short = spread * (1 - profit_target)
target_long = spread * (1 + profit_target)
success_short = (z_score > z_thresh) & (future_min < target_short)
success_long = (z_score < -z_thresh) & (future_max > target_long)
targets = np.select([success_short, success_long], [1, 1], default=0)
# Valid mask (exclude rows without complete future data)
valid_mask = future_min.notna() & future_max.notna()
# Collect valid samples
valid_features = features[valid_mask]
valid_targets = targets[valid_mask.values]
if len(valid_features) > 0:
all_features.append(valid_features)
all_targets.extend(valid_targets)
if not all_features:
logger.warning("No valid training samples")
return
# Combine all training data
X_df = pd.concat(all_features, ignore_index=True)
y = np.array(all_targets)
# Get feature columns
exclude_cols = [
'pair_id', 'base_asset', 'quote_asset',
'spread', 'base_close', 'quote_close', 'base_volume'
]
self.feature_cols = [c for c in X_df.columns if c not in exclude_cols]
# Prepare features
X = X_df[self.feature_cols].fillna(0)
X = X.replace([np.inf, -np.inf], 0)
# Train model
self.model = RandomForestClassifier(
n_estimators=300,
max_depth=5,
min_samples_leaf=30,
class_weight={0: 1, 1: 3},
random_state=42
)
self.model.fit(X, y)
logger.info(
"Model trained on %d samples, %d features, %.1f%% positive class",
len(X), len(self.feature_cols), y.mean() * 100
)
self.save_model()
def score_pairs(
self,
pair_features: dict[str, pd.DataFrame],
pairs: list[TradingPair],
timestamp: pd.Timestamp | None = None
) -> list[DivergenceSignal]:
"""
Score all pairs and return ranked signals.
Args:
pair_features: Feature DataFrames by pair_id
pairs: List of TradingPair objects
timestamp: Current timestamp for feature extraction
Returns:
List of DivergenceSignal sorted by score (descending)
"""
if self.model is None:
logger.warning("Model not trained, returning empty signals")
return []
signals = []
pair_map = {p.pair_id: p for p in pairs}
for pair_id, features in pair_features.items():
if pair_id not in pair_map:
continue
pair = pair_map[pair_id]
# Get latest features
if timestamp is not None:
valid = features[features.index <= timestamp]
if len(valid) == 0:
continue
latest = valid.iloc[-1]
ts = valid.index[-1]
else:
latest = features.iloc[-1]
ts = features.index[-1]
z_score = latest['z_score']
# Skip if Z-score below threshold
if abs(z_score) < self.config.z_entry_threshold:
continue
# Prepare features for prediction
feature_row = latest[self.feature_cols].fillna(0).infer_objects(copy=False)
feature_row = feature_row.replace([np.inf, -np.inf], 0)
X = pd.DataFrame([feature_row.values], columns=self.feature_cols)
# Predict probability
prob = self.model.predict_proba(X)[0, 1]
# Skip if probability below threshold
if prob < self.config.prob_threshold:
continue
# Apply funding rate filter
# Block trades where funding opposes our direction
base_funding = latest.get('base_funding', 0) or 0
funding_thresh = self.config.funding_threshold
if z_score > 0: # Short signal
# High negative funding = shorts are paying -> skip
if base_funding < -funding_thresh:
logger.debug(
"Skipping %s short: funding too negative (%.4f)",
pair.name, base_funding
)
continue
else: # Long signal
# High positive funding = longs are paying -> skip
if base_funding > funding_thresh:
logger.debug(
"Skipping %s long: funding too positive (%.4f)",
pair.name, base_funding
)
continue
# Calculate divergence score
divergence_score = abs(z_score) * prob
# Determine direction
# Z > 0: Spread high (base expensive vs quote) -> Short base
# Z < 0: Spread low (base cheap vs quote) -> Long base
direction = 'short' if z_score > 0 else 'long'
signal = DivergenceSignal(
pair=pair,
z_score=z_score,
probability=prob,
divergence_score=divergence_score,
direction=direction,
base_price=latest['base_close'],
quote_price=latest['quote_close'],
atr=latest.get('atr_base', 0),
atr_pct=latest.get('atr_pct_base', 0.02),
timestamp=ts
)
signals.append(signal)
# Sort by divergence score (highest first)
signals.sort(key=lambda s: s.divergence_score, reverse=True)
if signals:
logger.debug(
"Scored %d pairs, top: %s (score=%.3f, z=%.2f, p=%.2f)",
len(signals),
signals[0].pair.name,
signals[0].divergence_score,
signals[0].z_score,
signals[0].probability
)
return signals
def select_best_pair(
self,
signals: list[DivergenceSignal]
) -> DivergenceSignal | None:
"""
Select the best pair from scored signals.
Args:
signals: List of DivergenceSignal (pre-sorted by score)
Returns:
Best signal or None if no valid candidates
"""
if not signals:
return None
return signals[0]