- Extend regime detection to top 10 cryptocurrencies (45 pairs) - Dynamic pair selection based on divergence score (|z_score| * probability) - Universal ML model trained on all pairs - Correlation-based filtering to avoid redundant positions - Funding rate integration from OKX for all 10 assets - ATR-based dynamic stop-loss and take-profit - Walk-forward training with 70/30 split Performance: +35.69% return (vs +28.66% baseline), 63.6% win rate
312 lines
10 KiB
Python
312 lines
10 KiB
Python
"""
|
|
Divergence Scorer for Multi-Pair Strategy.
|
|
|
|
Ranks pairs by divergence score and selects the best candidate.
|
|
"""
|
|
from dataclasses import dataclass
|
|
from typing import Optional
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
import pickle
|
|
from pathlib import Path
|
|
|
|
from engine.logging_config import get_logger
|
|
from .config import MultiPairConfig
|
|
from .pair_scanner import TradingPair
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class DivergenceSignal:
|
|
"""
|
|
Signal for a divergent pair.
|
|
|
|
Attributes:
|
|
pair: Trading pair
|
|
z_score: Current Z-Score of the spread
|
|
probability: ML model probability of profitable reversion
|
|
divergence_score: Combined score (|z_score| * probability)
|
|
direction: 'long' or 'short' (relative to base asset)
|
|
base_price: Current price of base asset
|
|
quote_price: Current price of quote asset
|
|
atr: Average True Range in price units
|
|
atr_pct: ATR as percentage of price
|
|
"""
|
|
pair: TradingPair
|
|
z_score: float
|
|
probability: float
|
|
divergence_score: float
|
|
direction: str
|
|
base_price: float
|
|
quote_price: float
|
|
atr: float
|
|
atr_pct: float
|
|
timestamp: pd.Timestamp
|
|
|
|
|
|
class DivergenceScorer:
|
|
"""
|
|
Scores and ranks pairs by divergence potential.
|
|
|
|
Uses ML model predictions combined with Z-Score magnitude
|
|
to identify the most promising mean-reversion opportunity.
|
|
"""
|
|
|
|
def __init__(self, config: MultiPairConfig, model_path: str = "data/multi_pair_model.pkl"):
|
|
self.config = config
|
|
self.model_path = Path(model_path)
|
|
self.model: RandomForestClassifier | None = None
|
|
self.feature_cols: list[str] | None = None
|
|
self._load_model()
|
|
|
|
def _load_model(self) -> None:
|
|
"""Load pre-trained model if available."""
|
|
if self.model_path.exists():
|
|
try:
|
|
with open(self.model_path, 'rb') as f:
|
|
saved = pickle.load(f)
|
|
self.model = saved['model']
|
|
self.feature_cols = saved['feature_cols']
|
|
logger.info("Loaded model from %s", self.model_path)
|
|
except Exception as e:
|
|
logger.warning("Could not load model: %s", e)
|
|
|
|
def save_model(self) -> None:
|
|
"""Save trained model."""
|
|
if self.model is None:
|
|
return
|
|
|
|
self.model_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(self.model_path, 'wb') as f:
|
|
pickle.dump({
|
|
'model': self.model,
|
|
'feature_cols': self.feature_cols,
|
|
}, f)
|
|
logger.info("Saved model to %s", self.model_path)
|
|
|
|
def train_model(
|
|
self,
|
|
combined_features: pd.DataFrame,
|
|
pair_features: dict[str, pd.DataFrame]
|
|
) -> None:
|
|
"""
|
|
Train universal model on all pairs.
|
|
|
|
Args:
|
|
combined_features: Combined feature DataFrame from all pairs
|
|
pair_features: Individual pair feature DataFrames (for target calculation)
|
|
"""
|
|
logger.info("Training universal model on %d samples...", len(combined_features))
|
|
|
|
z_thresh = self.config.z_entry_threshold
|
|
horizon = self.config.horizon
|
|
profit_target = self.config.profit_target
|
|
|
|
# Calculate targets for each pair
|
|
all_targets = []
|
|
all_features = []
|
|
|
|
for pair_id, features in pair_features.items():
|
|
if len(features) < horizon + 50:
|
|
continue
|
|
|
|
spread = features['spread']
|
|
z_score = features['z_score']
|
|
|
|
# Future price movements
|
|
future_min = spread.rolling(window=horizon).min().shift(-horizon)
|
|
future_max = spread.rolling(window=horizon).max().shift(-horizon)
|
|
|
|
# Target labels
|
|
target_short = spread * (1 - profit_target)
|
|
target_long = spread * (1 + profit_target)
|
|
|
|
success_short = (z_score > z_thresh) & (future_min < target_short)
|
|
success_long = (z_score < -z_thresh) & (future_max > target_long)
|
|
|
|
targets = np.select([success_short, success_long], [1, 1], default=0)
|
|
|
|
# Valid mask (exclude rows without complete future data)
|
|
valid_mask = future_min.notna() & future_max.notna()
|
|
|
|
# Collect valid samples
|
|
valid_features = features[valid_mask]
|
|
valid_targets = targets[valid_mask.values]
|
|
|
|
if len(valid_features) > 0:
|
|
all_features.append(valid_features)
|
|
all_targets.extend(valid_targets)
|
|
|
|
if not all_features:
|
|
logger.warning("No valid training samples")
|
|
return
|
|
|
|
# Combine all training data
|
|
X_df = pd.concat(all_features, ignore_index=True)
|
|
y = np.array(all_targets)
|
|
|
|
# Get feature columns
|
|
exclude_cols = [
|
|
'pair_id', 'base_asset', 'quote_asset',
|
|
'spread', 'base_close', 'quote_close', 'base_volume'
|
|
]
|
|
self.feature_cols = [c for c in X_df.columns if c not in exclude_cols]
|
|
|
|
# Prepare features
|
|
X = X_df[self.feature_cols].fillna(0)
|
|
X = X.replace([np.inf, -np.inf], 0)
|
|
|
|
# Train model
|
|
self.model = RandomForestClassifier(
|
|
n_estimators=300,
|
|
max_depth=5,
|
|
min_samples_leaf=30,
|
|
class_weight={0: 1, 1: 3},
|
|
random_state=42
|
|
)
|
|
self.model.fit(X, y)
|
|
|
|
logger.info(
|
|
"Model trained on %d samples, %d features, %.1f%% positive class",
|
|
len(X), len(self.feature_cols), y.mean() * 100
|
|
)
|
|
self.save_model()
|
|
|
|
def score_pairs(
|
|
self,
|
|
pair_features: dict[str, pd.DataFrame],
|
|
pairs: list[TradingPair],
|
|
timestamp: pd.Timestamp | None = None
|
|
) -> list[DivergenceSignal]:
|
|
"""
|
|
Score all pairs and return ranked signals.
|
|
|
|
Args:
|
|
pair_features: Feature DataFrames by pair_id
|
|
pairs: List of TradingPair objects
|
|
timestamp: Current timestamp for feature extraction
|
|
|
|
Returns:
|
|
List of DivergenceSignal sorted by score (descending)
|
|
"""
|
|
if self.model is None:
|
|
logger.warning("Model not trained, returning empty signals")
|
|
return []
|
|
|
|
signals = []
|
|
pair_map = {p.pair_id: p for p in pairs}
|
|
|
|
for pair_id, features in pair_features.items():
|
|
if pair_id not in pair_map:
|
|
continue
|
|
|
|
pair = pair_map[pair_id]
|
|
|
|
# Get latest features
|
|
if timestamp is not None:
|
|
valid = features[features.index <= timestamp]
|
|
if len(valid) == 0:
|
|
continue
|
|
latest = valid.iloc[-1]
|
|
ts = valid.index[-1]
|
|
else:
|
|
latest = features.iloc[-1]
|
|
ts = features.index[-1]
|
|
|
|
z_score = latest['z_score']
|
|
|
|
# Skip if Z-score below threshold
|
|
if abs(z_score) < self.config.z_entry_threshold:
|
|
continue
|
|
|
|
# Prepare features for prediction
|
|
feature_row = latest[self.feature_cols].fillna(0).infer_objects(copy=False)
|
|
feature_row = feature_row.replace([np.inf, -np.inf], 0)
|
|
X = pd.DataFrame([feature_row.values], columns=self.feature_cols)
|
|
|
|
# Predict probability
|
|
prob = self.model.predict_proba(X)[0, 1]
|
|
|
|
# Skip if probability below threshold
|
|
if prob < self.config.prob_threshold:
|
|
continue
|
|
|
|
# Apply funding rate filter
|
|
# Block trades where funding opposes our direction
|
|
base_funding = latest.get('base_funding', 0) or 0
|
|
funding_thresh = self.config.funding_threshold
|
|
|
|
if z_score > 0: # Short signal
|
|
# High negative funding = shorts are paying -> skip
|
|
if base_funding < -funding_thresh:
|
|
logger.debug(
|
|
"Skipping %s short: funding too negative (%.4f)",
|
|
pair.name, base_funding
|
|
)
|
|
continue
|
|
else: # Long signal
|
|
# High positive funding = longs are paying -> skip
|
|
if base_funding > funding_thresh:
|
|
logger.debug(
|
|
"Skipping %s long: funding too positive (%.4f)",
|
|
pair.name, base_funding
|
|
)
|
|
continue
|
|
|
|
# Calculate divergence score
|
|
divergence_score = abs(z_score) * prob
|
|
|
|
# Determine direction
|
|
# Z > 0: Spread high (base expensive vs quote) -> Short base
|
|
# Z < 0: Spread low (base cheap vs quote) -> Long base
|
|
direction = 'short' if z_score > 0 else 'long'
|
|
|
|
signal = DivergenceSignal(
|
|
pair=pair,
|
|
z_score=z_score,
|
|
probability=prob,
|
|
divergence_score=divergence_score,
|
|
direction=direction,
|
|
base_price=latest['base_close'],
|
|
quote_price=latest['quote_close'],
|
|
atr=latest.get('atr_base', 0),
|
|
atr_pct=latest.get('atr_pct_base', 0.02),
|
|
timestamp=ts
|
|
)
|
|
signals.append(signal)
|
|
|
|
# Sort by divergence score (highest first)
|
|
signals.sort(key=lambda s: s.divergence_score, reverse=True)
|
|
|
|
if signals:
|
|
logger.debug(
|
|
"Scored %d pairs, top: %s (score=%.3f, z=%.2f, p=%.2f)",
|
|
len(signals),
|
|
signals[0].pair.name,
|
|
signals[0].divergence_score,
|
|
signals[0].z_score,
|
|
signals[0].probability
|
|
)
|
|
|
|
return signals
|
|
|
|
def select_best_pair(
|
|
self,
|
|
signals: list[DivergenceSignal]
|
|
) -> DivergenceSignal | None:
|
|
"""
|
|
Select the best pair from scored signals.
|
|
|
|
Args:
|
|
signals: List of DivergenceSignal (pre-sorted by score)
|
|
|
|
Returns:
|
|
Best signal or None if no valid candidates
|
|
"""
|
|
if not signals:
|
|
return None
|
|
return signals[0]
|