feat: Multi-Pair Divergence Selection Strategy
- Extend regime detection to top 10 cryptocurrencies (45 pairs) - Dynamic pair selection based on divergence score (|z_score| * probability) - Universal ML model trained on all pairs - Correlation-based filtering to avoid redundant positions - Funding rate integration from OKX for all 10 assets - ATR-based dynamic stop-loss and take-profit - Walk-forward training with 70/30 split Performance: +35.69% return (vs +28.66% baseline), 63.6% win rate
This commit is contained in:
433
strategies/multi_pair/feature_engine.py
Normal file
433
strategies/multi_pair/feature_engine.py
Normal file
@@ -0,0 +1,433 @@
|
||||
"""
|
||||
Feature Engineering for Multi-Pair Divergence Strategy.
|
||||
|
||||
Calculates features for all pairs in the universe, including
|
||||
spread technicals, volatility, and on-chain data.
|
||||
"""
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import ta
|
||||
|
||||
from engine.logging_config import get_logger
|
||||
from engine.data_manager import DataManager
|
||||
from engine.market import MarketType
|
||||
from .config import MultiPairConfig
|
||||
from .pair_scanner import TradingPair
|
||||
from .funding import FundingRateFetcher
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class MultiPairFeatureEngine:
|
||||
"""
|
||||
Calculates features for multiple trading pairs.
|
||||
|
||||
Generates consistent feature sets across all pairs for
|
||||
the universal ML model.
|
||||
"""
|
||||
|
||||
def __init__(self, config: MultiPairConfig):
|
||||
self.config = config
|
||||
self.dm = DataManager()
|
||||
self.funding_fetcher = FundingRateFetcher()
|
||||
self._funding_data: pd.DataFrame | None = None
|
||||
|
||||
def load_all_assets(
|
||||
self,
|
||||
start_date: str | None = None,
|
||||
end_date: str | None = None
|
||||
) -> dict[str, pd.DataFrame]:
|
||||
"""
|
||||
Load OHLCV data for all assets in the universe.
|
||||
|
||||
Args:
|
||||
start_date: Start date filter (YYYY-MM-DD)
|
||||
end_date: End date filter (YYYY-MM-DD)
|
||||
|
||||
Returns:
|
||||
Dictionary mapping symbol to OHLCV DataFrame
|
||||
"""
|
||||
data = {}
|
||||
market_type = MarketType.PERPETUAL
|
||||
|
||||
for symbol in self.config.assets:
|
||||
try:
|
||||
df = self.dm.load_data(
|
||||
self.config.exchange_id,
|
||||
symbol,
|
||||
self.config.timeframe,
|
||||
market_type
|
||||
)
|
||||
|
||||
# Apply date filters
|
||||
if start_date:
|
||||
df = df[df.index >= pd.Timestamp(start_date, tz="UTC")]
|
||||
if end_date:
|
||||
df = df[df.index <= pd.Timestamp(end_date, tz="UTC")]
|
||||
|
||||
if len(df) >= 200: # Minimum data requirement
|
||||
data[symbol] = df
|
||||
logger.debug("Loaded %s: %d bars", symbol, len(df))
|
||||
else:
|
||||
logger.warning(
|
||||
"Skipping %s: insufficient data (%d bars)",
|
||||
symbol, len(df)
|
||||
)
|
||||
except FileNotFoundError:
|
||||
logger.warning("Data not found for %s", symbol)
|
||||
except Exception as e:
|
||||
logger.error("Error loading %s: %s", symbol, e)
|
||||
|
||||
logger.info("Loaded %d/%d assets", len(data), len(self.config.assets))
|
||||
return data
|
||||
|
||||
def load_funding_data(
|
||||
self,
|
||||
start_date: str | None = None,
|
||||
end_date: str | None = None,
|
||||
use_cache: bool = True
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Load funding rate data for all assets.
|
||||
|
||||
Args:
|
||||
start_date: Start date filter
|
||||
end_date: End date filter
|
||||
use_cache: Whether to use cached data
|
||||
|
||||
Returns:
|
||||
DataFrame with funding rates for all assets
|
||||
"""
|
||||
self._funding_data = self.funding_fetcher.get_funding_data(
|
||||
self.config.assets,
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
use_cache=use_cache
|
||||
)
|
||||
|
||||
if self._funding_data is not None and not self._funding_data.empty:
|
||||
logger.info(
|
||||
"Loaded funding data: %d rows, %d assets",
|
||||
len(self._funding_data),
|
||||
len(self._funding_data.columns)
|
||||
)
|
||||
else:
|
||||
logger.warning("No funding data available")
|
||||
|
||||
return self._funding_data
|
||||
|
||||
def calculate_pair_features(
|
||||
self,
|
||||
pair: TradingPair,
|
||||
asset_data: dict[str, pd.DataFrame],
|
||||
on_chain_data: pd.DataFrame | None = None
|
||||
) -> pd.DataFrame | None:
|
||||
"""
|
||||
Calculate features for a single pair.
|
||||
|
||||
Args:
|
||||
pair: Trading pair
|
||||
asset_data: Dictionary of OHLCV DataFrames by symbol
|
||||
on_chain_data: Optional on-chain data (funding, inflows)
|
||||
|
||||
Returns:
|
||||
DataFrame with features, or None if insufficient data
|
||||
"""
|
||||
base = pair.base_asset
|
||||
quote = pair.quote_asset
|
||||
|
||||
if base not in asset_data or quote not in asset_data:
|
||||
return None
|
||||
|
||||
df_base = asset_data[base]
|
||||
df_quote = asset_data[quote]
|
||||
|
||||
# Align indices
|
||||
common_idx = df_base.index.intersection(df_quote.index)
|
||||
if len(common_idx) < 200:
|
||||
logger.debug("Pair %s: insufficient aligned data", pair.name)
|
||||
return None
|
||||
|
||||
df_a = df_base.loc[common_idx]
|
||||
df_b = df_quote.loc[common_idx]
|
||||
|
||||
# Calculate spread (base / quote)
|
||||
spread = df_a['close'] / df_b['close']
|
||||
|
||||
# Z-Score
|
||||
z_window = self.config.z_window
|
||||
rolling_mean = spread.rolling(window=z_window).mean()
|
||||
rolling_std = spread.rolling(window=z_window).std()
|
||||
z_score = (spread - rolling_mean) / rolling_std
|
||||
|
||||
# Spread Technicals
|
||||
spread_rsi = ta.momentum.RSIIndicator(spread, window=14).rsi()
|
||||
spread_roc = spread.pct_change(periods=5) * 100
|
||||
spread_change_1h = spread.pct_change(periods=1)
|
||||
|
||||
# Volume Analysis
|
||||
vol_ratio = df_a['volume'] / (df_b['volume'] + 1e-10)
|
||||
vol_ratio_ma = vol_ratio.rolling(window=12).mean()
|
||||
vol_ratio_rel = vol_ratio / (vol_ratio_ma + 1e-10)
|
||||
|
||||
# Volatility
|
||||
ret_a = df_a['close'].pct_change()
|
||||
ret_b = df_b['close'].pct_change()
|
||||
vol_a = ret_a.rolling(window=z_window).std()
|
||||
vol_b = ret_b.rolling(window=z_window).std()
|
||||
vol_spread_ratio = vol_a / (vol_b + 1e-10)
|
||||
|
||||
# Realized Volatility (for dynamic SL/TP)
|
||||
realized_vol_a = ret_a.rolling(window=self.config.volatility_window).std()
|
||||
realized_vol_b = ret_b.rolling(window=self.config.volatility_window).std()
|
||||
|
||||
# ATR (Average True Range) for dynamic stops
|
||||
# ATR = average of max(high-low, |high-prev_close|, |low-prev_close|)
|
||||
high_a, low_a, close_a = df_a['high'], df_a['low'], df_a['close']
|
||||
high_b, low_b, close_b = df_b['high'], df_b['low'], df_b['close']
|
||||
|
||||
# True Range for base asset
|
||||
tr_a = pd.concat([
|
||||
high_a - low_a,
|
||||
(high_a - close_a.shift(1)).abs(),
|
||||
(low_a - close_a.shift(1)).abs()
|
||||
], axis=1).max(axis=1)
|
||||
atr_a = tr_a.rolling(window=self.config.atr_period).mean()
|
||||
|
||||
# True Range for quote asset
|
||||
tr_b = pd.concat([
|
||||
high_b - low_b,
|
||||
(high_b - close_b.shift(1)).abs(),
|
||||
(low_b - close_b.shift(1)).abs()
|
||||
], axis=1).max(axis=1)
|
||||
atr_b = tr_b.rolling(window=self.config.atr_period).mean()
|
||||
|
||||
# ATR as percentage of price (normalized)
|
||||
atr_pct_a = atr_a / close_a
|
||||
atr_pct_b = atr_b / close_b
|
||||
|
||||
# Build feature DataFrame
|
||||
features = pd.DataFrame(index=common_idx)
|
||||
features['pair_id'] = pair.pair_id
|
||||
features['base_asset'] = base
|
||||
features['quote_asset'] = quote
|
||||
|
||||
# Price data (for reference, not features)
|
||||
features['spread'] = spread
|
||||
features['base_close'] = df_a['close']
|
||||
features['quote_close'] = df_b['close']
|
||||
features['base_volume'] = df_a['volume']
|
||||
|
||||
# Core Features
|
||||
features['z_score'] = z_score
|
||||
features['spread_rsi'] = spread_rsi
|
||||
features['spread_roc'] = spread_roc
|
||||
features['spread_change_1h'] = spread_change_1h
|
||||
features['vol_ratio'] = vol_ratio
|
||||
features['vol_ratio_rel'] = vol_ratio_rel
|
||||
features['vol_diff_ratio'] = vol_spread_ratio
|
||||
|
||||
# Volatility for SL/TP
|
||||
features['realized_vol_base'] = realized_vol_a
|
||||
features['realized_vol_quote'] = realized_vol_b
|
||||
features['realized_vol_avg'] = (realized_vol_a + realized_vol_b) / 2
|
||||
|
||||
# ATR for dynamic stops (in price units and as percentage)
|
||||
features['atr_base'] = atr_a
|
||||
features['atr_quote'] = atr_b
|
||||
features['atr_pct_base'] = atr_pct_a
|
||||
features['atr_pct_quote'] = atr_pct_b
|
||||
features['atr_pct_avg'] = (atr_pct_a + atr_pct_b) / 2
|
||||
|
||||
# Pair encoding (for universal model)
|
||||
# Using base and quote indices for hierarchical encoding
|
||||
assets = self.config.assets
|
||||
features['base_idx'] = assets.index(base) if base in assets else -1
|
||||
features['quote_idx'] = assets.index(quote) if quote in assets else -1
|
||||
|
||||
# Add funding and on-chain features
|
||||
# Funding data is always added from self._funding_data (OKX, all 10 assets)
|
||||
# On-chain data is optional (CryptoQuant, BTC/ETH only)
|
||||
features = self._add_on_chain_features(
|
||||
features, on_chain_data, base, quote
|
||||
)
|
||||
|
||||
# Drop rows with NaN in core features only (not funding/on-chain)
|
||||
core_cols = [
|
||||
'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h',
|
||||
'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio',
|
||||
'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg',
|
||||
'atr_base', 'atr_pct_base' # ATR is core for SL/TP
|
||||
]
|
||||
features = features.dropna(subset=core_cols)
|
||||
|
||||
# Fill missing funding/on-chain features with 0 (neutral)
|
||||
optional_cols = [
|
||||
'base_funding', 'quote_funding', 'funding_diff', 'funding_avg',
|
||||
'base_inflow', 'quote_inflow', 'inflow_ratio'
|
||||
]
|
||||
for col in optional_cols:
|
||||
if col in features.columns:
|
||||
features[col] = features[col].fillna(0)
|
||||
|
||||
return features
|
||||
|
||||
def calculate_all_pair_features(
|
||||
self,
|
||||
pairs: list[TradingPair],
|
||||
asset_data: dict[str, pd.DataFrame],
|
||||
on_chain_data: pd.DataFrame | None = None
|
||||
) -> dict[str, pd.DataFrame]:
|
||||
"""
|
||||
Calculate features for all pairs.
|
||||
|
||||
Args:
|
||||
pairs: List of trading pairs
|
||||
asset_data: Dictionary of OHLCV DataFrames
|
||||
on_chain_data: Optional on-chain data
|
||||
|
||||
Returns:
|
||||
Dictionary mapping pair_id to feature DataFrame
|
||||
"""
|
||||
all_features = {}
|
||||
|
||||
for pair in pairs:
|
||||
features = self.calculate_pair_features(
|
||||
pair, asset_data, on_chain_data
|
||||
)
|
||||
if features is not None and len(features) > 0:
|
||||
all_features[pair.pair_id] = features
|
||||
|
||||
logger.info(
|
||||
"Calculated features for %d/%d pairs",
|
||||
len(all_features), len(pairs)
|
||||
)
|
||||
|
||||
return all_features
|
||||
|
||||
def get_combined_features(
|
||||
self,
|
||||
pair_features: dict[str, pd.DataFrame],
|
||||
timestamp: pd.Timestamp | None = None
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Combine all pair features into a single DataFrame.
|
||||
|
||||
Useful for batch model prediction across all pairs.
|
||||
|
||||
Args:
|
||||
pair_features: Dictionary of feature DataFrames by pair_id
|
||||
timestamp: Optional specific timestamp to filter to
|
||||
|
||||
Returns:
|
||||
Combined DataFrame with all pairs as rows
|
||||
"""
|
||||
if not pair_features:
|
||||
return pd.DataFrame()
|
||||
|
||||
if timestamp is not None:
|
||||
# Get latest row from each pair at or before timestamp
|
||||
rows = []
|
||||
for pair_id, features in pair_features.items():
|
||||
valid = features[features.index <= timestamp]
|
||||
if len(valid) > 0:
|
||||
row = valid.iloc[-1:].copy()
|
||||
rows.append(row)
|
||||
|
||||
if rows:
|
||||
return pd.concat(rows, ignore_index=False)
|
||||
return pd.DataFrame()
|
||||
|
||||
# Combine all features (for training)
|
||||
return pd.concat(pair_features.values(), ignore_index=False)
|
||||
|
||||
def _add_on_chain_features(
|
||||
self,
|
||||
features: pd.DataFrame,
|
||||
on_chain_data: pd.DataFrame | None,
|
||||
base_asset: str,
|
||||
quote_asset: str
|
||||
) -> pd.DataFrame:
|
||||
"""
|
||||
Add on-chain and funding rate features for the pair.
|
||||
|
||||
Uses funding data from OKX (all 10 assets) and on-chain data
|
||||
from CryptoQuant (BTC/ETH only for inflows).
|
||||
"""
|
||||
base_short = base_asset.replace('-USDT', '').lower()
|
||||
quote_short = quote_asset.replace('-USDT', '').lower()
|
||||
|
||||
# Add funding rates from cached funding data
|
||||
if self._funding_data is not None and not self._funding_data.empty:
|
||||
funding_aligned = self._funding_data.reindex(
|
||||
features.index, method='ffill'
|
||||
)
|
||||
|
||||
base_funding_col = f'{base_short}_funding'
|
||||
quote_funding_col = f'{quote_short}_funding'
|
||||
|
||||
if base_funding_col in funding_aligned.columns:
|
||||
features['base_funding'] = funding_aligned[base_funding_col]
|
||||
if quote_funding_col in funding_aligned.columns:
|
||||
features['quote_funding'] = funding_aligned[quote_funding_col]
|
||||
|
||||
# Funding difference (positive = base has higher funding)
|
||||
if 'base_funding' in features.columns and 'quote_funding' in features.columns:
|
||||
features['funding_diff'] = (
|
||||
features['base_funding'] - features['quote_funding']
|
||||
)
|
||||
|
||||
# Funding sentiment: average of both assets
|
||||
features['funding_avg'] = (
|
||||
features['base_funding'] + features['quote_funding']
|
||||
) / 2
|
||||
|
||||
# Add on-chain features from CryptoQuant (BTC/ETH only)
|
||||
if on_chain_data is not None and not on_chain_data.empty:
|
||||
cq_aligned = on_chain_data.reindex(features.index, method='ffill')
|
||||
|
||||
# Inflows (only available for BTC/ETH)
|
||||
base_inflow_col = f'{base_short}_inflow'
|
||||
quote_inflow_col = f'{quote_short}_inflow'
|
||||
|
||||
if base_inflow_col in cq_aligned.columns:
|
||||
features['base_inflow'] = cq_aligned[base_inflow_col]
|
||||
if quote_inflow_col in cq_aligned.columns:
|
||||
features['quote_inflow'] = cq_aligned[quote_inflow_col]
|
||||
|
||||
if 'base_inflow' in features.columns and 'quote_inflow' in features.columns:
|
||||
features['inflow_ratio'] = (
|
||||
features['base_inflow'] /
|
||||
(features['quote_inflow'] + 1)
|
||||
)
|
||||
|
||||
return features
|
||||
|
||||
def get_feature_columns(self) -> list[str]:
|
||||
"""
|
||||
Get list of feature columns for ML model.
|
||||
|
||||
Excludes metadata and target-related columns.
|
||||
|
||||
Returns:
|
||||
List of feature column names
|
||||
"""
|
||||
# Core features (always present)
|
||||
core_features = [
|
||||
'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h',
|
||||
'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio',
|
||||
'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg',
|
||||
'base_idx', 'quote_idx'
|
||||
]
|
||||
|
||||
# Funding features (now available for all 10 assets via OKX)
|
||||
funding_features = [
|
||||
'base_funding', 'quote_funding', 'funding_diff', 'funding_avg'
|
||||
]
|
||||
|
||||
# On-chain features (BTC/ETH only via CryptoQuant)
|
||||
onchain_features = [
|
||||
'base_inflow', 'quote_inflow', 'inflow_ratio'
|
||||
]
|
||||
|
||||
return core_features + funding_features + onchain_features
|
||||
Reference in New Issue
Block a user