- Extend regime detection to top 10 cryptocurrencies (45 pairs) - Dynamic pair selection based on divergence score (|z_score| * probability) - Universal ML model trained on all pairs - Correlation-based filtering to avoid redundant positions - Funding rate integration from OKX for all 10 assets - ATR-based dynamic stop-loss and take-profit - Walk-forward training with 70/30 split Performance: +35.69% return (vs +28.66% baseline), 63.6% win rate
434 lines
16 KiB
Python
434 lines
16 KiB
Python
"""
|
|
Feature Engineering for Multi-Pair Divergence Strategy.
|
|
|
|
Calculates features for all pairs in the universe, including
|
|
spread technicals, volatility, and on-chain data.
|
|
"""
|
|
import pandas as pd
|
|
import numpy as np
|
|
import ta
|
|
|
|
from engine.logging_config import get_logger
|
|
from engine.data_manager import DataManager
|
|
from engine.market import MarketType
|
|
from .config import MultiPairConfig
|
|
from .pair_scanner import TradingPair
|
|
from .funding import FundingRateFetcher
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class MultiPairFeatureEngine:
|
|
"""
|
|
Calculates features for multiple trading pairs.
|
|
|
|
Generates consistent feature sets across all pairs for
|
|
the universal ML model.
|
|
"""
|
|
|
|
def __init__(self, config: MultiPairConfig):
|
|
self.config = config
|
|
self.dm = DataManager()
|
|
self.funding_fetcher = FundingRateFetcher()
|
|
self._funding_data: pd.DataFrame | None = None
|
|
|
|
def load_all_assets(
|
|
self,
|
|
start_date: str | None = None,
|
|
end_date: str | None = None
|
|
) -> dict[str, pd.DataFrame]:
|
|
"""
|
|
Load OHLCV data for all assets in the universe.
|
|
|
|
Args:
|
|
start_date: Start date filter (YYYY-MM-DD)
|
|
end_date: End date filter (YYYY-MM-DD)
|
|
|
|
Returns:
|
|
Dictionary mapping symbol to OHLCV DataFrame
|
|
"""
|
|
data = {}
|
|
market_type = MarketType.PERPETUAL
|
|
|
|
for symbol in self.config.assets:
|
|
try:
|
|
df = self.dm.load_data(
|
|
self.config.exchange_id,
|
|
symbol,
|
|
self.config.timeframe,
|
|
market_type
|
|
)
|
|
|
|
# Apply date filters
|
|
if start_date:
|
|
df = df[df.index >= pd.Timestamp(start_date, tz="UTC")]
|
|
if end_date:
|
|
df = df[df.index <= pd.Timestamp(end_date, tz="UTC")]
|
|
|
|
if len(df) >= 200: # Minimum data requirement
|
|
data[symbol] = df
|
|
logger.debug("Loaded %s: %d bars", symbol, len(df))
|
|
else:
|
|
logger.warning(
|
|
"Skipping %s: insufficient data (%d bars)",
|
|
symbol, len(df)
|
|
)
|
|
except FileNotFoundError:
|
|
logger.warning("Data not found for %s", symbol)
|
|
except Exception as e:
|
|
logger.error("Error loading %s: %s", symbol, e)
|
|
|
|
logger.info("Loaded %d/%d assets", len(data), len(self.config.assets))
|
|
return data
|
|
|
|
def load_funding_data(
|
|
self,
|
|
start_date: str | None = None,
|
|
end_date: str | None = None,
|
|
use_cache: bool = True
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Load funding rate data for all assets.
|
|
|
|
Args:
|
|
start_date: Start date filter
|
|
end_date: End date filter
|
|
use_cache: Whether to use cached data
|
|
|
|
Returns:
|
|
DataFrame with funding rates for all assets
|
|
"""
|
|
self._funding_data = self.funding_fetcher.get_funding_data(
|
|
self.config.assets,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
use_cache=use_cache
|
|
)
|
|
|
|
if self._funding_data is not None and not self._funding_data.empty:
|
|
logger.info(
|
|
"Loaded funding data: %d rows, %d assets",
|
|
len(self._funding_data),
|
|
len(self._funding_data.columns)
|
|
)
|
|
else:
|
|
logger.warning("No funding data available")
|
|
|
|
return self._funding_data
|
|
|
|
def calculate_pair_features(
|
|
self,
|
|
pair: TradingPair,
|
|
asset_data: dict[str, pd.DataFrame],
|
|
on_chain_data: pd.DataFrame | None = None
|
|
) -> pd.DataFrame | None:
|
|
"""
|
|
Calculate features for a single pair.
|
|
|
|
Args:
|
|
pair: Trading pair
|
|
asset_data: Dictionary of OHLCV DataFrames by symbol
|
|
on_chain_data: Optional on-chain data (funding, inflows)
|
|
|
|
Returns:
|
|
DataFrame with features, or None if insufficient data
|
|
"""
|
|
base = pair.base_asset
|
|
quote = pair.quote_asset
|
|
|
|
if base not in asset_data or quote not in asset_data:
|
|
return None
|
|
|
|
df_base = asset_data[base]
|
|
df_quote = asset_data[quote]
|
|
|
|
# Align indices
|
|
common_idx = df_base.index.intersection(df_quote.index)
|
|
if len(common_idx) < 200:
|
|
logger.debug("Pair %s: insufficient aligned data", pair.name)
|
|
return None
|
|
|
|
df_a = df_base.loc[common_idx]
|
|
df_b = df_quote.loc[common_idx]
|
|
|
|
# Calculate spread (base / quote)
|
|
spread = df_a['close'] / df_b['close']
|
|
|
|
# Z-Score
|
|
z_window = self.config.z_window
|
|
rolling_mean = spread.rolling(window=z_window).mean()
|
|
rolling_std = spread.rolling(window=z_window).std()
|
|
z_score = (spread - rolling_mean) / rolling_std
|
|
|
|
# Spread Technicals
|
|
spread_rsi = ta.momentum.RSIIndicator(spread, window=14).rsi()
|
|
spread_roc = spread.pct_change(periods=5) * 100
|
|
spread_change_1h = spread.pct_change(periods=1)
|
|
|
|
# Volume Analysis
|
|
vol_ratio = df_a['volume'] / (df_b['volume'] + 1e-10)
|
|
vol_ratio_ma = vol_ratio.rolling(window=12).mean()
|
|
vol_ratio_rel = vol_ratio / (vol_ratio_ma + 1e-10)
|
|
|
|
# Volatility
|
|
ret_a = df_a['close'].pct_change()
|
|
ret_b = df_b['close'].pct_change()
|
|
vol_a = ret_a.rolling(window=z_window).std()
|
|
vol_b = ret_b.rolling(window=z_window).std()
|
|
vol_spread_ratio = vol_a / (vol_b + 1e-10)
|
|
|
|
# Realized Volatility (for dynamic SL/TP)
|
|
realized_vol_a = ret_a.rolling(window=self.config.volatility_window).std()
|
|
realized_vol_b = ret_b.rolling(window=self.config.volatility_window).std()
|
|
|
|
# ATR (Average True Range) for dynamic stops
|
|
# ATR = average of max(high-low, |high-prev_close|, |low-prev_close|)
|
|
high_a, low_a, close_a = df_a['high'], df_a['low'], df_a['close']
|
|
high_b, low_b, close_b = df_b['high'], df_b['low'], df_b['close']
|
|
|
|
# True Range for base asset
|
|
tr_a = pd.concat([
|
|
high_a - low_a,
|
|
(high_a - close_a.shift(1)).abs(),
|
|
(low_a - close_a.shift(1)).abs()
|
|
], axis=1).max(axis=1)
|
|
atr_a = tr_a.rolling(window=self.config.atr_period).mean()
|
|
|
|
# True Range for quote asset
|
|
tr_b = pd.concat([
|
|
high_b - low_b,
|
|
(high_b - close_b.shift(1)).abs(),
|
|
(low_b - close_b.shift(1)).abs()
|
|
], axis=1).max(axis=1)
|
|
atr_b = tr_b.rolling(window=self.config.atr_period).mean()
|
|
|
|
# ATR as percentage of price (normalized)
|
|
atr_pct_a = atr_a / close_a
|
|
atr_pct_b = atr_b / close_b
|
|
|
|
# Build feature DataFrame
|
|
features = pd.DataFrame(index=common_idx)
|
|
features['pair_id'] = pair.pair_id
|
|
features['base_asset'] = base
|
|
features['quote_asset'] = quote
|
|
|
|
# Price data (for reference, not features)
|
|
features['spread'] = spread
|
|
features['base_close'] = df_a['close']
|
|
features['quote_close'] = df_b['close']
|
|
features['base_volume'] = df_a['volume']
|
|
|
|
# Core Features
|
|
features['z_score'] = z_score
|
|
features['spread_rsi'] = spread_rsi
|
|
features['spread_roc'] = spread_roc
|
|
features['spread_change_1h'] = spread_change_1h
|
|
features['vol_ratio'] = vol_ratio
|
|
features['vol_ratio_rel'] = vol_ratio_rel
|
|
features['vol_diff_ratio'] = vol_spread_ratio
|
|
|
|
# Volatility for SL/TP
|
|
features['realized_vol_base'] = realized_vol_a
|
|
features['realized_vol_quote'] = realized_vol_b
|
|
features['realized_vol_avg'] = (realized_vol_a + realized_vol_b) / 2
|
|
|
|
# ATR for dynamic stops (in price units and as percentage)
|
|
features['atr_base'] = atr_a
|
|
features['atr_quote'] = atr_b
|
|
features['atr_pct_base'] = atr_pct_a
|
|
features['atr_pct_quote'] = atr_pct_b
|
|
features['atr_pct_avg'] = (atr_pct_a + atr_pct_b) / 2
|
|
|
|
# Pair encoding (for universal model)
|
|
# Using base and quote indices for hierarchical encoding
|
|
assets = self.config.assets
|
|
features['base_idx'] = assets.index(base) if base in assets else -1
|
|
features['quote_idx'] = assets.index(quote) if quote in assets else -1
|
|
|
|
# Add funding and on-chain features
|
|
# Funding data is always added from self._funding_data (OKX, all 10 assets)
|
|
# On-chain data is optional (CryptoQuant, BTC/ETH only)
|
|
features = self._add_on_chain_features(
|
|
features, on_chain_data, base, quote
|
|
)
|
|
|
|
# Drop rows with NaN in core features only (not funding/on-chain)
|
|
core_cols = [
|
|
'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h',
|
|
'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio',
|
|
'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg',
|
|
'atr_base', 'atr_pct_base' # ATR is core for SL/TP
|
|
]
|
|
features = features.dropna(subset=core_cols)
|
|
|
|
# Fill missing funding/on-chain features with 0 (neutral)
|
|
optional_cols = [
|
|
'base_funding', 'quote_funding', 'funding_diff', 'funding_avg',
|
|
'base_inflow', 'quote_inflow', 'inflow_ratio'
|
|
]
|
|
for col in optional_cols:
|
|
if col in features.columns:
|
|
features[col] = features[col].fillna(0)
|
|
|
|
return features
|
|
|
|
def calculate_all_pair_features(
|
|
self,
|
|
pairs: list[TradingPair],
|
|
asset_data: dict[str, pd.DataFrame],
|
|
on_chain_data: pd.DataFrame | None = None
|
|
) -> dict[str, pd.DataFrame]:
|
|
"""
|
|
Calculate features for all pairs.
|
|
|
|
Args:
|
|
pairs: List of trading pairs
|
|
asset_data: Dictionary of OHLCV DataFrames
|
|
on_chain_data: Optional on-chain data
|
|
|
|
Returns:
|
|
Dictionary mapping pair_id to feature DataFrame
|
|
"""
|
|
all_features = {}
|
|
|
|
for pair in pairs:
|
|
features = self.calculate_pair_features(
|
|
pair, asset_data, on_chain_data
|
|
)
|
|
if features is not None and len(features) > 0:
|
|
all_features[pair.pair_id] = features
|
|
|
|
logger.info(
|
|
"Calculated features for %d/%d pairs",
|
|
len(all_features), len(pairs)
|
|
)
|
|
|
|
return all_features
|
|
|
|
def get_combined_features(
|
|
self,
|
|
pair_features: dict[str, pd.DataFrame],
|
|
timestamp: pd.Timestamp | None = None
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Combine all pair features into a single DataFrame.
|
|
|
|
Useful for batch model prediction across all pairs.
|
|
|
|
Args:
|
|
pair_features: Dictionary of feature DataFrames by pair_id
|
|
timestamp: Optional specific timestamp to filter to
|
|
|
|
Returns:
|
|
Combined DataFrame with all pairs as rows
|
|
"""
|
|
if not pair_features:
|
|
return pd.DataFrame()
|
|
|
|
if timestamp is not None:
|
|
# Get latest row from each pair at or before timestamp
|
|
rows = []
|
|
for pair_id, features in pair_features.items():
|
|
valid = features[features.index <= timestamp]
|
|
if len(valid) > 0:
|
|
row = valid.iloc[-1:].copy()
|
|
rows.append(row)
|
|
|
|
if rows:
|
|
return pd.concat(rows, ignore_index=False)
|
|
return pd.DataFrame()
|
|
|
|
# Combine all features (for training)
|
|
return pd.concat(pair_features.values(), ignore_index=False)
|
|
|
|
def _add_on_chain_features(
|
|
self,
|
|
features: pd.DataFrame,
|
|
on_chain_data: pd.DataFrame | None,
|
|
base_asset: str,
|
|
quote_asset: str
|
|
) -> pd.DataFrame:
|
|
"""
|
|
Add on-chain and funding rate features for the pair.
|
|
|
|
Uses funding data from OKX (all 10 assets) and on-chain data
|
|
from CryptoQuant (BTC/ETH only for inflows).
|
|
"""
|
|
base_short = base_asset.replace('-USDT', '').lower()
|
|
quote_short = quote_asset.replace('-USDT', '').lower()
|
|
|
|
# Add funding rates from cached funding data
|
|
if self._funding_data is not None and not self._funding_data.empty:
|
|
funding_aligned = self._funding_data.reindex(
|
|
features.index, method='ffill'
|
|
)
|
|
|
|
base_funding_col = f'{base_short}_funding'
|
|
quote_funding_col = f'{quote_short}_funding'
|
|
|
|
if base_funding_col in funding_aligned.columns:
|
|
features['base_funding'] = funding_aligned[base_funding_col]
|
|
if quote_funding_col in funding_aligned.columns:
|
|
features['quote_funding'] = funding_aligned[quote_funding_col]
|
|
|
|
# Funding difference (positive = base has higher funding)
|
|
if 'base_funding' in features.columns and 'quote_funding' in features.columns:
|
|
features['funding_diff'] = (
|
|
features['base_funding'] - features['quote_funding']
|
|
)
|
|
|
|
# Funding sentiment: average of both assets
|
|
features['funding_avg'] = (
|
|
features['base_funding'] + features['quote_funding']
|
|
) / 2
|
|
|
|
# Add on-chain features from CryptoQuant (BTC/ETH only)
|
|
if on_chain_data is not None and not on_chain_data.empty:
|
|
cq_aligned = on_chain_data.reindex(features.index, method='ffill')
|
|
|
|
# Inflows (only available for BTC/ETH)
|
|
base_inflow_col = f'{base_short}_inflow'
|
|
quote_inflow_col = f'{quote_short}_inflow'
|
|
|
|
if base_inflow_col in cq_aligned.columns:
|
|
features['base_inflow'] = cq_aligned[base_inflow_col]
|
|
if quote_inflow_col in cq_aligned.columns:
|
|
features['quote_inflow'] = cq_aligned[quote_inflow_col]
|
|
|
|
if 'base_inflow' in features.columns and 'quote_inflow' in features.columns:
|
|
features['inflow_ratio'] = (
|
|
features['base_inflow'] /
|
|
(features['quote_inflow'] + 1)
|
|
)
|
|
|
|
return features
|
|
|
|
def get_feature_columns(self) -> list[str]:
|
|
"""
|
|
Get list of feature columns for ML model.
|
|
|
|
Excludes metadata and target-related columns.
|
|
|
|
Returns:
|
|
List of feature column names
|
|
"""
|
|
# Core features (always present)
|
|
core_features = [
|
|
'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h',
|
|
'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio',
|
|
'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg',
|
|
'base_idx', 'quote_idx'
|
|
]
|
|
|
|
# Funding features (now available for all 10 assets via OKX)
|
|
funding_features = [
|
|
'base_funding', 'quote_funding', 'funding_diff', 'funding_avg'
|
|
]
|
|
|
|
# On-chain features (BTC/ETH only via CryptoQuant)
|
|
onchain_features = [
|
|
'base_inflow', 'quote_inflow', 'inflow_ratio'
|
|
]
|
|
|
|
return core_features + funding_features + onchain_features
|