Files
lowkey_backtest/strategies/multi_pair/feature_engine.py
Simon Moisy df37366603 feat: Multi-Pair Divergence Selection Strategy
- Extend regime detection to top 10 cryptocurrencies (45 pairs)
- Dynamic pair selection based on divergence score (|z_score| * probability)
- Universal ML model trained on all pairs
- Correlation-based filtering to avoid redundant positions
- Funding rate integration from OKX for all 10 assets
- ATR-based dynamic stop-loss and take-profit
- Walk-forward training with 70/30 split

Performance: +35.69% return (vs +28.66% baseline), 63.6% win rate
2026-01-15 20:47:23 +08:00

434 lines
16 KiB
Python

"""
Feature Engineering for Multi-Pair Divergence Strategy.
Calculates features for all pairs in the universe, including
spread technicals, volatility, and on-chain data.
"""
import pandas as pd
import numpy as np
import ta
from engine.logging_config import get_logger
from engine.data_manager import DataManager
from engine.market import MarketType
from .config import MultiPairConfig
from .pair_scanner import TradingPair
from .funding import FundingRateFetcher
logger = get_logger(__name__)
class MultiPairFeatureEngine:
"""
Calculates features for multiple trading pairs.
Generates consistent feature sets across all pairs for
the universal ML model.
"""
def __init__(self, config: MultiPairConfig):
self.config = config
self.dm = DataManager()
self.funding_fetcher = FundingRateFetcher()
self._funding_data: pd.DataFrame | None = None
def load_all_assets(
self,
start_date: str | None = None,
end_date: str | None = None
) -> dict[str, pd.DataFrame]:
"""
Load OHLCV data for all assets in the universe.
Args:
start_date: Start date filter (YYYY-MM-DD)
end_date: End date filter (YYYY-MM-DD)
Returns:
Dictionary mapping symbol to OHLCV DataFrame
"""
data = {}
market_type = MarketType.PERPETUAL
for symbol in self.config.assets:
try:
df = self.dm.load_data(
self.config.exchange_id,
symbol,
self.config.timeframe,
market_type
)
# Apply date filters
if start_date:
df = df[df.index >= pd.Timestamp(start_date, tz="UTC")]
if end_date:
df = df[df.index <= pd.Timestamp(end_date, tz="UTC")]
if len(df) >= 200: # Minimum data requirement
data[symbol] = df
logger.debug("Loaded %s: %d bars", symbol, len(df))
else:
logger.warning(
"Skipping %s: insufficient data (%d bars)",
symbol, len(df)
)
except FileNotFoundError:
logger.warning("Data not found for %s", symbol)
except Exception as e:
logger.error("Error loading %s: %s", symbol, e)
logger.info("Loaded %d/%d assets", len(data), len(self.config.assets))
return data
def load_funding_data(
self,
start_date: str | None = None,
end_date: str | None = None,
use_cache: bool = True
) -> pd.DataFrame:
"""
Load funding rate data for all assets.
Args:
start_date: Start date filter
end_date: End date filter
use_cache: Whether to use cached data
Returns:
DataFrame with funding rates for all assets
"""
self._funding_data = self.funding_fetcher.get_funding_data(
self.config.assets,
start_date=start_date,
end_date=end_date,
use_cache=use_cache
)
if self._funding_data is not None and not self._funding_data.empty:
logger.info(
"Loaded funding data: %d rows, %d assets",
len(self._funding_data),
len(self._funding_data.columns)
)
else:
logger.warning("No funding data available")
return self._funding_data
def calculate_pair_features(
self,
pair: TradingPair,
asset_data: dict[str, pd.DataFrame],
on_chain_data: pd.DataFrame | None = None
) -> pd.DataFrame | None:
"""
Calculate features for a single pair.
Args:
pair: Trading pair
asset_data: Dictionary of OHLCV DataFrames by symbol
on_chain_data: Optional on-chain data (funding, inflows)
Returns:
DataFrame with features, or None if insufficient data
"""
base = pair.base_asset
quote = pair.quote_asset
if base not in asset_data or quote not in asset_data:
return None
df_base = asset_data[base]
df_quote = asset_data[quote]
# Align indices
common_idx = df_base.index.intersection(df_quote.index)
if len(common_idx) < 200:
logger.debug("Pair %s: insufficient aligned data", pair.name)
return None
df_a = df_base.loc[common_idx]
df_b = df_quote.loc[common_idx]
# Calculate spread (base / quote)
spread = df_a['close'] / df_b['close']
# Z-Score
z_window = self.config.z_window
rolling_mean = spread.rolling(window=z_window).mean()
rolling_std = spread.rolling(window=z_window).std()
z_score = (spread - rolling_mean) / rolling_std
# Spread Technicals
spread_rsi = ta.momentum.RSIIndicator(spread, window=14).rsi()
spread_roc = spread.pct_change(periods=5) * 100
spread_change_1h = spread.pct_change(periods=1)
# Volume Analysis
vol_ratio = df_a['volume'] / (df_b['volume'] + 1e-10)
vol_ratio_ma = vol_ratio.rolling(window=12).mean()
vol_ratio_rel = vol_ratio / (vol_ratio_ma + 1e-10)
# Volatility
ret_a = df_a['close'].pct_change()
ret_b = df_b['close'].pct_change()
vol_a = ret_a.rolling(window=z_window).std()
vol_b = ret_b.rolling(window=z_window).std()
vol_spread_ratio = vol_a / (vol_b + 1e-10)
# Realized Volatility (for dynamic SL/TP)
realized_vol_a = ret_a.rolling(window=self.config.volatility_window).std()
realized_vol_b = ret_b.rolling(window=self.config.volatility_window).std()
# ATR (Average True Range) for dynamic stops
# ATR = average of max(high-low, |high-prev_close|, |low-prev_close|)
high_a, low_a, close_a = df_a['high'], df_a['low'], df_a['close']
high_b, low_b, close_b = df_b['high'], df_b['low'], df_b['close']
# True Range for base asset
tr_a = pd.concat([
high_a - low_a,
(high_a - close_a.shift(1)).abs(),
(low_a - close_a.shift(1)).abs()
], axis=1).max(axis=1)
atr_a = tr_a.rolling(window=self.config.atr_period).mean()
# True Range for quote asset
tr_b = pd.concat([
high_b - low_b,
(high_b - close_b.shift(1)).abs(),
(low_b - close_b.shift(1)).abs()
], axis=1).max(axis=1)
atr_b = tr_b.rolling(window=self.config.atr_period).mean()
# ATR as percentage of price (normalized)
atr_pct_a = atr_a / close_a
atr_pct_b = atr_b / close_b
# Build feature DataFrame
features = pd.DataFrame(index=common_idx)
features['pair_id'] = pair.pair_id
features['base_asset'] = base
features['quote_asset'] = quote
# Price data (for reference, not features)
features['spread'] = spread
features['base_close'] = df_a['close']
features['quote_close'] = df_b['close']
features['base_volume'] = df_a['volume']
# Core Features
features['z_score'] = z_score
features['spread_rsi'] = spread_rsi
features['spread_roc'] = spread_roc
features['spread_change_1h'] = spread_change_1h
features['vol_ratio'] = vol_ratio
features['vol_ratio_rel'] = vol_ratio_rel
features['vol_diff_ratio'] = vol_spread_ratio
# Volatility for SL/TP
features['realized_vol_base'] = realized_vol_a
features['realized_vol_quote'] = realized_vol_b
features['realized_vol_avg'] = (realized_vol_a + realized_vol_b) / 2
# ATR for dynamic stops (in price units and as percentage)
features['atr_base'] = atr_a
features['atr_quote'] = atr_b
features['atr_pct_base'] = atr_pct_a
features['atr_pct_quote'] = atr_pct_b
features['atr_pct_avg'] = (atr_pct_a + atr_pct_b) / 2
# Pair encoding (for universal model)
# Using base and quote indices for hierarchical encoding
assets = self.config.assets
features['base_idx'] = assets.index(base) if base in assets else -1
features['quote_idx'] = assets.index(quote) if quote in assets else -1
# Add funding and on-chain features
# Funding data is always added from self._funding_data (OKX, all 10 assets)
# On-chain data is optional (CryptoQuant, BTC/ETH only)
features = self._add_on_chain_features(
features, on_chain_data, base, quote
)
# Drop rows with NaN in core features only (not funding/on-chain)
core_cols = [
'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h',
'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio',
'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg',
'atr_base', 'atr_pct_base' # ATR is core for SL/TP
]
features = features.dropna(subset=core_cols)
# Fill missing funding/on-chain features with 0 (neutral)
optional_cols = [
'base_funding', 'quote_funding', 'funding_diff', 'funding_avg',
'base_inflow', 'quote_inflow', 'inflow_ratio'
]
for col in optional_cols:
if col in features.columns:
features[col] = features[col].fillna(0)
return features
def calculate_all_pair_features(
self,
pairs: list[TradingPair],
asset_data: dict[str, pd.DataFrame],
on_chain_data: pd.DataFrame | None = None
) -> dict[str, pd.DataFrame]:
"""
Calculate features for all pairs.
Args:
pairs: List of trading pairs
asset_data: Dictionary of OHLCV DataFrames
on_chain_data: Optional on-chain data
Returns:
Dictionary mapping pair_id to feature DataFrame
"""
all_features = {}
for pair in pairs:
features = self.calculate_pair_features(
pair, asset_data, on_chain_data
)
if features is not None and len(features) > 0:
all_features[pair.pair_id] = features
logger.info(
"Calculated features for %d/%d pairs",
len(all_features), len(pairs)
)
return all_features
def get_combined_features(
self,
pair_features: dict[str, pd.DataFrame],
timestamp: pd.Timestamp | None = None
) -> pd.DataFrame:
"""
Combine all pair features into a single DataFrame.
Useful for batch model prediction across all pairs.
Args:
pair_features: Dictionary of feature DataFrames by pair_id
timestamp: Optional specific timestamp to filter to
Returns:
Combined DataFrame with all pairs as rows
"""
if not pair_features:
return pd.DataFrame()
if timestamp is not None:
# Get latest row from each pair at or before timestamp
rows = []
for pair_id, features in pair_features.items():
valid = features[features.index <= timestamp]
if len(valid) > 0:
row = valid.iloc[-1:].copy()
rows.append(row)
if rows:
return pd.concat(rows, ignore_index=False)
return pd.DataFrame()
# Combine all features (for training)
return pd.concat(pair_features.values(), ignore_index=False)
def _add_on_chain_features(
self,
features: pd.DataFrame,
on_chain_data: pd.DataFrame | None,
base_asset: str,
quote_asset: str
) -> pd.DataFrame:
"""
Add on-chain and funding rate features for the pair.
Uses funding data from OKX (all 10 assets) and on-chain data
from CryptoQuant (BTC/ETH only for inflows).
"""
base_short = base_asset.replace('-USDT', '').lower()
quote_short = quote_asset.replace('-USDT', '').lower()
# Add funding rates from cached funding data
if self._funding_data is not None and not self._funding_data.empty:
funding_aligned = self._funding_data.reindex(
features.index, method='ffill'
)
base_funding_col = f'{base_short}_funding'
quote_funding_col = f'{quote_short}_funding'
if base_funding_col in funding_aligned.columns:
features['base_funding'] = funding_aligned[base_funding_col]
if quote_funding_col in funding_aligned.columns:
features['quote_funding'] = funding_aligned[quote_funding_col]
# Funding difference (positive = base has higher funding)
if 'base_funding' in features.columns and 'quote_funding' in features.columns:
features['funding_diff'] = (
features['base_funding'] - features['quote_funding']
)
# Funding sentiment: average of both assets
features['funding_avg'] = (
features['base_funding'] + features['quote_funding']
) / 2
# Add on-chain features from CryptoQuant (BTC/ETH only)
if on_chain_data is not None and not on_chain_data.empty:
cq_aligned = on_chain_data.reindex(features.index, method='ffill')
# Inflows (only available for BTC/ETH)
base_inflow_col = f'{base_short}_inflow'
quote_inflow_col = f'{quote_short}_inflow'
if base_inflow_col in cq_aligned.columns:
features['base_inflow'] = cq_aligned[base_inflow_col]
if quote_inflow_col in cq_aligned.columns:
features['quote_inflow'] = cq_aligned[quote_inflow_col]
if 'base_inflow' in features.columns and 'quote_inflow' in features.columns:
features['inflow_ratio'] = (
features['base_inflow'] /
(features['quote_inflow'] + 1)
)
return features
def get_feature_columns(self) -> list[str]:
"""
Get list of feature columns for ML model.
Excludes metadata and target-related columns.
Returns:
List of feature column names
"""
# Core features (always present)
core_features = [
'z_score', 'spread_rsi', 'spread_roc', 'spread_change_1h',
'vol_ratio', 'vol_ratio_rel', 'vol_diff_ratio',
'realized_vol_base', 'realized_vol_quote', 'realized_vol_avg',
'base_idx', 'quote_idx'
]
# Funding features (now available for all 10 assets via OKX)
funding_features = [
'base_funding', 'quote_funding', 'funding_diff', 'funding_avg'
]
# On-chain features (BTC/ETH only via CryptoQuant)
onchain_features = [
'base_inflow', 'quote_inflow', 'inflow_ratio'
]
return core_features + funding_features + onchain_features