Files
lowkey_backtest/strategies/regime_strategy.py
Simon Moisy 1e4cb87da3 Add check_symbols.py for ETH perpetuals filtering and enhance backtester with size handling
- Introduced `check_symbols.py` to load and filter ETH perpetual markets from the OKX exchange using CCXT.
- Updated the backtester to normalize signals to a 5-tuple format, incorporating size management for trades.
- Enhanced portfolio functions to support variable size and leverage adjustments based on initial capital.
- Added a new method in `CryptoQuantClient` for chunked historical data fetching to avoid API limits.
- Improved market symbol normalization in `market.py` to handle different formats.
- Updated regime strategy parameters based on recent research findings for optimal performance.
2026-01-14 09:46:51 +08:00

366 lines
16 KiB
Python

import pandas as pd
import numpy as np
import ta
import vectorbt as vbt
from sklearn.ensemble import RandomForestClassifier
from strategies.base import BaseStrategy
from engine.market import MarketType
from engine.data_manager import DataManager
from engine.logging_config import get_logger
logger = get_logger(__name__)
class RegimeReversionStrategy(BaseStrategy):
"""
ML-Based Regime Detection & Mean Reversion Strategy.
Logic:
1. Tracks the BTC/ETH Spread and its Z-Score (24h window).
2. Uses a Random Forest model to predict if an extreme Z-Score will revert profitably.
3. Features: Spread Technicals (RSI, ROC) + On-Chain Flows (Inflow, Funding).
4. Entry: When Model Probability > 0.5.
5. Exit: Z-Score reversion to 0 or SL/TP.
Walk-Forward Training:
- Trains on first `train_ratio` of data (default 70%)
- Generates signals only for remaining test period (30%)
- Eliminates look-ahead bias for realistic backtest results
"""
# Optimal parameters from walk-forward research (2025-10 to 2025-12)
# Research: research/horizon_optimization_results.csv
OPTIMAL_HORIZON = 102 # 4.25 days - best Net PnL (+232%)
OPTIMAL_Z_WINDOW = 24 # 24h rolling window for spread Z-score
OPTIMAL_TRAIN_RATIO = 0.7 # 70% train / 30% test split
OPTIMAL_PROFIT_TARGET = 0.005 # 0.5% profit threshold for target definition
OPTIMAL_Z_ENTRY = 1.0 # Enter when |Z| > 1.0
def __init__(self,
model_path: str = "data/regime_model.pkl",
horizon: int = OPTIMAL_HORIZON,
z_window: int = OPTIMAL_Z_WINDOW,
z_entry_threshold: float = OPTIMAL_Z_ENTRY,
profit_target: float = OPTIMAL_PROFIT_TARGET,
stop_loss: float = 0.06, # 6% - accommodates 1.95% avg MAE
take_profit: float = 0.05, # 5% swing target
train_ratio: float = OPTIMAL_TRAIN_RATIO,
trend_window: int = 0, # Disable SMA filter
use_funding_filter: bool = True, # Enable Funding Rate filter
funding_threshold: float = 0.005 # Tightened to 0.005%
):
super().__init__()
self.model_path = model_path
self.horizon = horizon
self.z_window = z_window
self.z_entry_threshold = z_entry_threshold
self.profit_target = profit_target
self.stop_loss = stop_loss
self.take_profit = take_profit
self.train_ratio = train_ratio
self.trend_window = trend_window
self.use_funding_filter = use_funding_filter
self.funding_threshold = funding_threshold
# Default Strategy Config
self.default_market_type = MarketType.PERPETUAL
self.default_leverage = 1
self.dm = DataManager()
self.model = None
self.feature_cols = None
self.train_end_idx = None # Will store the training cutoff point
def run(self, close, **kwargs):
"""
Execute the strategy logic.
We assume this strategy is run on ETH-USDT (the active asset).
We will fetch BTC-USDT internally to calculate the spread.
"""
# 1. Identify Context
# We need BTC data aligned with the incoming ETH 'close' series
start_date = close.index.min()
end_date = close.index.max()
logger.info("Fetching BTC context data...")
try:
# Load BTC data (Context) - Must match the timeframe of the backtest
# Research was done on 1h candles, so strategy should be run on 1h
# Use PERPETUAL data to match the trading instrument (ETH Perp)
df_btc = self.dm.load_data("okx", "BTC-USDT", "1h", MarketType.PERPETUAL)
# Align BTC to ETH (close)
df_btc = df_btc.reindex(close.index, method='ffill')
btc_close = df_btc['close']
except Exception as e:
logger.error(f"Failed to load BTC context: {e}")
empty = self.create_empty_signals(close)
return empty, empty, empty, empty
# 2. Construct DataFrames for Feature Engineering
# We need volume/high/low for features, but 'run' signature primarily gives 'close'.
# kwargs might have high/low/volume if passed by Backtester.run_strategy
eth_vol = kwargs.get('volume')
if eth_vol is None:
logger.warning("Volume data missing. Feature calculation might fail.")
# Fallback or error handling
eth_vol = pd.Series(0, index=close.index)
# Construct dummy dfs for prepare_features
# We only really need Close and Volume for the current feature set
df_a = pd.DataFrame({'close': btc_close, 'volume': df_btc['volume']})
df_b = pd.DataFrame({'close': close, 'volume': eth_vol})
# 3. Load On-Chain Data (CryptoQuant)
# We use the saved CSV for training/inference
# In a live setting, this would query the API for recent data
cq_df = None
try:
cq_path = "data/cq_training_data.csv"
cq_df = pd.read_csv(cq_path, index_col='timestamp', parse_dates=True)
if cq_df.index.tz is None:
cq_df.index = cq_df.index.tz_localize('UTC')
except Exception:
logger.warning("CryptoQuant data not found. Running without on-chain features.")
# 4. Calculate Features
features = self.prepare_features(df_a, df_b, cq_df)
# 5. Walk-Forward Split
# Train on first `train_ratio` of data, test on remainder
n_samples = len(features)
train_size = int(n_samples * self.train_ratio)
train_features = features.iloc[:train_size]
test_features = features.iloc[train_size:]
train_end_date = train_features.index[-1]
test_start_date = test_features.index[0]
logger.info(
f"Walk-Forward Split: Train={len(train_features)} bars "
f"(until {train_end_date.strftime('%Y-%m-%d')}), "
f"Test={len(test_features)} bars "
f"(from {test_start_date.strftime('%Y-%m-%d')})"
)
# 6. Train Model on Training Period ONLY
if self.model is None:
logger.info("Training Regime Model on training period only...")
self.model, self.feature_cols = self.train_model(train_features)
# 7. Predict on TEST Period ONLY
# Use valid columns only
X_test = test_features[self.feature_cols].fillna(0)
X_test = X_test.replace([np.inf, -np.inf], 0)
# Predict Probabilities for test period
probs = self.model.predict_proba(X_test)[:, 1]
# 8. Generate Entry Signals (TEST period only)
# If Z > threshold (Spread High, ETH Expensive) -> Short ETH
# If Z < -threshold (Spread Low, ETH Cheap) -> Long ETH
z_thresh = self.z_entry_threshold
short_signal_test = (probs > 0.5) & (test_features['z_score'].values > z_thresh)
long_signal_test = (probs > 0.5) & (test_features['z_score'].values < -z_thresh)
# 8b. Apply Trend Filter (Macro Regime)
# Rule: Long only if BTC > SMA (Bull), Short only if BTC < SMA (Bear)
if self.trend_window > 0:
# Calculate SMA on full BTC history first
btc_sma = btc_close.rolling(window=self.trend_window).mean()
# Align with test period
test_btc_close = btc_close.reindex(test_features.index)
test_btc_sma = btc_sma.reindex(test_features.index)
# Define Regimes
is_bull = (test_btc_close > test_btc_sma).values
is_bear = (test_btc_close < test_btc_sma).values
# Apply Filter
long_signal_test = long_signal_test & is_bull
short_signal_test = short_signal_test & is_bear
# 8c. Apply Funding Rate Filter
# Rule: If Funding > Threshold (Greedy) -> No Longs.
# If Funding < -Threshold (Fearful) -> No Shorts.
if self.use_funding_filter and 'btc_funding' in test_features.columns:
funding = test_features['btc_funding'].values
thresh = self.funding_threshold
# Greedy Market (High Positive Funding) -> Risk of Long Squeeze -> Block Longs
# (Or implies trend is up? Actually for Mean Reversion, high funding often marks tops)
# We block Longs because we don't want to buy into an overheated market?
# Actually, "Greedy" means Longs are paying Shorts.
# If we Long, we pay funding.
# If we Short, we receive funding.
# So High Funding = Good for Shorts (receive yield + reversion).
# Bad for Longs (pay yield + likely top).
is_overheated = funding > thresh
is_oversold = funding < -thresh
# Block Longs if Overheated
long_signal_test = long_signal_test & (~is_overheated)
# Block Shorts if Oversold (Negative Funding) -> Risk of Short Squeeze
short_signal_test = short_signal_test & (~is_oversold)
n_blocked_long = (is_overheated & (probs > 0.5) & (test_features['z_score'].values < -z_thresh)).sum()
n_blocked_short = (is_oversold & (probs > 0.5) & (test_features['z_score'].values > z_thresh)).sum()
if n_blocked_long > 0 or n_blocked_short > 0:
logger.info(f"Funding Filter: Blocked {n_blocked_long} Longs, {n_blocked_short} Shorts")
# 9. Calculate Position Sizing (Probability-Based)
# Base size = 1.0 (100% of equity)
# Scale: 1.0 + (Prob - 0.5) * 2
# Example: Prob=0.6 -> Size=1.2, Prob=0.8 -> Size=1.6
# Align probabilities to close index
probs_series = pd.Series(0.0, index=test_features.index)
probs_series[:] = probs
probs_aligned = probs_series.reindex(close.index, fill_value=0.0)
# Calculate dynamic size
dynamic_size = 1.0 + (probs_aligned - 0.5) * 2.0
# Cap leverage between 1x and 2x
size = dynamic_size.clip(lower=1.0, upper=2.0)
# Create full-length signal series (False for training period)
long_entries = pd.Series(False, index=close.index)
short_entries = pd.Series(False, index=close.index)
# Map test signals to their correct indices
test_idx = test_features.index
for i, idx in enumerate(test_idx):
if idx in close.index:
long_entries.loc[idx] = bool(long_signal_test[i])
short_entries.loc[idx] = bool(short_signal_test[i])
# 9. Generate Exits
# Exit when Z-Score crosses back through 0 (mean reversion complete)
z_reindexed = features['z_score'].reindex(close.index, fill_value=0)
# Exit Long when Z > 0, Exit Short when Z < 0
long_exits = z_reindexed > 0
short_exits = z_reindexed < 0
# Log signal counts for verification
n_long = long_entries.sum()
n_short = short_entries.sum()
logger.info(f"Generated {n_long} long signals, {n_short} short signals (test period only)")
return long_entries, long_exits, short_entries, short_exits, size
def prepare_features(self, df_btc, df_eth, cq_df=None):
"""Replicate research feature engineering"""
# Align
common = df_btc.index.intersection(df_eth.index)
df_a = df_btc.loc[common].copy()
df_b = df_eth.loc[common].copy()
# Spread
spread = df_b['close'] / df_a['close']
# Z-Score
rolling_mean = spread.rolling(window=self.z_window).mean()
rolling_std = spread.rolling(window=self.z_window).std()
z_score = (spread - rolling_mean) / rolling_std
# Technicals
spread_rsi = ta.momentum.RSIIndicator(spread, window=14).rsi()
spread_roc = spread.pct_change(periods=5) * 100
spread_change_1h = spread.pct_change(periods=1)
# Volume
vol_ratio = df_b['volume'] / df_a['volume']
vol_ratio_ma = vol_ratio.rolling(window=12).mean()
# Volatility
ret_a = df_a['close'].pct_change()
ret_b = df_b['close'].pct_change()
vol_a = ret_a.rolling(window=self.z_window).std()
vol_b = ret_b.rolling(window=self.z_window).std()
vol_spread_ratio = vol_b / vol_a
features = pd.DataFrame(index=spread.index)
features['spread'] = spread
features['z_score'] = z_score
features['spread_rsi'] = spread_rsi
features['spread_roc'] = spread_roc
features['spread_change_1h'] = spread_change_1h
features['vol_ratio'] = vol_ratio
features['vol_ratio_rel'] = vol_ratio / vol_ratio_ma
features['vol_diff_ratio'] = vol_spread_ratio
# CQ Merge
if cq_df is not None:
cq_aligned = cq_df.reindex(features.index, method='ffill')
if 'btc_funding' in cq_aligned.columns and 'eth_funding' in cq_aligned.columns:
cq_aligned['funding_diff'] = cq_aligned['eth_funding'] - cq_aligned['btc_funding']
if 'btc_inflow' in cq_aligned.columns and 'eth_inflow' in cq_aligned.columns:
cq_aligned['inflow_ratio'] = cq_aligned['eth_inflow'] / (cq_aligned['btc_inflow'] + 1)
features = features.join(cq_aligned)
return features.dropna()
def train_model(self, train_features):
"""
Train Random Forest on training data only.
This method receives ONLY the training subset of features,
ensuring no look-ahead bias. The model learns from historical
patterns and is then applied to unseen test data.
Args:
train_features: DataFrame containing features for training period only
"""
threshold = self.profit_target
horizon = self.horizon
z_thresh = self.z_entry_threshold
# Define targets using ONLY training data
# For Short Spread (Z > threshold): Did spread drop below target within horizon?
future_min = train_features['spread'].rolling(window=horizon).min().shift(-horizon)
target_short = train_features['spread'] * (1 - threshold)
success_short = (train_features['z_score'] > z_thresh) & (future_min < target_short)
# For Long Spread (Z < -threshold): Did spread rise above target within horizon?
future_max = train_features['spread'].rolling(window=horizon).max().shift(-horizon)
target_long = train_features['spread'] * (1 + threshold)
success_long = (train_features['z_score'] < -z_thresh) & (future_max > target_long)
targets = np.select([success_short, success_long], [1, 1], default=0)
# Build model
model = RandomForestClassifier(
n_estimators=300, max_depth=5, min_samples_leaf=30,
class_weight={0: 1, 1: 3}, random_state=42
)
# Exclude non-feature columns
exclude = ['spread']
cols = [c for c in train_features.columns if c not in exclude]
# Clean features
X_train = train_features[cols].fillna(0)
X_train = X_train.replace([np.inf, -np.inf], 0)
# Remove rows with NaN targets (from rolling window at end of training period)
valid_mask = ~np.isnan(targets) & ~np.isinf(targets)
# Also check for rows where future data doesn't exist (shift created NaNs)
valid_mask = valid_mask & (future_min.notna().values) & (future_max.notna().values)
X_train_clean = X_train[valid_mask]
targets_clean = targets[valid_mask]
logger.info(f"Training on {len(X_train_clean)} valid samples (removed {len(X_train) - len(X_train_clean)} with incomplete future data)")
model.fit(X_train_clean, targets_clean)
return model, cols