- Deleted `install_cron.sh`, `setup_schedule.sh`, and `train_daily.sh` as part of the transition to a new scheduling mechanism. - Removed associated Systemd service and timer files for daily model training. - Updated `live_regime_strategy.py` and `main.py` to reflect changes in model training and scheduling logic. - Adjusted `regime_strategy.py` to align with new target calculation methods and updated optimal parameters. - Enhanced `regime_detection.py` to incorporate path-dependent labeling for target calculations.
408 lines
17 KiB
Python
408 lines
17 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import ta
|
|
import vectorbt as vbt
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
|
|
from strategies.base import BaseStrategy
|
|
from engine.market import MarketType
|
|
from engine.data_manager import DataManager
|
|
from engine.logging_config import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
class RegimeReversionStrategy(BaseStrategy):
|
|
"""
|
|
ML-Based Regime Detection & Mean Reversion Strategy.
|
|
|
|
Logic:
|
|
1. Tracks the BTC/ETH Spread and its Z-Score (24h window).
|
|
2. Uses a Random Forest model to predict if an extreme Z-Score will revert profitably.
|
|
3. Features: Spread Technicals (RSI, ROC) + On-Chain Flows (Inflow, Funding).
|
|
4. Entry: When Model Probability > 0.5.
|
|
5. Exit: Z-Score reversion to 0 or SL/TP.
|
|
|
|
Walk-Forward Training:
|
|
- Trains on first `train_ratio` of data (default 70%)
|
|
- Generates signals only for remaining test period (30%)
|
|
- Eliminates look-ahead bias for realistic backtest results
|
|
"""
|
|
|
|
# Optimal parameters from walk-forward research (2025-10 to 2025-12)
|
|
# Research: research/horizon_optimization_results.csv
|
|
OPTIMAL_HORIZON = 54 # Updated from 102h based on corrected labeling
|
|
OPTIMAL_Z_WINDOW = 24 # 24h rolling window for spread Z-score
|
|
OPTIMAL_TRAIN_RATIO = 0.7 # 70% train / 30% test split
|
|
OPTIMAL_PROFIT_TARGET = 0.005 # 0.5% profit threshold for target definition
|
|
OPTIMAL_Z_ENTRY = 1.0 # Enter when |Z| > 1.0
|
|
|
|
def __init__(self,
|
|
model_path: str = "data/regime_model.pkl",
|
|
horizon: int = OPTIMAL_HORIZON,
|
|
z_window: int = OPTIMAL_Z_WINDOW,
|
|
z_entry_threshold: float = OPTIMAL_Z_ENTRY,
|
|
profit_target: float = OPTIMAL_PROFIT_TARGET,
|
|
stop_loss: float = 0.06, # 6% - accommodates 1.95% avg MAE
|
|
take_profit: float = 0.05, # 5% swing target
|
|
train_ratio: float = OPTIMAL_TRAIN_RATIO,
|
|
trend_window: int = 0, # Disable SMA filter
|
|
use_funding_filter: bool = True, # Enable Funding Rate filter
|
|
funding_threshold: float = 0.005 # Tightened to 0.005%
|
|
):
|
|
super().__init__()
|
|
self.model_path = model_path
|
|
self.horizon = horizon
|
|
self.z_window = z_window
|
|
self.z_entry_threshold = z_entry_threshold
|
|
self.profit_target = profit_target
|
|
self.stop_loss = stop_loss
|
|
self.take_profit = take_profit
|
|
self.train_ratio = train_ratio
|
|
self.trend_window = trend_window
|
|
self.use_funding_filter = use_funding_filter
|
|
self.funding_threshold = funding_threshold
|
|
|
|
# Default Strategy Config
|
|
self.default_market_type = MarketType.PERPETUAL
|
|
self.default_leverage = 1
|
|
|
|
self.dm = DataManager()
|
|
self.model = None
|
|
self.feature_cols = None
|
|
self.train_end_idx = None # Will store the training cutoff point
|
|
|
|
def run(self, close, **kwargs):
|
|
"""
|
|
Execute the strategy logic.
|
|
We assume this strategy is run on ETH-USDT (the active asset).
|
|
We will fetch BTC-USDT internally to calculate the spread.
|
|
"""
|
|
# 1. Identify Context
|
|
# We need BTC data aligned with the incoming ETH 'close' series
|
|
start_date = close.index.min()
|
|
end_date = close.index.max()
|
|
|
|
logger.info("Fetching BTC context data...")
|
|
try:
|
|
# Load BTC data (Context) - Must match the timeframe of the backtest
|
|
# Research was done on 1h candles, so strategy should be run on 1h
|
|
# Use PERPETUAL data to match the trading instrument (ETH Perp)
|
|
df_btc = self.dm.load_data("okx", "BTC-USDT", "1h", MarketType.PERPETUAL)
|
|
|
|
# Align BTC to ETH (close)
|
|
df_btc = df_btc.reindex(close.index, method='ffill')
|
|
btc_close = df_btc['close']
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to load BTC context: {e}")
|
|
empty = self.create_empty_signals(close)
|
|
return empty, empty, empty, empty
|
|
|
|
# 2. Construct DataFrames for Feature Engineering
|
|
# We need volume/high/low for features, but 'run' signature primarily gives 'close'.
|
|
# kwargs might have high/low/volume if passed by Backtester.run_strategy
|
|
eth_vol = kwargs.get('volume')
|
|
|
|
if eth_vol is None:
|
|
logger.warning("Volume data missing. Feature calculation might fail.")
|
|
# Fallback or error handling
|
|
eth_vol = pd.Series(0, index=close.index)
|
|
|
|
# Construct dummy dfs for prepare_features
|
|
# We only really need Close and Volume for the current feature set
|
|
df_a = pd.DataFrame({'close': btc_close, 'volume': df_btc['volume']})
|
|
df_b = pd.DataFrame({'close': close, 'volume': eth_vol})
|
|
|
|
# 3. Load On-Chain Data (CryptoQuant)
|
|
# We use the saved CSV for training/inference
|
|
# In a live setting, this would query the API for recent data
|
|
cq_df = None
|
|
try:
|
|
cq_path = "data/cq_training_data.csv"
|
|
cq_df = pd.read_csv(cq_path, index_col='timestamp', parse_dates=True)
|
|
if cq_df.index.tz is None:
|
|
cq_df.index = cq_df.index.tz_localize('UTC')
|
|
except Exception:
|
|
logger.warning("CryptoQuant data not found. Running without on-chain features.")
|
|
|
|
# 4. Calculate Features
|
|
features = self.prepare_features(df_a, df_b, cq_df)
|
|
|
|
# 5. Walk-Forward Split
|
|
# Train on first `train_ratio` of data, test on remainder
|
|
n_samples = len(features)
|
|
train_size = int(n_samples * self.train_ratio)
|
|
|
|
train_features = features.iloc[:train_size]
|
|
test_features = features.iloc[train_size:]
|
|
|
|
train_end_date = train_features.index[-1]
|
|
test_start_date = test_features.index[0]
|
|
|
|
logger.info(
|
|
f"Walk-Forward Split: Train={len(train_features)} bars "
|
|
f"(until {train_end_date.strftime('%Y-%m-%d')}), "
|
|
f"Test={len(test_features)} bars "
|
|
f"(from {test_start_date.strftime('%Y-%m-%d')})"
|
|
)
|
|
|
|
# 6. Train Model on Training Period ONLY
|
|
if self.model is None:
|
|
logger.info("Training Regime Model on training period only...")
|
|
self.model, self.feature_cols = self.train_model(train_features)
|
|
|
|
# 7. Predict on TEST Period ONLY
|
|
# Use valid columns only
|
|
X_test = test_features[self.feature_cols].fillna(0)
|
|
X_test = X_test.replace([np.inf, -np.inf], 0)
|
|
|
|
# Predict Probabilities for test period
|
|
probs = self.model.predict_proba(X_test)[:, 1]
|
|
|
|
# 8. Generate Entry Signals (TEST period only)
|
|
# If Z > threshold (Spread High, ETH Expensive) -> Short ETH
|
|
# If Z < -threshold (Spread Low, ETH Cheap) -> Long ETH
|
|
z_thresh = self.z_entry_threshold
|
|
|
|
short_signal_test = (probs > 0.5) & (test_features['z_score'].values > z_thresh)
|
|
long_signal_test = (probs > 0.5) & (test_features['z_score'].values < -z_thresh)
|
|
|
|
# 8b. Apply Trend Filter (Macro Regime)
|
|
# Rule: Long only if BTC > SMA (Bull), Short only if BTC < SMA (Bear)
|
|
if self.trend_window > 0:
|
|
# Calculate SMA on full BTC history first
|
|
btc_sma = btc_close.rolling(window=self.trend_window).mean()
|
|
|
|
# Align with test period
|
|
test_btc_close = btc_close.reindex(test_features.index)
|
|
test_btc_sma = btc_sma.reindex(test_features.index)
|
|
|
|
# Define Regimes
|
|
is_bull = (test_btc_close > test_btc_sma).values
|
|
is_bear = (test_btc_close < test_btc_sma).values
|
|
|
|
# Apply Filter
|
|
long_signal_test = long_signal_test & is_bull
|
|
short_signal_test = short_signal_test & is_bear
|
|
|
|
# 8c. Apply Funding Rate Filter
|
|
# Rule: If Funding > Threshold (Greedy) -> No Longs.
|
|
# If Funding < -Threshold (Fearful) -> No Shorts.
|
|
if self.use_funding_filter and 'btc_funding' in test_features.columns:
|
|
funding = test_features['btc_funding'].values
|
|
thresh = self.funding_threshold
|
|
|
|
# Greedy Market (High Positive Funding) -> Risk of Long Squeeze -> Block Longs
|
|
# (Or implies trend is up? Actually for Mean Reversion, high funding often marks tops)
|
|
# We block Longs because we don't want to buy into an overheated market?
|
|
# Actually, "Greedy" means Longs are paying Shorts.
|
|
# If we Long, we pay funding.
|
|
# If we Short, we receive funding.
|
|
# So High Funding = Good for Shorts (receive yield + reversion).
|
|
# Bad for Longs (pay yield + likely top).
|
|
|
|
is_overheated = funding > thresh
|
|
is_oversold = funding < -thresh
|
|
|
|
# Block Longs if Overheated
|
|
long_signal_test = long_signal_test & (~is_overheated)
|
|
|
|
# Block Shorts if Oversold (Negative Funding) -> Risk of Short Squeeze
|
|
short_signal_test = short_signal_test & (~is_oversold)
|
|
|
|
n_blocked_long = (is_overheated & (probs > 0.5) & (test_features['z_score'].values < -z_thresh)).sum()
|
|
n_blocked_short = (is_oversold & (probs > 0.5) & (test_features['z_score'].values > z_thresh)).sum()
|
|
|
|
if n_blocked_long > 0 or n_blocked_short > 0:
|
|
logger.info(f"Funding Filter: Blocked {n_blocked_long} Longs, {n_blocked_short} Shorts")
|
|
|
|
# 9. Calculate Position Sizing (Probability-Based)
|
|
# Base size = 1.0 (100% of equity)
|
|
# Scale: 1.0 + (Prob - 0.5) * 2
|
|
# Example: Prob=0.6 -> Size=1.2, Prob=0.8 -> Size=1.6
|
|
|
|
# Align probabilities to close index
|
|
probs_series = pd.Series(0.0, index=test_features.index)
|
|
probs_series[:] = probs
|
|
probs_aligned = probs_series.reindex(close.index, fill_value=0.0)
|
|
|
|
# Calculate dynamic size
|
|
dynamic_size = 1.0 + (probs_aligned - 0.5) * 2.0
|
|
# Cap leverage between 1x and 2x
|
|
size = dynamic_size.clip(lower=1.0, upper=2.0)
|
|
|
|
# Create full-length signal series (False for training period)
|
|
long_entries = pd.Series(False, index=close.index)
|
|
short_entries = pd.Series(False, index=close.index)
|
|
|
|
# Map test signals to their correct indices
|
|
test_idx = test_features.index
|
|
for i, idx in enumerate(test_idx):
|
|
if idx in close.index:
|
|
long_entries.loc[idx] = bool(long_signal_test[i])
|
|
short_entries.loc[idx] = bool(short_signal_test[i])
|
|
|
|
# 9. Generate Exits
|
|
# Exit when Z-Score crosses back through 0 (mean reversion complete)
|
|
z_reindexed = features['z_score'].reindex(close.index, fill_value=0)
|
|
|
|
# Exit Long when Z > 0, Exit Short when Z < 0
|
|
long_exits = z_reindexed > 0
|
|
short_exits = z_reindexed < 0
|
|
|
|
# Log signal counts for verification
|
|
n_long = long_entries.sum()
|
|
n_short = short_entries.sum()
|
|
logger.info(f"Generated {n_long} long signals, {n_short} short signals (test period only)")
|
|
|
|
return long_entries, long_exits, short_entries, short_exits, size
|
|
|
|
def prepare_features(self, df_btc, df_eth, cq_df=None):
|
|
"""Replicate research feature engineering"""
|
|
# Align
|
|
common = df_btc.index.intersection(df_eth.index)
|
|
df_a = df_btc.loc[common].copy()
|
|
df_b = df_eth.loc[common].copy()
|
|
|
|
# Spread
|
|
spread = df_b['close'] / df_a['close']
|
|
|
|
# Z-Score
|
|
rolling_mean = spread.rolling(window=self.z_window).mean()
|
|
rolling_std = spread.rolling(window=self.z_window).std()
|
|
z_score = (spread - rolling_mean) / rolling_std
|
|
|
|
# Technicals
|
|
spread_rsi = ta.momentum.RSIIndicator(spread, window=14).rsi()
|
|
spread_roc = spread.pct_change(periods=5) * 100
|
|
spread_change_1h = spread.pct_change(periods=1)
|
|
|
|
# Volume
|
|
vol_ratio = df_b['volume'] / df_a['volume']
|
|
vol_ratio_ma = vol_ratio.rolling(window=12).mean()
|
|
|
|
# Volatility
|
|
ret_a = df_a['close'].pct_change()
|
|
ret_b = df_b['close'].pct_change()
|
|
vol_a = ret_a.rolling(window=self.z_window).std()
|
|
vol_b = ret_b.rolling(window=self.z_window).std()
|
|
vol_spread_ratio = vol_b / vol_a
|
|
|
|
features = pd.DataFrame(index=spread.index)
|
|
features['spread'] = spread
|
|
features['z_score'] = z_score
|
|
features['spread_rsi'] = spread_rsi
|
|
features['spread_roc'] = spread_roc
|
|
features['spread_change_1h'] = spread_change_1h
|
|
features['vol_ratio'] = vol_ratio
|
|
features['vol_ratio_rel'] = vol_ratio / vol_ratio_ma
|
|
features['vol_diff_ratio'] = vol_spread_ratio
|
|
|
|
# CQ Merge
|
|
if cq_df is not None:
|
|
cq_aligned = cq_df.reindex(features.index, method='ffill')
|
|
if 'btc_funding' in cq_aligned.columns and 'eth_funding' in cq_aligned.columns:
|
|
cq_aligned['funding_diff'] = cq_aligned['eth_funding'] - cq_aligned['btc_funding']
|
|
if 'btc_inflow' in cq_aligned.columns and 'eth_inflow' in cq_aligned.columns:
|
|
cq_aligned['inflow_ratio'] = cq_aligned['eth_inflow'] / (cq_aligned['btc_inflow'] + 1)
|
|
features = features.join(cq_aligned)
|
|
|
|
return features.dropna()
|
|
|
|
def train_model(self, train_features):
|
|
"""
|
|
Train Random Forest on training data only.
|
|
|
|
This method receives ONLY the training subset of features,
|
|
ensuring no look-ahead bias. The model learns from historical
|
|
patterns and is then applied to unseen test data.
|
|
|
|
Args:
|
|
train_features: DataFrame containing features for training period only
|
|
"""
|
|
threshold = self.profit_target
|
|
stop_loss_pct = self.stop_loss
|
|
horizon = self.horizon
|
|
z_thresh = self.z_entry_threshold
|
|
|
|
# Calculate targets path-dependently (checking SL before TP)
|
|
spread = train_features['spread'].values
|
|
z_score = train_features['z_score'].values
|
|
n = len(spread)
|
|
|
|
targets = np.zeros(n, dtype=int)
|
|
|
|
# Only iterate relevant rows for efficiency
|
|
candidates = np.where((z_score > z_thresh) | (z_score < -z_thresh))[0]
|
|
|
|
for i in candidates:
|
|
if i + horizon >= n:
|
|
continue
|
|
|
|
entry_price = spread[i]
|
|
future_prices = spread[i+1 : i+1+horizon]
|
|
|
|
if z_score[i] > z_thresh: # Short
|
|
target_price = entry_price * (1 - threshold)
|
|
stop_price = entry_price * (1 + stop_loss_pct)
|
|
|
|
hit_tp = future_prices <= target_price
|
|
hit_sl = future_prices >= stop_price
|
|
|
|
if not np.any(hit_tp):
|
|
targets[i] = 0
|
|
elif not np.any(hit_sl):
|
|
targets[i] = 1
|
|
else:
|
|
first_tp_idx = np.argmax(hit_tp)
|
|
first_sl_idx = np.argmax(hit_sl)
|
|
if first_tp_idx < first_sl_idx:
|
|
targets[i] = 1
|
|
else:
|
|
targets[i] = 0
|
|
|
|
else: # Long
|
|
target_price = entry_price * (1 + threshold)
|
|
stop_price = entry_price * (1 - stop_loss_pct)
|
|
|
|
hit_tp = future_prices >= target_price
|
|
hit_sl = future_prices <= stop_price
|
|
|
|
if not np.any(hit_tp):
|
|
targets[i] = 0
|
|
elif not np.any(hit_sl):
|
|
targets[i] = 1
|
|
else:
|
|
first_tp_idx = np.argmax(hit_tp)
|
|
first_sl_idx = np.argmax(hit_sl)
|
|
if first_tp_idx < first_sl_idx:
|
|
targets[i] = 1
|
|
else:
|
|
targets[i] = 0
|
|
|
|
# Build model
|
|
model = RandomForestClassifier(
|
|
n_estimators=300, max_depth=5, min_samples_leaf=30,
|
|
class_weight={0: 1, 1: 3}, random_state=42
|
|
)
|
|
|
|
# Exclude non-feature columns
|
|
exclude = ['spread']
|
|
cols = [c for c in train_features.columns if c not in exclude]
|
|
|
|
# Clean features
|
|
X_train = train_features[cols].fillna(0)
|
|
X_train = X_train.replace([np.inf, -np.inf], 0)
|
|
|
|
# Use rows where we had enough data to look ahead
|
|
valid_mask = np.zeros(n, dtype=bool)
|
|
valid_mask[:n-horizon] = True
|
|
|
|
X_train_clean = X_train[valid_mask]
|
|
targets_clean = targets[valid_mask]
|
|
|
|
logger.info(f"Training on {len(X_train_clean)} valid samples (removed {len(X_train) - len(X_train_clean)} with incomplete future data)")
|
|
|
|
model.fit(X_train_clean, targets_clean)
|
|
return model, cols
|