Files
lowkey_backtest/research/regime_detection.py
Simon Moisy 1e4cb87da3 Add check_symbols.py for ETH perpetuals filtering and enhance backtester with size handling
- Introduced `check_symbols.py` to load and filter ETH perpetual markets from the OKX exchange using CCXT.
- Updated the backtester to normalize signals to a 5-tuple format, incorporating size management for trades.
- Enhanced portfolio functions to support variable size and leverage adjustments based on initial capital.
- Added a new method in `CryptoQuantClient` for chunked historical data fetching to avoid API limits.
- Improved market symbol normalization in `market.py` to handle different formats.
- Updated regime strategy parameters based on recent research findings for optimal performance.
2026-01-14 09:46:51 +08:00

343 lines
11 KiB
Python

"""
Regime Detection Research Script with Walk-Forward Training.
Tests multiple holding horizons to find optimal parameters
without look-ahead bias.
"""
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import pandas as pd
import numpy as np
import ta
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from engine.data_manager import DataManager
from engine.market import MarketType
from engine.logging_config import get_logger
logger = get_logger(__name__)
# Configuration
TRAIN_RATIO = 0.7 # 70% train, 30% test
PROFIT_THRESHOLD = 0.005 # 0.5% profit target
Z_WINDOW = 24
FEE_RATE = 0.001 # 0.1% round-trip fee
def load_data():
"""Load and align BTC/ETH data."""
dm = DataManager()
df_btc = dm.load_data("okx", "BTC-USDT", "1h", MarketType.SPOT)
df_eth = dm.load_data("okx", "ETH-USDT", "1h", MarketType.SPOT)
# Filter to Oct-Dec 2025
start = pd.Timestamp("2025-10-01", tz="UTC")
end = pd.Timestamp("2025-12-31", tz="UTC")
df_btc = df_btc[(df_btc.index >= start) & (df_btc.index <= end)]
df_eth = df_eth[(df_eth.index >= start) & (df_eth.index <= end)]
# Align indices
common = df_btc.index.intersection(df_eth.index)
df_btc = df_btc.loc[common]
df_eth = df_eth.loc[common]
logger.info(f"Loaded {len(common)} aligned hourly bars")
return df_btc, df_eth
def load_cryptoquant_data():
"""Load CryptoQuant on-chain data if available."""
try:
cq_path = "data/cq_training_data.csv"
cq_df = pd.read_csv(cq_path, index_col='timestamp', parse_dates=True)
if cq_df.index.tz is None:
cq_df.index = cq_df.index.tz_localize('UTC')
logger.info(f"Loaded CryptoQuant data: {len(cq_df)} rows")
return cq_df
except Exception as e:
logger.warning(f"CryptoQuant data not available: {e}")
return None
def calculate_features(df_btc, df_eth, cq_df=None):
"""Calculate all features for the model."""
spread = df_eth['close'] / df_btc['close']
# Z-Score
rolling_mean = spread.rolling(window=Z_WINDOW).mean()
rolling_std = spread.rolling(window=Z_WINDOW).std()
z_score = (spread - rolling_mean) / rolling_std
# Technicals
spread_rsi = ta.momentum.RSIIndicator(spread, window=14).rsi()
spread_roc = spread.pct_change(periods=5) * 100
spread_change_1h = spread.pct_change(periods=1)
# Volume
vol_ratio = df_eth['volume'] / df_btc['volume']
vol_ratio_ma = vol_ratio.rolling(window=12).mean()
# Volatility
ret_btc = df_btc['close'].pct_change()
ret_eth = df_eth['close'].pct_change()
vol_btc = ret_btc.rolling(window=Z_WINDOW).std()
vol_eth = ret_eth.rolling(window=Z_WINDOW).std()
vol_spread_ratio = vol_eth / vol_btc
features = pd.DataFrame(index=spread.index)
features['spread'] = spread
features['z_score'] = z_score
features['spread_rsi'] = spread_rsi
features['spread_roc'] = spread_roc
features['spread_change_1h'] = spread_change_1h
features['vol_ratio'] = vol_ratio
features['vol_ratio_rel'] = vol_ratio / vol_ratio_ma
features['vol_diff_ratio'] = vol_spread_ratio
# Add CQ features if available
if cq_df is not None:
cq_aligned = cq_df.reindex(features.index, method='ffill')
if 'btc_funding' in cq_aligned.columns and 'eth_funding' in cq_aligned.columns:
cq_aligned['funding_diff'] = cq_aligned['eth_funding'] - cq_aligned['btc_funding']
if 'btc_inflow' in cq_aligned.columns and 'eth_inflow' in cq_aligned.columns:
cq_aligned['inflow_ratio'] = cq_aligned['eth_inflow'] / (cq_aligned['btc_inflow'] + 1)
features = features.join(cq_aligned)
return features.dropna()
def calculate_targets(features, horizon):
"""Calculate target labels for a given horizon."""
spread = features['spread']
z_score = features['z_score']
# For Short (Z > 1): Did spread drop below target?
future_min = spread.rolling(window=horizon).min().shift(-horizon)
target_short = spread * (1 - PROFIT_THRESHOLD)
success_short = (z_score > 1.0) & (future_min < target_short)
# For Long (Z < -1): Did spread rise above target?
future_max = spread.rolling(window=horizon).max().shift(-horizon)
target_long = spread * (1 + PROFIT_THRESHOLD)
success_long = (z_score < -1.0) & (future_max > target_long)
targets = np.select([success_short, success_long], [1, 1], default=0)
# Create valid mask (rows with complete future data)
valid_mask = future_min.notna() & future_max.notna()
return targets, valid_mask, future_min, future_max
def calculate_mae(features, predictions, test_idx, horizon):
"""Calculate Maximum Adverse Excursion for predicted trades."""
test_features = features.loc[test_idx]
spread = test_features['spread']
z_score = test_features['z_score']
mae_values = []
for i, (idx, pred) in enumerate(zip(test_idx, predictions)):
if pred != 1:
continue
entry_spread = spread.loc[idx]
z = z_score.loc[idx]
# Get future spread values
future_idx = features.index.get_loc(idx)
future_end = min(future_idx + horizon, len(features))
future_spreads = features['spread'].iloc[future_idx:future_end]
if len(future_spreads) < 2:
continue
if z > 1.0: # Short trade
max_adverse = (future_spreads.max() - entry_spread) / entry_spread
else: # Long trade
max_adverse = (entry_spread - future_spreads.min()) / entry_spread
mae_values.append(max_adverse * 100) # As percentage
return np.mean(mae_values) if mae_values else 0.0
def calculate_net_profit(features, predictions, test_idx, horizon):
"""Calculate estimated net profit including fees."""
test_features = features.loc[test_idx]
spread = test_features['spread']
z_score = test_features['z_score']
total_pnl = 0.0
n_trades = 0
for i, (idx, pred) in enumerate(zip(test_idx, predictions)):
if pred != 1:
continue
entry_spread = spread.loc[idx]
z = z_score.loc[idx]
# Get future spread values
future_idx = features.index.get_loc(idx)
future_end = min(future_idx + horizon, len(features))
future_spreads = features['spread'].iloc[future_idx:future_end]
if len(future_spreads) < 2:
continue
# Calculate PnL based on direction
if z > 1.0: # Short trade - profit if spread drops
exit_spread = future_spreads.iloc[-1] # Exit at horizon
pnl = (entry_spread - exit_spread) / entry_spread
else: # Long trade - profit if spread rises
exit_spread = future_spreads.iloc[-1]
pnl = (exit_spread - entry_spread) / entry_spread
# Subtract fees
net_pnl = pnl - FEE_RATE
total_pnl += net_pnl
n_trades += 1
return total_pnl, n_trades
def test_horizon(features, horizon):
"""Test a single horizon with walk-forward training."""
# Calculate targets
targets, valid_mask, _, _ = calculate_targets(features, horizon)
# Walk-forward split
n_samples = len(features)
train_size = int(n_samples * TRAIN_RATIO)
train_features = features.iloc[:train_size]
test_features = features.iloc[train_size:]
train_targets = targets[:train_size]
test_targets = targets[train_size:]
train_valid = valid_mask.iloc[:train_size]
test_valid = valid_mask.iloc[train_size:]
# Prepare training data (only valid rows)
exclude = ['spread']
cols = [c for c in features.columns if c not in exclude]
X_train = train_features[cols].fillna(0).replace([np.inf, -np.inf], 0)
X_train_valid = X_train[train_valid]
y_train_valid = train_targets[train_valid]
if len(X_train_valid) < 50:
return None # Not enough training data
# Train model
model = RandomForestClassifier(
n_estimators=300, max_depth=5, min_samples_leaf=30,
class_weight={0: 1, 1: 3}, random_state=42
)
model.fit(X_train_valid, y_train_valid)
# Predict on test set
X_test = test_features[cols].fillna(0).replace([np.inf, -np.inf], 0)
predictions = model.predict(X_test)
# Only evaluate on valid test rows (those with complete future data)
test_valid_mask = test_valid.values
y_test_valid = test_targets[test_valid_mask]
pred_valid = predictions[test_valid_mask]
if len(y_test_valid) < 10:
return None
# Calculate metrics
f1 = f1_score(y_test_valid, pred_valid, zero_division=0)
# Calculate MAE and Net Profit on ALL test predictions (not just valid targets)
test_idx = test_features.index
avg_mae = calculate_mae(features, predictions, test_idx, horizon)
net_pnl, n_trades = calculate_net_profit(features, predictions, test_idx, horizon)
return {
'horizon': horizon,
'f1_score': f1,
'avg_mae': avg_mae,
'net_pnl': net_pnl,
'n_trades': n_trades,
'train_samples': len(X_train_valid),
'test_samples': len(X_test)
}
def test_horizons(features, horizons):
"""Test multiple horizons and return comparison."""
results = []
print("\n" + "=" * 80)
print("WALK-FORWARD HORIZON OPTIMIZATION")
print(f"Train Ratio: {TRAIN_RATIO*100:.0f}% | Profit Target: {PROFIT_THRESHOLD*100:.1f}% | Fee Rate: {FEE_RATE*100:.2f}%")
print("=" * 80)
for h in horizons:
result = test_horizon(features, h)
if result:
results.append(result)
print(f"Horizon {h:3d}h: F1={result['f1_score']:.3f}, "
f"MAE={result['avg_mae']:.2f}%, "
f"Net PnL={result['net_pnl']*100:.2f}%, "
f"Trades={result['n_trades']}")
return results
def main():
"""Main research function."""
# Load data
df_btc, df_eth = load_data()
cq_df = load_cryptoquant_data()
# Calculate features
features = calculate_features(df_btc, df_eth, cq_df)
logger.info(f"Calculated {len(features)} feature rows with {len(features.columns)} columns")
# Test horizons from 6h to 150h
horizons = list(range(6, 151, 6)) # 6, 12, 18, ..., 150
results = test_horizons(features, horizons)
if not results:
print("No valid results!")
return
# Find best by different metrics
results_df = pd.DataFrame(results)
print("\n" + "=" * 80)
print("BEST HORIZONS BY METRIC")
print("=" * 80)
best_f1 = results_df.loc[results_df['f1_score'].idxmax()]
print(f"Best F1 Score: {best_f1['horizon']:.0f}h (F1={best_f1['f1_score']:.3f})")
best_pnl = results_df.loc[results_df['net_pnl'].idxmax()]
print(f"Best Net PnL: {best_pnl['horizon']:.0f}h (PnL={best_pnl['net_pnl']*100:.2f}%)")
lowest_mae = results_df.loc[results_df['avg_mae'].idxmin()]
print(f"Lowest MAE: {lowest_mae['horizon']:.0f}h (MAE={lowest_mae['avg_mae']:.2f}%)")
# Save results
output_path = "research/horizon_optimization_results.csv"
results_df.to_csv(output_path, index=False)
print(f"\nResults saved to {output_path}")
return results_df
if __name__ == "__main__":
main()