import pandas as pd import numpy as np import pickle import os import strategy_config as config from trade import TradeState, enter_long, exit_long from logging_utils import write_trade_log from metrics import compute_metrics from pathlib import Path def backtest_mvrv( df_features: pd.DataFrame, df_1min: pd.DataFrame, initial_cash: float = 10000.0, log_path: Path | None = None, test_only: bool = True # NEW: Only backtest on test set to avoid train/test leakage ): print("--- Starting MVRV Strategy Backtest ---") # 1. Load Model and Generate Predictions print(f"Loading model from {config.MODEL_PATH}...") with open(config.MODEL_PATH, 'rb') as f: model = pickle.load(f) # Load split info to identify test set boundary split_info_path = config.MODEL_PATH.replace('.pkl', '_split.pkl') if test_only and os.path.exists(split_info_path): with open(split_info_path, 'rb') as f: split_info = pickle.load(f) test_start_idx = split_info['test_start_idx'] print(f"Filtering to TEST SET ONLY (starting at index {test_start_idx})") print(f" Train size was: {split_info['train_size']}, Test size: {split_info['test_size']}") # Filter features to test set only df_features = df_features.iloc[test_start_idx:].copy() # Filter 1min data to match the test period test_start_ts = df_features.index[0] df_1min = df_1min[df_1min['Timestamp'] >= test_start_ts].copy() print(f"Backtest period: {df_features.index[0]} to {df_features.index[-1]}") elif test_only: print("WARNING: Split info not found. Running on FULL dataset (includes training data!).") # Prepare features for prediction # Only use columns that were used in training # We rely on config.FEATURE_NAMES, but we must check what's in df_features # The model expects specific columns. X = df_features[config.FEATURE_NAMES] print("Generating predictions...") probs = model.predict_proba(X)[:, 1] df_features['signal_prob'] = probs # 2. Setup Backtest Loop state = TradeState( cash=initial_cash, fee_bps=config.FEES_PERCENT * 10000, # Convert to bps slippage_bps=config.SLIPPAGE_PERCENT * 10000 ) equity = [] trades = [] # Track dynamic SL/TP current_sl_price = 0.0 current_tp_price = 0.0 # Pre-calculate entry signals to speed up loop # Logic: Prob > Thresh AND Funding > Filter AND (MVRV < Thresh AND NUPL < Thresh) # Note: features.py handles MVRV/NUPL Z-scores. # The strategy uses raw NUPL/MVRV for regime filter, or Z-scores? # Source: (mvrv_z > MVRV_Z_THRESH) | (nupl > NUPL_THRESH) -> is_overheated # Check if we have 'mvrv_z' and 'nupl' columns in df_features. # Apply filters # Defaults if cols missing (safe fallback) s_prob = df_features['signal_prob'] funding = df_features['funding_rate'] if 'funding_rate' in df_features.columns else pd.Series(0, index=df_features.index) # Use 'mvrv_z' if available, else 'mvrv' (but Z-score is preferred for normalization) # The source strategy used 'mvrv_z' > 1.5 for overheated. mvrv_z = df_features['mvrv_z'] if 'mvrv_z' in df_features.columns else pd.Series(0, index=df_features.index) # Source used raw 'nupl' > 0.6 for overheated nupl = df_features['nupl'] if 'nupl' in df_features.columns else pd.Series(0, index=df_features.index) # Regime Filter: True if NOT overheated is_overheated = (mvrv_z > config.MVRV_Z_THRESH) | (nupl > config.NUPL_THRESH) regime_can_trade = ~is_overheated # Entry Signal entry_signals = ( (s_prob > config.PROB_THRESHOLD) & (funding > config.FUNDING_FILTER) & regime_can_trade ) df_features['entry_signal'] = entry_signals print(f"Total Entry Signals: {entry_signals.sum()}") # Loop # df_features is 1H. df_1min is 1m. # We iterate through df_features (hourly steps). # If in a trade, we check df_1min for SL/TP within that hour. # If not in a trade, we check for Entry Signal at the close of the hour (or open of next). # Standard backtesting: Signals calculated on 'Close' are executable at 'Open' of next candle. # But df_1min covers the interval. # Let's align carefully. for i in range(len(df_features) - 1): # Current 1H candle (completed) row = df_features.iloc[i] next_row = df_features.iloc[i+1] ts_start = row.name # Timestamp of the row (e.g. 10:00) ts_end = next_row.name # Timestamp of next row (e.g. 11:00) # Get 1m data for this interval [ts_start, ts_end) # Note: df_1min['Timestamp'] needs to be datetime mask = (df_1min['Timestamp'] >= ts_start) & (df_1min['Timestamp'] < ts_end) chunk_1min = df_1min.loc[mask] # 1. Manage Existing Position (Exit Logic) # Store initial qty state to prevent re-entry in same candle if we exited started_with_position = state.qty > 0 if state.qty > 0: # Check for SL/TP hits in 1m data for _, m_row in chunk_1min.iterrows(): m_high = m_row['High'] m_low = m_row['Low'] m_ts = m_row['Timestamp'] # Check SL if m_low <= current_sl_price: evt = exit_long(state, current_sl_price) # Exec at SL price if evt: prev = trades[-1] pnl = (evt["price"] - prev["price"]) * prev["qty"] evt.update({"t": m_ts.isoformat(), "reason": "stop_loss", "pnl": pnl}) trades.append(evt) break # Exit loop # Check TP if m_high >= current_tp_price: evt = exit_long(state, current_tp_price) # Exec at TP price if evt: prev = trades[-1] pnl = (evt["price"] - prev["price"]) * prev["qty"] evt.update({"t": m_ts.isoformat(), "reason": "take_profit", "pnl": pnl}) trades.append(evt) break # Exit loop # 2. Check for New Entry (if no position) # Logic: If signal was True at 'row' (completed candle), we enter at Open of 'next_row' (or first 1m candle of next hour) # Actually, we can enter immediately at the start of the interval if the signal was from the *previous* completed candle. # Here 'row' is the current interval processing. # If 'entry_signal' is True for 'row', it means at the end of 10:00 we have a signal. # We should enter at 11:00 (which is start of next interval). # So we check entry_signal of 'row', and if True, we enter at first available price in 'chunk_1min'?? # WAIT. 'chunk_1min' is [ts_start, ts_end). # If row is 10:00 (meaning data for 09:00-10:00?), standard pandas resample labels left or right? # Usually 10:00 label means 10:00-11:00 or 09:00-10:00? # prepare_data used resample('1h'). # Pandas default for 1h is usually start of bin (left). # So 10:00 row contains data from 10:00 to 11:00. # We can only know the signal at 11:00 (Close of the candle). # So we can execute at 11:00 (start of next bin). # So: processing interval i (10:00-11:00). # We check signal from i-1 (09:00-10:00). # If i-1 had signal, we enter at start of i. if state.qty <= 0 and not started_with_position: # Check previous row signal if i > 0: prev_row = df_features.iloc[i-1] if prev_row['entry_signal']: # Enter Long # Price = Open of current interval (or first 1m open) entry_price = row['open'] if not chunk_1min.empty: entry_price = chunk_1min.iloc[0]['Open'] # Calculate ATR-based SL/TP atr = prev_row['atr'] if pd.isna(atr) or atr == 0: atr = row['open'] * 0.01 # Fallback 1% sl_dist = atr * config.SL_ATR_MULT tp_dist = atr * config.TP_ATR_MULT current_sl_price = entry_price - sl_dist current_tp_price = entry_price + tp_dist evt = enter_long(state, entry_price) if evt: evt.update({ "t": ts_start.isoformat(), "reason": "signal_entry", "sl": current_sl_price, "tp": current_tp_price }) trades.append(evt) # Update Equity Curve (mark-to-market at close of hour) current_price = row['close'] val = state.cash + (state.qty * current_price) equity.append({'timestamp': ts_start, 'equity': val}) # Create Equity Series equity_df = pd.DataFrame(equity).set_index('timestamp') equity_curve = equity_df['equity'] # Save Logs if log_path: write_trade_log(trades, log_path) # Metrics (hourly bars: 252 trading days * 24 hours = 6048 periods/year) perf = compute_metrics(equity_curve, trades, periods_per_year=252 * 24) # Print Summary print("\n--- Backtest Summary ---") print(f"Total Return: {perf.total_return * 100:.2f}%") print(f"Sharpe Ratio: {perf.sharpe_ratio:.2f}") print(f"Max Drawdown: {perf.max_drawdown * 100:.2f}%") print(f"Total Trades: {perf.num_trades}") return perf, equity_curve, trades import argparse def run(): parser = argparse.ArgumentParser() parser.add_argument("--csv", required=True, help="Path to 1m/15m OHLCV CSV") args = parser.parse_args() # Load 1M Data print(f"Loading 1m/15m data from {args.csv}...") df_1min = pd.read_csv(args.csv) # Ensure Timestamp if 'Timestamp' in df_1min.columns: ts_max = df_1min['Timestamp'].max() if ts_max < 3000000000: unit = 's' elif ts_max < 3000000000000: unit = 'ms' else: unit = None df_1min['Timestamp'] = pd.to_datetime(df_1min['Timestamp'], unit=unit) elif 'Date' in df_1min.columns: df_1min['Timestamp'] = pd.to_datetime(df_1min['Date']) df_1min = df_1min.sort_values('Timestamp') # Load Features (1H) print(f"Loading features from {config.FEATURES_PATH}...") if not os.path.exists(config.FEATURES_PATH): print("Error: features.csv not found. Run prepare_data.py first.") return df_features = pd.read_csv(config.FEATURES_PATH, parse_dates=['timestamp'], index_col='timestamp') # Run Backtest backtest_mvrv(df_features, df_1min, log_path=Path("logs/mvrv_trade_log.csv")) if __name__ == "__main__": run()