diff --git a/.vscode/launch.json b/.vscode/launch.json index 653a133..9cfb4e3 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -10,12 +10,125 @@ "args": [ "--btc", "${workspaceFolder}/../data/btcusd_1-min_data.csv", "--eth", "${workspaceFolder}/../data/ethusd_1min_ohlc.csv", - // "--rules", "20min,21min,22min,23min,24min,25min,26min,27min,28min,29min,30min,31min,32min,33min,34min,35min,36min,37min,38min,39min,40min,41min,42min,43min,44min,45min,46min,47min,48min,49min,50min,51min,52min,53min,54min,55min,56min,57min,58min,59min,60min", - "--rules", "39min", + "--rules", "20min,21min,22min,23min,24min,25min,26min,27min,28min,29min,30min,31min,32min,33min,34min,35min,36min,37min,38min,39min,40min,41min,42min,43min,44min,45min,46min,47min,48min,49min,50min,51min,52min,53min,54min,55min,56min,57min,58min,59min,60min", "--states", "3", - "--cv_since", "2023-01-01", "--horizon", "60", - "--folder_save_path", "models" + "--cv_since", "2023-01-01", + "--cv_splits", "8", + "--cv_test_bars", "500", + "--cv_gap_bars", "24", + "--cv_seed", "7", + "--cv_method", "random", + "--feature_selection", "mutual_info", + "--n_features", "10" + ], + "console": "integratedTerminal", + "cwd": "${workspaceFolder}", + "justMyCode": true, + "env": { + "PYTHONUNBUFFERED": "1" + } + }, + { + "name": "Run ETH/BTC - Expanding Window CV", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/main.py", + "args": [ + "--btc", "${workspaceFolder}/../data/btcusd_1-min_data.csv", + "--eth", "${workspaceFolder}/../data/ethusd_1min_ohlc.csv", + "--rules", "30min,45min,1H", + "--states", "3", + "--horizon", "60", + "--cv_since", "2023-01-01", + "--cv_splits", "5", + "--cv_test_bars", "1000", + "--cv_gap_bars", "24", + "--cv_seed", "42", + "--cv_method", "expanding", + "--feature_selection", "rfe", + "--n_features", "12" + ], + "console": "integratedTerminal", + "cwd": "${workspaceFolder}", + "justMyCode": true, + "env": { + "PYTHONUNBUFFERED": "1" + } + }, + { + "name": "Run ETH/BTC - Rolling Window CV", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/main.py", + "args": [ + "--btc", "${workspaceFolder}/../data/btcusd_1-min_data.csv", + "--eth", "${workspaceFolder}/../data/ethusd_1min_ohlc.csv", + "--rules", "30min,1H,2H", + "--states", "4", + "--horizon", "120", + "--cv_since", "2023-01-01", + "--cv_splits", "6", + "--cv_test_bars", "800", + "--cv_gap_bars", "12", + "--cv_seed", "123", + "--cv_method", "rolling", + "--feature_selection", "random_forest", + "--n_features", "15" + ], + "console": "integratedTerminal", + "cwd": "${workspaceFolder}", + "justMyCode": true, + "env": { + "PYTHONUNBUFFERED": "1" + } + }, + { + "name": "Run ETH/BTC - Quick Test", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/main.py", + "args": [ + "--btc", "${workspaceFolder}/../data/btcusd_1-min_data.csv", + "--eth", "${workspaceFolder}/../data/ethusd_1min_ohlc.csv", + "--rules", "30min,1H", + "--states", "3", + "--horizon", "60", + "--cv_since", "2024-01-01", + "--cv_splits", "3", + "--cv_test_bars", "200", + "--cv_gap_bars", "12", + "--cv_seed", "7", + "--cv_method", "random", + "--feature_selection", "mutual_info", + "--n_features", "8" + ], + "console": "integratedTerminal", + "cwd": "${workspaceFolder}", + "justMyCode": true, + "env": { + "PYTHONUNBUFFERED": "1" + } + }, + { + "name": "Run ETH/BTC - No Feature Selection", + "type": "debugpy", + "request": "launch", + "program": "${workspaceFolder}/main.py", + "args": [ + "--btc", "${workspaceFolder}/../data/btcusd_1-min_data.csv", + "--eth", "${workspaceFolder}/../data/ethusd_1min_ohlc.csv", + "--rules", "30min,45min,1H", + "--states", "3", + "--horizon", "60", + "--cv_since", "2023-01-01", + "--cv_splits", "5", + "--cv_test_bars", "500", + "--cv_gap_bars", "24", + "--cv_seed", "7", + "--cv_method", "random", + "--feature_selection", "none", + "--n_features", "0" ], "console": "integratedTerminal", "cwd": "${workspaceFolder}", @@ -25,4 +138,4 @@ } } ] -} +} \ No newline at end of file diff --git a/main.py b/main.py index a7ecb75..80bae37 100644 --- a/main.py +++ b/main.py @@ -1,19 +1,19 @@ #!/usr/bin/env python3 from __future__ import annotations - import argparse from dataclasses import dataclass from pathlib import Path - import numpy as np import pandas as pd from hmmlearn.hmm import GaussianHMM from sklearn.preprocessing import StandardScaler -import joblib - - -# ============================== CLI ========================================== +from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFE +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor +import warnings +warnings.filterwarnings('ignore') +# ------------------------------- CLI ----------------------------------------- @dataclass class CLI: btc_csv: Path @@ -21,400 +21,487 @@ class CLI: resample_rules: list[str] n_states: int horizon_min: int - folder_save_path: str | None # CV params cv_splits: int cv_test_bars: int cv_gap_bars: int cv_seed: int cv_since: str | None # restrict sampling to recent era - + # Feature selection params + feature_selection_method: str # 'mutual_info', 'rfe', 'random_forest', 'none' + n_features: int # number of features to select + # Enhanced CV params + cv_method: str # 'random', 'expanding', 'rolling' def parse_args() -> CLI: - p = argparse.ArgumentParser(description="BTC/ETH regime modeling with properly embargoed time splits") + p = argparse.ArgumentParser(description="BTC/ETH regime modeling with robust CV and feature selection") p.add_argument("--btc", type=Path, default=Path("btcusd_1-min_data.csv")) p.add_argument("--eth", type=Path, default=Path("ethusd_1min_ohlc.csv")) p.add_argument("--rules", default="30min,45min,1H", help="Comma-separated pandas offsets") p.add_argument("--states", type=int, default=3) - p.add_argument("--horizon", type=int, default=60, help="Forward horizon in minutes for the target") - p.add_argument("--folder_save_path", default=None, help="Folder path to save fitted HMM models (optional)") + p.add_argument("--horizon", type=int, default=60) # randomized CV controls p.add_argument("--cv_splits", type=int, default=8, help="number of random test windows") p.add_argument("--cv_test_bars", type=int, default=500, help="length of each test window in bars") - p.add_argument("--cv_gap_bars", type=int, default=24, help="extra embargo bars beyond the minimum computed gap") + p.add_argument("--cv_gap_bars", type=int, default=24, help="embargo gap before test window") p.add_argument("--cv_seed", type=int, default=7, help="rng seed for reproducibility") p.add_argument("--cv_since", default=None, help="only sample test starts at/after this date (e.g. 2023-01-01)") + + # Feature selection + p.add_argument("--feature_selection", default="mutual_info", + choices=['mutual_info', 'rfe', 'random_forest', 'none'], + help="Feature selection method") + p.add_argument("--n_features", type=int, default=10, help="Number of features to select") + + # Enhanced CV method + p.add_argument("--cv_method", default="random", choices=['random', 'expanding', 'rolling'], + help="Cross-validation method") + a = p.parse_args() - rules = [r.strip() for r in a.rules.split(",") if r.strip()] - return CLI( - btc_csv=a.btc, - eth_csv=a.eth, - resample_rules=rules, - n_states=a.states, - horizon_min=a.horizon, - folder_save_path=a.folder_save_path, - cv_splits=a.cv_splits, - cv_test_bars=a.cv_test_bars, - cv_gap_bars=a.cv_gap_bars, - cv_seed=a.cv_seed, - cv_since=a.cv_since, - ) - - -# ============================ IO / CLEAN ===================================== + return CLI(a.btc, a.eth, rules, a.states, a.horizon, a.cv_splits, a.cv_test_bars, + a.cv_gap_bars, a.cv_seed, a.cv_since, a.feature_selection, a.n_features, a.cv_method) +# ------------------------------ IO / CLEAN ----------------------------------- def _norm_headers(df: pd.DataFrame) -> pd.DataFrame: df = df.rename(columns={c: c.strip().lower() for c in df.columns}) - if "unix" in df.columns: - df = df.rename(columns={"unix": "timestamp"}) - if "date" in df.columns: - df = df.rename(columns={"date": "timestamp"}) + if "unix" in df.columns: df = df.rename(columns={"unix": "timestamp"}) + if "date" in df.columns: df = df.rename(columns={"date": "timestamp"}) return df - def _load_bitstamp_csv(path: Path, prefix: str) -> pd.DataFrame: df = pd.read_csv(path) df = _norm_headers(df) - if "timestamp" not in df.columns: - raise ValueError(f"Missing timestamp in {path}") + if "timestamp" not in df.columns: raise ValueError(f"Missing timestamp in {path}") df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", utc=True, errors="coerce") df = df.dropna(subset=["timestamp"]).set_index("timestamp").sort_index() - for c in ("open", "high", "low", "close", "volume"): - if c in df.columns: - df[c] = pd.to_numeric(df[c], errors="coerce", downcast="float") - df = df[["open", "high", "low", "close", "volume"]].dropna() + for c in ("open","high","low","close","volume"): + if c in df.columns: df[c] = pd.to_numeric(df[c], errors="coerce", downcast="float") + df = df[["open","high","low","close","volume"]].dropna() return df.add_prefix(prefix + "_") - def _align_minutely(btc: pd.DataFrame, eth: pd.DataFrame) -> pd.DataFrame: idx = btc.index.intersection(eth.index) df = btc.reindex(idx).join(eth.reindex(idx), how="inner") return df.ffill(limit=60).dropna() - -# ======================= FEATURES / TARGET =================================== - +# --------------------------- FEATURES (enhanced) ----------------------- def build_features(df: pd.DataFrame, rule: str, horizon_min: int) -> pd.DataFrame: df = df.copy() - - # base returns df["btc_ret"] = np.log(df["btc_close"]).diff() df["eth_ret"] = np.log(df["eth_close"]).diff() - df["ratio"] = df["eth_close"] / df["btc_close"] + df["ratio"] = df["eth_close"]/df["btc_close"] df["ratio_ret"] = np.log(df["ratio"]).diff() - - # volatility (minutes) - for win in (15, 30, 60, 120, 240, 360): + + # Volatility features + for win in (15,30,60,120,240,360): df[f"rv_{win}m"] = df["ratio_ret"].rolling(win, min_periods=win).std() - - # trend vs long MA (minutes) - for win in (60, 240, 1440): + + # Trend features + for win in (60,240,1440): ma = df["ratio"].rolling(win, min_periods=win).mean() - df[f"trend_{win}m"] = df["ratio"] / (ma + 1e-12) - 1.0 - - # rolling correlation (minutes) - for win in (60, 120, 240): + df[f"trend_{win}m"] = df["ratio"]/(ma+1e-12)-1.0 + + # Correlation features + for win in (60,120,240): df[f"corr_{win}m"] = df["btc_ret"].rolling(win, min_periods=win).corr(df["eth_ret"]) - - # beta-like measure over 120m + + # Beta and risk features cov_120 = df["eth_ret"].rolling(120, min_periods=120).cov(df["btc_ret"]) var_120 = df["btc_ret"].rolling(120, min_periods=120).var() - df["beta_2h"] = cov_120 / (var_120 + 1e-12) - - # divergence and volume structure + df["beta_2h"] = cov_120/(var_120+1e-12) + std_b = df["btc_ret"].rolling(120, min_periods=120).std() std_e = df["eth_ret"].rolling(120, min_periods=120).std() - df["divergence_2h"] = np.abs(df["btc_ret"] / (std_b + 1e-12) - df["eth_ret"] / (std_e + 1e-12)) - df["volratio"] = np.log((df["eth_volume"] + 1e-9) / (df["btc_volume"] + 1e-9)) - df["vol_sum"] = np.log(df["eth_volume"] + df["btc_volume"] + 1e-9) - df["vol_diff"] = (df["eth_volume"] - df["btc_volume"]) / (df["eth_volume"] + df["btc_volume"] + 1e-9) - - # convenience aliases - df["rv_2h"] = df.get("rv_120m", df["ratio_ret"].rolling(120, min_periods=120).std()) - df["corr_2h"] = df.get("corr_120m", df["btc_ret"].rolling(120, min_periods=120).corr(df["eth_ret"])) - df["ratio_trend"] = df.get( - "trend_1440m", - df["ratio"] / (df["ratio"].rolling(1440, min_periods=1440).mean() + 1e-12) - 1.0, - ) - - # aggregate to rule - agg = {"btc_close": "last", "eth_close": "last", "ratio": "last", "ratio_ret": "sum"} + df["divergence_2h"] = np.abs(df["btc_ret"]/(std_b+1e-12) - df["eth_ret"]/(std_e+1e-12)) + + # Volume features + df["volratio"] = np.log((df["eth_volume"]+1e-9)/(df["btc_volume"]+1e-9)) + df["vol_sum"] = np.log(df["eth_volume"]+df["btc_volume"]+1e-9) + df["vol_diff"] = (df["eth_volume"]-df["btc_volume"])/(df["eth_volume"]+df["btc_volume"]+1e-9) + + # Additional momentum features + df["momentum_1h"] = df["ratio_ret"].rolling(60).sum() + df["momentum_4h"] = df["ratio_ret"].rolling(240).sum() + + # Mean reversion features + for win in (60, 120, 240): + rolling_mean = df["ratio_ret"].rolling(win).mean() + rolling_std = df["ratio_ret"].rolling(win).std() + df[f"zscore_{win}m"] = (df["ratio_ret"] - rolling_mean) / (rolling_std + 1e-12) + + # Price position features + for win in (240, 1440): + high = df["ratio"].rolling(win).max() + low = df["ratio"].rolling(win).min() + df[f"position_{win}m"] = (df["ratio"] - low) / (high - low + 1e-12) + + # Aggregate to target timeframe + agg = {"btc_close":"last","eth_close":"last","ratio":"last","ratio_ret":"sum"} for c in df.columns: - if c not in agg: - agg[c] = "mean" - + if c not in agg: agg[c] = "mean" + g = df.resample(rule).agg(agg).dropna() - - step_min = max(1, int(pd.Timedelta(rule).total_seconds() // 60)) - ahead = max(1, int(round(horizon_min / step_min))) + step_min = max(1, int(pd.Timedelta(rule).total_seconds()//60)) + ahead = max(1, int(round(horizon_min/step_min))) g["fut_ret"] = g["ratio_ret"].shift(-ahead) - + return g.dropna() - -def feature_matrix(g: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, list[str]]: - ban = {"fut_ret", "btc_close", "eth_close", "ratio"} - keep = ("rv_", "corr_", "trend_", "beta_", "divergence_", "vol") - feats: list[str] = [] - - if "ratio_ret" in g.columns: +def feature_matrix(g: pd.DataFrame) -> tuple[np.ndarray,np.ndarray,list[str]]: + ban = {"fut_ret","btc_close","eth_close","ratio"} + keep = ("rv_","corr_","trend_","beta_","divergence_","vol","momentum_","zscore_","position_") + feats = [] + + if "ratio_ret" in g.columns: feats.append("ratio_ret") - feats += [ - c for c in g.columns - if c not in ban and c != "ratio_ret" and any(c.startswith(p) for p in keep) - ] - - if not feats: - feats = ["ratio_ret", "rv_30m", "rv_2h", "corr_2h", "ratio_trend", "volratio"] - + feats += [c for c in g.columns if c not in ban and c!="ratio_ret" and any(c.startswith(p) for p in keep)] + + if not feats: + feats = ["ratio_ret","rv_30m","rv_2h","corr_2h","ratio_trend","volratio","momentum_1h","zscore_60m"] + X = g[feats].astype(np.float32).values y = g["fut_ret"].astype(np.float32).values + return X, y, feats +# ------------------------- Enhanced Feature Selection ------------------------ +def select_features(X_train: np.ndarray, y_train: np.ndarray, + X_test: np.ndarray, feature_names: list[str], + method: str, n_features: int) -> tuple[np.ndarray, np.ndarray, list[str]]: + """ + Apply feature selection to training and test data + """ + if method == "none" or n_features >= len(feature_names): + return X_train, X_test, feature_names + + if n_features <= 0: + n_features = max(1, len(feature_names) // 2) + + try: + if method == "mutual_info": + selector = SelectKBest(score_func=mutual_info_regression, k=n_features) + X_train_selected = selector.fit_transform(X_train, y_train) + X_test_selected = selector.transform(X_test) + selected_indices = selector.get_support(indices=True) + selected_features = [feature_names[i] for i in selected_indices] + + elif method == "rfe": + # Use linear regression as base estimator for RFE + estimator = LinearRegression() + selector = RFE(estimator, n_features_to_select=n_features, step=1) + X_train_selected = selector.fit_transform(X_train, y_train) + X_test_selected = selector.transform(X_test) + selected_indices = selector.get_support(indices=True) + selected_features = [feature_names[i] for i in selected_indices] + + elif method == "random_forest": + # Use feature importance from random forest + rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) + rf.fit(X_train, y_train) + importances = rf.feature_importances_ + selected_indices = np.argsort(importances)[-n_features:] + X_train_selected = X_train[:, selected_indices] + X_test_selected = X_test[:, selected_indices] + selected_features = [feature_names[i] for i in selected_indices] + + else: + return X_train, X_test, feature_names + + print(f" Selected {len(selected_features)} features: {selected_features}") + return X_train_selected, X_test_selected, selected_features + + except Exception as e: + print(f" Feature selection failed: {e}, using all features") + return X_train, X_test, feature_names -# ====================== OVERLAP-/LEAKAGE-AWARE UTILITIES ===================== - -def max_lookback_minutes() -> int: - # From feature construction: the maximum rolling window is 1440 minutes. - return 1440 - - -def bars_from_minutes(rule: str, minutes: int) -> int: - step_min = max(1, int(pd.Timedelta(rule).total_seconds() // 60)) - return int(np.ceil(minutes / step_min)) - - +# ------------------------- Robust Cross-Validation Methods ------------------- def sample_random_splits( g: pd.DataFrame, - rule: str, n_splits: int, test_bars: int, - gap_bars_extra: int, + gap_bars: int, seed: int, - since: str | None, - horizon_min: int, + since: str | None ): - """ - Random test windows with an embargo that guarantees disjoint information sets. - Embargo (in bars) = max(gap_bars_extra, ceil((max_lookback + horizon_min)/rule_minutes)). - Train uses only data strictly before (test_start - embargo). - """ + """Original random sampling method""" rng = np.random.default_rng(seed) idx = g.index + if since is not None: idx = idx[idx >= pd.Timestamp(since, tz="UTC")] - - if len(idx) <= test_bars: - return - - # Compute minimal embargo based on lookback + horizon - gap_min_bars = bars_from_minutes(rule, max_lookback_minutes() + horizon_min) - embargo_bars = int(max(gap_bars_extra, gap_min_bars)) - - # Valid start indices ensure full test window fits + valid = np.arange(len(idx) - test_bars) if len(valid) <= 0: - return - + raise ValueError("Not enough data for requested test window") + starts = rng.choice(valid, size=min(n_splits, len(valid)), replace=False) - starts = np.sort(starts) - for s in starts: + for s in np.sort(starts): test_start = idx[s] - test_end = idx[s + test_bars - 1] - - # Train: strictly before test_start - embargo_bars - left_end_pos = s - embargo_bars - 1 - if left_end_pos < 0: - # No room for non-overlapping training information - continue - - embargo_end = idx[left_end_pos] - train = g.loc[:embargo_end] - test = g.loc[test_start:test_end] + test_end = idx[s + test_bars - 1] + embargo_end = idx[max(0, s - gap_bars - 1)] if s - gap_bars - 1 >= 0 else None + train = g.loc[:embargo_end] if embargo_end is not None else g.iloc[0:0] + test = g.loc[test_start:test_end] if len(train) == 0 or len(test) < test_bars: continue - yield train, test, (test_start, test_end), embargo_bars + yield train, test, (test_start, test_end) - -# ============================ MODEL / FIT ==================================== - -def fit_and_predict_train_test( - train: pd.DataFrame, - test: pd.DataFrame, - n_states: int, - full_save_path: str | None = None, +def sample_expanding_window_splits( + g: pd.DataFrame, + n_splits: int, + test_bars: int, + gap_bars: int, + since: str | None ): + """Expanding window CV - training set grows over time""" + idx = g.index + if since is not None: + idx = idx[idx >= pd.Timestamp(since, tz="UTC")] + + total_bars = len(idx) + min_train_size = test_bars * 2 # Minimum training set size + + # Calculate split points + available_splits = max(1, (total_bars - min_train_size - test_bars - gap_bars) // test_bars) + n_splits = min(n_splits, available_splits) + + if n_splits <= 0: + raise ValueError("Not enough data for expanding window splits") + + for i in range(n_splits): + test_start_idx = min_train_size + gap_bars + (i * test_bars) + test_end_idx = test_start_idx + test_bars - 1 + + if test_end_idx >= total_bars: + break + + train_end_idx = test_start_idx - gap_bars - 1 + train = g.iloc[:train_end_idx + 1] + test = g.iloc[test_start_idx:test_end_idx + 1] + + if len(train) < min_train_size or len(test) < test_bars: + continue + + yield train, test, (idx[test_start_idx], idx[test_end_idx]) + +def sample_rolling_window_splits( + g: pd.DataFrame, + n_splits: int, + test_bars: int, + gap_bars: int, + since: str | None +): + """Rolling window CV - fixed training window size""" + idx = g.index + if since is not None: + idx = idx[idx >= pd.Timestamp(since, tz="UTC")] + + total_bars = len(idx) + train_bars = test_bars * 3 # Fixed training window size + + # Calculate split points + available_splits = max(1, (total_bars - train_bars - test_bars - gap_bars) // test_bars) + n_splits = min(n_splits, available_splits) + + if n_splits <= 0: + raise ValueError("Not enough data for rolling window splits") + + for i in range(n_splits): + train_start_idx = i * test_bars + train_end_idx = train_start_idx + train_bars - 1 + test_start_idx = train_end_idx + gap_bars + 1 + test_end_idx = test_start_idx + test_bars - 1 + + if test_end_idx >= total_bars: + break + + train = g.iloc[train_start_idx:train_end_idx + 1] + test = g.iloc[test_start_idx:test_end_idx + 1] + + if len(train) < train_bars or len(test) < test_bars: + continue + + yield train, test, (idx[test_start_idx], idx[test_end_idx]) + +def get_cv_splits(g: pd.DataFrame, cv_method: str, n_splits: int, test_bars: int, + gap_bars: int, seed: int, since: str | None): + """Dispatch to appropriate CV method""" + if cv_method == "random": + return sample_random_splits(g, n_splits, test_bars, gap_bars, seed, since) + elif cv_method == "expanding": + return sample_expanding_window_splits(g, n_splits, test_bars, gap_bars, since) + elif cv_method == "rolling": + return sample_rolling_window_splits(g, n_splits, test_bars, gap_bars, since) + else: + raise ValueError(f"Unknown CV method: {cv_method}") + +# ------------------------------ Model / Fit ----------------------------------- +def fit_and_predict_train_test(train: pd.DataFrame, test: pd.DataFrame, + n_states: int, feature_selection_method: str, + n_features: int): Xtr, ytr, feats = feature_matrix(train) Xte, yte, _ = feature_matrix(test) - + + # Apply feature selection + Xtr_sel, Xte_sel, selected_feats = select_features( + Xtr, ytr, Xte, feats, feature_selection_method, n_features + ) + scaler = StandardScaler() - Xtr_s = scaler.fit_transform(Xtr) - Xte_s = scaler.transform(Xte) - + Xtr_s = scaler.fit_transform(Xtr_sel) + Xte_s = scaler.transform(Xte_sel) + hmm = GaussianHMM(n_components=n_states, covariance_type="diag", n_iter=300, random_state=7) hmm.fit(Xtr_s) - + st_tr = hmm.predict(Xtr_s) st_te = hmm.predict(Xte_s) - - # Map HMM states to stances using state-wise mean of future returns in TRAIN + means = {s: float(np.nanmean(ytr[st_tr == s])) for s in range(n_states)} small = np.nanpercentile(np.abs(list(means.values())), 30) state_to_stance = {s: (1 if m > +small else (-1 if m < -small else 0)) for s, m in means.items()} - preds = np.vectorize(state_to_stance.get)(st_te).astype(np.int8) + + return preds, yte, state_to_stance, selected_feats, hmm - if full_save_path: - Path(full_save_path).parent.mkdir(parents=True, exist_ok=True) - joblib.dump( - {"hmm": hmm, "scaler": scaler, "features": feats, "state_to_stance": state_to_stance}, - full_save_path, - ) - print(f"Model saved: {full_save_path}") - - return preds, yte, state_to_stance, feats - - -# ============================= METRICS ======================================= - -def metrics_nonoverlap(y: np.ndarray, preds: np.ndarray, rule: str, horizon_min: int) -> dict[str, float]: - """ - Score only every 'ahead'-th point to remove overlap of forward windows. - Adjust annualization for reduced sampling frequency. - """ - T = min(len(y), len(preds)) - if T == 0: - return {"hit_rate": np.nan, "ann_sharpe": np.nan, "n_points": 0} - - y = y[:T] - preds = preds[:T] - - step_min = max(1, int(pd.Timedelta(rule).total_seconds() // 60)) - ahead = max(1, int(round(horizon_min / step_min))) - - # Use the last index of each non-overlapping forward window - idx = np.arange(ahead - 1, T, ahead) - if len(idx) == 0: - return {"hit_rate": np.nan, "ann_sharpe": np.nan, "n_points": 0} - - y_s = y[idx] - p_s = preds[idx] - - pnl = p_s * y_s - hit = float((np.sign(p_s) == np.sign(y_s)).mean()) - - bars_per_day = int(round(24 * 60 / step_min)) - # We only take one observation per 'ahead' bars - eff_obs_per_day = bars_per_day / ahead - ann = np.sqrt(365 * max(eff_obs_per_day, 1e-12)) - +def metrics(y: np.ndarray, preds: np.ndarray, rule: str) -> dict[str,float]: + T = min(len(y), len(preds)); y, preds = y[:T], preds[:T] + pnl = preds * y + hit = (np.sign(preds) == np.sign(y)).mean() if T else np.nan + bars_per_day = int(round(24 * 60 / max(1, int(pd.Timedelta(rule).total_seconds() // 60)))) + ann = np.sqrt(365 * bars_per_day) sharpe = float(np.nanmean(pnl) / (np.nanstd(pnl) + 1e-12) * ann) - return {"hit_rate": hit, "ann_sharpe": sharpe, "n_points": int(len(idx))} + # Additional metrics + positive_returns = (pnl > 0).sum() + total_trades = len(pnl) + win_rate = positive_returns / total_trades if total_trades > 0 else 0 + profit_factor = abs(pnl[pnl > 0].sum() / (pnl[pnl < 0].sum() + 1e-12)) + + return { + "hit_rate": float(hit), + "ann_sharpe": sharpe, + "n_points": int(T), + "win_rate": win_rate, + "profit_factor": profit_factor, + "total_return": float(pnl.sum()) + } -# ============================== RUNNER ======================================= - -def run_rule_mc( - minute: pd.DataFrame, - rule: str, - n_states: int, - horizon_min: int, - cv: object, - folder_save_path: str | None, -) -> dict: +# ------------------------------ Runner --------------------------------------- +def run_rule_mc(minute: pd.DataFrame, rule: str, n_states: int, + horizon_min: int, cv, feature_selection_method: str, + n_features: int) -> dict: g = build_features(minute, rule, horizon_min) rows = [] + feature_importance = {} - for train, test, (ts, te), embargo_bars in sample_random_splits( - g=g, - rule=rule, - n_splits=cv.cv_splits, - test_bars=cv.cv_test_bars, - gap_bars_extra=cv.cv_gap_bars, - seed=cv.cv_seed, - since=cv.cv_since, - horizon_min=horizon_min, - ): - full_save_path = None - if folder_save_path: - full_save_path = f"{folder_save_path}/hmm_btc_eth_{rule}_{horizon_min}.joblib" - - preds, ytest, state_map, feats = fit_and_predict_train_test( - train, test, n_states, full_save_path + for i, (train, test, (ts, te)) in enumerate(get_cv_splits(g, cv.cv_method, cv.cv_splits, + cv.cv_test_bars, cv.cv_gap_bars, + cv.cv_seed, cv.cv_since)): + print(f" Split {i+1}: Train {len(train)} bars, Test {len(test)} bars") + + preds, ytest, state_map, feats, hmm = fit_and_predict_train_test( + train, test, n_states, feature_selection_method, n_features ) - m = metrics_nonoverlap(ytest, preds, rule, horizon_min) - rows.append( - { - "hit_rate": m["hit_rate"], - "ann_sharpe": m["ann_sharpe"], - "n_points": m["n_points"], - "test_span": (ts, te), - "embargo_bars": embargo_bars, - } - ) - + + m = metrics(ytest, preds, rule) + rows.append({ + "hit_rate": m["hit_rate"], + "ann_sharpe": m["ann_sharpe"], + "n_points": m["n_points"], + "test_span": (ts, te), + "win_rate": m["win_rate"], + "profit_factor": m["profit_factor"], + "total_return": m["total_return"] + }) + + # Track feature usage + for feat in feats: + feature_importance[feat] = feature_importance.get(feat, 0) + 1 + if not rows: return { - "rule": rule, - "hit_mean": np.nan, - "hit_std": np.nan, - "sharpe_mean": np.nan, - "sharpe_std": np.nan, - "splits": 0, + "rule": rule, "hit_mean": np.nan, "sharpe_mean": np.nan, + "splits": 0, "hit_std": np.nan, "sharpe_std": np.nan, + "win_rate_mean": np.nan, "profit_factor_mean": np.nan } - + hits = np.array([r["hit_rate"] for r in rows], dtype=float) sharpes = np.array([r["ann_sharpe"] for r in rows], dtype=float) - + win_rates = np.array([r["win_rate"] for r in rows], dtype=float) + profit_factors = np.array([r["profit_factor"] for r in rows], dtype=float) + + # Sort features by importance + sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True) + top_features = [f[0] for f in sorted_features[:5]] # Top 5 most frequently selected features + return { "rule": rule, "hit_mean": float(np.nanmean(hits)), "hit_std": float(np.nanstd(hits)), "sharpe_mean": float(np.nanmean(sharpes)), "sharpe_std": float(np.nanstd(sharpes)), + "win_rate_mean": float(np.nanmean(win_rates)), + "profit_factor_mean": float(np.nanmean(profit_factors)), "splits": len(rows), + "top_features": top_features } - +# ------------------------------ MAIN ----------------------------------------- def main(args: CLI) -> None: + print("Loading data...") btc = _load_bitstamp_csv(args.btc_csv, "btc") eth = _load_bitstamp_csv(args.eth_csv, "eth") minute = _align_minutely(btc, eth) + print(f"Aligned data: {len(minute)} minutes") - class CV: - pass - + class CV: pass cv = CV() cv.cv_splits = args.cv_splits cv.cv_test_bars = args.cv_test_bars cv.cv_gap_bars = args.cv_gap_bars cv.cv_seed = args.cv_seed cv.cv_since = args.cv_since + cv.cv_method = args.cv_method + + results = [] + for rule in args.resample_rules: + print(f"\nProcessing rule: {rule}") + result = run_rule_mc(minute, rule, args.n_states, args.horizon_min, cv, + args.feature_selection_method, args.n_features) + results.append(result) - results = [ - run_rule_mc(minute, rule, args.n_states, args.horizon_min, cv, args.folder_save_path) - for rule in args.resample_rules - ] df = pd.DataFrame(results).sort_values(by="sharpe_mean", ascending=False) - - print("# Randomized time-split comparison (embargo = max(user_gap, ceil((lookback+horizon)/rule)))") - print( - f"States={args.n_states} HorizonMin={args.horizon_min} Splits={args.cv_splits} " - f"TestBars={args.cv_test_bars} ExtraGapBars={args.cv_gap_bars} Since={args.cv_since}" - ) + + print("\n" + "="*80) + print("ENHANCED RESULTS: Randomized time-split comparison with Feature Selection") + print("="*80) + print(f"States={args.n_states} | HorizonMin={args.horizon_min} | Splits={args.cv_splits}") + print(f"TestBars={args.cv_test_bars} | GapBars={args.cv_gap_bars} | Since={args.cv_since}") + print(f"CV Method={args.cv_method} | Feature Selection={args.feature_selection_method} | N Features={args.n_features}") + print("="*80) + if not df.empty: df["hit"] = df["hit_mean"].round(4).astype(str) + " ± " + df["hit_std"].round(4).astype(str) df["sharpe"] = df["sharpe_mean"].round(4).astype(str) + " ± " + df["sharpe_std"].round(4).astype(str) - print(df[["rule", "splits", "hit", "sharpe"]].to_string(index=False)) + df["win_rate"] = (df["win_rate_mean"] * 100).round(2).astype(str) + "%" + df["profit_factor"] = df["profit_factor_mean"].round(3).astype(str) + + display_cols = ["rule", "splits", "hit", "sharpe", "win_rate", "profit_factor"] + print(df[display_cols].to_string(index=False)) + + # Show top features for best performing rule + best_rule = df.iloc[0] + print(f"\nTop features for best rule '{best_rule['rule']}':") + for i, feat in enumerate(best_rule.get('top_features', [])[:5]): + print(f" {i+1}. {feat}") else: print("No valid splits found") - if __name__ == "__main__": args = parse_args() - main(args) + main(args) \ No newline at end of file