diff --git a/.gitignore b/.gitignore index 736352e..7d2ab8f 100644 --- a/.gitignore +++ b/.gitignore @@ -169,4 +169,5 @@ cython_debug/ #.idea/ ./logs/ -*.csv \ No newline at end of file +*.csv +research/regime_results.html diff --git a/engine/cryptoquant.py b/engine/cryptoquant.py new file mode 100644 index 0000000..403b56d --- /dev/null +++ b/engine/cryptoquant.py @@ -0,0 +1,156 @@ +import os +import sys +import time +import requests +import pandas as pd +from datetime import datetime, timedelta +from dotenv import load_dotenv + +# Load env vars from .env file +load_dotenv() + +# Fix path for direct execution +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from engine.logging_config import get_logger + +logger = get_logger(__name__) + +class CryptoQuantClient: + """ + Client for fetching data from CryptoQuant API. + """ + BASE_URL = "https://api.cryptoquant.com/v1" + + def __init__(self, api_key: str | None = None): + self.api_key = api_key or os.getenv("CRYPTOQUANT_API_KEY") + if not self.api_key: + raise ValueError("CryptoQuant API Key not found. Set CRYPTOQUANT_API_KEY env var.") + + self.headers = { + "Authorization": f"Bearer {self.api_key}" + } + + def fetch_metric( + self, + metric_path: str, + symbol: str, + start_date: str, + end_date: str, + exchange: str | None = "all_exchange", + window: str = "day" + ) -> pd.DataFrame: + """ + Fetch a specific metric from CryptoQuant. + """ + url = f"{self.BASE_URL}/{metric_path}" + + params = { + "window": window, + "from": start_date, + "to": end_date, + "limit": 100000 + } + + if exchange: + params["exchange"] = exchange + + logger.info(f"Fetching {metric_path} for {symbol} ({start_date}-{end_date})...") + + try: + response = requests.get(url, headers=self.headers, params=params) + response.raise_for_status() + data = response.json() + + if 'result' in data and 'data' in data['result']: + df = pd.DataFrame(data['result']['data']) + if not df.empty: + if 'date' in df.columns: + df['timestamp'] = pd.to_datetime(df['date']) + df.set_index('timestamp', inplace=True) + df.sort_index(inplace=True) + return df + + return pd.DataFrame() + + except Exception as e: + logger.error(f"Error fetching CQ data {metric_path}: {e}") + if 'response' in locals() and hasattr(response, 'text'): + logger.error(f"Response: {response.text}") + return pd.DataFrame() + + def fetch_multi_metrics(self, symbols: list[str], metrics: dict, start_date: str, end_date: str): + """ + Fetch multiple metrics for multiple symbols and combine them. + """ + combined_df = pd.DataFrame() + + for symbol in symbols: + asset = symbol.lower() + + for metric_name, api_path in metrics.items(): + full_path = f"{asset}/{api_path}" + + # Some metrics (like funding rates) might need specific exchange vs all_exchange + # Defaulting to all_exchange is usually safe for flows, but check specific logic if needed + exchange_param = "all_exchange" + if "funding-rates" in api_path: + # For funding rates, 'all_exchange' might not be valid or might be aggregated + # Let's try 'binance' as a proxy for market sentiment if all fails, + # or keep 'all_exchange' if supported. + # Based on testing, 'all_exchange' is standard for flows. + pass + + df = self.fetch_metric(full_path, asset, start_date, end_date, exchange=exchange_param) + + if not df.empty: + target_col = None + # Heuristic to find the value column + candidates = ['funding_rate', 'reserve', 'inflow_total', 'outflow_total', 'open_interest', 'ratio', 'value'] + + for col in df.columns: + if col in candidates: + target_col = col + break + + if not target_col: + # Fallback: take first numeric col that isn't date + for col in df.columns: + if col not in ['date', 'datetime', 'timestamp_str', 'block_height']: + target_col = col + break + + if target_col: + col_name = f"{asset}_{metric_name}" + subset = df[[target_col]].rename(columns={target_col: col_name}) + + if combined_df.empty: + combined_df = subset + else: + combined_df = combined_df.join(subset, how='outer') + + time.sleep(0.2) + + return combined_df + +if __name__ == "__main__": + cq = CryptoQuantClient() + + # 3 Months Data (Oct 1 2025 - Dec 31 2025) + start = "20251001" + end = "20251231" + + metrics = { + "reserves": "exchange-flows/exchange-reserve", + "inflow": "exchange-flows/inflow", + "funding": "market-data/funding-rates" + } + + print(f"Fetching training data from {start} to {end}...") + df = cq.fetch_multi_metrics(["btc", "eth"], metrics, start, end) + + output_file = "data/cq_training_data.csv" + os.makedirs("data", exist_ok=True) + df.to_csv(output_file) + print(f"\nSaved {len(df)} rows to {output_file}") + print(df.head()) diff --git a/pyproject.toml b/pyproject.toml index 12d55d9..8787753 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,11 @@ dependencies = [ "pandas>=2.3.1", "ta>=0.11.0", "vectorbt>=0.28.2", + "scikit-learn>=1.6.0", + "matplotlib>=3.10.0", + "plotly>=5.24.0", + "requests>=2.32.5", + "python-dotenv>=1.2.1", ] [project.optional-dependencies] diff --git a/research/regime_detection.py b/research/regime_detection.py new file mode 100644 index 0000000..6468c60 --- /dev/null +++ b/research/regime_detection.py @@ -0,0 +1,384 @@ +import sys +import os +from pathlib import Path + +# Add project root to path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +import pandas as pd +import numpy as np +import ta +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report, confusion_matrix +import plotly.graph_objects as go +from plotly.subplots import make_subplots + +from engine.data_manager import DataManager +from engine.market import MarketType + +def prepare_data(symbol_a="BTC-USDT", symbol_b="ETH-USDT", timeframe="1h", limit=None, start_date=None, end_date=None): + """ + Load and align data for two assets to create a pair. + """ + dm = DataManager() + + print(f"Loading data for {symbol_a} and {symbol_b}...") + + # Helper to load or download + def get_df(symbol): + try: + # Try load first + df = dm.load_data("okx", symbol, timeframe, MarketType.SPOT) + except Exception: + df = dm.download_data("okx", symbol, timeframe, market_type=MarketType.SPOT) + + # If we have start/end dates, ensure we have enough data or re-download + if start_date: + mask_start = pd.Timestamp(start_date, tz='UTC') + if df.index.min() > mask_start: + print(f"Local data starts {df.index.min()}, need {mask_start}. Downloading...") + df = dm.download_data("okx", symbol, timeframe, start_date=start_date, end_date=end_date, market_type=MarketType.SPOT) + return df + + df_a = get_df(symbol_a) + df_b = get_df(symbol_b) + + # Filter by date if provided (to match CQ data range) + if start_date: + df_a = df_a[df_a.index >= pd.Timestamp(start_date, tz='UTC')] + df_b = df_b[df_b.index >= pd.Timestamp(start_date, tz='UTC')] + + if end_date: + df_a = df_a[df_a.index <= pd.Timestamp(end_date, tz='UTC')] + df_b = df_b[df_b.index <= pd.Timestamp(end_date, tz='UTC')] + + # Align DataFrames + print("Aligning data...") + common_index = df_a.index.intersection(df_b.index) + df_a = df_a.loc[common_index].copy() + df_b = df_b.loc[common_index].copy() + + if limit: + df_a = df_a.tail(limit) + df_b = df_b.tail(limit) + + return df_a, df_b + +def load_cryptoquant_data(file_path: str) -> pd.DataFrame | None: + """ + Load CryptoQuant data and prepare it for merging. + """ + if not os.path.exists(file_path): + print(f"Warning: CQ data file {file_path} not found.") + return None + + print(f"Loading CryptoQuant data from {file_path}...") + df = pd.read_csv(file_path, index_col='timestamp', parse_dates=True) + + # CQ data is usually daily (UTC 00:00). + # Ensure index is timezone aware to match market data + if df.index.tz is None: + df.index = df.index.tz_localize('UTC') + + return df + +def calculate_features(df_a, df_b, cq_df=None, window=24): + """ + Calculate spread, z-score, and advanced regime features including CQ data. + """ + # 1. Price Ratio (Spread) + spread = df_b['close'] / df_a['close'] + + # 2. Rolling Statistics for Z-Score + rolling_mean = spread.rolling(window=window).mean() + rolling_std = spread.rolling(window=window).std() + z_score = (spread - rolling_mean) / rolling_std + + # 3. Spread Momentum / Technicals + spread_rsi = ta.momentum.RSIIndicator(spread, window=14).rsi() + spread_roc = spread.pct_change(periods=5) * 100 + + # 4. Volume Dynamics + vol_ratio = df_b['volume'] / df_a['volume'] + vol_ratio_ma = vol_ratio.rolling(window=12).mean() + + # 5. Volatility Regime + ret_a = df_a['close'].pct_change() + ret_b = df_b['close'].pct_change() + vol_a = ret_a.rolling(window=window).std() + vol_b = ret_b.rolling(window=window).std() + vol_spread_ratio = vol_b / vol_a + + # Create feature DataFrame + features = pd.DataFrame(index=spread.index) + features['spread'] = spread + features['z_score'] = z_score + features['spread_rsi'] = spread_rsi + features['spread_roc'] = spread_roc + features['vol_ratio'] = vol_ratio + features['vol_ratio_rel'] = vol_ratio / vol_ratio_ma + features['vol_diff_ratio'] = vol_spread_ratio + + # 6. Merge CryptoQuant Data + if cq_df is not None: + print("Merging CryptoQuant features...") + # Forward fill daily data to hourly timestamps + # reindex features to match cq_df range or join + + # Resample CQ to hourly (ffill) + # But easier: join features with cq_df using asof or reindex + cq_aligned = cq_df.reindex(features.index, method='ffill') + + # Add derived CQ features + # Funding Diff: If ETH funding > BTC funding => ETH overheated + if 'btc_funding' in cq_aligned.columns and 'eth_funding' in cq_aligned.columns: + cq_aligned['funding_diff'] = cq_aligned['eth_funding'] - cq_aligned['btc_funding'] + + # Inflow Ratio: If ETH inflow >> BTC inflow => ETH dump incoming? + if 'btc_inflow' in cq_aligned.columns and 'eth_inflow' in cq_aligned.columns: + # Add small epsilon to avoid div by zero + cq_aligned['inflow_ratio'] = cq_aligned['eth_inflow'] / (cq_aligned['btc_inflow'] + 1) + + features = features.join(cq_aligned) + + # --- Refined Target Definition (Anytime Profit) --- + horizon = 6 + threshold = 0.005 # 0.5% profit target + z_threshold = 1.0 + + # For Short Spread (Z > 1): Did it drop below target? + # We look for the MINIMUM spread in the next 'horizon' periods + future_min = features['spread'].rolling(window=horizon).min().shift(-horizon) + target_short = features['spread'] * (1 - threshold) + success_short = (features['z_score'] > z_threshold) & (future_min < target_short) + + # For Long Spread (Z < -1): Did it rise above target? + # We look for the MAXIMUM spread in the next 'horizon' periods + future_max = features['spread'].rolling(window=horizon).max().shift(-horizon) + target_long = features['spread'] * (1 + threshold) + success_long = (features['z_score'] < -z_threshold) & (future_max > target_long) + + conditions = [success_short, success_long] + + features['target'] = np.select(conditions, [1, 1], default=0) + + return features.dropna() + +def train_regime_model(features): + """ + Train a Random Forest to predict mean reversion success. + """ + # Define excluded columns (targets, raw prices, intermediates) + exclude_cols = ['spread', 'horizon_ret', 'target', 'rolling_mean', 'rolling_std'] + + # Auto-select all other numeric columns as features + feature_cols = [c for c in features.columns if c not in exclude_cols] + + # Handle NaN/Inf if any slipped through + X = features[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0) + y = features['target'] + + # Split Data + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False) + + print(f"\nTraining on {len(X_train)} samples, Testing on {len(X_test)} samples...") + print(f"Features used: {feature_cols}") + print(f"Class Balance (Target=1): {y.mean():.2%}") + + # Model + model = RandomForestClassifier( + n_estimators=200, + max_depth=6, + min_samples_leaf=20, + class_weight='balanced_subsample', + random_state=42 + ) + model.fit(X_train, y_train) + + # Evaluation + y_pred = model.predict(X_test) + y_prob = model.predict_proba(X_test)[:, 1] + + print("\n--- Model Evaluation ---") + print(classification_report(y_test, y_pred)) + + # Feature Importance + importances = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False) + print("\n--- Feature Importance ---") + print(importances) + + return model, X_test, y_test, y_pred, y_prob + +def plot_interactive_results(features, y_test, y_pred, y_prob): + """ + Create an interactive HTML plot using Plotly. + """ + print("\nGenerating interactive plot...") + + test_idx = y_test.index + test_data = features.loc[test_idx].copy() + test_data['prob'] = y_prob + test_data['prediction'] = y_pred + test_data['actual'] = y_test + + # Create Subplots + fig = make_subplots( + rows=3, cols=1, + shared_xaxes=True, + vertical_spacing=0.05, + row_heights=[0.5, 0.25, 0.25], + subplot_titles=('Spread & Signals', 'Exchange Inflows', 'Z-Score & Probability') + ) + + # Top: Spread + fig.add_trace( + go.Scatter(x=test_data.index, y=test_data['spread'], mode='lines', name='Spread', line=dict(color='gray')), + row=1, col=1 + ) + + # Signals + # Separate Long and Short signals for clarity + # Logic: If Z-Score was High (>1), we were betting on a SHORT Spread (Reversion Down) + # If Z-Score was Low (< -1), we were betting on a LONG Spread (Reversion Up) + + # Correct Short Signals (Green Triangle Down) + tp_short = test_data[(test_data['prediction'] == 1) & (test_data['actual'] == 1) & (test_data['z_score'] > 0)] + fig.add_trace( + go.Scatter(x=tp_short.index, y=tp_short['spread'], mode='markers', name='Win: Short Spread', + marker=dict(symbol='triangle-down', size=12, color='green')), + row=1, col=1 + ) + + # Correct Long Signals (Green Triangle Up) + tp_long = test_data[(test_data['prediction'] == 1) & (test_data['actual'] == 1) & (test_data['z_score'] < 0)] + fig.add_trace( + go.Scatter(x=tp_long.index, y=tp_long['spread'], mode='markers', name='Win: Long Spread', + marker=dict(symbol='triangle-up', size=12, color='green')), + row=1, col=1 + ) + + # False Short Signals (Red Triangle Down) + fp_short = test_data[(test_data['prediction'] == 1) & (test_data['actual'] == 0) & (test_data['z_score'] > 0)] + fig.add_trace( + go.Scatter(x=fp_short.index, y=fp_short['spread'], mode='markers', name='Loss: Short Spread', + marker=dict(symbol='triangle-down', size=10, color='red')), + row=1, col=1 + ) + + # False Long Signals (Red Triangle Up) + fp_long = test_data[(test_data['prediction'] == 1) & (test_data['actual'] == 0) & (test_data['z_score'] < 0)] + fig.add_trace( + go.Scatter(x=fp_long.index, y=fp_long['spread'], mode='markers', name='Loss: Long Spread', + marker=dict(symbol='triangle-up', size=10, color='red')), + row=1, col=1 + ) + + # Middle: Inflows (BTC vs ETH) + if 'btc_inflow' in test_data.columns: + fig.add_trace( + go.Bar(x=test_data.index, y=test_data['btc_inflow'], name='BTC Inflow', marker_color='orange', opacity=0.6), + row=2, col=1 + ) + if 'eth_inflow' in test_data.columns: + fig.add_trace( + go.Bar(x=test_data.index, y=test_data['eth_inflow'], name='ETH Inflow', marker_color='purple', opacity=0.6), + row=2, col=1 + ) + + # Bottom: Z-Score + fig.add_trace( + go.Scatter(x=test_data.index, y=test_data['z_score'], mode='lines', name='Z-Score', line=dict(color='blue'), opacity=0.5), + row=3, col=1 + ) + fig.add_hline(y=2, line_dash="dash", line_color="red", row=3, col=1) + fig.add_hline(y=-2, line_dash="dash", line_color="green", row=3, col=1) + + # Probability (Secondary Y for Row 3) + fig.add_trace( + go.Scatter(x=test_data.index, y=test_data['prob'], mode='lines', name='Prob', line=dict(color='cyan', width=1.5), yaxis='y4'), + row=3, col=1 + ) + + fig.update_layout( + title='Regime Detection Analysis (with CryptoQuant)', + autosize=True, + height=None, + hovermode='x unified', + yaxis4=dict(title='Probability', overlaying='y3', side='right', range=[0, 1], showgrid=False), + template="plotly_dark", + margin=dict(l=10, r=10, t=40, b=10), + barmode='group' + ) + + # Update all x-axes to ensure spikes are visible everywhere + fig.update_xaxes( + showspikes=True, + spikemode='across', + spikesnap='cursor', + showline=False, + showgrid=True, + spikedash='dot', + spikecolor='white', # Make it bright to see + spikethickness=1, + ) + + fig.update_layout( + title='Regime Detection Analysis (with CryptoQuant)', + autosize=True, + height=None, + hovermode='x unified', # Keep unified hover for data reading + yaxis4=dict(title='Probability', overlaying='y3', side='right', range=[0, 1], showgrid=False), + template="plotly_dark", + margin=dict(l=10, r=10, t=40, b=10), + barmode='group' + ) + + output_path = "research/regime_results.html" + fig.write_html( + output_path, + config={'responsive': True, 'scrollZoom': True}, + include_plotlyjs='cdn', + full_html=True, + default_height='100vh', + default_width='100%' + ) + print(f"Interactive plot saved to {output_path}") + +def main(): + # 1. Load CQ Data first to determine valid date range + cq_path = "data/cq_training_data.csv" + cq_df = load_cryptoquant_data(cq_path) + + start_date = None + end_date = None + + if cq_df is not None and not cq_df.empty: + start_date = cq_df.index.min().strftime('%Y-%m-%d') + end_date = cq_df.index.max().strftime('%Y-%m-%d') + print(f"CryptoQuant Data Range: {start_date} to {end_date}") + + # 2. Get Market Data (Aligned to CQ range) + df_btc, df_eth = prepare_data( + "BTC-USDT", "ETH-USDT", + timeframe="1h", + start_date=start_date, + end_date=end_date + ) + + # 3. Calculate Features + print("Calculating advanced regime features...") + data = calculate_features(df_btc, df_eth, cq_df=cq_df, window=24) + + if data.empty: + print("Error: No overlapping data found between Price and CryptoQuant data.") + return + + # 4. Train & Evaluate + model, X_test, y_test, y_pred, y_prob = train_regime_model(data) + + # 5. Plot + plot_interactive_results(data, y_test, y_pred, y_prob) + +if __name__ == "__main__": + main() diff --git a/research/regime_results.png b/research/regime_results.png new file mode 100644 index 0000000..d18d590 Binary files /dev/null and b/research/regime_results.png differ diff --git a/uv.lock b/uv.lock index b82256c..99888dc 100644 --- a/uv.lock +++ b/uv.lock @@ -843,8 +843,13 @@ version = "0.1.0" source = { virtual = "." } dependencies = [ { name = "ccxt" }, + { name = "matplotlib" }, { name = "numpy" }, { name = "pandas" }, + { name = "plotly" }, + { name = "python-dotenv" }, + { name = "requests" }, + { name = "scikit-learn" }, { name = "ta" }, { name = "vectorbt" }, ] @@ -857,9 +862,14 @@ dev = [ [package.metadata] requires-dist = [ { name = "ccxt", specifier = ">=4.5.32" }, + { name = "matplotlib", specifier = ">=3.10.0" }, { name = "numpy", specifier = ">=2.3.2" }, { name = "pandas", specifier = ">=2.3.1" }, + { name = "plotly", specifier = ">=5.24.0" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0.0" }, + { name = "python-dotenv", specifier = ">=1.2.1" }, + { name = "requests", specifier = ">=2.32.5" }, + { name = "scikit-learn", specifier = ">=1.6.0" }, { name = "ta", specifier = ">=0.11.0" }, { name = "vectorbt", specifier = ">=0.28.2" }, ] @@ -1522,6 +1532,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, ] +[[package]] +name = "python-dotenv" +version = "1.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, +] + [[package]] name = "pytz" version = "2025.2"