import pandas as pd import strategy_config as config from cryptoquant_client import CryptoQuantClient from features import create_features import argparse import os import time def fetch_onchain_data(client, asset='BTC', days_back=365*2): """Fetches and aligns on-chain data.""" print(f"Fetching on-chain data for {asset}...") # 1. Fetch all metrics # Note: This might take a while due to rate limits raw_data = client.fetch_all_onchain(asset, days_back=days_back) dfs = [] for metric_name, records in raw_data.items(): if not records: print(f"⚠️ No data for {metric_name}") continue df = pd.DataFrame(records) # Standardize date column if 'date' in df.columns: df['timestamp'] = pd.to_datetime(df['date']) else: # Try to find date-like column cols = [c for c in df.columns if 'date' in c or 'time' in c] if cols: df['timestamp'] = pd.to_datetime(df[cols[0]]) else: print(f"❌ Could not find date column for {metric_name}") continue df = df.set_index('timestamp').sort_index() # Keep only the value column # Value column name varies by endpoint. # usually same as metric name or 'value' or specific name. # Simple heuristic: take the first numeric column that isn't 'timestamp' numeric_cols = df.select_dtypes(include=['number']).columns if len(numeric_cols) > 0: val_col = numeric_cols[0] df = df[[val_col]].rename(columns={val_col: metric_name}) # Resample to hourly and forward fill (since on-chain is daily/block) # CAUTION: Daily data from CryptoQuant (e.g. Total Active Addresses) is usually # timestamped at 00:00 but represents the FULL day's activity. # If we use it at 10:00 AM on the same day, that is Lookahead Bias. # We must SHIFT it by 1 day to ensure we only use it AFTER it's available (next day). # Funding rates might be 8h, but 'window=day' implies daily aggregation. # Safer to lag by 24h. df = df.shift(1, freq='D') # Shift index by 1 Day df = df.resample('1h').ffill() dfs.append(df) else: print(f"❌ No numeric data for {metric_name}") if not dfs: return pd.DataFrame() # Concatenate all on-chain metrics onchain_df = pd.concat(dfs, axis=1) return onchain_df def main(): parser = argparse.ArgumentParser(description="Prepare data for MVRV Strategy") parser.add_argument("--csv", required=True, help="Path to OHLCV CSV file") parser.add_argument("--days", type=int, default=365, help="Days of on-chain data to fetch") args = parser.parse_args() # 1. Load OHLCV print(f"Loading OHLCV from {args.csv}...") df_ohlcv = pd.read_csv(args.csv) # Standardize OHLCV columns/index # Expecting Timestamp/Date column if 'Timestamp' in df_ohlcv.columns: # Smart detection of unit ts_max = df_ohlcv['Timestamp'].max() if ts_max < 3000000000: # < 3B, likely seconds (valid until ~2065) unit = 's' elif ts_max < 3000000000000: # < 3T, likely milliseconds unit = 'ms' else: unit = None # Let pandas guess (ns?) df_ohlcv['timestamp'] = pd.to_datetime(df_ohlcv['Timestamp'], unit=unit) elif 'Date' in df_ohlcv.columns: df_ohlcv['timestamp'] = pd.to_datetime(df_ohlcv['Date']) df_ohlcv = df_ohlcv.set_index('timestamp').sort_index() # Resample to 1H for feature engineering df_1h = df_ohlcv.resample('1h').agg({ 'Open': 'first', 'High': 'max', 'Low': 'min', 'Close': 'last', 'Volume': 'sum' }).dropna() # Rename to lowercase for features.py df_1h = df_1h.rename(columns={ 'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume' }) print(f"OHLCV 1H shape: {df_1h.shape}") # 2. Fetch On-Chain client = CryptoQuantClient() df_onchain = fetch_onchain_data(client, asset=config.ASSET, days_back=args.days) print(f"On-Chain shape: {df_onchain.shape}") # 3. Merge # Left join on OHLCV index df_merged = df_1h.join(df_onchain, how='left') # Forward fill on-chain data (it's slower than price) df_merged = df_merged.ffill() # Drop rows where we still have NaNs (start of data) df_merged = df_merged.dropna() print(f"Merged shape: {df_merged.shape}") # 4. Create Features print("Engineering features...") df_features = create_features(df_merged) df_features = df_features.dropna() # 5. Save print(f"Saving features to {config.FEATURES_PATH}...") df_features.to_csv(config.FEATURES_PATH) print("Done.") if __name__ == "__main__": main()