lowkey_backtest/prepare_data.py

import pandas as pd
import strategy_config as config
from cryptoquant_client import CryptoQuantClient
from features import create_features
import argparse
import os
import time

def fetch_onchain_data(client, asset='BTC', days_back=365*2):
    """Fetches and aligns on-chain data."""
    print(f"Fetching on-chain data for {asset}...")

    # 1. Fetch all metrics
    # Note: This might take a while due to rate limits
    raw_data = client.fetch_all_onchain(asset, days_back=days_back)

    dfs = []
    for metric_name, records in raw_data.items():
        if not records:
            print(f"⚠️  No data for {metric_name}")
            continue

        df = pd.DataFrame(records)
        # Standardize date column
        if 'date' in df.columns:
            df['timestamp'] = pd.to_datetime(df['date'])
        else:
            # Try to find date-like column
            cols = [c for c in df.columns if 'date' in c or 'time' in c]
            if cols:
                df['timestamp'] = pd.to_datetime(df[cols[0]])
            else:
                print(f"❌ Could not find date column for {metric_name}")
                continue

        df = df.set_index('timestamp').sort_index()

        # Keep only the value column
        # Value column name varies by endpoint.
        # usually same as metric name or 'value' or specific name.
        # Simple heuristic: take the first numeric column that isn't 'timestamp'
        numeric_cols = df.select_dtypes(include=['number']).columns
        if len(numeric_cols) > 0:
            val_col = numeric_cols[0]
            df = df[[val_col]].rename(columns={val_col: metric_name})

            # Resample to hourly and forward fill (since on-chain is daily/block)
            # CAUTION: Daily data from CryptoQuant (e.g. Total Active Addresses) is usually
            # timestamped at 00:00 but represents the FULL day's activity.
            # If we use it at 10:00 AM on the same day, that is Lookahead Bias.
            # We must SHIFT it by 1 day to ensure we only use it AFTER it's available (next day).
            # Funding rates might be 8h, but 'window=day' implies daily aggregation.
            # Safer to lag by 24h.

            df = df.shift(1, freq='D') # Shift index by 1 Day

            df = df.resample('1h').ffill()
            dfs.append(df)
        else:
            print(f"❌ No numeric data for {metric_name}")

    if not dfs:
        return pd.DataFrame()

    # Concatenate all on-chain metrics
    onchain_df = pd.concat(dfs, axis=1)
    return onchain_df

def main():
    parser = argparse.ArgumentParser(description="Prepare data for MVRV Strategy")
    parser.add_argument("--csv", required=True, help="Path to OHLCV CSV file")
    parser.add_argument("--days", type=int, default=365, help="Days of on-chain data to fetch")
    args = parser.parse_args()

    # 1. Load OHLCV
    print(f"Loading OHLCV from {args.csv}...")
    df_ohlcv = pd.read_csv(args.csv)

    # Standardize OHLCV columns/index
    # Expecting Timestamp/Date column
    if 'Timestamp' in df_ohlcv.columns:
        # Smart detection of unit
        ts_max = df_ohlcv['Timestamp'].max()
        if ts_max < 3000000000: # < 3B, likely seconds (valid until ~2065)
            unit = 's'
        elif ts_max < 3000000000000: # < 3T, likely milliseconds
            unit = 'ms'
        else:
            unit = None # Let pandas guess (ns?)

        df_ohlcv['timestamp'] = pd.to_datetime(df_ohlcv['Timestamp'], unit=unit)
    elif 'Date' in df_ohlcv.columns:
        df_ohlcv['timestamp'] = pd.to_datetime(df_ohlcv['Date'])

    df_ohlcv = df_ohlcv.set_index('timestamp').sort_index()

    # Resample to 1H for feature engineering
    df_1h = df_ohlcv.resample('1h').agg({
        'Open': 'first',
        'High': 'max',
        'Low': 'min',
        'Close': 'last',
        'Volume': 'sum'
    }).dropna()

    # Rename to lowercase for features.py
    df_1h = df_1h.rename(columns={
        'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume'
    })

    print(f"OHLCV 1H shape: {df_1h.shape}")

    # 2. Fetch On-Chain
    client = CryptoQuantClient()
    df_onchain = fetch_onchain_data(client, asset=config.ASSET, days_back=args.days)
    print(f"On-Chain shape: {df_onchain.shape}")

    # 3. Merge
    # Left join on OHLCV index
    df_merged = df_1h.join(df_onchain, how='left')

    # Forward fill on-chain data (it's slower than price)
    df_merged = df_merged.ffill()

    # Drop rows where we still have NaNs (start of data)
    df_merged = df_merged.dropna()
    print(f"Merged shape: {df_merged.shape}")

    # 4. Create Features
    print("Engineering features...")
    df_features = create_features(df_merged)
    df_features = df_features.dropna()

    # 5. Save
    print(f"Saving features to {config.FEATURES_PATH}...")
    df_features.to_csv(config.FEATURES_PATH)
    print("Done.")

if __name__ == "__main__":
    main()