lowkey_backtest/prepare_data.py

141 lines
4.9 KiB
Python

import pandas as pd
import strategy_config as config
from cryptoquant_client import CryptoQuantClient
from features import create_features
import argparse
import os
import time
def fetch_onchain_data(client, asset='BTC', days_back=365*2):
"""Fetches and aligns on-chain data."""
print(f"Fetching on-chain data for {asset}...")
# 1. Fetch all metrics
# Note: This might take a while due to rate limits
raw_data = client.fetch_all_onchain(asset, days_back=days_back)
dfs = []
for metric_name, records in raw_data.items():
if not records:
print(f"⚠️ No data for {metric_name}")
continue
df = pd.DataFrame(records)
# Standardize date column
if 'date' in df.columns:
df['timestamp'] = pd.to_datetime(df['date'])
else:
# Try to find date-like column
cols = [c for c in df.columns if 'date' in c or 'time' in c]
if cols:
df['timestamp'] = pd.to_datetime(df[cols[0]])
else:
print(f"❌ Could not find date column for {metric_name}")
continue
df = df.set_index('timestamp').sort_index()
# Keep only the value column
# Value column name varies by endpoint.
# usually same as metric name or 'value' or specific name.
# Simple heuristic: take the first numeric column that isn't 'timestamp'
numeric_cols = df.select_dtypes(include=['number']).columns
if len(numeric_cols) > 0:
val_col = numeric_cols[0]
df = df[[val_col]].rename(columns={val_col: metric_name})
# Resample to hourly and forward fill (since on-chain is daily/block)
# CAUTION: Daily data from CryptoQuant (e.g. Total Active Addresses) is usually
# timestamped at 00:00 but represents the FULL day's activity.
# If we use it at 10:00 AM on the same day, that is Lookahead Bias.
# We must SHIFT it by 1 day to ensure we only use it AFTER it's available (next day).
# Funding rates might be 8h, but 'window=day' implies daily aggregation.
# Safer to lag by 24h.
df = df.shift(1, freq='D') # Shift index by 1 Day
df = df.resample('1h').ffill()
dfs.append(df)
else:
print(f"❌ No numeric data for {metric_name}")
if not dfs:
return pd.DataFrame()
# Concatenate all on-chain metrics
onchain_df = pd.concat(dfs, axis=1)
return onchain_df
def main():
parser = argparse.ArgumentParser(description="Prepare data for MVRV Strategy")
parser.add_argument("--csv", required=True, help="Path to OHLCV CSV file")
parser.add_argument("--days", type=int, default=365, help="Days of on-chain data to fetch")
args = parser.parse_args()
# 1. Load OHLCV
print(f"Loading OHLCV from {args.csv}...")
df_ohlcv = pd.read_csv(args.csv)
# Standardize OHLCV columns/index
# Expecting Timestamp/Date column
if 'Timestamp' in df_ohlcv.columns:
# Smart detection of unit
ts_max = df_ohlcv['Timestamp'].max()
if ts_max < 3000000000: # < 3B, likely seconds (valid until ~2065)
unit = 's'
elif ts_max < 3000000000000: # < 3T, likely milliseconds
unit = 'ms'
else:
unit = None # Let pandas guess (ns?)
df_ohlcv['timestamp'] = pd.to_datetime(df_ohlcv['Timestamp'], unit=unit)
elif 'Date' in df_ohlcv.columns:
df_ohlcv['timestamp'] = pd.to_datetime(df_ohlcv['Date'])
df_ohlcv = df_ohlcv.set_index('timestamp').sort_index()
# Resample to 1H for feature engineering
df_1h = df_ohlcv.resample('1h').agg({
'Open': 'first',
'High': 'max',
'Low': 'min',
'Close': 'last',
'Volume': 'sum'
}).dropna()
# Rename to lowercase for features.py
df_1h = df_1h.rename(columns={
'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume'
})
print(f"OHLCV 1H shape: {df_1h.shape}")
# 2. Fetch On-Chain
client = CryptoQuantClient()
df_onchain = fetch_onchain_data(client, asset=config.ASSET, days_back=args.days)
print(f"On-Chain shape: {df_onchain.shape}")
# 3. Merge
# Left join on OHLCV index
df_merged = df_1h.join(df_onchain, how='left')
# Forward fill on-chain data (it's slower than price)
df_merged = df_merged.ffill()
# Drop rows where we still have NaNs (start of data)
df_merged = df_merged.dropna()
print(f"Merged shape: {df_merged.shape}")
# 4. Create Features
print("Engineering features...")
df_features = create_features(df_merged)
df_features = df_features.dropna()
# 5. Save
print(f"Saving features to {config.FEATURES_PATH}...")
df_features.to_csv(config.FEATURES_PATH)
print("Done.")
if __name__ == "__main__":
main()