141 lines
4.9 KiB
Python
141 lines
4.9 KiB
Python
import pandas as pd
|
|
import strategy_config as config
|
|
from cryptoquant_client import CryptoQuantClient
|
|
from features import create_features
|
|
import argparse
|
|
import os
|
|
import time
|
|
|
|
def fetch_onchain_data(client, asset='BTC', days_back=365*2):
|
|
"""Fetches and aligns on-chain data."""
|
|
print(f"Fetching on-chain data for {asset}...")
|
|
|
|
# 1. Fetch all metrics
|
|
# Note: This might take a while due to rate limits
|
|
raw_data = client.fetch_all_onchain(asset, days_back=days_back)
|
|
|
|
dfs = []
|
|
for metric_name, records in raw_data.items():
|
|
if not records:
|
|
print(f"⚠️ No data for {metric_name}")
|
|
continue
|
|
|
|
df = pd.DataFrame(records)
|
|
# Standardize date column
|
|
if 'date' in df.columns:
|
|
df['timestamp'] = pd.to_datetime(df['date'])
|
|
else:
|
|
# Try to find date-like column
|
|
cols = [c for c in df.columns if 'date' in c or 'time' in c]
|
|
if cols:
|
|
df['timestamp'] = pd.to_datetime(df[cols[0]])
|
|
else:
|
|
print(f"❌ Could not find date column for {metric_name}")
|
|
continue
|
|
|
|
df = df.set_index('timestamp').sort_index()
|
|
|
|
# Keep only the value column
|
|
# Value column name varies by endpoint.
|
|
# usually same as metric name or 'value' or specific name.
|
|
# Simple heuristic: take the first numeric column that isn't 'timestamp'
|
|
numeric_cols = df.select_dtypes(include=['number']).columns
|
|
if len(numeric_cols) > 0:
|
|
val_col = numeric_cols[0]
|
|
df = df[[val_col]].rename(columns={val_col: metric_name})
|
|
|
|
# Resample to hourly and forward fill (since on-chain is daily/block)
|
|
# CAUTION: Daily data from CryptoQuant (e.g. Total Active Addresses) is usually
|
|
# timestamped at 00:00 but represents the FULL day's activity.
|
|
# If we use it at 10:00 AM on the same day, that is Lookahead Bias.
|
|
# We must SHIFT it by 1 day to ensure we only use it AFTER it's available (next day).
|
|
# Funding rates might be 8h, but 'window=day' implies daily aggregation.
|
|
# Safer to lag by 24h.
|
|
|
|
df = df.shift(1, freq='D') # Shift index by 1 Day
|
|
|
|
df = df.resample('1h').ffill()
|
|
dfs.append(df)
|
|
else:
|
|
print(f"❌ No numeric data for {metric_name}")
|
|
|
|
if not dfs:
|
|
return pd.DataFrame()
|
|
|
|
# Concatenate all on-chain metrics
|
|
onchain_df = pd.concat(dfs, axis=1)
|
|
return onchain_df
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Prepare data for MVRV Strategy")
|
|
parser.add_argument("--csv", required=True, help="Path to OHLCV CSV file")
|
|
parser.add_argument("--days", type=int, default=365, help="Days of on-chain data to fetch")
|
|
args = parser.parse_args()
|
|
|
|
# 1. Load OHLCV
|
|
print(f"Loading OHLCV from {args.csv}...")
|
|
df_ohlcv = pd.read_csv(args.csv)
|
|
|
|
# Standardize OHLCV columns/index
|
|
# Expecting Timestamp/Date column
|
|
if 'Timestamp' in df_ohlcv.columns:
|
|
# Smart detection of unit
|
|
ts_max = df_ohlcv['Timestamp'].max()
|
|
if ts_max < 3000000000: # < 3B, likely seconds (valid until ~2065)
|
|
unit = 's'
|
|
elif ts_max < 3000000000000: # < 3T, likely milliseconds
|
|
unit = 'ms'
|
|
else:
|
|
unit = None # Let pandas guess (ns?)
|
|
|
|
df_ohlcv['timestamp'] = pd.to_datetime(df_ohlcv['Timestamp'], unit=unit)
|
|
elif 'Date' in df_ohlcv.columns:
|
|
df_ohlcv['timestamp'] = pd.to_datetime(df_ohlcv['Date'])
|
|
|
|
df_ohlcv = df_ohlcv.set_index('timestamp').sort_index()
|
|
|
|
# Resample to 1H for feature engineering
|
|
df_1h = df_ohlcv.resample('1h').agg({
|
|
'Open': 'first',
|
|
'High': 'max',
|
|
'Low': 'min',
|
|
'Close': 'last',
|
|
'Volume': 'sum'
|
|
}).dropna()
|
|
|
|
# Rename to lowercase for features.py
|
|
df_1h = df_1h.rename(columns={
|
|
'Open': 'open', 'High': 'high', 'Low': 'low', 'Close': 'close', 'Volume': 'volume'
|
|
})
|
|
|
|
print(f"OHLCV 1H shape: {df_1h.shape}")
|
|
|
|
# 2. Fetch On-Chain
|
|
client = CryptoQuantClient()
|
|
df_onchain = fetch_onchain_data(client, asset=config.ASSET, days_back=args.days)
|
|
print(f"On-Chain shape: {df_onchain.shape}")
|
|
|
|
# 3. Merge
|
|
# Left join on OHLCV index
|
|
df_merged = df_1h.join(df_onchain, how='left')
|
|
|
|
# Forward fill on-chain data (it's slower than price)
|
|
df_merged = df_merged.ffill()
|
|
|
|
# Drop rows where we still have NaNs (start of data)
|
|
df_merged = df_merged.dropna()
|
|
print(f"Merged shape: {df_merged.shape}")
|
|
|
|
# 4. Create Features
|
|
print("Engineering features...")
|
|
df_features = create_features(df_merged)
|
|
df_features = df_features.dropna()
|
|
|
|
# 5. Save
|
|
print(f"Saving features to {config.FEATURES_PATH}...")
|
|
df_features.to_csv(config.FEATURES_PATH)
|
|
print("Done.")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|