#!/usr/bin/env python3 """ Visual test for timeframe aggregation utilities. This script loads BTC minute data and aggregates it to different timeframes, then plots candlestick charts to visually verify the aggregation correctness. """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib.dates as mdates from matplotlib.patches import Rectangle import sys import os from datetime import datetime, timedelta # Add the project root to Python path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from IncrementalTrader.utils import aggregate_minute_data_to_timeframe, parse_timeframe_to_minutes def load_btc_data(file_path: str, date_filter: str = None, max_rows: int = None) -> pd.DataFrame: """ Load BTC minute data from CSV file. Args: file_path: Path to the CSV file date_filter: Date to filter (e.g., "2024-01-01") max_rows: Maximum number of rows to load Returns: DataFrame with OHLCV data """ print(f"๐Ÿ“Š Loading BTC data from {file_path}") try: # Load the CSV file df = pd.read_csv(file_path) print(f" ๐Ÿ“ˆ Loaded {len(df)} rows") print(f" ๐Ÿ“‹ Columns: {list(df.columns)}") # Check the first few rows to understand the format print(f" ๐Ÿ” First few rows:") print(df.head()) # Handle Unix timestamp format if 'Timestamp' in df.columns: print(f" ๐Ÿ• Converting Unix timestamps...") df['timestamp'] = pd.to_datetime(df['Timestamp'], unit='s') print(f" โœ… Converted timestamps from {df['timestamp'].min()} to {df['timestamp'].max()}") else: # Try to identify timestamp column timestamp_cols = ['timestamp', 'time', 'datetime', 'date'] timestamp_col = None for col in timestamp_cols: if col in df.columns: timestamp_col = col break if timestamp_col is None: # Try to find a column that looks like a timestamp for col in df.columns: if 'time' in col.lower() or 'date' in col.lower(): timestamp_col = col break if timestamp_col is None: print(" โŒ Could not find timestamp column") return None print(f" ๐Ÿ• Using timestamp column: {timestamp_col}") df['timestamp'] = pd.to_datetime(df[timestamp_col]) # Standardize column names column_mapping = {} for col in df.columns: col_lower = col.lower() if 'open' in col_lower: column_mapping[col] = 'open' elif 'high' in col_lower: column_mapping[col] = 'high' elif 'low' in col_lower: column_mapping[col] = 'low' elif 'close' in col_lower: column_mapping[col] = 'close' elif 'volume' in col_lower: column_mapping[col] = 'volume' df = df.rename(columns=column_mapping) # Ensure we have required columns required_cols = ['open', 'high', 'low', 'close', 'volume'] missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: print(f" โŒ Missing required columns: {missing_cols}") return None # Remove rows with zero or invalid prices initial_len = len(df) df = df[(df['open'] > 0) & (df['high'] > 0) & (df['low'] > 0) & (df['close'] > 0)] if len(df) < initial_len: print(f" ๐Ÿงน Removed {initial_len - len(df)} rows with invalid prices") # Filter by date if specified if date_filter: target_date = pd.to_datetime(date_filter).date() df = df[df['timestamp'].dt.date == target_date] print(f" ๐Ÿ“… Filtered to {date_filter}: {len(df)} rows") if len(df) == 0: print(f" โš ๏ธ No data found for {date_filter}") # Find available dates available_dates = df['timestamp'].dt.date.unique() print(f" ๐Ÿ“… Available dates (sample): {sorted(available_dates)[:10]}") return None # If no date filter, let's find a good date with lots of data if date_filter is None: print(f" ๐Ÿ“… Finding a good date with active trading...") # Group by date and count rows date_counts = df.groupby(df['timestamp'].dt.date).size() # Find dates with close to 1440 minutes (full day) good_dates = date_counts[date_counts >= 1000].index if len(good_dates) > 0: # Pick a recent date with good data selected_date = good_dates[-1] # Most recent good date df = df[df['timestamp'].dt.date == selected_date] print(f" โœ… Auto-selected date {selected_date} with {len(df)} data points") else: print(f" โš ๏ธ No dates with sufficient data found") # Limit rows if specified if max_rows and len(df) > max_rows: df = df.head(max_rows) print(f" โœ‚๏ธ Limited to {max_rows} rows") # Sort by timestamp df = df.sort_values('timestamp') print(f" โœ… Final dataset: {len(df)} rows from {df['timestamp'].min()} to {df['timestamp'].max()}") return df except Exception as e: print(f" โŒ Error loading data: {e}") import traceback traceback.print_exc() return None def convert_df_to_minute_data(df: pd.DataFrame) -> list: """Convert DataFrame to list of dictionaries for aggregation.""" minute_data = [] for _, row in df.iterrows(): minute_data.append({ 'timestamp': row['timestamp'], 'open': float(row['open']), 'high': float(row['high']), 'low': float(row['low']), 'close': float(row['close']), 'volume': float(row['volume']) }) return minute_data def plot_candlesticks(ax, data, timeframe, color='blue', alpha=0.7, width_factor=0.8): """ Plot candlestick chart on given axes. Args: ax: Matplotlib axes data: List of OHLCV dictionaries timeframe: Timeframe string for labeling color: Color for the candlesticks alpha: Transparency width_factor: Width factor for candlesticks """ if not data: return # Calculate bar width based on timeframe timeframe_minutes = parse_timeframe_to_minutes(timeframe) bar_width = pd.Timedelta(minutes=timeframe_minutes * width_factor) for bar in data: timestamp = bar['timestamp'] open_price = bar['open'] high_price = bar['high'] low_price = bar['low'] close_price = bar['close'] # For "end" timestamp mode, the bar represents data from (timestamp - timeframe) to timestamp bar_start = timestamp - pd.Timedelta(minutes=timeframe_minutes) bar_end = timestamp # Determine color based on open/close if close_price >= open_price: # Green/bullish candle candle_color = 'green' if color == 'blue' else color body_color = candle_color else: # Red/bearish candle candle_color = 'red' if color == 'blue' else color body_color = candle_color # Draw the wick (high-low line) at the center of the time period bar_center = bar_start + (bar_end - bar_start) / 2 ax.plot([bar_center, bar_center], [low_price, high_price], color=candle_color, linewidth=1, alpha=alpha) # Draw the body (open-close rectangle) spanning the time period body_height = abs(close_price - open_price) body_bottom = min(open_price, close_price) if body_height > 0: rect = Rectangle((bar_start, body_bottom), bar_end - bar_start, body_height, facecolor=body_color, edgecolor=candle_color, alpha=alpha, linewidth=0.5) ax.add_patch(rect) else: # Doji (open == close) - draw a horizontal line ax.plot([bar_start, bar_end], [open_price, close_price], color=candle_color, linewidth=2, alpha=alpha) def create_comparison_plot(minute_data, timeframes, title="Timeframe Aggregation Comparison"): """ Create a comparison plot showing different timeframes. Args: minute_data: List of minute OHLCV data timeframes: List of timeframes to compare title: Plot title """ print(f"\n๐Ÿ“Š Creating comparison plot for timeframes: {timeframes}") # Aggregate data for each timeframe aggregated_data = {} for tf in timeframes: print(f" ๐Ÿ”„ Aggregating to {tf}...") aggregated_data[tf] = aggregate_minute_data_to_timeframe(minute_data, tf, "end") print(f" โœ… {len(aggregated_data[tf])} bars") # Create subplots fig, axes = plt.subplots(len(timeframes), 1, figsize=(15, 4 * len(timeframes))) if len(timeframes) == 1: axes = [axes] fig.suptitle(title, fontsize=16, fontweight='bold') # Colors for different timeframes colors = ['blue', 'orange', 'green', 'red', 'purple', 'brown'] for i, tf in enumerate(timeframes): ax = axes[i] data = aggregated_data[tf] if data: # Plot candlesticks plot_candlesticks(ax, data, tf, color=colors[i % len(colors)]) # Set title and labels ax.set_title(f"{tf} Timeframe ({len(data)} bars)", fontweight='bold') ax.set_ylabel('Price (USD)') # Format x-axis based on data range if len(data) > 0: time_range = data[-1]['timestamp'] - data[0]['timestamp'] if time_range.total_seconds() <= 24 * 3600: # Less than 24 hours ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M')) ax.xaxis.set_major_locator(mdates.HourLocator(interval=2)) else: ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d %H:%M')) ax.xaxis.set_major_locator(mdates.DayLocator()) plt.setp(ax.xaxis.get_majorticklabels(), rotation=45) # Add grid ax.grid(True, alpha=0.3) # Add statistics if data: first_bar = data[0] last_bar = data[-1] price_change = last_bar['close'] - first_bar['open'] price_change_pct = (price_change / first_bar['open']) * 100 stats_text = f"Open: ${first_bar['open']:.2f} | Close: ${last_bar['close']:.2f} | Change: {price_change_pct:+.2f}%" ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8)) else: ax.text(0.5, 0.5, f"No data for {tf}", transform=ax.transAxes, ha='center', va='center', fontsize=14) plt.tight_layout() return fig def create_overlay_plot(minute_data, timeframes, title="Timeframe Overlay Comparison"): """ Create an overlay plot showing multiple timeframes on the same chart. Args: minute_data: List of minute OHLCV data timeframes: List of timeframes to overlay title: Plot title """ print(f"\n๐Ÿ“Š Creating overlay plot for timeframes: {timeframes}") # Aggregate data for each timeframe aggregated_data = {} for tf in timeframes: print(f" ๐Ÿ”„ Aggregating to {tf}...") aggregated_data[tf] = aggregate_minute_data_to_timeframe(minute_data, tf, "end") print(f" โœ… {len(aggregated_data[tf])} bars") # Create single plot fig, ax = plt.subplots(1, 1, figsize=(15, 8)) fig.suptitle(title, fontsize=16, fontweight='bold') # Colors and alphas for different timeframes (lighter for larger timeframes) colors = ['lightcoral', 'lightgreen', 'orange', 'lightblue'] # Reordered for better visibility alphas = [0.9, 0.7, 0.5, 0.3] # Higher alpha for smaller timeframes # Plot timeframes from largest to smallest (background to foreground) sorted_timeframes = sorted(timeframes, key=parse_timeframe_to_minutes, reverse=True) for i, tf in enumerate(sorted_timeframes): data = aggregated_data[tf] if data: color_idx = timeframes.index(tf) plot_candlesticks(ax, data, tf, color=colors[color_idx % len(colors)], alpha=alphas[color_idx % len(alphas)]) # Set labels and formatting ax.set_ylabel('Price (USD)') ax.set_xlabel('Time') # Format x-axis based on data range if minute_data: time_range = minute_data[-1]['timestamp'] - minute_data[0]['timestamp'] if time_range.total_seconds() <= 24 * 3600: # Less than 24 hours ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M')) ax.xaxis.set_major_locator(mdates.HourLocator(interval=2)) else: ax.xaxis.set_major_formatter(mdates.DateFormatter('%m-%d %H:%M')) ax.xaxis.set_major_locator(mdates.DayLocator()) plt.setp(ax.xaxis.get_majorticklabels(), rotation=45) # Add grid ax.grid(True, alpha=0.3) # Add legend legend_elements = [] for i, tf in enumerate(timeframes): data = aggregated_data[tf] if data: legend_elements.append(plt.Rectangle((0,0),1,1, facecolor=colors[i % len(colors)], alpha=alphas[i % len(alphas)], label=f"{tf} ({len(data)} bars)")) ax.legend(handles=legend_elements, loc='upper left') # Add explanation text explanation = ("Smaller timeframes should be contained within larger timeframes.\n" "Each bar spans its full time period (not just a point in time).") ax.text(0.02, 0.02, explanation, transform=ax.transAxes, verticalalignment='bottom', fontsize=10, bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8)) plt.tight_layout() return fig def main(): """Main function to run the visual test.""" print("๐Ÿš€ Visual Test for Timeframe Aggregation") print("=" * 50) # Configuration data_file = "./data/btcusd_1-min_data.csv" test_date = None # Let the script auto-select a good date max_rows = 1440 # 24 hours of minute data timeframes = ["5min", "15min", "30min", "1h"] # Check if data file exists if not os.path.exists(data_file): print(f"โŒ Data file not found: {data_file}") print("Please ensure the BTC data file exists in the ./data/ directory") return False # Load data df = load_btc_data(data_file, date_filter=test_date, max_rows=max_rows) if df is None or len(df) == 0: print("โŒ Failed to load data or no data available") return False # Convert to minute data format minute_data = convert_df_to_minute_data(df) print(f"\n๐Ÿ“ˆ Converted to {len(minute_data)} minute data points") # Show data range if minute_data: start_time = minute_data[0]['timestamp'] end_time = minute_data[-1]['timestamp'] print(f"๐Ÿ“… Data range: {start_time} to {end_time}") # Show sample data print(f"๐Ÿ“Š Sample data point:") sample = minute_data[0] print(f" Timestamp: {sample['timestamp']}") print(f" OHLCV: O={sample['open']:.2f}, H={sample['high']:.2f}, L={sample['low']:.2f}, C={sample['close']:.2f}, V={sample['volume']:.0f}") # Create comparison plots try: # Individual timeframe plots fig1 = create_comparison_plot(minute_data, timeframes, f"BTC Timeframe Comparison - {start_time.date()}") # Overlay plot fig2 = create_overlay_plot(minute_data, timeframes, f"BTC Timeframe Overlay - {start_time.date()}") # Show plots plt.show() print("\nโœ… Visual test completed successfully!") print("๐Ÿ“Š Check the plots to verify:") print(" 1. Higher timeframes contain lower timeframes") print(" 2. OHLCV values are correctly aggregated") print(" 3. Timestamps represent bar end times") print(" 4. No future data leakage") return True except Exception as e: print(f"โŒ Error creating plots: {e}") import traceback traceback.print_exc() return False if __name__ == "__main__": success = main() sys.exit(0 if success else 1)