TimeFrame agregator with right logic

2025-05-28 18:26:51 +08:00
parent 78ccb15fda
commit 1861c336f9
20 changed files with 5031 additions and 99 deletions
--- a/IncrementalTrader/utils/init.py
+++ b/IncrementalTrader/utils/init.py
@@ -0,0 +1,23 @@
+"""
+Utility modules for the IncrementalTrader framework.
+
+This package contains utility functions and classes that support the core
+trading functionality, including timeframe aggregation, data management,
+and helper utilities.
+"""
+
+from .timeframe_utils import (
+    aggregate_minute_data_to_timeframe,
+    parse_timeframe_to_minutes,
+    get_latest_complete_bar,
+    MinuteDataBuffer,
+    TimeframeError
+)
+
+__all__ = [
+    'aggregate_minute_data_to_timeframe',
+    'parse_timeframe_to_minutes', 
+    'get_latest_complete_bar',
+    'MinuteDataBuffer',
+    'TimeframeError'
+] 
--- a/IncrementalTrader/utils/timeframe_utils.py
+++ b/IncrementalTrader/utils/timeframe_utils.py
@@ -0,0 +1,455 @@
+"""
+Timeframe aggregation utilities for the IncrementalTrader framework.
+
+This module provides utilities for aggregating minute-level OHLCV data to higher
+timeframes with mathematical correctness and proper timestamp handling.
+
+Key Features:
+- Uses pandas resampling for mathematical correctness
+- Supports bar end timestamps (default) to prevent future data leakage
+- Proper OHLCV aggregation rules (first/max/min/last/sum)
+- MinuteDataBuffer for efficient real-time data management
+- Comprehensive error handling and validation
+
+Critical Fixes:
+1. Bar timestamps represent END of period (no future data leakage)
+2. Correct OHLCV aggregation matching pandas resampling
+3. Proper handling of incomplete bars and edge cases
+"""
+
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Optional, Union, Any
+from collections import deque
+import logging
+import re
+
+logger = logging.getLogger(__name__)
+
+
+class TimeframeError(Exception):
+    """Exception raised for timeframe-related errors."""
+    pass
+
+
+def parse_timeframe_to_minutes(timeframe: str) -> int:
+    """
+    Parse timeframe string to minutes.
+    
+    Args:
+        timeframe: Timeframe string (e.g., "1min", "5min", "15min", "1h", "4h", "1d")
+        
+    Returns:
+        Number of minutes in the timeframe
+        
+    Raises:
+        TimeframeError: If timeframe format is invalid
+        
+    Examples:
+        >>> parse_timeframe_to_minutes("15min")
+        15
+        >>> parse_timeframe_to_minutes("1h")
+        60
+        >>> parse_timeframe_to_minutes("1d")
+        1440
+    """
+    if not isinstance(timeframe, str):
+        raise TimeframeError(f"Timeframe must be a string, got {type(timeframe)}")
+    
+    timeframe = timeframe.lower().strip()
+    
+    # Handle common timeframe formats
+    patterns = {
+        r'^(\d+)min$': lambda m: int(m.group(1)),
+        r'^(\d+)h$': lambda m: int(m.group(1)) * 60,
+        r'^(\d+)d$': lambda m: int(m.group(1)) * 1440,
+        r'^(\d+)w$': lambda m: int(m.group(1)) * 10080,  # 7 * 24 * 60
+    }
+    
+    for pattern, converter in patterns.items():
+        match = re.match(pattern, timeframe)
+        if match:
+            minutes = converter(match)
+            if minutes <= 0:
+                raise TimeframeError(f"Timeframe must be positive, got {minutes} minutes")
+            return minutes
+    
+    raise TimeframeError(f"Invalid timeframe format: {timeframe}. "
+                        f"Supported formats: Nmin, Nh, Nd, Nw (e.g., 15min, 1h, 1d)")
+
+
+def aggregate_minute_data_to_timeframe(
+    minute_data: List[Dict[str, Union[float, pd.Timestamp]]], 
+    timeframe: str,
+    timestamp_mode: str = "end"
+) -> List[Dict[str, Union[float, pd.Timestamp]]]:
+    """
+    Aggregate minute-level OHLCV data to specified timeframe using pandas resampling.
+    
+    This function provides mathematically correct aggregation that matches pandas
+    resampling behavior, with proper timestamp handling to prevent future data leakage.
+    
+    Args:
+        minute_data: List of minute OHLCV dictionaries with 'timestamp' field
+        timeframe: Target timeframe ("1min", "5min", "15min", "1h", "4h", "1d")
+        timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
+    
+    Returns:
+        List of aggregated OHLCV dictionaries with proper timestamps
+        
+    Raises:
+        TimeframeError: If timeframe format is invalid or data is malformed
+        ValueError: If minute_data is empty or contains invalid data
+        
+    Examples:
+        >>> minute_data = [
+        ...     {'timestamp': pd.Timestamp('2024-01-01 09:00'), 'open': 100, 'high': 102, 'low': 99, 'close': 101, 'volume': 1000},
+        ...     {'timestamp': pd.Timestamp('2024-01-01 09:01'), 'open': 101, 'high': 103, 'low': 100, 'close': 102, 'volume': 1200},
+        ... ]
+        >>> result = aggregate_minute_data_to_timeframe(minute_data, "15min")
+        >>> len(result)
+        1
+        >>> result[0]['timestamp']  # Bar end timestamp
+        Timestamp('2024-01-01 09:15:00')
+    """
+    if not minute_data:
+        return []
+    
+    if not isinstance(minute_data, list):
+        raise ValueError("minute_data must be a list of dictionaries")
+    
+    if timestamp_mode not in ["end", "start"]:
+        raise ValueError("timestamp_mode must be 'end' or 'start'")
+    
+    # Validate timeframe
+    timeframe_minutes = parse_timeframe_to_minutes(timeframe)
+    
+    # If requesting 1min data, return as-is (with timestamp mode adjustment)
+    if timeframe_minutes == 1:
+        if timestamp_mode == "end":
+            # Adjust timestamps to represent bar end (add 1 minute)
+            result = []
+            for data_point in minute_data:
+                adjusted_point = data_point.copy()
+                adjusted_point['timestamp'] = data_point['timestamp'] + pd.Timedelta(minutes=1)
+                result.append(adjusted_point)
+            return result
+        else:
+            return minute_data.copy()
+    
+    # Validate data structure
+    required_fields = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
+    for i, data_point in enumerate(minute_data):
+        if not isinstance(data_point, dict):
+            raise ValueError(f"Data point {i} must be a dictionary")
+        
+        for field in required_fields:
+            if field not in data_point:
+                raise ValueError(f"Data point {i} missing required field: {field}")
+        
+        # Validate timestamp
+        if not isinstance(data_point['timestamp'], pd.Timestamp):
+            try:
+                data_point['timestamp'] = pd.Timestamp(data_point['timestamp'])
+            except Exception as e:
+                raise ValueError(f"Invalid timestamp in data point {i}: {e}")
+    
+    try:
+        # Convert to DataFrame for pandas resampling
+        df = pd.DataFrame(minute_data)
+        df = df.set_index('timestamp')
+        
+        # Sort by timestamp to ensure proper ordering
+        df = df.sort_index()
+        
+        # Use pandas resampling for mathematical correctness
+        freq_str = f'{timeframe_minutes}min'
+        
+        # Use trading industry standard grouping: label='left', closed='left'
+        # This means 5min bar starting at 09:00 includes minutes 09:00-09:04
+        resampled = df.resample(freq_str, label='left', closed='left').agg({
+            'open': 'first',    # First open in the period
+            'high': 'max',      # Maximum high in the period
+            'low': 'min',       # Minimum low in the period
+            'close': 'last',    # Last close in the period
+            'volume': 'sum'     # Sum of volume in the period
+        })
+        
+        # Remove any rows with NaN values (incomplete periods)
+        resampled = resampled.dropna()
+        
+        # Convert back to list of dictionaries
+        result = []
+        for timestamp, row in resampled.iterrows():
+            # Adjust timestamp based on mode
+            if timestamp_mode == "end":
+                # Convert bar start timestamp to bar end timestamp
+                bar_end_timestamp = timestamp + pd.Timedelta(minutes=timeframe_minutes)
+                final_timestamp = bar_end_timestamp
+            else:
+                # Keep bar start timestamp
+                final_timestamp = timestamp
+            
+            result.append({
+                'timestamp': final_timestamp,
+                'open': float(row['open']),
+                'high': float(row['high']),
+                'low': float(row['low']),
+                'close': float(row['close']),
+                'volume': float(row['volume'])
+            })
+        
+        return result
+        
+    except Exception as e:
+        raise TimeframeError(f"Failed to aggregate data to {timeframe}: {e}")
+
+
+def get_latest_complete_bar(
+    minute_data: List[Dict[str, Union[float, pd.Timestamp]]], 
+    timeframe: str,
+    timestamp_mode: str = "end"
+) -> Optional[Dict[str, Union[float, pd.Timestamp]]]:
+    """
+    Get the latest complete bar from minute data for the specified timeframe.
+    
+    This function is useful for real-time processing where you only want to
+    process complete bars and avoid using incomplete/future data.
+    
+    Args:
+        minute_data: List of minute OHLCV dictionaries with 'timestamp' field
+        timeframe: Target timeframe ("1min", "5min", "15min", "1h", "4h", "1d")
+        timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
+    
+    Returns:
+        Latest complete bar dictionary, or None if no complete bars available
+        
+    Examples:
+        >>> minute_data = [...]  # 30 minutes of data
+        >>> latest_15m = get_latest_complete_bar(minute_data, "15min")
+        >>> latest_15m['timestamp']  # Will be 15 minutes ago (complete bar)
+    """
+    if not minute_data:
+        return None
+    
+    # Get all aggregated bars
+    aggregated_bars = aggregate_minute_data_to_timeframe(minute_data, timeframe, timestamp_mode)
+    
+    if not aggregated_bars:
+        return None
+    
+    # For real-time processing, we need to ensure the bar is truly complete
+    # This means the bar's end time should be before the current time
+    latest_minute_timestamp = max(data['timestamp'] for data in minute_data)
+    
+    # Filter out incomplete bars
+    complete_bars = []
+    for bar in aggregated_bars:
+        if timestamp_mode == "end":
+            # Bar timestamp is the end time, so it should be <= latest minute + 1 minute
+            if bar['timestamp'] <= latest_minute_timestamp + pd.Timedelta(minutes=1):
+                complete_bars.append(bar)
+        else:
+            # Bar timestamp is the start time, check if enough time has passed
+            timeframe_minutes = parse_timeframe_to_minutes(timeframe)
+            bar_end_time = bar['timestamp'] + pd.Timedelta(minutes=timeframe_minutes)
+            if bar_end_time <= latest_minute_timestamp + pd.Timedelta(minutes=1):
+                complete_bars.append(bar)
+    
+    return complete_bars[-1] if complete_bars else None
+
+
+class MinuteDataBuffer:
+    """
+    Helper class for managing minute data buffers in real-time strategies.
+    
+    This class provides efficient buffer management for minute-level data with
+    automatic aggregation capabilities. It's designed for use in incremental
+    strategies that need to maintain a rolling window of minute data.
+    
+    Features:
+    - Automatic buffer size management with configurable limits
+    - Efficient data access and aggregation methods
+    - Memory-bounded operation (doesn't grow indefinitely)
+    - Thread-safe operations for real-time use
+    - Comprehensive validation and error handling
+    
+    Example:
+        >>> buffer = MinuteDataBuffer(max_size=1440)  # 24 hours
+        >>> buffer.add(timestamp, {'open': 100, 'high': 102, 'low': 99, 'close': 101, 'volume': 1000})
+        >>> bars_15m = buffer.aggregate_to_timeframe("15min", lookback_bars=4)
+        >>> latest_bar = buffer.get_latest_complete_bar("15min")
+    """
+    
+    def __init__(self, max_size: int = 1440):
+        """
+        Initialize minute data buffer.
+        
+        Args:
+            max_size: Maximum number of minute data points to keep (default: 1440 = 24 hours)
+        """
+        if max_size <= 0:
+            raise ValueError("max_size must be positive")
+        
+        self.max_size = max_size
+        self._buffer = deque(maxlen=max_size)
+        self._last_timestamp = None
+        
+        logger.debug(f"Initialized MinuteDataBuffer with max_size={max_size}")
+    
+    def add(self, timestamp: pd.Timestamp, ohlcv_data: Dict[str, float]) -> None:
+        """
+        Add new minute data point to the buffer.
+        
+        Args:
+            timestamp: Timestamp of the data point
+            ohlcv_data: OHLCV data dictionary (open, high, low, close, volume)
+            
+        Raises:
+            ValueError: If data is invalid or timestamp is out of order
+        """
+        if not isinstance(timestamp, pd.Timestamp):
+            try:
+                timestamp = pd.Timestamp(timestamp)
+            except Exception as e:
+                raise ValueError(f"Invalid timestamp: {e}")
+        
+        # Validate OHLCV data
+        required_fields = ['open', 'high', 'low', 'close', 'volume']
+        for field in required_fields:
+            if field not in ohlcv_data:
+                raise ValueError(f"Missing required field: {field}")
+            if not isinstance(ohlcv_data[field], (int, float)):
+                raise ValueError(f"Field {field} must be numeric, got {type(ohlcv_data[field])}")
+        
+        # Check timestamp ordering (allow equal timestamps for updates)
+        if self._last_timestamp is not None and timestamp < self._last_timestamp:
+            logger.warning(f"Out-of-order timestamp: {timestamp} < {self._last_timestamp}")
+        
+        # Create data point
+        data_point = ohlcv_data.copy()
+        data_point['timestamp'] = timestamp
+        
+        # Add to buffer
+        self._buffer.append(data_point)
+        self._last_timestamp = timestamp
+        
+        logger.debug(f"Added data point at {timestamp}, buffer size: {len(self._buffer)}")
+    
+    def get_data(self, lookback_minutes: Optional[int] = None) -> List[Dict[str, Union[float, pd.Timestamp]]]:
+        """
+        Get data from buffer.
+        
+        Args:
+            lookback_minutes: Number of minutes to look back (None for all data)
+            
+        Returns:
+            List of minute data dictionaries
+        """
+        if not self._buffer:
+            return []
+        
+        if lookback_minutes is None:
+            return list(self._buffer)
+        
+        if lookback_minutes <= 0:
+            raise ValueError("lookback_minutes must be positive")
+        
+        # Get data from the last N minutes
+        if len(self._buffer) <= lookback_minutes:
+            return list(self._buffer)
+        
+        return list(self._buffer)[-lookback_minutes:]
+    
+    def aggregate_to_timeframe(
+        self, 
+        timeframe: str, 
+        lookback_bars: Optional[int] = None,
+        timestamp_mode: str = "end"
+    ) -> List[Dict[str, Union[float, pd.Timestamp]]]:
+        """
+        Aggregate buffer data to specified timeframe.
+        
+        Args:
+            timeframe: Target timeframe ("5min", "15min", "1h", etc.)
+            lookback_bars: Number of bars to return (None for all available)
+            timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
+            
+        Returns:
+            List of aggregated OHLCV bars
+        """
+        if not self._buffer:
+            return []
+        
+        # Get all buffer data
+        minute_data = list(self._buffer)
+        
+        # Aggregate to timeframe
+        aggregated_bars = aggregate_minute_data_to_timeframe(minute_data, timeframe, timestamp_mode)
+        
+        # Apply lookback limit
+        if lookback_bars is not None and lookback_bars > 0:
+            aggregated_bars = aggregated_bars[-lookback_bars:]
+        
+        return aggregated_bars
+    
+    def get_latest_complete_bar(
+        self, 
+        timeframe: str,
+        timestamp_mode: str = "end"
+    ) -> Optional[Dict[str, Union[float, pd.Timestamp]]]:
+        """
+        Get the latest complete bar for the specified timeframe.
+        
+        Args:
+            timeframe: Target timeframe ("5min", "15min", "1h", etc.)
+            timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
+            
+        Returns:
+            Latest complete bar dictionary, or None if no complete bars available
+        """
+        if not self._buffer:
+            return None
+        
+        minute_data = list(self._buffer)
+        return get_latest_complete_bar(minute_data, timeframe, timestamp_mode)
+    
+    def size(self) -> int:
+        """Get current buffer size."""
+        return len(self._buffer)
+    
+    def is_full(self) -> bool:
+        """Check if buffer is at maximum capacity."""
+        return len(self._buffer) >= self.max_size
+    
+    def clear(self) -> None:
+        """Clear all data from buffer."""
+        self._buffer.clear()
+        self._last_timestamp = None
+        logger.debug("Buffer cleared")
+    
+    def get_time_range(self) -> Optional[tuple]:
+        """
+        Get the time range of data in the buffer.
+        
+        Returns:
+            Tuple of (start_time, end_time) or None if buffer is empty
+        """
+        if not self._buffer:
+            return None
+        
+        timestamps = [data['timestamp'] for data in self._buffer]
+        return (min(timestamps), max(timestamps))
+    
+    def __len__(self) -> int:
+        """Get buffer size."""
+        return len(self._buffer)
+    
+    def __repr__(self) -> str:
+        """String representation of buffer."""
+        time_range = self.get_time_range()
+        if time_range:
+            start, end = time_range
+            return f"MinuteDataBuffer(size={len(self._buffer)}, range={start} to {end})"
+        else:
+            return f"MinuteDataBuffer(size=0, empty)"