Cycles/IncrementalTrader/utils/timeframe_utils.py
Ajasra b0ea701020 Enhance DataLoader and MinuteDataBuffer for improved data handling
- Added error handling in DataLoader to attempt reading CSV files with a fallback to the Python engine if the default engine fails.
- Converted numpy float32 columns to Python float for compatibility in DataLoader.
- Updated MinuteDataBuffer to accept both Python and numpy numeric types, ensuring consistent data validation and conversion.
2025-05-29 14:21:16 +08:00

460 lines
17 KiB
Python

"""
Timeframe aggregation utilities for the IncrementalTrader framework.
This module provides utilities for aggregating minute-level OHLCV data to higher
timeframes with mathematical correctness and proper timestamp handling.
Key Features:
- Uses pandas resampling for mathematical correctness
- Supports bar end timestamps (default) to prevent future data leakage
- Proper OHLCV aggregation rules (first/max/min/last/sum)
- MinuteDataBuffer for efficient real-time data management
- Comprehensive error handling and validation
Critical Fixes:
1. Bar timestamps represent END of period (no future data leakage)
2. Correct OHLCV aggregation matching pandas resampling
3. Proper handling of incomplete bars and edge cases
"""
import pandas as pd
import numpy as np
from typing import Dict, List, Optional, Union, Any
from collections import deque
import logging
import re
logger = logging.getLogger(__name__)
class TimeframeError(Exception):
"""Exception raised for timeframe-related errors."""
pass
def parse_timeframe_to_minutes(timeframe: str) -> int:
"""
Parse timeframe string to minutes.
Args:
timeframe: Timeframe string (e.g., "1min", "5min", "15min", "1h", "4h", "1d")
Returns:
Number of minutes in the timeframe
Raises:
TimeframeError: If timeframe format is invalid
Examples:
>>> parse_timeframe_to_minutes("15min")
15
>>> parse_timeframe_to_minutes("1h")
60
>>> parse_timeframe_to_minutes("1d")
1440
"""
if not isinstance(timeframe, str):
raise TimeframeError(f"Timeframe must be a string, got {type(timeframe)}")
timeframe = timeframe.lower().strip()
# Handle common timeframe formats
patterns = {
r'^(\d+)min$': lambda m: int(m.group(1)),
r'^(\d+)h$': lambda m: int(m.group(1)) * 60,
r'^(\d+)d$': lambda m: int(m.group(1)) * 1440,
r'^(\d+)w$': lambda m: int(m.group(1)) * 10080, # 7 * 24 * 60
}
for pattern, converter in patterns.items():
match = re.match(pattern, timeframe)
if match:
minutes = converter(match)
if minutes <= 0:
raise TimeframeError(f"Timeframe must be positive, got {minutes} minutes")
return minutes
raise TimeframeError(f"Invalid timeframe format: {timeframe}. "
f"Supported formats: Nmin, Nh, Nd, Nw (e.g., 15min, 1h, 1d)")
def aggregate_minute_data_to_timeframe(
minute_data: List[Dict[str, Union[float, pd.Timestamp]]],
timeframe: str,
timestamp_mode: str = "end"
) -> List[Dict[str, Union[float, pd.Timestamp]]]:
"""
Aggregate minute-level OHLCV data to specified timeframe using pandas resampling.
This function provides mathematically correct aggregation that matches pandas
resampling behavior, with proper timestamp handling to prevent future data leakage.
Args:
minute_data: List of minute OHLCV dictionaries with 'timestamp' field
timeframe: Target timeframe ("1min", "5min", "15min", "1h", "4h", "1d")
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
Returns:
List of aggregated OHLCV dictionaries with proper timestamps
Raises:
TimeframeError: If timeframe format is invalid or data is malformed
ValueError: If minute_data is empty or contains invalid data
Examples:
>>> minute_data = [
... {'timestamp': pd.Timestamp('2024-01-01 09:00'), 'open': 100, 'high': 102, 'low': 99, 'close': 101, 'volume': 1000},
... {'timestamp': pd.Timestamp('2024-01-01 09:01'), 'open': 101, 'high': 103, 'low': 100, 'close': 102, 'volume': 1200},
... ]
>>> result = aggregate_minute_data_to_timeframe(minute_data, "15min")
>>> len(result)
1
>>> result[0]['timestamp'] # Bar end timestamp
Timestamp('2024-01-01 09:15:00')
"""
if not minute_data:
return []
if not isinstance(minute_data, list):
raise ValueError("minute_data must be a list of dictionaries")
if timestamp_mode not in ["end", "start"]:
raise ValueError("timestamp_mode must be 'end' or 'start'")
# Validate timeframe
timeframe_minutes = parse_timeframe_to_minutes(timeframe)
# If requesting 1min data, return as-is (with timestamp mode adjustment)
if timeframe_minutes == 1:
if timestamp_mode == "end":
# Adjust timestamps to represent bar end (add 1 minute)
result = []
for data_point in minute_data:
adjusted_point = data_point.copy()
adjusted_point['timestamp'] = data_point['timestamp'] + pd.Timedelta(minutes=1)
result.append(adjusted_point)
return result
else:
return minute_data.copy()
# Validate data structure
required_fields = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
for i, data_point in enumerate(minute_data):
if not isinstance(data_point, dict):
raise ValueError(f"Data point {i} must be a dictionary")
for field in required_fields:
if field not in data_point:
raise ValueError(f"Data point {i} missing required field: {field}")
# Validate timestamp
if not isinstance(data_point['timestamp'], pd.Timestamp):
try:
data_point['timestamp'] = pd.Timestamp(data_point['timestamp'])
except Exception as e:
raise ValueError(f"Invalid timestamp in data point {i}: {e}")
try:
# Convert to DataFrame for pandas resampling
df = pd.DataFrame(minute_data)
df = df.set_index('timestamp')
# Sort by timestamp to ensure proper ordering
df = df.sort_index()
# Use pandas resampling for mathematical correctness
freq_str = f'{timeframe_minutes}min'
# Use trading industry standard grouping: label='left', closed='left'
# This means 5min bar starting at 09:00 includes minutes 09:00-09:04
resampled = df.resample(freq_str, label='left', closed='left').agg({
'open': 'first', # First open in the period
'high': 'max', # Maximum high in the period
'low': 'min', # Minimum low in the period
'close': 'last', # Last close in the period
'volume': 'sum' # Sum of volume in the period
})
# Remove any rows with NaN values (incomplete periods)
resampled = resampled.dropna()
# Convert back to list of dictionaries
result = []
for timestamp, row in resampled.iterrows():
# Adjust timestamp based on mode
if timestamp_mode == "end":
# Convert bar start timestamp to bar end timestamp
bar_end_timestamp = timestamp + pd.Timedelta(minutes=timeframe_minutes)
final_timestamp = bar_end_timestamp
else:
# Keep bar start timestamp
final_timestamp = timestamp
result.append({
'timestamp': final_timestamp,
'open': float(row['open']),
'high': float(row['high']),
'low': float(row['low']),
'close': float(row['close']),
'volume': float(row['volume'])
})
return result
except Exception as e:
raise TimeframeError(f"Failed to aggregate data to {timeframe}: {e}")
def get_latest_complete_bar(
minute_data: List[Dict[str, Union[float, pd.Timestamp]]],
timeframe: str,
timestamp_mode: str = "end"
) -> Optional[Dict[str, Union[float, pd.Timestamp]]]:
"""
Get the latest complete bar from minute data for the specified timeframe.
This function is useful for real-time processing where you only want to
process complete bars and avoid using incomplete/future data.
Args:
minute_data: List of minute OHLCV dictionaries with 'timestamp' field
timeframe: Target timeframe ("1min", "5min", "15min", "1h", "4h", "1d")
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
Returns:
Latest complete bar dictionary, or None if no complete bars available
Examples:
>>> minute_data = [...] # 30 minutes of data
>>> latest_15m = get_latest_complete_bar(minute_data, "15min")
>>> latest_15m['timestamp'] # Will be 15 minutes ago (complete bar)
"""
if not minute_data:
return None
# Get all aggregated bars
aggregated_bars = aggregate_minute_data_to_timeframe(minute_data, timeframe, timestamp_mode)
if not aggregated_bars:
return None
# For real-time processing, we need to ensure the bar is truly complete
# This means the bar's end time should be before the current time
latest_minute_timestamp = max(data['timestamp'] for data in minute_data)
# Filter out incomplete bars
complete_bars = []
for bar in aggregated_bars:
if timestamp_mode == "end":
# Bar timestamp is the end time, so it should be <= latest minute + 1 minute
if bar['timestamp'] <= latest_minute_timestamp + pd.Timedelta(minutes=1):
complete_bars.append(bar)
else:
# Bar timestamp is the start time, check if enough time has passed
timeframe_minutes = parse_timeframe_to_minutes(timeframe)
bar_end_time = bar['timestamp'] + pd.Timedelta(minutes=timeframe_minutes)
if bar_end_time <= latest_minute_timestamp + pd.Timedelta(minutes=1):
complete_bars.append(bar)
return complete_bars[-1] if complete_bars else None
class MinuteDataBuffer:
"""
Helper class for managing minute data buffers in real-time strategies.
This class provides efficient buffer management for minute-level data with
automatic aggregation capabilities. It's designed for use in incremental
strategies that need to maintain a rolling window of minute data.
Features:
- Automatic buffer size management with configurable limits
- Efficient data access and aggregation methods
- Memory-bounded operation (doesn't grow indefinitely)
- Thread-safe operations for real-time use
- Comprehensive validation and error handling
Example:
>>> buffer = MinuteDataBuffer(max_size=1440) # 24 hours
>>> buffer.add(timestamp, {'open': 100, 'high': 102, 'low': 99, 'close': 101, 'volume': 1000})
>>> bars_15m = buffer.aggregate_to_timeframe("15min", lookback_bars=4)
>>> latest_bar = buffer.get_latest_complete_bar("15min")
"""
def __init__(self, max_size: int = 1440):
"""
Initialize minute data buffer.
Args:
max_size: Maximum number of minute data points to keep (default: 1440 = 24 hours)
"""
if max_size <= 0:
raise ValueError("max_size must be positive")
self.max_size = max_size
self._buffer = deque(maxlen=max_size)
self._last_timestamp = None
logger.debug(f"Initialized MinuteDataBuffer with max_size={max_size}")
def add(self, timestamp: pd.Timestamp, ohlcv_data: Dict[str, float]) -> None:
"""
Add new minute data point to the buffer.
Args:
timestamp: Timestamp of the data point
ohlcv_data: OHLCV data dictionary (open, high, low, close, volume)
Raises:
ValueError: If data is invalid or timestamp is out of order
"""
if not isinstance(timestamp, pd.Timestamp):
try:
timestamp = pd.Timestamp(timestamp)
except Exception as e:
raise ValueError(f"Invalid timestamp: {e}")
# Validate OHLCV data
required_fields = ['open', 'high', 'low', 'close', 'volume']
for field in required_fields:
if field not in ohlcv_data:
raise ValueError(f"Missing required field: {field}")
# Accept both Python numeric types and numpy numeric types
if not isinstance(ohlcv_data[field], (int, float, np.number)):
raise ValueError(f"Field {field} must be numeric, got {type(ohlcv_data[field])}")
# Convert numpy types to Python types to ensure compatibility
if isinstance(ohlcv_data[field], np.number):
ohlcv_data[field] = float(ohlcv_data[field])
# Check timestamp ordering (allow equal timestamps for updates)
if self._last_timestamp is not None and timestamp < self._last_timestamp:
logger.warning(f"Out-of-order timestamp: {timestamp} < {self._last_timestamp}")
# Create data point
data_point = ohlcv_data.copy()
data_point['timestamp'] = timestamp
# Add to buffer
self._buffer.append(data_point)
self._last_timestamp = timestamp
logger.debug(f"Added data point at {timestamp}, buffer size: {len(self._buffer)}")
def get_data(self, lookback_minutes: Optional[int] = None) -> List[Dict[str, Union[float, pd.Timestamp]]]:
"""
Get data from buffer.
Args:
lookback_minutes: Number of minutes to look back (None for all data)
Returns:
List of minute data dictionaries
"""
if not self._buffer:
return []
if lookback_minutes is None:
return list(self._buffer)
if lookback_minutes <= 0:
raise ValueError("lookback_minutes must be positive")
# Get data from the last N minutes
if len(self._buffer) <= lookback_minutes:
return list(self._buffer)
return list(self._buffer)[-lookback_minutes:]
def aggregate_to_timeframe(
self,
timeframe: str,
lookback_bars: Optional[int] = None,
timestamp_mode: str = "end"
) -> List[Dict[str, Union[float, pd.Timestamp]]]:
"""
Aggregate buffer data to specified timeframe.
Args:
timeframe: Target timeframe ("5min", "15min", "1h", etc.)
lookback_bars: Number of bars to return (None for all available)
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
Returns:
List of aggregated OHLCV bars
"""
if not self._buffer:
return []
# Get all buffer data
minute_data = list(self._buffer)
# Aggregate to timeframe
aggregated_bars = aggregate_minute_data_to_timeframe(minute_data, timeframe, timestamp_mode)
# Apply lookback limit
if lookback_bars is not None and lookback_bars > 0:
aggregated_bars = aggregated_bars[-lookback_bars:]
return aggregated_bars
def get_latest_complete_bar(
self,
timeframe: str,
timestamp_mode: str = "end"
) -> Optional[Dict[str, Union[float, pd.Timestamp]]]:
"""
Get the latest complete bar for the specified timeframe.
Args:
timeframe: Target timeframe ("5min", "15min", "1h", etc.)
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
Returns:
Latest complete bar dictionary, or None if no complete bars available
"""
if not self._buffer:
return None
minute_data = list(self._buffer)
return get_latest_complete_bar(minute_data, timeframe, timestamp_mode)
def size(self) -> int:
"""Get current buffer size."""
return len(self._buffer)
def is_full(self) -> bool:
"""Check if buffer is at maximum capacity."""
return len(self._buffer) >= self.max_size
def clear(self) -> None:
"""Clear all data from buffer."""
self._buffer.clear()
self._last_timestamp = None
logger.debug("Buffer cleared")
def get_time_range(self) -> Optional[tuple]:
"""
Get the time range of data in the buffer.
Returns:
Tuple of (start_time, end_time) or None if buffer is empty
"""
if not self._buffer:
return None
timestamps = [data['timestamp'] for data in self._buffer]
return (min(timestamps), max(timestamps))
def __len__(self) -> int:
"""Get buffer size."""
return len(self._buffer)
def __repr__(self) -> str:
"""String representation of buffer."""
time_range = self.get_time_range()
if time_range:
start, end = time_range
return f"MinuteDataBuffer(size={len(self._buffer)}, range={start} to {end})"
else:
return f"MinuteDataBuffer(size=0, empty)"