- Added error handling in DataLoader to attempt reading CSV files with a fallback to the Python engine if the default engine fails. - Converted numpy float32 columns to Python float for compatibility in DataLoader. - Updated MinuteDataBuffer to accept both Python and numpy numeric types, ensuring consistent data validation and conversion.
460 lines
17 KiB
Python
460 lines
17 KiB
Python
"""
|
|
Timeframe aggregation utilities for the IncrementalTrader framework.
|
|
|
|
This module provides utilities for aggregating minute-level OHLCV data to higher
|
|
timeframes with mathematical correctness and proper timestamp handling.
|
|
|
|
Key Features:
|
|
- Uses pandas resampling for mathematical correctness
|
|
- Supports bar end timestamps (default) to prevent future data leakage
|
|
- Proper OHLCV aggregation rules (first/max/min/last/sum)
|
|
- MinuteDataBuffer for efficient real-time data management
|
|
- Comprehensive error handling and validation
|
|
|
|
Critical Fixes:
|
|
1. Bar timestamps represent END of period (no future data leakage)
|
|
2. Correct OHLCV aggregation matching pandas resampling
|
|
3. Proper handling of incomplete bars and edge cases
|
|
"""
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
from typing import Dict, List, Optional, Union, Any
|
|
from collections import deque
|
|
import logging
|
|
import re
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TimeframeError(Exception):
|
|
"""Exception raised for timeframe-related errors."""
|
|
pass
|
|
|
|
|
|
def parse_timeframe_to_minutes(timeframe: str) -> int:
|
|
"""
|
|
Parse timeframe string to minutes.
|
|
|
|
Args:
|
|
timeframe: Timeframe string (e.g., "1min", "5min", "15min", "1h", "4h", "1d")
|
|
|
|
Returns:
|
|
Number of minutes in the timeframe
|
|
|
|
Raises:
|
|
TimeframeError: If timeframe format is invalid
|
|
|
|
Examples:
|
|
>>> parse_timeframe_to_minutes("15min")
|
|
15
|
|
>>> parse_timeframe_to_minutes("1h")
|
|
60
|
|
>>> parse_timeframe_to_minutes("1d")
|
|
1440
|
|
"""
|
|
if not isinstance(timeframe, str):
|
|
raise TimeframeError(f"Timeframe must be a string, got {type(timeframe)}")
|
|
|
|
timeframe = timeframe.lower().strip()
|
|
|
|
# Handle common timeframe formats
|
|
patterns = {
|
|
r'^(\d+)min$': lambda m: int(m.group(1)),
|
|
r'^(\d+)h$': lambda m: int(m.group(1)) * 60,
|
|
r'^(\d+)d$': lambda m: int(m.group(1)) * 1440,
|
|
r'^(\d+)w$': lambda m: int(m.group(1)) * 10080, # 7 * 24 * 60
|
|
}
|
|
|
|
for pattern, converter in patterns.items():
|
|
match = re.match(pattern, timeframe)
|
|
if match:
|
|
minutes = converter(match)
|
|
if minutes <= 0:
|
|
raise TimeframeError(f"Timeframe must be positive, got {minutes} minutes")
|
|
return minutes
|
|
|
|
raise TimeframeError(f"Invalid timeframe format: {timeframe}. "
|
|
f"Supported formats: Nmin, Nh, Nd, Nw (e.g., 15min, 1h, 1d)")
|
|
|
|
|
|
def aggregate_minute_data_to_timeframe(
|
|
minute_data: List[Dict[str, Union[float, pd.Timestamp]]],
|
|
timeframe: str,
|
|
timestamp_mode: str = "end"
|
|
) -> List[Dict[str, Union[float, pd.Timestamp]]]:
|
|
"""
|
|
Aggregate minute-level OHLCV data to specified timeframe using pandas resampling.
|
|
|
|
This function provides mathematically correct aggregation that matches pandas
|
|
resampling behavior, with proper timestamp handling to prevent future data leakage.
|
|
|
|
Args:
|
|
minute_data: List of minute OHLCV dictionaries with 'timestamp' field
|
|
timeframe: Target timeframe ("1min", "5min", "15min", "1h", "4h", "1d")
|
|
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
|
|
|
|
Returns:
|
|
List of aggregated OHLCV dictionaries with proper timestamps
|
|
|
|
Raises:
|
|
TimeframeError: If timeframe format is invalid or data is malformed
|
|
ValueError: If minute_data is empty or contains invalid data
|
|
|
|
Examples:
|
|
>>> minute_data = [
|
|
... {'timestamp': pd.Timestamp('2024-01-01 09:00'), 'open': 100, 'high': 102, 'low': 99, 'close': 101, 'volume': 1000},
|
|
... {'timestamp': pd.Timestamp('2024-01-01 09:01'), 'open': 101, 'high': 103, 'low': 100, 'close': 102, 'volume': 1200},
|
|
... ]
|
|
>>> result = aggregate_minute_data_to_timeframe(minute_data, "15min")
|
|
>>> len(result)
|
|
1
|
|
>>> result[0]['timestamp'] # Bar end timestamp
|
|
Timestamp('2024-01-01 09:15:00')
|
|
"""
|
|
if not minute_data:
|
|
return []
|
|
|
|
if not isinstance(minute_data, list):
|
|
raise ValueError("minute_data must be a list of dictionaries")
|
|
|
|
if timestamp_mode not in ["end", "start"]:
|
|
raise ValueError("timestamp_mode must be 'end' or 'start'")
|
|
|
|
# Validate timeframe
|
|
timeframe_minutes = parse_timeframe_to_minutes(timeframe)
|
|
|
|
# If requesting 1min data, return as-is (with timestamp mode adjustment)
|
|
if timeframe_minutes == 1:
|
|
if timestamp_mode == "end":
|
|
# Adjust timestamps to represent bar end (add 1 minute)
|
|
result = []
|
|
for data_point in minute_data:
|
|
adjusted_point = data_point.copy()
|
|
adjusted_point['timestamp'] = data_point['timestamp'] + pd.Timedelta(minutes=1)
|
|
result.append(adjusted_point)
|
|
return result
|
|
else:
|
|
return minute_data.copy()
|
|
|
|
# Validate data structure
|
|
required_fields = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
|
|
for i, data_point in enumerate(minute_data):
|
|
if not isinstance(data_point, dict):
|
|
raise ValueError(f"Data point {i} must be a dictionary")
|
|
|
|
for field in required_fields:
|
|
if field not in data_point:
|
|
raise ValueError(f"Data point {i} missing required field: {field}")
|
|
|
|
# Validate timestamp
|
|
if not isinstance(data_point['timestamp'], pd.Timestamp):
|
|
try:
|
|
data_point['timestamp'] = pd.Timestamp(data_point['timestamp'])
|
|
except Exception as e:
|
|
raise ValueError(f"Invalid timestamp in data point {i}: {e}")
|
|
|
|
try:
|
|
# Convert to DataFrame for pandas resampling
|
|
df = pd.DataFrame(minute_data)
|
|
df = df.set_index('timestamp')
|
|
|
|
# Sort by timestamp to ensure proper ordering
|
|
df = df.sort_index()
|
|
|
|
# Use pandas resampling for mathematical correctness
|
|
freq_str = f'{timeframe_minutes}min'
|
|
|
|
# Use trading industry standard grouping: label='left', closed='left'
|
|
# This means 5min bar starting at 09:00 includes minutes 09:00-09:04
|
|
resampled = df.resample(freq_str, label='left', closed='left').agg({
|
|
'open': 'first', # First open in the period
|
|
'high': 'max', # Maximum high in the period
|
|
'low': 'min', # Minimum low in the period
|
|
'close': 'last', # Last close in the period
|
|
'volume': 'sum' # Sum of volume in the period
|
|
})
|
|
|
|
# Remove any rows with NaN values (incomplete periods)
|
|
resampled = resampled.dropna()
|
|
|
|
# Convert back to list of dictionaries
|
|
result = []
|
|
for timestamp, row in resampled.iterrows():
|
|
# Adjust timestamp based on mode
|
|
if timestamp_mode == "end":
|
|
# Convert bar start timestamp to bar end timestamp
|
|
bar_end_timestamp = timestamp + pd.Timedelta(minutes=timeframe_minutes)
|
|
final_timestamp = bar_end_timestamp
|
|
else:
|
|
# Keep bar start timestamp
|
|
final_timestamp = timestamp
|
|
|
|
result.append({
|
|
'timestamp': final_timestamp,
|
|
'open': float(row['open']),
|
|
'high': float(row['high']),
|
|
'low': float(row['low']),
|
|
'close': float(row['close']),
|
|
'volume': float(row['volume'])
|
|
})
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
raise TimeframeError(f"Failed to aggregate data to {timeframe}: {e}")
|
|
|
|
|
|
def get_latest_complete_bar(
|
|
minute_data: List[Dict[str, Union[float, pd.Timestamp]]],
|
|
timeframe: str,
|
|
timestamp_mode: str = "end"
|
|
) -> Optional[Dict[str, Union[float, pd.Timestamp]]]:
|
|
"""
|
|
Get the latest complete bar from minute data for the specified timeframe.
|
|
|
|
This function is useful for real-time processing where you only want to
|
|
process complete bars and avoid using incomplete/future data.
|
|
|
|
Args:
|
|
minute_data: List of minute OHLCV dictionaries with 'timestamp' field
|
|
timeframe: Target timeframe ("1min", "5min", "15min", "1h", "4h", "1d")
|
|
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
|
|
|
|
Returns:
|
|
Latest complete bar dictionary, or None if no complete bars available
|
|
|
|
Examples:
|
|
>>> minute_data = [...] # 30 minutes of data
|
|
>>> latest_15m = get_latest_complete_bar(minute_data, "15min")
|
|
>>> latest_15m['timestamp'] # Will be 15 minutes ago (complete bar)
|
|
"""
|
|
if not minute_data:
|
|
return None
|
|
|
|
# Get all aggregated bars
|
|
aggregated_bars = aggregate_minute_data_to_timeframe(minute_data, timeframe, timestamp_mode)
|
|
|
|
if not aggregated_bars:
|
|
return None
|
|
|
|
# For real-time processing, we need to ensure the bar is truly complete
|
|
# This means the bar's end time should be before the current time
|
|
latest_minute_timestamp = max(data['timestamp'] for data in minute_data)
|
|
|
|
# Filter out incomplete bars
|
|
complete_bars = []
|
|
for bar in aggregated_bars:
|
|
if timestamp_mode == "end":
|
|
# Bar timestamp is the end time, so it should be <= latest minute + 1 minute
|
|
if bar['timestamp'] <= latest_minute_timestamp + pd.Timedelta(minutes=1):
|
|
complete_bars.append(bar)
|
|
else:
|
|
# Bar timestamp is the start time, check if enough time has passed
|
|
timeframe_minutes = parse_timeframe_to_minutes(timeframe)
|
|
bar_end_time = bar['timestamp'] + pd.Timedelta(minutes=timeframe_minutes)
|
|
if bar_end_time <= latest_minute_timestamp + pd.Timedelta(minutes=1):
|
|
complete_bars.append(bar)
|
|
|
|
return complete_bars[-1] if complete_bars else None
|
|
|
|
|
|
class MinuteDataBuffer:
|
|
"""
|
|
Helper class for managing minute data buffers in real-time strategies.
|
|
|
|
This class provides efficient buffer management for minute-level data with
|
|
automatic aggregation capabilities. It's designed for use in incremental
|
|
strategies that need to maintain a rolling window of minute data.
|
|
|
|
Features:
|
|
- Automatic buffer size management with configurable limits
|
|
- Efficient data access and aggregation methods
|
|
- Memory-bounded operation (doesn't grow indefinitely)
|
|
- Thread-safe operations for real-time use
|
|
- Comprehensive validation and error handling
|
|
|
|
Example:
|
|
>>> buffer = MinuteDataBuffer(max_size=1440) # 24 hours
|
|
>>> buffer.add(timestamp, {'open': 100, 'high': 102, 'low': 99, 'close': 101, 'volume': 1000})
|
|
>>> bars_15m = buffer.aggregate_to_timeframe("15min", lookback_bars=4)
|
|
>>> latest_bar = buffer.get_latest_complete_bar("15min")
|
|
"""
|
|
|
|
def __init__(self, max_size: int = 1440):
|
|
"""
|
|
Initialize minute data buffer.
|
|
|
|
Args:
|
|
max_size: Maximum number of minute data points to keep (default: 1440 = 24 hours)
|
|
"""
|
|
if max_size <= 0:
|
|
raise ValueError("max_size must be positive")
|
|
|
|
self.max_size = max_size
|
|
self._buffer = deque(maxlen=max_size)
|
|
self._last_timestamp = None
|
|
|
|
logger.debug(f"Initialized MinuteDataBuffer with max_size={max_size}")
|
|
|
|
def add(self, timestamp: pd.Timestamp, ohlcv_data: Dict[str, float]) -> None:
|
|
"""
|
|
Add new minute data point to the buffer.
|
|
|
|
Args:
|
|
timestamp: Timestamp of the data point
|
|
ohlcv_data: OHLCV data dictionary (open, high, low, close, volume)
|
|
|
|
Raises:
|
|
ValueError: If data is invalid or timestamp is out of order
|
|
"""
|
|
if not isinstance(timestamp, pd.Timestamp):
|
|
try:
|
|
timestamp = pd.Timestamp(timestamp)
|
|
except Exception as e:
|
|
raise ValueError(f"Invalid timestamp: {e}")
|
|
|
|
# Validate OHLCV data
|
|
required_fields = ['open', 'high', 'low', 'close', 'volume']
|
|
for field in required_fields:
|
|
if field not in ohlcv_data:
|
|
raise ValueError(f"Missing required field: {field}")
|
|
# Accept both Python numeric types and numpy numeric types
|
|
if not isinstance(ohlcv_data[field], (int, float, np.number)):
|
|
raise ValueError(f"Field {field} must be numeric, got {type(ohlcv_data[field])}")
|
|
|
|
# Convert numpy types to Python types to ensure compatibility
|
|
if isinstance(ohlcv_data[field], np.number):
|
|
ohlcv_data[field] = float(ohlcv_data[field])
|
|
|
|
# Check timestamp ordering (allow equal timestamps for updates)
|
|
if self._last_timestamp is not None and timestamp < self._last_timestamp:
|
|
logger.warning(f"Out-of-order timestamp: {timestamp} < {self._last_timestamp}")
|
|
|
|
# Create data point
|
|
data_point = ohlcv_data.copy()
|
|
data_point['timestamp'] = timestamp
|
|
|
|
# Add to buffer
|
|
self._buffer.append(data_point)
|
|
self._last_timestamp = timestamp
|
|
|
|
logger.debug(f"Added data point at {timestamp}, buffer size: {len(self._buffer)}")
|
|
|
|
def get_data(self, lookback_minutes: Optional[int] = None) -> List[Dict[str, Union[float, pd.Timestamp]]]:
|
|
"""
|
|
Get data from buffer.
|
|
|
|
Args:
|
|
lookback_minutes: Number of minutes to look back (None for all data)
|
|
|
|
Returns:
|
|
List of minute data dictionaries
|
|
"""
|
|
if not self._buffer:
|
|
return []
|
|
|
|
if lookback_minutes is None:
|
|
return list(self._buffer)
|
|
|
|
if lookback_minutes <= 0:
|
|
raise ValueError("lookback_minutes must be positive")
|
|
|
|
# Get data from the last N minutes
|
|
if len(self._buffer) <= lookback_minutes:
|
|
return list(self._buffer)
|
|
|
|
return list(self._buffer)[-lookback_minutes:]
|
|
|
|
def aggregate_to_timeframe(
|
|
self,
|
|
timeframe: str,
|
|
lookback_bars: Optional[int] = None,
|
|
timestamp_mode: str = "end"
|
|
) -> List[Dict[str, Union[float, pd.Timestamp]]]:
|
|
"""
|
|
Aggregate buffer data to specified timeframe.
|
|
|
|
Args:
|
|
timeframe: Target timeframe ("5min", "15min", "1h", etc.)
|
|
lookback_bars: Number of bars to return (None for all available)
|
|
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
|
|
|
|
Returns:
|
|
List of aggregated OHLCV bars
|
|
"""
|
|
if not self._buffer:
|
|
return []
|
|
|
|
# Get all buffer data
|
|
minute_data = list(self._buffer)
|
|
|
|
# Aggregate to timeframe
|
|
aggregated_bars = aggregate_minute_data_to_timeframe(minute_data, timeframe, timestamp_mode)
|
|
|
|
# Apply lookback limit
|
|
if lookback_bars is not None and lookback_bars > 0:
|
|
aggregated_bars = aggregated_bars[-lookback_bars:]
|
|
|
|
return aggregated_bars
|
|
|
|
def get_latest_complete_bar(
|
|
self,
|
|
timeframe: str,
|
|
timestamp_mode: str = "end"
|
|
) -> Optional[Dict[str, Union[float, pd.Timestamp]]]:
|
|
"""
|
|
Get the latest complete bar for the specified timeframe.
|
|
|
|
Args:
|
|
timeframe: Target timeframe ("5min", "15min", "1h", etc.)
|
|
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
|
|
|
|
Returns:
|
|
Latest complete bar dictionary, or None if no complete bars available
|
|
"""
|
|
if not self._buffer:
|
|
return None
|
|
|
|
minute_data = list(self._buffer)
|
|
return get_latest_complete_bar(minute_data, timeframe, timestamp_mode)
|
|
|
|
def size(self) -> int:
|
|
"""Get current buffer size."""
|
|
return len(self._buffer)
|
|
|
|
def is_full(self) -> bool:
|
|
"""Check if buffer is at maximum capacity."""
|
|
return len(self._buffer) >= self.max_size
|
|
|
|
def clear(self) -> None:
|
|
"""Clear all data from buffer."""
|
|
self._buffer.clear()
|
|
self._last_timestamp = None
|
|
logger.debug("Buffer cleared")
|
|
|
|
def get_time_range(self) -> Optional[tuple]:
|
|
"""
|
|
Get the time range of data in the buffer.
|
|
|
|
Returns:
|
|
Tuple of (start_time, end_time) or None if buffer is empty
|
|
"""
|
|
if not self._buffer:
|
|
return None
|
|
|
|
timestamps = [data['timestamp'] for data in self._buffer]
|
|
return (min(timestamps), max(timestamps))
|
|
|
|
def __len__(self) -> int:
|
|
"""Get buffer size."""
|
|
return len(self._buffer)
|
|
|
|
def __repr__(self) -> str:
|
|
"""String representation of buffer."""
|
|
time_range = self.get_time_range()
|
|
if time_range:
|
|
start, end = time_range
|
|
return f"MinuteDataBuffer(size={len(self._buffer)}, range={start} to {end})"
|
|
else:
|
|
return f"MinuteDataBuffer(size=0, empty)" |