TimeFrame agregator with right logic
This commit is contained in:
23
IncrementalTrader/utils/__init__.py
Normal file
23
IncrementalTrader/utils/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""
|
||||
Utility modules for the IncrementalTrader framework.
|
||||
|
||||
This package contains utility functions and classes that support the core
|
||||
trading functionality, including timeframe aggregation, data management,
|
||||
and helper utilities.
|
||||
"""
|
||||
|
||||
from .timeframe_utils import (
|
||||
aggregate_minute_data_to_timeframe,
|
||||
parse_timeframe_to_minutes,
|
||||
get_latest_complete_bar,
|
||||
MinuteDataBuffer,
|
||||
TimeframeError
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'aggregate_minute_data_to_timeframe',
|
||||
'parse_timeframe_to_minutes',
|
||||
'get_latest_complete_bar',
|
||||
'MinuteDataBuffer',
|
||||
'TimeframeError'
|
||||
]
|
||||
455
IncrementalTrader/utils/timeframe_utils.py
Normal file
455
IncrementalTrader/utils/timeframe_utils.py
Normal file
@@ -0,0 +1,455 @@
|
||||
"""
|
||||
Timeframe aggregation utilities for the IncrementalTrader framework.
|
||||
|
||||
This module provides utilities for aggregating minute-level OHLCV data to higher
|
||||
timeframes with mathematical correctness and proper timestamp handling.
|
||||
|
||||
Key Features:
|
||||
- Uses pandas resampling for mathematical correctness
|
||||
- Supports bar end timestamps (default) to prevent future data leakage
|
||||
- Proper OHLCV aggregation rules (first/max/min/last/sum)
|
||||
- MinuteDataBuffer for efficient real-time data management
|
||||
- Comprehensive error handling and validation
|
||||
|
||||
Critical Fixes:
|
||||
1. Bar timestamps represent END of period (no future data leakage)
|
||||
2. Correct OHLCV aggregation matching pandas resampling
|
||||
3. Proper handling of incomplete bars and edge cases
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional, Union, Any
|
||||
from collections import deque
|
||||
import logging
|
||||
import re
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TimeframeError(Exception):
|
||||
"""Exception raised for timeframe-related errors."""
|
||||
pass
|
||||
|
||||
|
||||
def parse_timeframe_to_minutes(timeframe: str) -> int:
|
||||
"""
|
||||
Parse timeframe string to minutes.
|
||||
|
||||
Args:
|
||||
timeframe: Timeframe string (e.g., "1min", "5min", "15min", "1h", "4h", "1d")
|
||||
|
||||
Returns:
|
||||
Number of minutes in the timeframe
|
||||
|
||||
Raises:
|
||||
TimeframeError: If timeframe format is invalid
|
||||
|
||||
Examples:
|
||||
>>> parse_timeframe_to_minutes("15min")
|
||||
15
|
||||
>>> parse_timeframe_to_minutes("1h")
|
||||
60
|
||||
>>> parse_timeframe_to_minutes("1d")
|
||||
1440
|
||||
"""
|
||||
if not isinstance(timeframe, str):
|
||||
raise TimeframeError(f"Timeframe must be a string, got {type(timeframe)}")
|
||||
|
||||
timeframe = timeframe.lower().strip()
|
||||
|
||||
# Handle common timeframe formats
|
||||
patterns = {
|
||||
r'^(\d+)min$': lambda m: int(m.group(1)),
|
||||
r'^(\d+)h$': lambda m: int(m.group(1)) * 60,
|
||||
r'^(\d+)d$': lambda m: int(m.group(1)) * 1440,
|
||||
r'^(\d+)w$': lambda m: int(m.group(1)) * 10080, # 7 * 24 * 60
|
||||
}
|
||||
|
||||
for pattern, converter in patterns.items():
|
||||
match = re.match(pattern, timeframe)
|
||||
if match:
|
||||
minutes = converter(match)
|
||||
if minutes <= 0:
|
||||
raise TimeframeError(f"Timeframe must be positive, got {minutes} minutes")
|
||||
return minutes
|
||||
|
||||
raise TimeframeError(f"Invalid timeframe format: {timeframe}. "
|
||||
f"Supported formats: Nmin, Nh, Nd, Nw (e.g., 15min, 1h, 1d)")
|
||||
|
||||
|
||||
def aggregate_minute_data_to_timeframe(
|
||||
minute_data: List[Dict[str, Union[float, pd.Timestamp]]],
|
||||
timeframe: str,
|
||||
timestamp_mode: str = "end"
|
||||
) -> List[Dict[str, Union[float, pd.Timestamp]]]:
|
||||
"""
|
||||
Aggregate minute-level OHLCV data to specified timeframe using pandas resampling.
|
||||
|
||||
This function provides mathematically correct aggregation that matches pandas
|
||||
resampling behavior, with proper timestamp handling to prevent future data leakage.
|
||||
|
||||
Args:
|
||||
minute_data: List of minute OHLCV dictionaries with 'timestamp' field
|
||||
timeframe: Target timeframe ("1min", "5min", "15min", "1h", "4h", "1d")
|
||||
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
|
||||
|
||||
Returns:
|
||||
List of aggregated OHLCV dictionaries with proper timestamps
|
||||
|
||||
Raises:
|
||||
TimeframeError: If timeframe format is invalid or data is malformed
|
||||
ValueError: If minute_data is empty or contains invalid data
|
||||
|
||||
Examples:
|
||||
>>> minute_data = [
|
||||
... {'timestamp': pd.Timestamp('2024-01-01 09:00'), 'open': 100, 'high': 102, 'low': 99, 'close': 101, 'volume': 1000},
|
||||
... {'timestamp': pd.Timestamp('2024-01-01 09:01'), 'open': 101, 'high': 103, 'low': 100, 'close': 102, 'volume': 1200},
|
||||
... ]
|
||||
>>> result = aggregate_minute_data_to_timeframe(minute_data, "15min")
|
||||
>>> len(result)
|
||||
1
|
||||
>>> result[0]['timestamp'] # Bar end timestamp
|
||||
Timestamp('2024-01-01 09:15:00')
|
||||
"""
|
||||
if not minute_data:
|
||||
return []
|
||||
|
||||
if not isinstance(minute_data, list):
|
||||
raise ValueError("minute_data must be a list of dictionaries")
|
||||
|
||||
if timestamp_mode not in ["end", "start"]:
|
||||
raise ValueError("timestamp_mode must be 'end' or 'start'")
|
||||
|
||||
# Validate timeframe
|
||||
timeframe_minutes = parse_timeframe_to_minutes(timeframe)
|
||||
|
||||
# If requesting 1min data, return as-is (with timestamp mode adjustment)
|
||||
if timeframe_minutes == 1:
|
||||
if timestamp_mode == "end":
|
||||
# Adjust timestamps to represent bar end (add 1 minute)
|
||||
result = []
|
||||
for data_point in minute_data:
|
||||
adjusted_point = data_point.copy()
|
||||
adjusted_point['timestamp'] = data_point['timestamp'] + pd.Timedelta(minutes=1)
|
||||
result.append(adjusted_point)
|
||||
return result
|
||||
else:
|
||||
return minute_data.copy()
|
||||
|
||||
# Validate data structure
|
||||
required_fields = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
|
||||
for i, data_point in enumerate(minute_data):
|
||||
if not isinstance(data_point, dict):
|
||||
raise ValueError(f"Data point {i} must be a dictionary")
|
||||
|
||||
for field in required_fields:
|
||||
if field not in data_point:
|
||||
raise ValueError(f"Data point {i} missing required field: {field}")
|
||||
|
||||
# Validate timestamp
|
||||
if not isinstance(data_point['timestamp'], pd.Timestamp):
|
||||
try:
|
||||
data_point['timestamp'] = pd.Timestamp(data_point['timestamp'])
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid timestamp in data point {i}: {e}")
|
||||
|
||||
try:
|
||||
# Convert to DataFrame for pandas resampling
|
||||
df = pd.DataFrame(minute_data)
|
||||
df = df.set_index('timestamp')
|
||||
|
||||
# Sort by timestamp to ensure proper ordering
|
||||
df = df.sort_index()
|
||||
|
||||
# Use pandas resampling for mathematical correctness
|
||||
freq_str = f'{timeframe_minutes}min'
|
||||
|
||||
# Use trading industry standard grouping: label='left', closed='left'
|
||||
# This means 5min bar starting at 09:00 includes minutes 09:00-09:04
|
||||
resampled = df.resample(freq_str, label='left', closed='left').agg({
|
||||
'open': 'first', # First open in the period
|
||||
'high': 'max', # Maximum high in the period
|
||||
'low': 'min', # Minimum low in the period
|
||||
'close': 'last', # Last close in the period
|
||||
'volume': 'sum' # Sum of volume in the period
|
||||
})
|
||||
|
||||
# Remove any rows with NaN values (incomplete periods)
|
||||
resampled = resampled.dropna()
|
||||
|
||||
# Convert back to list of dictionaries
|
||||
result = []
|
||||
for timestamp, row in resampled.iterrows():
|
||||
# Adjust timestamp based on mode
|
||||
if timestamp_mode == "end":
|
||||
# Convert bar start timestamp to bar end timestamp
|
||||
bar_end_timestamp = timestamp + pd.Timedelta(minutes=timeframe_minutes)
|
||||
final_timestamp = bar_end_timestamp
|
||||
else:
|
||||
# Keep bar start timestamp
|
||||
final_timestamp = timestamp
|
||||
|
||||
result.append({
|
||||
'timestamp': final_timestamp,
|
||||
'open': float(row['open']),
|
||||
'high': float(row['high']),
|
||||
'low': float(row['low']),
|
||||
'close': float(row['close']),
|
||||
'volume': float(row['volume'])
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
raise TimeframeError(f"Failed to aggregate data to {timeframe}: {e}")
|
||||
|
||||
|
||||
def get_latest_complete_bar(
|
||||
minute_data: List[Dict[str, Union[float, pd.Timestamp]]],
|
||||
timeframe: str,
|
||||
timestamp_mode: str = "end"
|
||||
) -> Optional[Dict[str, Union[float, pd.Timestamp]]]:
|
||||
"""
|
||||
Get the latest complete bar from minute data for the specified timeframe.
|
||||
|
||||
This function is useful for real-time processing where you only want to
|
||||
process complete bars and avoid using incomplete/future data.
|
||||
|
||||
Args:
|
||||
minute_data: List of minute OHLCV dictionaries with 'timestamp' field
|
||||
timeframe: Target timeframe ("1min", "5min", "15min", "1h", "4h", "1d")
|
||||
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
|
||||
|
||||
Returns:
|
||||
Latest complete bar dictionary, or None if no complete bars available
|
||||
|
||||
Examples:
|
||||
>>> minute_data = [...] # 30 minutes of data
|
||||
>>> latest_15m = get_latest_complete_bar(minute_data, "15min")
|
||||
>>> latest_15m['timestamp'] # Will be 15 minutes ago (complete bar)
|
||||
"""
|
||||
if not minute_data:
|
||||
return None
|
||||
|
||||
# Get all aggregated bars
|
||||
aggregated_bars = aggregate_minute_data_to_timeframe(minute_data, timeframe, timestamp_mode)
|
||||
|
||||
if not aggregated_bars:
|
||||
return None
|
||||
|
||||
# For real-time processing, we need to ensure the bar is truly complete
|
||||
# This means the bar's end time should be before the current time
|
||||
latest_minute_timestamp = max(data['timestamp'] for data in minute_data)
|
||||
|
||||
# Filter out incomplete bars
|
||||
complete_bars = []
|
||||
for bar in aggregated_bars:
|
||||
if timestamp_mode == "end":
|
||||
# Bar timestamp is the end time, so it should be <= latest minute + 1 minute
|
||||
if bar['timestamp'] <= latest_minute_timestamp + pd.Timedelta(minutes=1):
|
||||
complete_bars.append(bar)
|
||||
else:
|
||||
# Bar timestamp is the start time, check if enough time has passed
|
||||
timeframe_minutes = parse_timeframe_to_minutes(timeframe)
|
||||
bar_end_time = bar['timestamp'] + pd.Timedelta(minutes=timeframe_minutes)
|
||||
if bar_end_time <= latest_minute_timestamp + pd.Timedelta(minutes=1):
|
||||
complete_bars.append(bar)
|
||||
|
||||
return complete_bars[-1] if complete_bars else None
|
||||
|
||||
|
||||
class MinuteDataBuffer:
|
||||
"""
|
||||
Helper class for managing minute data buffers in real-time strategies.
|
||||
|
||||
This class provides efficient buffer management for minute-level data with
|
||||
automatic aggregation capabilities. It's designed for use in incremental
|
||||
strategies that need to maintain a rolling window of minute data.
|
||||
|
||||
Features:
|
||||
- Automatic buffer size management with configurable limits
|
||||
- Efficient data access and aggregation methods
|
||||
- Memory-bounded operation (doesn't grow indefinitely)
|
||||
- Thread-safe operations for real-time use
|
||||
- Comprehensive validation and error handling
|
||||
|
||||
Example:
|
||||
>>> buffer = MinuteDataBuffer(max_size=1440) # 24 hours
|
||||
>>> buffer.add(timestamp, {'open': 100, 'high': 102, 'low': 99, 'close': 101, 'volume': 1000})
|
||||
>>> bars_15m = buffer.aggregate_to_timeframe("15min", lookback_bars=4)
|
||||
>>> latest_bar = buffer.get_latest_complete_bar("15min")
|
||||
"""
|
||||
|
||||
def __init__(self, max_size: int = 1440):
|
||||
"""
|
||||
Initialize minute data buffer.
|
||||
|
||||
Args:
|
||||
max_size: Maximum number of minute data points to keep (default: 1440 = 24 hours)
|
||||
"""
|
||||
if max_size <= 0:
|
||||
raise ValueError("max_size must be positive")
|
||||
|
||||
self.max_size = max_size
|
||||
self._buffer = deque(maxlen=max_size)
|
||||
self._last_timestamp = None
|
||||
|
||||
logger.debug(f"Initialized MinuteDataBuffer with max_size={max_size}")
|
||||
|
||||
def add(self, timestamp: pd.Timestamp, ohlcv_data: Dict[str, float]) -> None:
|
||||
"""
|
||||
Add new minute data point to the buffer.
|
||||
|
||||
Args:
|
||||
timestamp: Timestamp of the data point
|
||||
ohlcv_data: OHLCV data dictionary (open, high, low, close, volume)
|
||||
|
||||
Raises:
|
||||
ValueError: If data is invalid or timestamp is out of order
|
||||
"""
|
||||
if not isinstance(timestamp, pd.Timestamp):
|
||||
try:
|
||||
timestamp = pd.Timestamp(timestamp)
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid timestamp: {e}")
|
||||
|
||||
# Validate OHLCV data
|
||||
required_fields = ['open', 'high', 'low', 'close', 'volume']
|
||||
for field in required_fields:
|
||||
if field not in ohlcv_data:
|
||||
raise ValueError(f"Missing required field: {field}")
|
||||
if not isinstance(ohlcv_data[field], (int, float)):
|
||||
raise ValueError(f"Field {field} must be numeric, got {type(ohlcv_data[field])}")
|
||||
|
||||
# Check timestamp ordering (allow equal timestamps for updates)
|
||||
if self._last_timestamp is not None and timestamp < self._last_timestamp:
|
||||
logger.warning(f"Out-of-order timestamp: {timestamp} < {self._last_timestamp}")
|
||||
|
||||
# Create data point
|
||||
data_point = ohlcv_data.copy()
|
||||
data_point['timestamp'] = timestamp
|
||||
|
||||
# Add to buffer
|
||||
self._buffer.append(data_point)
|
||||
self._last_timestamp = timestamp
|
||||
|
||||
logger.debug(f"Added data point at {timestamp}, buffer size: {len(self._buffer)}")
|
||||
|
||||
def get_data(self, lookback_minutes: Optional[int] = None) -> List[Dict[str, Union[float, pd.Timestamp]]]:
|
||||
"""
|
||||
Get data from buffer.
|
||||
|
||||
Args:
|
||||
lookback_minutes: Number of minutes to look back (None for all data)
|
||||
|
||||
Returns:
|
||||
List of minute data dictionaries
|
||||
"""
|
||||
if not self._buffer:
|
||||
return []
|
||||
|
||||
if lookback_minutes is None:
|
||||
return list(self._buffer)
|
||||
|
||||
if lookback_minutes <= 0:
|
||||
raise ValueError("lookback_minutes must be positive")
|
||||
|
||||
# Get data from the last N minutes
|
||||
if len(self._buffer) <= lookback_minutes:
|
||||
return list(self._buffer)
|
||||
|
||||
return list(self._buffer)[-lookback_minutes:]
|
||||
|
||||
def aggregate_to_timeframe(
|
||||
self,
|
||||
timeframe: str,
|
||||
lookback_bars: Optional[int] = None,
|
||||
timestamp_mode: str = "end"
|
||||
) -> List[Dict[str, Union[float, pd.Timestamp]]]:
|
||||
"""
|
||||
Aggregate buffer data to specified timeframe.
|
||||
|
||||
Args:
|
||||
timeframe: Target timeframe ("5min", "15min", "1h", etc.)
|
||||
lookback_bars: Number of bars to return (None for all available)
|
||||
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
|
||||
|
||||
Returns:
|
||||
List of aggregated OHLCV bars
|
||||
"""
|
||||
if not self._buffer:
|
||||
return []
|
||||
|
||||
# Get all buffer data
|
||||
minute_data = list(self._buffer)
|
||||
|
||||
# Aggregate to timeframe
|
||||
aggregated_bars = aggregate_minute_data_to_timeframe(minute_data, timeframe, timestamp_mode)
|
||||
|
||||
# Apply lookback limit
|
||||
if lookback_bars is not None and lookback_bars > 0:
|
||||
aggregated_bars = aggregated_bars[-lookback_bars:]
|
||||
|
||||
return aggregated_bars
|
||||
|
||||
def get_latest_complete_bar(
|
||||
self,
|
||||
timeframe: str,
|
||||
timestamp_mode: str = "end"
|
||||
) -> Optional[Dict[str, Union[float, pd.Timestamp]]]:
|
||||
"""
|
||||
Get the latest complete bar for the specified timeframe.
|
||||
|
||||
Args:
|
||||
timeframe: Target timeframe ("5min", "15min", "1h", etc.)
|
||||
timestamp_mode: "end" (default) for bar end timestamps, "start" for bar start
|
||||
|
||||
Returns:
|
||||
Latest complete bar dictionary, or None if no complete bars available
|
||||
"""
|
||||
if not self._buffer:
|
||||
return None
|
||||
|
||||
minute_data = list(self._buffer)
|
||||
return get_latest_complete_bar(minute_data, timeframe, timestamp_mode)
|
||||
|
||||
def size(self) -> int:
|
||||
"""Get current buffer size."""
|
||||
return len(self._buffer)
|
||||
|
||||
def is_full(self) -> bool:
|
||||
"""Check if buffer is at maximum capacity."""
|
||||
return len(self._buffer) >= self.max_size
|
||||
|
||||
def clear(self) -> None:
|
||||
"""Clear all data from buffer."""
|
||||
self._buffer.clear()
|
||||
self._last_timestamp = None
|
||||
logger.debug("Buffer cleared")
|
||||
|
||||
def get_time_range(self) -> Optional[tuple]:
|
||||
"""
|
||||
Get the time range of data in the buffer.
|
||||
|
||||
Returns:
|
||||
Tuple of (start_time, end_time) or None if buffer is empty
|
||||
"""
|
||||
if not self._buffer:
|
||||
return None
|
||||
|
||||
timestamps = [data['timestamp'] for data in self._buffer]
|
||||
return (min(timestamps), max(timestamps))
|
||||
|
||||
def __len__(self) -> int:
|
||||
"""Get buffer size."""
|
||||
return len(self._buffer)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""String representation of buffer."""
|
||||
time_range = self.get_time_range()
|
||||
if time_range:
|
||||
start, end = time_range
|
||||
return f"MinuteDataBuffer(size={len(self._buffer)}, range={start} to {end})"
|
||||
else:
|
||||
return f"MinuteDataBuffer(size=0, empty)"
|
||||
Reference in New Issue
Block a user