Vasily.onl e7ede7f329 Refactor aggregation module and enhance structure
- Split the `aggregation.py` file into a dedicated sub-package, improving modularity and maintainability.
- Moved `TimeframeBucket`, `RealTimeCandleProcessor`, and `BatchCandleProcessor` classes into their respective files within the new `aggregation` sub-package.
- Introduced utility functions for trade aggregation and validation, enhancing code organization.
- Updated import paths throughout the codebase to reflect the new structure, ensuring compatibility.
- Added safety net tests for the aggregation package to verify core functionality and prevent regressions during refactoring.

These changes enhance the overall architecture of the aggregation module, making it more scalable and easier to manage.
2025-06-07 01:17:22 +08:00

153 lines
5.8 KiB
Python

"""
Batch candle processor for historical trade data.
This module provides the BatchCandleProcessor class for building OHLCV candles
from historical trade data in batch mode.
"""
from datetime import datetime
from typing import Dict, List, Any, Iterator
from collections import defaultdict
from ..data_types import StandardizedTrade, OHLCVCandle, ProcessingStats
from .bucket import TimeframeBucket
class BatchCandleProcessor:
"""
Batch candle processor for historical trade data.
This class processes trades in batch mode, building candles for multiple
timeframes simultaneously. It's optimized for processing large amounts
of historical trade data efficiently.
"""
def __init__(self,
symbol: str,
exchange: str,
timeframes: List[str],
component_name: str = "batch_candle_processor",
logger = None):
"""
Initialize batch candle processor.
Args:
symbol: Trading symbol (e.g., 'BTC-USDT')
exchange: Exchange name
timeframes: List of timeframes to process (e.g., ['1m', '5m'])
component_name: Name for logging/stats
logger: Optional logger instance
"""
self.symbol = symbol
self.exchange = exchange
self.timeframes = timeframes
self.component_name = component_name
self.logger = logger
# Stats tracking
self.stats = ProcessingStats()
def process_trades_to_candles(self, trades: Iterator[StandardizedTrade]) -> List[OHLCVCandle]:
"""
Process trades in batch and return completed candles.
Args:
trades: Iterator of trades to process
Returns:
List of completed candles for all timeframes
"""
# Track buckets for each timeframe
buckets: Dict[str, Dict[datetime, TimeframeBucket]] = defaultdict(dict)
# Process all trades
for trade in trades:
self.stats.trades_processed += 1
# Process trade for each timeframe
for timeframe in self.timeframes:
# Get bucket for this trade's timestamp
bucket_start = self._get_bucket_start_time(trade.timestamp, timeframe)
# Create bucket if it doesn't exist
if bucket_start not in buckets[timeframe]:
buckets[timeframe][bucket_start] = TimeframeBucket(
symbol=self.symbol,
timeframe=timeframe,
start_time=bucket_start,
exchange=self.exchange
)
# Add trade to bucket
buckets[timeframe][bucket_start].add_trade(trade)
# Convert all buckets to candles
candles = []
for timeframe_buckets in buckets.values():
for bucket in timeframe_buckets.values():
candle = bucket.to_candle(is_complete=True)
candles.append(candle)
self.stats.candles_emitted += 1
return sorted(candles, key=lambda x: (x.timeframe, x.end_time))
def _get_bucket_start_time(self, timestamp: datetime, timeframe: str) -> datetime:
"""
Calculate the start time for the bucket that this timestamp belongs to.
IMPORTANT: Uses RIGHT-ALIGNED timestamps
- For 5m timeframe, buckets start at 00:00, 00:05, 00:10, etc.
- Trade at 09:03:45 belongs to 09:00-09:05 bucket
- Trade at 09:07:30 belongs to 09:05-09:10 bucket
Args:
timestamp: Trade timestamp
timeframe: Time period (e.g., '1m', '5m', '1h')
Returns:
Start time for the appropriate bucket
"""
if timeframe == '1s':
return timestamp.replace(microsecond=0)
elif timeframe == '5s':
seconds = (timestamp.second // 5) * 5
return timestamp.replace(second=seconds, microsecond=0)
elif timeframe == '10s':
seconds = (timestamp.second // 10) * 10
return timestamp.replace(second=seconds, microsecond=0)
elif timeframe == '15s':
seconds = (timestamp.second // 15) * 15
return timestamp.replace(second=seconds, microsecond=0)
elif timeframe == '30s':
seconds = (timestamp.second // 30) * 30
return timestamp.replace(second=seconds, microsecond=0)
elif timeframe == '1m':
return timestamp.replace(second=0, microsecond=0)
elif timeframe == '5m':
minutes = (timestamp.minute // 5) * 5
return timestamp.replace(minute=minutes, second=0, microsecond=0)
elif timeframe == '15m':
minutes = (timestamp.minute // 15) * 15
return timestamp.replace(minute=minutes, second=0, microsecond=0)
elif timeframe == '30m':
minutes = (timestamp.minute // 30) * 30
return timestamp.replace(minute=minutes, second=0, microsecond=0)
elif timeframe == '1h':
return timestamp.replace(minute=0, second=0, microsecond=0)
elif timeframe == '4h':
hours = (timestamp.hour // 4) * 4
return timestamp.replace(hour=hours, minute=0, second=0, microsecond=0)
elif timeframe == '1d':
return timestamp.replace(hour=0, minute=0, second=0, microsecond=0)
else:
raise ValueError(f"Unsupported timeframe: {timeframe}")
def get_stats(self) -> Dict[str, Any]:
"""Get processing statistics."""
return {
"component": self.component_name,
"stats": self.stats.to_dict()
}
__all__ = ['BatchCandleProcessor']