2025-05-31 21:58:47 +08:00
"""
Common aggregation utilities for all exchanges .
This module provides shared functionality for building OHLCV candles
from trade data , regardless of the source exchange .
AGGREGATION STRATEGY :
- Uses RIGHT - ALIGNED timestamps ( industry standard )
- Candle timestamp = end time of the interval ( close time )
- 5 - minute candle with timestamp 09 : 05 : 00 represents data from 09 : 00 : 01 to 09 : 05 : 00
- Prevents future leakage by only completing candles when time boundary is crossed
- Aligns with major exchanges ( Binance , OKX , Coinbase )
PROCESS FLOW :
1. Trade arrives with timestamp T
2. Calculate which time bucket this trade belongs to
3. If bucket doesn ' t exist or time boundary crossed, complete previous bucket
4. Add trade to current bucket
5. Only emit completed candles ( never future data )
"""
from datetime import datetime , timezone , timedelta
from decimal import Decimal
from typing import Dict , List , Optional , Any , Iterator , Callable
from collections import defaultdict
from . data_types import (
StandardizedTrade ,
OHLCVCandle ,
CandleProcessingConfig ,
ProcessingStats
)
class TimeframeBucket :
"""
Time bucket for building OHLCV candles from trades .
This class accumulates trades within a specific time period
and calculates OHLCV data incrementally .
IMPORTANT : Uses RIGHT - ALIGNED timestamps
- start_time : Beginning of the interval ( inclusive )
- end_time : End of the interval ( exclusive ) - this becomes the candle timestamp
- Example : 09 : 00 : 00 - 09 : 05 : 00 bucket - > candle timestamp = 09 : 05 : 00
"""
def __init__ ( self , symbol : str , timeframe : str , start_time : datetime , exchange : str = " unknown " ) :
"""
Initialize time bucket for candle aggregation .
Args :
symbol : Trading symbol ( e . g . , ' BTC-USDT ' )
timeframe : Time period ( e . g . , ' 1m ' , ' 5m ' , ' 1h ' )
start_time : Start time for this bucket ( inclusive )
exchange : Exchange name
"""
self . symbol = symbol
self . timeframe = timeframe
self . start_time = start_time
self . end_time = self . _calculate_end_time ( start_time , timeframe )
self . exchange = exchange
# OHLCV data
self . open : Optional [ Decimal ] = None
self . high : Optional [ Decimal ] = None
self . low : Optional [ Decimal ] = None
self . close : Optional [ Decimal ] = None
self . volume : Decimal = Decimal ( ' 0 ' )
self . trade_count : int = 0
# Tracking
self . first_trade_time : Optional [ datetime ] = None
self . last_trade_time : Optional [ datetime ] = None
self . trades : List [ StandardizedTrade ] = [ ]
def add_trade ( self , trade : StandardizedTrade ) - > bool :
"""
Add trade to this bucket if it belongs to this time period .
Args :
trade : Standardized trade data
Returns :
True if trade was added , False if outside time range
"""
# Check if trade belongs in this bucket (start_time <= trade.timestamp < end_time)
if not ( self . start_time < = trade . timestamp < self . end_time ) :
return False
# First trade sets open price
if self . open is None :
self . open = trade . price
self . high = trade . price
self . low = trade . price
self . first_trade_time = trade . timestamp
# Update OHLCV
self . high = max ( self . high , trade . price )
self . low = min ( self . low , trade . price )
self . close = trade . price # Last trade sets close
self . volume + = trade . size
self . trade_count + = 1
self . last_trade_time = trade . timestamp
# Store trade for detailed analysis if needed
self . trades . append ( trade )
return True
def to_candle ( self , is_complete : bool = True ) - > OHLCVCandle :
"""
Convert bucket to OHLCV candle .
IMPORTANT : Candle timestamp = end_time ( right - aligned , industry standard )
"""
return OHLCVCandle (
symbol = self . symbol ,
timeframe = self . timeframe ,
start_time = self . start_time ,
end_time = self . end_time ,
open = self . open or Decimal ( ' 0 ' ) ,
high = self . high or Decimal ( ' 0 ' ) ,
low = self . low or Decimal ( ' 0 ' ) ,
close = self . close or Decimal ( ' 0 ' ) ,
volume = self . volume ,
trade_count = self . trade_count ,
exchange = self . exchange ,
is_complete = is_complete ,
first_trade_time = self . first_trade_time ,
last_trade_time = self . last_trade_time
)
def _calculate_end_time ( self , start_time : datetime , timeframe : str ) - > datetime :
""" Calculate end time for this timeframe (right-aligned timestamp). """
if timeframe == ' 1m ' :
return start_time + timedelta ( minutes = 1 )
elif timeframe == ' 5m ' :
return start_time + timedelta ( minutes = 5 )
elif timeframe == ' 15m ' :
return start_time + timedelta ( minutes = 15 )
elif timeframe == ' 30m ' :
return start_time + timedelta ( minutes = 30 )
elif timeframe == ' 1h ' :
return start_time + timedelta ( hours = 1 )
elif timeframe == ' 4h ' :
return start_time + timedelta ( hours = 4 )
elif timeframe == ' 1d ' :
return start_time + timedelta ( days = 1 )
else :
raise ValueError ( f " Unsupported timeframe: { timeframe } " )
class RealTimeCandleProcessor :
"""
Real - time candle processor for live trade data .
This class processes trades immediately as they arrive from WebSocket ,
building candles incrementally and emitting completed candles when
time boundaries are crossed .
AGGREGATION PROCESS ( NO FUTURE LEAKAGE ) :
1. Trade arrives from WebSocket / API with timestamp T
2. For each configured timeframe ( 1 m , 5 m , etc . ) :
a . Calculate which time bucket this trade belongs to
b . Get current bucket for this timeframe
c . Check if trade timestamp crosses time boundary
d . If boundary crossed : complete and emit previous bucket , create new bucket
e . Add trade to current bucket ( updates OHLCV )
3. Only emit candles when time boundary is definitively crossed
4. Never emit incomplete / future candles during real - time processing
TIMESTAMP ALIGNMENT :
- Uses RIGHT - ALIGNED timestamps ( industry standard )
- 1 - minute candle covering 09 : 00 : 00 - 09 : 01 : 00 gets timestamp 09 : 01 : 00
- 5 - minute candle covering 09 : 00 : 00 - 09 : 05 : 00 gets timestamp 09 : 05 : 00
- Candle represents PAST data , never future
"""
def __init__ ( self ,
symbol : str ,
exchange : str ,
config : Optional [ CandleProcessingConfig ] = None ,
2025-06-01 14:42:29 +08:00
component_name : str = " realtime_candle_processor " ,
logger = None ) :
2025-05-31 21:58:47 +08:00
"""
Initialize real - time candle processor .
Args :
symbol : Trading symbol ( e . g . , ' BTC-USDT ' )
exchange : Exchange name ( e . g . , ' okx ' , ' binance ' )
config : Processing configuration
component_name : Name for logging
"""
self . symbol = symbol
self . exchange = exchange
self . config = config or CandleProcessingConfig ( )
self . component_name = component_name
2025-06-01 14:42:29 +08:00
self . logger = logger
2025-05-31 21:58:47 +08:00
# Current buckets for each timeframe
self . current_buckets : Dict [ str , TimeframeBucket ] = { }
# Callback functions for completed candles
self . candle_callbacks : List [ Callable [ [ OHLCVCandle ] , None ] ] = [ ]
# Statistics
self . stats = ProcessingStats ( active_timeframes = len ( self . config . timeframes ) )
2025-06-01 14:42:29 +08:00
if self . logger :
self . logger . info ( f " { self . component_name } : Initialized real-time candle processor for { symbol } on { exchange } with timeframes: { self . config . timeframes } " )
2025-05-31 21:58:47 +08:00
def add_candle_callback ( self , callback : Callable [ [ OHLCVCandle ] , None ] ) - > None :
""" Add callback function to receive completed candles. """
self . candle_callbacks . append ( callback )
2025-06-01 14:42:29 +08:00
if self . logger :
self . logger . debug ( f " { self . component_name } : Added candle callback: { callback . __name__ if hasattr ( callback , ' __name__ ' ) else str ( callback ) } " )
2025-05-31 21:58:47 +08:00
def process_trade ( self , trade : StandardizedTrade ) - > List [ OHLCVCandle ] :
"""
Process single trade - main entry point for real - time processing .
This is called for each trade as it arrives from WebSocket .
CRITICAL : Only returns completed candles ( time boundary crossed )
Never returns incomplete / future candles to prevent leakage .
Args :
trade : Standardized trade data
Returns :
List of completed candles ( if any time boundaries were crossed )
"""
try :
completed_candles = [ ]
# Process trade for each timeframe
for timeframe in self . config . timeframes :
candle = self . _process_trade_for_timeframe ( trade , timeframe )
if candle :
completed_candles . append ( candle )
# Update statistics
self . stats . trades_processed + = 1
self . stats . last_trade_time = trade . timestamp
# Emit completed candles to callbacks
for candle in completed_candles :
self . _emit_candle ( candle )
return completed_candles
except Exception as e :
2025-06-01 14:42:29 +08:00
if self . logger :
self . logger . error ( f " { self . component_name } : Error processing trade for { self . symbol } : { e } " )
2025-05-31 21:58:47 +08:00
self . stats . errors_count + = 1
return [ ]
def _process_trade_for_timeframe ( self , trade : StandardizedTrade , timeframe : str ) - > Optional [ OHLCVCandle ] :
"""
Process trade for specific timeframe .
CRITICAL LOGIC FOR PREVENTING FUTURE LEAKAGE :
1. Calculate which bucket this trade belongs to
2. Check if current bucket exists and matches
3. If bucket mismatch ( time boundary crossed ) , complete current bucket first
4. Create new bucket and add trade
5. Only return completed candles , never incomplete ones
"""
try :
# Calculate which bucket this trade belongs to
trade_bucket_start = self . _get_bucket_start_time ( trade . timestamp , timeframe )
# Check if we have a current bucket for this timeframe
current_bucket = self . current_buckets . get ( timeframe )
completed_candle = None
# If no bucket exists or time boundary crossed, handle transition
if current_bucket is None :
# First bucket for this timeframe
current_bucket = TimeframeBucket ( self . symbol , timeframe , trade_bucket_start , self . exchange )
self . current_buckets [ timeframe ] = current_bucket
elif current_bucket . start_time != trade_bucket_start :
# Time boundary crossed - complete previous bucket
if current_bucket . trade_count > 0 : # Only complete if it has trades
completed_candle = current_bucket . to_candle ( is_complete = True )
self . stats . candles_emitted + = 1
self . stats . last_candle_time = completed_candle . end_time
# Create new bucket for current time period
current_bucket = TimeframeBucket ( self . symbol , timeframe , trade_bucket_start , self . exchange )
self . current_buckets [ timeframe ] = current_bucket
# Add trade to current bucket
if not current_bucket . add_trade ( trade ) :
# This should never happen if logic is correct
2025-06-01 14:42:29 +08:00
if self . logger :
self . logger . warning ( f " { self . component_name } : Trade { trade . timestamp } could not be added to bucket { current_bucket . start_time } - { current_bucket . end_time } " )
2025-05-31 21:58:47 +08:00
return completed_candle
except Exception as e :
2025-06-01 14:42:29 +08:00
if self . logger :
self . logger . error ( f " { self . component_name } : Error processing trade for timeframe { timeframe } : { e } " )
2025-05-31 21:58:47 +08:00
self . stats . errors_count + = 1
return None
def _get_bucket_start_time ( self , timestamp : datetime , timeframe : str ) - > datetime :
"""
Calculate bucket start time for given timestamp and timeframe .
This function determines which time bucket a trade belongs to .
The start time is the LEFT boundary of the interval .
EXAMPLES :
- Trade at 09 : 03 : 45 for 5 m timeframe - > bucket start = 09 : 00 : 00
- Trade at 09 : 07 : 23 for 5 m timeframe - > bucket start = 09 : 05 : 00
- Trade at 14 : 00 : 00 for 1 h timeframe - > bucket start = 14 : 00 : 00
Args :
timestamp : Trade timestamp
timeframe : Target timeframe
Returns :
Bucket start time ( left boundary )
"""
# Normalize to UTC and remove microseconds for clean boundaries
dt = timestamp . replace ( second = 0 , microsecond = 0 )
if timeframe == ' 1m ' :
# 1-minute buckets align to minute boundaries
return dt
elif timeframe == ' 5m ' :
# 5-minute buckets: 00:00, 00:05, 00:10, etc.
return dt . replace ( minute = ( dt . minute / / 5 ) * 5 )
elif timeframe == ' 15m ' :
# 15-minute buckets: 00:00, 00:15, 00:30, 00:45
return dt . replace ( minute = ( dt . minute / / 15 ) * 15 )
elif timeframe == ' 30m ' :
# 30-minute buckets: 00:00, 00:30
return dt . replace ( minute = ( dt . minute / / 30 ) * 30 )
elif timeframe == ' 1h ' :
# 1-hour buckets align to hour boundaries
return dt . replace ( minute = 0 )
elif timeframe == ' 4h ' :
# 4-hour buckets: 00:00, 04:00, 08:00, 12:00, 16:00, 20:00
return dt . replace ( minute = 0 , hour = ( dt . hour / / 4 ) * 4 )
elif timeframe == ' 1d ' :
# 1-day buckets align to day boundaries (midnight UTC)
return dt . replace ( minute = 0 , hour = 0 )
else :
raise ValueError ( f " Unsupported timeframe: { timeframe } " )
def _emit_candle ( self , candle : OHLCVCandle ) - > None :
""" Emit completed candle to all callbacks. """
try :
for callback in self . candle_callbacks :
callback ( candle )
except Exception as e :
2025-06-01 14:42:29 +08:00
if self . logger :
self . logger . error ( f " { self . component_name } : Error in candle callback: { e } " )
2025-05-31 21:58:47 +08:00
self . stats . errors_count + = 1
def get_current_candles ( self , incomplete : bool = True ) - > List [ OHLCVCandle ] :
"""
Get current incomplete candles for all timeframes .
WARNING : These are incomplete candles and should NOT be used for trading decisions .
They are useful for monitoring / debugging only .
"""
candles = [ ]
for bucket in self . current_buckets . values ( ) :
if bucket . trade_count > 0 : # Only return buckets with trades
candles . append ( bucket . to_candle ( is_complete = False ) )
return candles
def force_complete_all_candles ( self ) - > List [ OHLCVCandle ] :
"""
Force completion of all current candles ( useful for shutdown / batch processing ) .
WARNING : This should only be used during shutdown or batch processing ,
not during live trading as it forces incomplete candles to be marked complete .
"""
completed_candles = [ ]
for bucket in self . current_buckets . values ( ) :
if bucket . trade_count > 0 :
candle = bucket . to_candle ( is_complete = True )
completed_candles . append ( candle )
self . _emit_candle ( candle )
# Clear buckets
self . current_buckets . clear ( )
return completed_candles
def get_stats ( self ) - > Dict [ str , Any ] :
""" Get processing statistics. """
stats_dict = self . stats . to_dict ( )
stats_dict [ ' current_buckets ' ] = {
tf : bucket . trade_count for tf , bucket in self . current_buckets . items ( )
}
return stats_dict
class BatchCandleProcessor :
"""
Batch candle processor for historical data processing .
This class processes large batches of historical trades efficiently ,
building candles for multiple timeframes simultaneously .
"""
def __init__ ( self ,
symbol : str ,
exchange : str ,
timeframes : List [ str ] ,
2025-06-01 14:42:29 +08:00
component_name : str = " batch_candle_processor " ,
logger = None ) :
2025-05-31 21:58:47 +08:00
"""
Initialize batch candle processor .
Args :
symbol : Trading symbol
exchange : Exchange name
timeframes : List of timeframes to process
component_name : Name for logging
"""
self . symbol = symbol
self . exchange = exchange
self . timeframes = timeframes
self . component_name = component_name
2025-06-01 14:42:29 +08:00
self . logger = logger
2025-05-31 21:58:47 +08:00
# Statistics
self . stats = ProcessingStats ( active_timeframes = len ( timeframes ) )
2025-06-01 14:42:29 +08:00
if self . logger :
self . logger . info ( f " { self . component_name } : Initialized batch candle processor for { symbol } on { exchange } " )
2025-05-31 21:58:47 +08:00
def process_trades_to_candles ( self , trades : Iterator [ StandardizedTrade ] ) - > List [ OHLCVCandle ] :
"""
Process trade iterator to candles - optimized for batch processing .
This function handles ALL scenarios :
- Historical : Batch trade iterators
- Backfill : API trade iterators
- Real - time batch : Multiple trades at once
Args :
trades : Iterator of standardized trades
Returns :
List of completed candles
"""
try :
# Create temporary processor for this batch
config = CandleProcessingConfig ( timeframes = self . timeframes , auto_save_candles = False )
processor = RealTimeCandleProcessor (
self . symbol , self . exchange , config ,
f " batch_processor_ { self . symbol } _ { self . exchange } "
)
all_candles = [ ]
# Process all trades
for trade in trades :
completed_candles = processor . process_trade ( trade )
all_candles . extend ( completed_candles )
self . stats . trades_processed + = 1
# Force complete any remaining candles
remaining_candles = processor . force_complete_all_candles ( )
all_candles . extend ( remaining_candles )
# Update stats
self . stats . candles_emitted = len ( all_candles )
if all_candles :
self . stats . last_candle_time = max ( candle . end_time for candle in all_candles )
2025-06-01 14:42:29 +08:00
if self . logger :
self . logger . info ( f " { self . component_name } : Batch processed { self . stats . trades_processed } trades to { len ( all_candles ) } candles " )
2025-05-31 21:58:47 +08:00
return all_candles
except Exception as e :
2025-06-01 14:42:29 +08:00
if self . logger :
self . logger . error ( f " { self . component_name } : Error in batch processing trades to candles: { e } " )
2025-05-31 21:58:47 +08:00
self . stats . errors_count + = 1
return [ ]
def get_stats ( self ) - > Dict [ str , Any ] :
""" Get processing statistics. """
return self . stats . to_dict ( )
# Utility functions for common aggregation operations
def aggregate_trades_to_candles ( trades : List [ StandardizedTrade ] ,
timeframes : List [ str ] ,
symbol : str ,
exchange : str ) - > List [ OHLCVCandle ] :
"""
Simple utility function to aggregate a list of trades to candles .
Args :
trades : List of standardized trades
timeframes : List of timeframes to generate
symbol : Trading symbol
exchange : Exchange name
Returns :
List of completed candles
"""
processor = BatchCandleProcessor ( symbol , exchange , timeframes )
return processor . process_trades_to_candles ( iter ( trades ) )
def validate_timeframe ( timeframe : str ) - > bool :
"""
Validate if timeframe is supported .
Args :
timeframe : Timeframe string ( e . g . , ' 1m ' , ' 5m ' , ' 1h ' )
Returns :
True if supported , False otherwise
"""
supported = [ ' 1m ' , ' 5m ' , ' 15m ' , ' 30m ' , ' 1h ' , ' 4h ' , ' 1d ' ]
return timeframe in supported
def parse_timeframe ( timeframe : str ) - > tuple [ int , str ] :
"""
Parse timeframe string into number and unit .
Args :
timeframe : Timeframe string ( e . g . , ' 5m ' , ' 1h ' )
Returns :
Tuple of ( number , unit )
Examples :
' 5m ' - > ( 5 , ' m ' )
' 1h ' - > ( 1 , ' h ' )
' 1d ' - > ( 1 , ' d ' )
"""
import re
match = re . match ( r ' ^( \ d+)([mhd])$ ' , timeframe . lower ( ) )
if not match :
raise ValueError ( f " Invalid timeframe format: { timeframe } " )
number = int ( match . group ( 1 ) )
unit = match . group ( 2 )
return number , unit
__all__ = [
' TimeframeBucket ' ,
' RealTimeCandleProcessor ' ,
' BatchCandleProcessor ' ,
' aggregate_trades_to_candles ' ,
' validate_timeframe ' ,
' parse_timeframe '
]