Add common data processing framework for OKX exchange
- Introduced a modular architecture for data processing, including common utilities for validation, transformation, and aggregation. - Implemented `StandardizedTrade`, `OHLCVCandle`, and `TimeframeBucket` classes for unified data handling across exchanges. - Developed `OKXDataProcessor` for OKX-specific data validation and processing, leveraging the new common framework. - Enhanced `OKXCollector` to utilize the common data processing utilities, improving modularity and maintainability. - Updated documentation to reflect the new architecture and provide guidance on the data processing framework. - Created comprehensive tests for the new data processing components to ensure reliability and functionality.
This commit is contained in:
52
data/common/__init__.py
Normal file
52
data/common/__init__.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""
|
||||
Common data processing utilities for all exchanges.
|
||||
|
||||
This package contains shared components for data validation, transformation,
|
||||
and aggregation that can be used across different exchange implementations.
|
||||
"""
|
||||
|
||||
from .data_types import (
|
||||
StandardizedTrade,
|
||||
OHLCVCandle,
|
||||
MarketDataPoint,
|
||||
DataValidationResult
|
||||
)
|
||||
|
||||
from .aggregation import (
|
||||
TimeframeBucket,
|
||||
RealTimeCandleProcessor,
|
||||
CandleProcessingConfig
|
||||
)
|
||||
|
||||
from .transformation import (
|
||||
BaseDataTransformer,
|
||||
UnifiedDataTransformer,
|
||||
create_standardized_trade
|
||||
)
|
||||
|
||||
from .validation import (
|
||||
BaseDataValidator,
|
||||
ValidationResult
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Data types
|
||||
'StandardizedTrade',
|
||||
'OHLCVCandle',
|
||||
'MarketDataPoint',
|
||||
'DataValidationResult',
|
||||
|
||||
# Aggregation
|
||||
'TimeframeBucket',
|
||||
'RealTimeCandleProcessor',
|
||||
'CandleProcessingConfig',
|
||||
|
||||
# Transformation
|
||||
'BaseDataTransformer',
|
||||
'UnifiedDataTransformer',
|
||||
'create_standardized_trade',
|
||||
|
||||
# Validation
|
||||
'BaseDataValidator',
|
||||
'ValidationResult'
|
||||
]
|
||||
553
data/common/aggregation.py
Normal file
553
data/common/aggregation.py
Normal file
@@ -0,0 +1,553 @@
|
||||
"""
|
||||
Common aggregation utilities for all exchanges.
|
||||
|
||||
This module provides shared functionality for building OHLCV candles
|
||||
from trade data, regardless of the source exchange.
|
||||
|
||||
AGGREGATION STRATEGY:
|
||||
- Uses RIGHT-ALIGNED timestamps (industry standard)
|
||||
- Candle timestamp = end time of the interval (close time)
|
||||
- 5-minute candle with timestamp 09:05:00 represents data from 09:00:01 to 09:05:00
|
||||
- Prevents future leakage by only completing candles when time boundary is crossed
|
||||
- Aligns with major exchanges (Binance, OKX, Coinbase)
|
||||
|
||||
PROCESS FLOW:
|
||||
1. Trade arrives with timestamp T
|
||||
2. Calculate which time bucket this trade belongs to
|
||||
3. If bucket doesn't exist or time boundary crossed, complete previous bucket
|
||||
4. Add trade to current bucket
|
||||
5. Only emit completed candles (never future data)
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from decimal import Decimal
|
||||
from typing import Dict, List, Optional, Any, Iterator, Callable
|
||||
from collections import defaultdict
|
||||
|
||||
from .data_types import (
|
||||
StandardizedTrade,
|
||||
OHLCVCandle,
|
||||
CandleProcessingConfig,
|
||||
ProcessingStats
|
||||
)
|
||||
from utils.logger import get_logger
|
||||
|
||||
|
||||
class TimeframeBucket:
|
||||
"""
|
||||
Time bucket for building OHLCV candles from trades.
|
||||
|
||||
This class accumulates trades within a specific time period
|
||||
and calculates OHLCV data incrementally.
|
||||
|
||||
IMPORTANT: Uses RIGHT-ALIGNED timestamps
|
||||
- start_time: Beginning of the interval (inclusive)
|
||||
- end_time: End of the interval (exclusive) - this becomes the candle timestamp
|
||||
- Example: 09:00:00 - 09:05:00 bucket -> candle timestamp = 09:05:00
|
||||
"""
|
||||
|
||||
def __init__(self, symbol: str, timeframe: str, start_time: datetime, exchange: str = "unknown"):
|
||||
"""
|
||||
Initialize time bucket for candle aggregation.
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol (e.g., 'BTC-USDT')
|
||||
timeframe: Time period (e.g., '1m', '5m', '1h')
|
||||
start_time: Start time for this bucket (inclusive)
|
||||
exchange: Exchange name
|
||||
"""
|
||||
self.symbol = symbol
|
||||
self.timeframe = timeframe
|
||||
self.start_time = start_time
|
||||
self.end_time = self._calculate_end_time(start_time, timeframe)
|
||||
self.exchange = exchange
|
||||
|
||||
# OHLCV data
|
||||
self.open: Optional[Decimal] = None
|
||||
self.high: Optional[Decimal] = None
|
||||
self.low: Optional[Decimal] = None
|
||||
self.close: Optional[Decimal] = None
|
||||
self.volume: Decimal = Decimal('0')
|
||||
self.trade_count: int = 0
|
||||
|
||||
# Tracking
|
||||
self.first_trade_time: Optional[datetime] = None
|
||||
self.last_trade_time: Optional[datetime] = None
|
||||
self.trades: List[StandardizedTrade] = []
|
||||
|
||||
def add_trade(self, trade: StandardizedTrade) -> bool:
|
||||
"""
|
||||
Add trade to this bucket if it belongs to this time period.
|
||||
|
||||
Args:
|
||||
trade: Standardized trade data
|
||||
|
||||
Returns:
|
||||
True if trade was added, False if outside time range
|
||||
"""
|
||||
# Check if trade belongs in this bucket (start_time <= trade.timestamp < end_time)
|
||||
if not (self.start_time <= trade.timestamp < self.end_time):
|
||||
return False
|
||||
|
||||
# First trade sets open price
|
||||
if self.open is None:
|
||||
self.open = trade.price
|
||||
self.high = trade.price
|
||||
self.low = trade.price
|
||||
self.first_trade_time = trade.timestamp
|
||||
|
||||
# Update OHLCV
|
||||
self.high = max(self.high, trade.price)
|
||||
self.low = min(self.low, trade.price)
|
||||
self.close = trade.price # Last trade sets close
|
||||
self.volume += trade.size
|
||||
self.trade_count += 1
|
||||
self.last_trade_time = trade.timestamp
|
||||
|
||||
# Store trade for detailed analysis if needed
|
||||
self.trades.append(trade)
|
||||
|
||||
return True
|
||||
|
||||
def to_candle(self, is_complete: bool = True) -> OHLCVCandle:
|
||||
"""
|
||||
Convert bucket to OHLCV candle.
|
||||
|
||||
IMPORTANT: Candle timestamp = end_time (right-aligned, industry standard)
|
||||
"""
|
||||
return OHLCVCandle(
|
||||
symbol=self.symbol,
|
||||
timeframe=self.timeframe,
|
||||
start_time=self.start_time,
|
||||
end_time=self.end_time,
|
||||
open=self.open or Decimal('0'),
|
||||
high=self.high or Decimal('0'),
|
||||
low=self.low or Decimal('0'),
|
||||
close=self.close or Decimal('0'),
|
||||
volume=self.volume,
|
||||
trade_count=self.trade_count,
|
||||
exchange=self.exchange,
|
||||
is_complete=is_complete,
|
||||
first_trade_time=self.first_trade_time,
|
||||
last_trade_time=self.last_trade_time
|
||||
)
|
||||
|
||||
def _calculate_end_time(self, start_time: datetime, timeframe: str) -> datetime:
|
||||
"""Calculate end time for this timeframe (right-aligned timestamp)."""
|
||||
if timeframe == '1m':
|
||||
return start_time + timedelta(minutes=1)
|
||||
elif timeframe == '5m':
|
||||
return start_time + timedelta(minutes=5)
|
||||
elif timeframe == '15m':
|
||||
return start_time + timedelta(minutes=15)
|
||||
elif timeframe == '30m':
|
||||
return start_time + timedelta(minutes=30)
|
||||
elif timeframe == '1h':
|
||||
return start_time + timedelta(hours=1)
|
||||
elif timeframe == '4h':
|
||||
return start_time + timedelta(hours=4)
|
||||
elif timeframe == '1d':
|
||||
return start_time + timedelta(days=1)
|
||||
else:
|
||||
raise ValueError(f"Unsupported timeframe: {timeframe}")
|
||||
|
||||
|
||||
class RealTimeCandleProcessor:
|
||||
"""
|
||||
Real-time candle processor for live trade data.
|
||||
|
||||
This class processes trades immediately as they arrive from WebSocket,
|
||||
building candles incrementally and emitting completed candles when
|
||||
time boundaries are crossed.
|
||||
|
||||
AGGREGATION PROCESS (NO FUTURE LEAKAGE):
|
||||
|
||||
1. Trade arrives from WebSocket/API with timestamp T
|
||||
2. For each configured timeframe (1m, 5m, etc.):
|
||||
a. Calculate which time bucket this trade belongs to
|
||||
b. Get current bucket for this timeframe
|
||||
c. Check if trade timestamp crosses time boundary
|
||||
d. If boundary crossed: complete and emit previous bucket, create new bucket
|
||||
e. Add trade to current bucket (updates OHLCV)
|
||||
3. Only emit candles when time boundary is definitively crossed
|
||||
4. Never emit incomplete/future candles during real-time processing
|
||||
|
||||
TIMESTAMP ALIGNMENT:
|
||||
- Uses RIGHT-ALIGNED timestamps (industry standard)
|
||||
- 1-minute candle covering 09:00:00-09:01:00 gets timestamp 09:01:00
|
||||
- 5-minute candle covering 09:00:00-09:05:00 gets timestamp 09:05:00
|
||||
- Candle represents PAST data, never future
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
symbol: str,
|
||||
exchange: str,
|
||||
config: Optional[CandleProcessingConfig] = None,
|
||||
component_name: str = "realtime_candle_processor"):
|
||||
"""
|
||||
Initialize real-time candle processor.
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol (e.g., 'BTC-USDT')
|
||||
exchange: Exchange name (e.g., 'okx', 'binance')
|
||||
config: Processing configuration
|
||||
component_name: Name for logging
|
||||
"""
|
||||
self.symbol = symbol
|
||||
self.exchange = exchange
|
||||
self.config = config or CandleProcessingConfig()
|
||||
self.component_name = component_name
|
||||
self.logger = get_logger(self.component_name)
|
||||
|
||||
# Current buckets for each timeframe
|
||||
self.current_buckets: Dict[str, TimeframeBucket] = {}
|
||||
|
||||
# Callback functions for completed candles
|
||||
self.candle_callbacks: List[Callable[[OHLCVCandle], None]] = []
|
||||
|
||||
# Statistics
|
||||
self.stats = ProcessingStats(active_timeframes=len(self.config.timeframes))
|
||||
|
||||
self.logger.info(f"Initialized real-time candle processor for {symbol} on {exchange} with timeframes: {self.config.timeframes}")
|
||||
|
||||
def add_candle_callback(self, callback: Callable[[OHLCVCandle], None]) -> None:
|
||||
"""Add callback function to receive completed candles."""
|
||||
self.candle_callbacks.append(callback)
|
||||
self.logger.debug(f"Added candle callback: {callback.__name__ if hasattr(callback, '__name__') else str(callback)}")
|
||||
|
||||
def process_trade(self, trade: StandardizedTrade) -> List[OHLCVCandle]:
|
||||
"""
|
||||
Process single trade - main entry point for real-time processing.
|
||||
|
||||
This is called for each trade as it arrives from WebSocket.
|
||||
|
||||
CRITICAL: Only returns completed candles (time boundary crossed)
|
||||
Never returns incomplete/future candles to prevent leakage.
|
||||
|
||||
Args:
|
||||
trade: Standardized trade data
|
||||
|
||||
Returns:
|
||||
List of completed candles (if any time boundaries were crossed)
|
||||
"""
|
||||
try:
|
||||
completed_candles = []
|
||||
|
||||
# Process trade for each timeframe
|
||||
for timeframe in self.config.timeframes:
|
||||
candle = self._process_trade_for_timeframe(trade, timeframe)
|
||||
if candle:
|
||||
completed_candles.append(candle)
|
||||
|
||||
# Update statistics
|
||||
self.stats.trades_processed += 1
|
||||
self.stats.last_trade_time = trade.timestamp
|
||||
|
||||
# Emit completed candles to callbacks
|
||||
for candle in completed_candles:
|
||||
self._emit_candle(candle)
|
||||
|
||||
return completed_candles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing trade for {self.symbol}: {e}")
|
||||
self.stats.errors_count += 1
|
||||
return []
|
||||
|
||||
def _process_trade_for_timeframe(self, trade: StandardizedTrade, timeframe: str) -> Optional[OHLCVCandle]:
|
||||
"""
|
||||
Process trade for specific timeframe.
|
||||
|
||||
CRITICAL LOGIC FOR PREVENTING FUTURE LEAKAGE:
|
||||
1. Calculate which bucket this trade belongs to
|
||||
2. Check if current bucket exists and matches
|
||||
3. If bucket mismatch (time boundary crossed), complete current bucket first
|
||||
4. Create new bucket and add trade
|
||||
5. Only return completed candles, never incomplete ones
|
||||
"""
|
||||
try:
|
||||
# Calculate which bucket this trade belongs to
|
||||
trade_bucket_start = self._get_bucket_start_time(trade.timestamp, timeframe)
|
||||
|
||||
# Check if we have a current bucket for this timeframe
|
||||
current_bucket = self.current_buckets.get(timeframe)
|
||||
completed_candle = None
|
||||
|
||||
# If no bucket exists or time boundary crossed, handle transition
|
||||
if current_bucket is None:
|
||||
# First bucket for this timeframe
|
||||
current_bucket = TimeframeBucket(self.symbol, timeframe, trade_bucket_start, self.exchange)
|
||||
self.current_buckets[timeframe] = current_bucket
|
||||
elif current_bucket.start_time != trade_bucket_start:
|
||||
# Time boundary crossed - complete previous bucket
|
||||
if current_bucket.trade_count > 0: # Only complete if it has trades
|
||||
completed_candle = current_bucket.to_candle(is_complete=True)
|
||||
self.stats.candles_emitted += 1
|
||||
self.stats.last_candle_time = completed_candle.end_time
|
||||
|
||||
# Create new bucket for current time period
|
||||
current_bucket = TimeframeBucket(self.symbol, timeframe, trade_bucket_start, self.exchange)
|
||||
self.current_buckets[timeframe] = current_bucket
|
||||
|
||||
# Add trade to current bucket
|
||||
if not current_bucket.add_trade(trade):
|
||||
# This should never happen if logic is correct
|
||||
self.logger.warning(f"Trade {trade.timestamp} could not be added to bucket {current_bucket.start_time}-{current_bucket.end_time}")
|
||||
|
||||
return completed_candle
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing trade for timeframe {timeframe}: {e}")
|
||||
self.stats.errors_count += 1
|
||||
return None
|
||||
|
||||
def _get_bucket_start_time(self, timestamp: datetime, timeframe: str) -> datetime:
|
||||
"""
|
||||
Calculate bucket start time for given timestamp and timeframe.
|
||||
|
||||
This function determines which time bucket a trade belongs to.
|
||||
The start time is the LEFT boundary of the interval.
|
||||
|
||||
EXAMPLES:
|
||||
- Trade at 09:03:45 for 5m timeframe -> bucket start = 09:00:00
|
||||
- Trade at 09:07:23 for 5m timeframe -> bucket start = 09:05:00
|
||||
- Trade at 14:00:00 for 1h timeframe -> bucket start = 14:00:00
|
||||
|
||||
Args:
|
||||
timestamp: Trade timestamp
|
||||
timeframe: Target timeframe
|
||||
|
||||
Returns:
|
||||
Bucket start time (left boundary)
|
||||
"""
|
||||
# Normalize to UTC and remove microseconds for clean boundaries
|
||||
dt = timestamp.replace(second=0, microsecond=0)
|
||||
|
||||
if timeframe == '1m':
|
||||
# 1-minute buckets align to minute boundaries
|
||||
return dt
|
||||
elif timeframe == '5m':
|
||||
# 5-minute buckets: 00:00, 00:05, 00:10, etc.
|
||||
return dt.replace(minute=(dt.minute // 5) * 5)
|
||||
elif timeframe == '15m':
|
||||
# 15-minute buckets: 00:00, 00:15, 00:30, 00:45
|
||||
return dt.replace(minute=(dt.minute // 15) * 15)
|
||||
elif timeframe == '30m':
|
||||
# 30-minute buckets: 00:00, 00:30
|
||||
return dt.replace(minute=(dt.minute // 30) * 30)
|
||||
elif timeframe == '1h':
|
||||
# 1-hour buckets align to hour boundaries
|
||||
return dt.replace(minute=0)
|
||||
elif timeframe == '4h':
|
||||
# 4-hour buckets: 00:00, 04:00, 08:00, 12:00, 16:00, 20:00
|
||||
return dt.replace(minute=0, hour=(dt.hour // 4) * 4)
|
||||
elif timeframe == '1d':
|
||||
# 1-day buckets align to day boundaries (midnight UTC)
|
||||
return dt.replace(minute=0, hour=0)
|
||||
else:
|
||||
raise ValueError(f"Unsupported timeframe: {timeframe}")
|
||||
|
||||
def _emit_candle(self, candle: OHLCVCandle) -> None:
|
||||
"""Emit completed candle to all callbacks."""
|
||||
try:
|
||||
for callback in self.candle_callbacks:
|
||||
callback(candle)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in candle callback: {e}")
|
||||
self.stats.errors_count += 1
|
||||
|
||||
def get_current_candles(self, incomplete: bool = True) -> List[OHLCVCandle]:
|
||||
"""
|
||||
Get current incomplete candles for all timeframes.
|
||||
|
||||
WARNING: These are incomplete candles and should NOT be used for trading decisions.
|
||||
They are useful for monitoring/debugging only.
|
||||
"""
|
||||
candles = []
|
||||
for bucket in self.current_buckets.values():
|
||||
if bucket.trade_count > 0: # Only return buckets with trades
|
||||
candles.append(bucket.to_candle(is_complete=False))
|
||||
return candles
|
||||
|
||||
def force_complete_all_candles(self) -> List[OHLCVCandle]:
|
||||
"""
|
||||
Force completion of all current candles (useful for shutdown/batch processing).
|
||||
|
||||
WARNING: This should only be used during shutdown or batch processing,
|
||||
not during live trading as it forces incomplete candles to be marked complete.
|
||||
"""
|
||||
completed_candles = []
|
||||
for bucket in self.current_buckets.values():
|
||||
if bucket.trade_count > 0:
|
||||
candle = bucket.to_candle(is_complete=True)
|
||||
completed_candles.append(candle)
|
||||
self._emit_candle(candle)
|
||||
|
||||
# Clear buckets
|
||||
self.current_buckets.clear()
|
||||
return completed_candles
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get processing statistics."""
|
||||
stats_dict = self.stats.to_dict()
|
||||
stats_dict['current_buckets'] = {
|
||||
tf: bucket.trade_count for tf, bucket in self.current_buckets.items()
|
||||
}
|
||||
return stats_dict
|
||||
|
||||
|
||||
class BatchCandleProcessor:
|
||||
"""
|
||||
Batch candle processor for historical data processing.
|
||||
|
||||
This class processes large batches of historical trades efficiently,
|
||||
building candles for multiple timeframes simultaneously.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
symbol: str,
|
||||
exchange: str,
|
||||
timeframes: List[str],
|
||||
component_name: str = "batch_candle_processor"):
|
||||
"""
|
||||
Initialize batch candle processor.
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol
|
||||
exchange: Exchange name
|
||||
timeframes: List of timeframes to process
|
||||
component_name: Name for logging
|
||||
"""
|
||||
self.symbol = symbol
|
||||
self.exchange = exchange
|
||||
self.timeframes = timeframes
|
||||
self.component_name = component_name
|
||||
self.logger = get_logger(self.component_name)
|
||||
|
||||
# Statistics
|
||||
self.stats = ProcessingStats(active_timeframes=len(timeframes))
|
||||
|
||||
self.logger.info(f"Initialized batch candle processor for {symbol} on {exchange}")
|
||||
|
||||
def process_trades_to_candles(self, trades: Iterator[StandardizedTrade]) -> List[OHLCVCandle]:
|
||||
"""
|
||||
Process trade iterator to candles - optimized for batch processing.
|
||||
|
||||
This function handles ALL scenarios:
|
||||
- Historical: Batch trade iterators
|
||||
- Backfill: API trade iterators
|
||||
- Real-time batch: Multiple trades at once
|
||||
|
||||
Args:
|
||||
trades: Iterator of standardized trades
|
||||
|
||||
Returns:
|
||||
List of completed candles
|
||||
"""
|
||||
try:
|
||||
# Create temporary processor for this batch
|
||||
config = CandleProcessingConfig(timeframes=self.timeframes, auto_save_candles=False)
|
||||
processor = RealTimeCandleProcessor(
|
||||
self.symbol, self.exchange, config,
|
||||
f"batch_processor_{self.symbol}_{self.exchange}"
|
||||
)
|
||||
|
||||
all_candles = []
|
||||
|
||||
# Process all trades
|
||||
for trade in trades:
|
||||
completed_candles = processor.process_trade(trade)
|
||||
all_candles.extend(completed_candles)
|
||||
self.stats.trades_processed += 1
|
||||
|
||||
# Force complete any remaining candles
|
||||
remaining_candles = processor.force_complete_all_candles()
|
||||
all_candles.extend(remaining_candles)
|
||||
|
||||
# Update stats
|
||||
self.stats.candles_emitted = len(all_candles)
|
||||
if all_candles:
|
||||
self.stats.last_candle_time = max(candle.end_time for candle in all_candles)
|
||||
|
||||
self.logger.info(f"Batch processed {self.stats.trades_processed} trades to {len(all_candles)} candles")
|
||||
return all_candles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in batch processing trades to candles: {e}")
|
||||
self.stats.errors_count += 1
|
||||
return []
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get processing statistics."""
|
||||
return self.stats.to_dict()
|
||||
|
||||
|
||||
# Utility functions for common aggregation operations
|
||||
|
||||
def aggregate_trades_to_candles(trades: List[StandardizedTrade],
|
||||
timeframes: List[str],
|
||||
symbol: str,
|
||||
exchange: str) -> List[OHLCVCandle]:
|
||||
"""
|
||||
Simple utility function to aggregate a list of trades to candles.
|
||||
|
||||
Args:
|
||||
trades: List of standardized trades
|
||||
timeframes: List of timeframes to generate
|
||||
symbol: Trading symbol
|
||||
exchange: Exchange name
|
||||
|
||||
Returns:
|
||||
List of completed candles
|
||||
"""
|
||||
processor = BatchCandleProcessor(symbol, exchange, timeframes)
|
||||
return processor.process_trades_to_candles(iter(trades))
|
||||
|
||||
|
||||
def validate_timeframe(timeframe: str) -> bool:
|
||||
"""
|
||||
Validate if timeframe is supported.
|
||||
|
||||
Args:
|
||||
timeframe: Timeframe string (e.g., '1m', '5m', '1h')
|
||||
|
||||
Returns:
|
||||
True if supported, False otherwise
|
||||
"""
|
||||
supported = ['1m', '5m', '15m', '30m', '1h', '4h', '1d']
|
||||
return timeframe in supported
|
||||
|
||||
|
||||
def parse_timeframe(timeframe: str) -> tuple[int, str]:
|
||||
"""
|
||||
Parse timeframe string into number and unit.
|
||||
|
||||
Args:
|
||||
timeframe: Timeframe string (e.g., '5m', '1h')
|
||||
|
||||
Returns:
|
||||
Tuple of (number, unit)
|
||||
|
||||
Examples:
|
||||
'5m' -> (5, 'm')
|
||||
'1h' -> (1, 'h')
|
||||
'1d' -> (1, 'd')
|
||||
"""
|
||||
import re
|
||||
match = re.match(r'^(\d+)([mhd])$', timeframe.lower())
|
||||
if not match:
|
||||
raise ValueError(f"Invalid timeframe format: {timeframe}")
|
||||
|
||||
number = int(match.group(1))
|
||||
unit = match.group(2)
|
||||
return number, unit
|
||||
|
||||
|
||||
__all__ = [
|
||||
'TimeframeBucket',
|
||||
'RealTimeCandleProcessor',
|
||||
'BatchCandleProcessor',
|
||||
'aggregate_trades_to_candles',
|
||||
'validate_timeframe',
|
||||
'parse_timeframe'
|
||||
]
|
||||
182
data/common/data_types.py
Normal file
182
data/common/data_types.py
Normal file
@@ -0,0 +1,182 @@
|
||||
"""
|
||||
Common data types for all exchange implementations.
|
||||
|
||||
These data structures provide a unified interface for market data
|
||||
regardless of the source exchange.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from decimal import Decimal
|
||||
from typing import Dict, List, Optional, Any
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
|
||||
from ..base_collector import DataType, MarketDataPoint # Import from base
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataValidationResult:
|
||||
"""Result of data validation - common across all exchanges."""
|
||||
is_valid: bool
|
||||
errors: List[str]
|
||||
warnings: List[str]
|
||||
sanitized_data: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class StandardizedTrade:
|
||||
"""
|
||||
Standardized trade format for unified processing across all exchanges.
|
||||
|
||||
This format works for both real-time and historical data processing,
|
||||
ensuring consistency across all data sources and scenarios.
|
||||
"""
|
||||
symbol: str
|
||||
trade_id: str
|
||||
price: Decimal
|
||||
size: Decimal
|
||||
side: str # 'buy' or 'sell'
|
||||
timestamp: datetime
|
||||
exchange: str
|
||||
raw_data: Optional[Dict[str, Any]] = None
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate and normalize fields after initialization."""
|
||||
# Ensure timestamp is timezone-aware
|
||||
if self.timestamp.tzinfo is None:
|
||||
self.timestamp = self.timestamp.replace(tzinfo=timezone.utc)
|
||||
|
||||
# Normalize side to lowercase
|
||||
self.side = self.side.lower()
|
||||
|
||||
# Validate side
|
||||
if self.side not in ['buy', 'sell']:
|
||||
raise ValueError(f"Invalid trade side: {self.side}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class OHLCVCandle:
|
||||
"""
|
||||
OHLCV candle data structure for time-based aggregation.
|
||||
|
||||
This represents a complete candle for a specific timeframe,
|
||||
built from aggregating multiple trades within the time period.
|
||||
"""
|
||||
symbol: str
|
||||
timeframe: str
|
||||
start_time: datetime
|
||||
end_time: datetime
|
||||
open: Decimal
|
||||
high: Decimal
|
||||
low: Decimal
|
||||
close: Decimal
|
||||
volume: Decimal
|
||||
trade_count: int
|
||||
exchange: str = "unknown"
|
||||
is_complete: bool = False
|
||||
first_trade_time: Optional[datetime] = None
|
||||
last_trade_time: Optional[datetime] = None
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate and normalize fields after initialization."""
|
||||
# Ensure timestamps are timezone-aware
|
||||
if self.start_time.tzinfo is None:
|
||||
self.start_time = self.start_time.replace(tzinfo=timezone.utc)
|
||||
if self.end_time.tzinfo is None:
|
||||
self.end_time = self.end_time.replace(tzinfo=timezone.utc)
|
||||
|
||||
# Validate OHLC relationships
|
||||
if self.high < self.low:
|
||||
raise ValueError("High price cannot be less than low price")
|
||||
if self.open < 0 or self.high < 0 or self.low < 0 or self.close < 0:
|
||||
raise ValueError("Prices cannot be negative")
|
||||
if self.volume < 0:
|
||||
raise ValueError("Volume cannot be negative")
|
||||
if self.trade_count < 0:
|
||||
raise ValueError("Trade count cannot be negative")
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert candle to dictionary for storage/serialization."""
|
||||
return {
|
||||
'symbol': self.symbol,
|
||||
'timeframe': self.timeframe,
|
||||
'start_time': self.start_time.isoformat(),
|
||||
'end_time': self.end_time.isoformat(),
|
||||
'open': str(self.open),
|
||||
'high': str(self.high),
|
||||
'low': str(self.low),
|
||||
'close': str(self.close),
|
||||
'volume': str(self.volume),
|
||||
'trade_count': self.trade_count,
|
||||
'exchange': self.exchange,
|
||||
'is_complete': self.is_complete,
|
||||
'first_trade_time': self.first_trade_time.isoformat() if self.first_trade_time else None,
|
||||
'last_trade_time': self.last_trade_time.isoformat() if self.last_trade_time else None
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class CandleProcessingConfig:
|
||||
"""Configuration for candle processing - shared across exchanges."""
|
||||
timeframes: List[str] = field(default_factory=lambda: ['1m', '5m', '15m', '1h'])
|
||||
auto_save_candles: bool = True
|
||||
emit_incomplete_candles: bool = False
|
||||
max_trades_per_candle: int = 100000 # Safety limit
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate configuration after initialization."""
|
||||
supported_timeframes = ['1m', '5m', '15m', '30m', '1h', '4h', '1d']
|
||||
for tf in self.timeframes:
|
||||
if tf not in supported_timeframes:
|
||||
raise ValueError(f"Unsupported timeframe: {tf}")
|
||||
|
||||
|
||||
class TradeSide(Enum):
|
||||
"""Standardized trade side enumeration."""
|
||||
BUY = "buy"
|
||||
SELL = "sell"
|
||||
|
||||
|
||||
class TimeframeUnit(Enum):
|
||||
"""Time units for candle timeframes."""
|
||||
MINUTE = "m"
|
||||
HOUR = "h"
|
||||
DAY = "d"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ProcessingStats:
|
||||
"""Common processing statistics structure."""
|
||||
trades_processed: int = 0
|
||||
candles_emitted: int = 0
|
||||
errors_count: int = 0
|
||||
warnings_count: int = 0
|
||||
last_trade_time: Optional[datetime] = None
|
||||
last_candle_time: Optional[datetime] = None
|
||||
active_timeframes: int = 0
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Convert stats to dictionary."""
|
||||
return {
|
||||
'trades_processed': self.trades_processed,
|
||||
'candles_emitted': self.candles_emitted,
|
||||
'errors_count': self.errors_count,
|
||||
'warnings_count': self.warnings_count,
|
||||
'last_trade_time': self.last_trade_time.isoformat() if self.last_trade_time else None,
|
||||
'last_candle_time': self.last_candle_time.isoformat() if self.last_candle_time else None,
|
||||
'active_timeframes': self.active_timeframes
|
||||
}
|
||||
|
||||
|
||||
# Re-export from base_collector for convenience
|
||||
__all__ = [
|
||||
'DataType',
|
||||
'MarketDataPoint',
|
||||
'DataValidationResult',
|
||||
'StandardizedTrade',
|
||||
'OHLCVCandle',
|
||||
'CandleProcessingConfig',
|
||||
'TradeSide',
|
||||
'TimeframeUnit',
|
||||
'ProcessingStats'
|
||||
]
|
||||
471
data/common/transformation.py
Normal file
471
data/common/transformation.py
Normal file
@@ -0,0 +1,471 @@
|
||||
"""
|
||||
Base transformation utilities for all exchanges.
|
||||
|
||||
This module provides common transformation patterns and base classes
|
||||
for converting exchange-specific data to standardized formats.
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from decimal import Decimal
|
||||
from typing import Dict, List, Optional, Any, Iterator
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from .data_types import StandardizedTrade, OHLCVCandle, DataValidationResult
|
||||
from .aggregation import BatchCandleProcessor
|
||||
from utils.logger import get_logger
|
||||
|
||||
|
||||
class BaseDataTransformer(ABC):
|
||||
"""
|
||||
Abstract base class for exchange data transformers.
|
||||
|
||||
This class provides common transformation patterns that can be
|
||||
extended by exchange-specific implementations.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
exchange_name: str,
|
||||
component_name: str = "base_data_transformer"):
|
||||
"""
|
||||
Initialize base data transformer.
|
||||
|
||||
Args:
|
||||
exchange_name: Name of the exchange (e.g., 'okx', 'binance')
|
||||
component_name: Name for logging
|
||||
"""
|
||||
self.exchange_name = exchange_name
|
||||
self.component_name = component_name
|
||||
self.logger = get_logger(self.component_name)
|
||||
|
||||
self.logger.info(f"Initialized base data transformer for {exchange_name}")
|
||||
|
||||
# Abstract methods that must be implemented by subclasses
|
||||
|
||||
@abstractmethod
|
||||
def transform_trade_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[StandardizedTrade]:
|
||||
"""Transform exchange-specific trade data to standardized format."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def transform_orderbook_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
|
||||
"""Transform exchange-specific orderbook data to standardized format."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def transform_ticker_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
|
||||
"""Transform exchange-specific ticker data to standardized format."""
|
||||
pass
|
||||
|
||||
# Common transformation utilities available to all subclasses
|
||||
|
||||
def timestamp_to_datetime(self, timestamp: Any, is_milliseconds: bool = True) -> datetime:
|
||||
"""
|
||||
Convert various timestamp formats to timezone-aware datetime.
|
||||
|
||||
Args:
|
||||
timestamp: Timestamp in various formats
|
||||
is_milliseconds: True if timestamp is in milliseconds
|
||||
|
||||
Returns:
|
||||
Timezone-aware datetime object
|
||||
"""
|
||||
try:
|
||||
# Convert to int/float
|
||||
if isinstance(timestamp, str):
|
||||
timestamp_num = float(timestamp)
|
||||
elif isinstance(timestamp, (int, float)):
|
||||
timestamp_num = float(timestamp)
|
||||
else:
|
||||
raise ValueError(f"Invalid timestamp type: {type(timestamp)}")
|
||||
|
||||
# Convert to seconds if needed
|
||||
if is_milliseconds:
|
||||
timestamp_num = timestamp_num / 1000
|
||||
|
||||
# Create timezone-aware datetime
|
||||
dt = datetime.fromtimestamp(timestamp_num, tz=timezone.utc)
|
||||
return dt
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error converting timestamp {timestamp}: {e}")
|
||||
# Return current time as fallback
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
def safe_decimal_conversion(self, value: Any, field_name: str = "value") -> Optional[Decimal]:
|
||||
"""
|
||||
Safely convert value to Decimal with error handling.
|
||||
|
||||
Args:
|
||||
value: Value to convert
|
||||
field_name: Name of field for error logging
|
||||
|
||||
Returns:
|
||||
Decimal value or None if conversion failed
|
||||
"""
|
||||
try:
|
||||
if value is None or value == "":
|
||||
return None
|
||||
return Decimal(str(value))
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Failed to convert {field_name} '{value}' to Decimal: {e}")
|
||||
return None
|
||||
|
||||
def normalize_trade_side(self, side: str) -> str:
|
||||
"""
|
||||
Normalize trade side to standard format.
|
||||
|
||||
Args:
|
||||
side: Raw trade side string
|
||||
|
||||
Returns:
|
||||
Normalized side ('buy' or 'sell')
|
||||
"""
|
||||
normalized = side.lower().strip()
|
||||
|
||||
# Handle common variations
|
||||
if normalized in ['buy', 'bid', 'b', '1']:
|
||||
return 'buy'
|
||||
elif normalized in ['sell', 'ask', 's', '0']:
|
||||
return 'sell'
|
||||
else:
|
||||
self.logger.warning(f"Unknown trade side: {side}, defaulting to 'buy'")
|
||||
return 'buy'
|
||||
|
||||
def validate_symbol_format(self, symbol: str) -> str:
|
||||
"""
|
||||
Validate and normalize symbol format.
|
||||
|
||||
Args:
|
||||
symbol: Raw symbol string
|
||||
|
||||
Returns:
|
||||
Normalized symbol string
|
||||
"""
|
||||
if not symbol or not isinstance(symbol, str):
|
||||
raise ValueError(f"Invalid symbol: {symbol}")
|
||||
|
||||
# Basic normalization
|
||||
normalized = symbol.upper().strip()
|
||||
|
||||
if not normalized:
|
||||
raise ValueError("Empty symbol after normalization")
|
||||
|
||||
return normalized
|
||||
|
||||
def transform_database_record(self, record: Any) -> Optional[StandardizedTrade]:
|
||||
"""
|
||||
Transform database record to standardized format.
|
||||
|
||||
This method should be overridden by subclasses to handle
|
||||
their specific database schema.
|
||||
|
||||
Args:
|
||||
record: Database record
|
||||
|
||||
Returns:
|
||||
StandardizedTrade or None if transformation failed
|
||||
"""
|
||||
self.logger.warning("transform_database_record not implemented for this exchange")
|
||||
return None
|
||||
|
||||
def get_transformer_info(self) -> Dict[str, Any]:
|
||||
"""Get transformer information."""
|
||||
return {
|
||||
'exchange': self.exchange_name,
|
||||
'component': self.component_name,
|
||||
'capabilities': {
|
||||
'trade_transformation': True,
|
||||
'orderbook_transformation': True,
|
||||
'ticker_transformation': True,
|
||||
'database_transformation': hasattr(self, 'transform_database_record')
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class UnifiedDataTransformer:
|
||||
"""
|
||||
Unified data transformation system for all scenarios.
|
||||
|
||||
This class provides a common interface for transforming data from
|
||||
various sources (real-time, historical, backfill) into standardized
|
||||
formats for further processing.
|
||||
|
||||
TRANSFORMATION PROCESS:
|
||||
|
||||
1. Raw Data Input (exchange format, database records, etc.)
|
||||
2. Validation (using exchange-specific validators)
|
||||
3. Transformation to StandardizedTrade format
|
||||
4. Optional aggregation to candles
|
||||
5. Output in consistent format
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
exchange_transformer: BaseDataTransformer,
|
||||
component_name: str = "unified_data_transformer"):
|
||||
"""
|
||||
Initialize unified data transformer.
|
||||
|
||||
Args:
|
||||
exchange_transformer: Exchange-specific transformer instance
|
||||
component_name: Name for logging
|
||||
"""
|
||||
self.exchange_transformer = exchange_transformer
|
||||
self.component_name = component_name
|
||||
self.logger = get_logger(self.component_name)
|
||||
|
||||
self.logger.info(f"Initialized unified data transformer with {exchange_transformer.exchange_name} transformer")
|
||||
|
||||
def transform_trade_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[StandardizedTrade]:
|
||||
"""
|
||||
Transform trade data using exchange-specific transformer.
|
||||
|
||||
Args:
|
||||
raw_data: Raw trade data from exchange
|
||||
symbol: Trading symbol
|
||||
|
||||
Returns:
|
||||
Standardized trade or None if transformation failed
|
||||
"""
|
||||
try:
|
||||
return self.exchange_transformer.transform_trade_data(raw_data, symbol)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in trade transformation: {e}")
|
||||
return None
|
||||
|
||||
def transform_orderbook_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Transform orderbook data using exchange-specific transformer.
|
||||
|
||||
Args:
|
||||
raw_data: Raw orderbook data from exchange
|
||||
symbol: Trading symbol
|
||||
|
||||
Returns:
|
||||
Standardized orderbook data or None if transformation failed
|
||||
"""
|
||||
try:
|
||||
return self.exchange_transformer.transform_orderbook_data(raw_data, symbol)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in orderbook transformation: {e}")
|
||||
return None
|
||||
|
||||
def transform_ticker_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Transform ticker data using exchange-specific transformer.
|
||||
|
||||
Args:
|
||||
raw_data: Raw ticker data from exchange
|
||||
symbol: Trading symbol
|
||||
|
||||
Returns:
|
||||
Standardized ticker data or None if transformation failed
|
||||
"""
|
||||
try:
|
||||
return self.exchange_transformer.transform_ticker_data(raw_data, symbol)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in ticker transformation: {e}")
|
||||
return None
|
||||
|
||||
def process_trades_to_candles(self,
|
||||
trades: Iterator[StandardizedTrade],
|
||||
timeframes: List[str],
|
||||
symbol: str) -> List[OHLCVCandle]:
|
||||
"""
|
||||
Process any trade iterator to candles - unified processing function.
|
||||
|
||||
This function handles ALL scenarios:
|
||||
- Real-time: Single trade iterators
|
||||
- Historical: Batch trade iterators
|
||||
- Backfill: API trade iterators
|
||||
|
||||
Args:
|
||||
trades: Iterator of standardized trades
|
||||
timeframes: List of timeframes to generate
|
||||
symbol: Trading symbol
|
||||
|
||||
Returns:
|
||||
List of completed candles
|
||||
"""
|
||||
try:
|
||||
processor = BatchCandleProcessor(
|
||||
symbol,
|
||||
self.exchange_transformer.exchange_name,
|
||||
timeframes,
|
||||
f"unified_batch_processor_{symbol}"
|
||||
)
|
||||
|
||||
candles = processor.process_trades_to_candles(trades)
|
||||
|
||||
self.logger.info(f"Processed {processor.get_stats()['trades_processed']} trades to {len(candles)} candles")
|
||||
return candles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error processing trades to candles: {e}")
|
||||
return []
|
||||
|
||||
def batch_transform_trades(self,
|
||||
raw_trades: List[Dict[str, Any]],
|
||||
symbol: str) -> List[StandardizedTrade]:
|
||||
"""
|
||||
Transform multiple trade records in batch.
|
||||
|
||||
Args:
|
||||
raw_trades: List of raw trade data
|
||||
symbol: Trading symbol
|
||||
|
||||
Returns:
|
||||
List of successfully transformed trades
|
||||
"""
|
||||
transformed_trades = []
|
||||
errors = 0
|
||||
|
||||
for raw_trade in raw_trades:
|
||||
try:
|
||||
trade = self.transform_trade_data(raw_trade, symbol)
|
||||
if trade:
|
||||
transformed_trades.append(trade)
|
||||
else:
|
||||
errors += 1
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error transforming trade: {e}")
|
||||
errors += 1
|
||||
|
||||
self.logger.info(f"Batch transformed {len(transformed_trades)} trades successfully, {errors} errors")
|
||||
return transformed_trades
|
||||
|
||||
def get_transformer_info(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive transformer information."""
|
||||
base_info = self.exchange_transformer.get_transformer_info()
|
||||
base_info.update({
|
||||
'unified_component': self.component_name,
|
||||
'batch_processing': True,
|
||||
'candle_aggregation': True
|
||||
})
|
||||
return base_info
|
||||
|
||||
|
||||
# Utility functions for common transformation patterns
|
||||
|
||||
def create_standardized_trade(symbol: str,
|
||||
trade_id: str,
|
||||
price: Any,
|
||||
size: Any,
|
||||
side: str,
|
||||
timestamp: Any,
|
||||
exchange: str,
|
||||
raw_data: Optional[Dict[str, Any]] = None,
|
||||
is_milliseconds: bool = True) -> StandardizedTrade:
|
||||
"""
|
||||
Utility function to create StandardizedTrade with proper validation.
|
||||
|
||||
Args:
|
||||
symbol: Trading symbol
|
||||
trade_id: Trade identifier
|
||||
price: Trade price (any numeric type)
|
||||
size: Trade size (any numeric type)
|
||||
side: Trade side ('buy' or 'sell')
|
||||
timestamp: Trade timestamp
|
||||
exchange: Exchange name
|
||||
raw_data: Original raw data
|
||||
is_milliseconds: True if timestamp is in milliseconds
|
||||
|
||||
Returns:
|
||||
StandardizedTrade object
|
||||
|
||||
Raises:
|
||||
ValueError: If data is invalid
|
||||
"""
|
||||
# Convert timestamp
|
||||
if isinstance(timestamp, (int, float, str)):
|
||||
timestamp_num = float(timestamp)
|
||||
if is_milliseconds:
|
||||
timestamp_num = timestamp_num / 1000
|
||||
dt = datetime.fromtimestamp(timestamp_num, tz=timezone.utc)
|
||||
elif isinstance(timestamp, datetime):
|
||||
dt = timestamp
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
else:
|
||||
raise ValueError(f"Invalid timestamp type: {type(timestamp)}")
|
||||
|
||||
# Convert price and size to Decimal
|
||||
try:
|
||||
decimal_price = Decimal(str(price))
|
||||
decimal_size = Decimal(str(size))
|
||||
except Exception as e:
|
||||
raise ValueError(f"Invalid price or size: {e}")
|
||||
|
||||
# Normalize side
|
||||
normalized_side = side.lower().strip()
|
||||
if normalized_side not in ['buy', 'sell']:
|
||||
raise ValueError(f"Invalid trade side: {side}")
|
||||
|
||||
return StandardizedTrade(
|
||||
symbol=symbol.upper().strip(),
|
||||
trade_id=str(trade_id),
|
||||
price=decimal_price,
|
||||
size=decimal_size,
|
||||
side=normalized_side,
|
||||
timestamp=dt,
|
||||
exchange=exchange.lower(),
|
||||
raw_data=raw_data
|
||||
)
|
||||
|
||||
|
||||
def batch_create_standardized_trades(raw_trades: List[Dict[str, Any]],
|
||||
symbol: str,
|
||||
exchange: str,
|
||||
field_mapping: Dict[str, str],
|
||||
is_milliseconds: bool = True) -> List[StandardizedTrade]:
|
||||
"""
|
||||
Batch create standardized trades from raw data.
|
||||
|
||||
Args:
|
||||
raw_trades: List of raw trade dictionaries
|
||||
symbol: Trading symbol
|
||||
exchange: Exchange name
|
||||
field_mapping: Mapping of StandardizedTrade fields to raw data fields
|
||||
is_milliseconds: True if timestamps are in milliseconds
|
||||
|
||||
Returns:
|
||||
List of successfully created StandardizedTrade objects
|
||||
|
||||
Example field_mapping:
|
||||
{
|
||||
'trade_id': 'id',
|
||||
'price': 'px',
|
||||
'size': 'sz',
|
||||
'side': 'side',
|
||||
'timestamp': 'ts'
|
||||
}
|
||||
"""
|
||||
trades = []
|
||||
|
||||
for raw_trade in raw_trades:
|
||||
try:
|
||||
trade = create_standardized_trade(
|
||||
symbol=symbol,
|
||||
trade_id=raw_trade[field_mapping['trade_id']],
|
||||
price=raw_trade[field_mapping['price']],
|
||||
size=raw_trade[field_mapping['size']],
|
||||
side=raw_trade[field_mapping['side']],
|
||||
timestamp=raw_trade[field_mapping['timestamp']],
|
||||
exchange=exchange,
|
||||
raw_data=raw_trade,
|
||||
is_milliseconds=is_milliseconds
|
||||
)
|
||||
trades.append(trade)
|
||||
except Exception as e:
|
||||
# Log error but continue processing
|
||||
logger = get_logger("batch_transform")
|
||||
logger.warning(f"Failed to transform trade: {e}")
|
||||
|
||||
return trades
|
||||
|
||||
|
||||
__all__ = [
|
||||
'BaseDataTransformer',
|
||||
'UnifiedDataTransformer',
|
||||
'create_standardized_trade',
|
||||
'batch_create_standardized_trades'
|
||||
]
|
||||
484
data/common/validation.py
Normal file
484
data/common/validation.py
Normal file
@@ -0,0 +1,484 @@
|
||||
"""
|
||||
Base validation utilities for all exchanges.
|
||||
|
||||
This module provides common validation patterns and base classes
|
||||
that can be extended by exchange-specific validators.
|
||||
"""
|
||||
|
||||
import re
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from typing import Dict, List, Optional, Any, Union, Pattern
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from .data_types import DataValidationResult, StandardizedTrade, TradeSide
|
||||
from utils.logger import get_logger
|
||||
|
||||
|
||||
class ValidationResult:
|
||||
"""Simple validation result for individual field validation."""
|
||||
|
||||
def __init__(self, is_valid: bool, errors: List[str] = None, warnings: List[str] = None, sanitized_data: Any = None):
|
||||
self.is_valid = is_valid
|
||||
self.errors = errors or []
|
||||
self.warnings = warnings or []
|
||||
self.sanitized_data = sanitized_data
|
||||
|
||||
|
||||
class BaseDataValidator(ABC):
|
||||
"""
|
||||
Abstract base class for exchange data validators.
|
||||
|
||||
This class provides common validation patterns and utilities
|
||||
that can be reused across different exchange implementations.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
exchange_name: str,
|
||||
component_name: str = "base_data_validator"):
|
||||
"""
|
||||
Initialize base data validator.
|
||||
|
||||
Args:
|
||||
exchange_name: Name of the exchange (e.g., 'okx', 'binance')
|
||||
component_name: Name for logging
|
||||
"""
|
||||
self.exchange_name = exchange_name
|
||||
self.component_name = component_name
|
||||
self.logger = get_logger(self.component_name)
|
||||
|
||||
# Common validation patterns
|
||||
self._numeric_pattern = re.compile(r'^-?\d*\.?\d+$')
|
||||
self._trade_id_pattern = re.compile(r'^[a-zA-Z0-9_-]+$') # Flexible pattern
|
||||
|
||||
# Valid trade sides
|
||||
self._valid_trade_sides = {'buy', 'sell'}
|
||||
|
||||
# Common price and size limits (can be overridden by subclasses)
|
||||
self._min_price = Decimal('0.00000001') # 1 satoshi equivalent
|
||||
self._max_price = Decimal('10000000') # 10 million
|
||||
self._min_size = Decimal('0.00000001') # Minimum trade size
|
||||
self._max_size = Decimal('1000000000') # 1 billion max size
|
||||
|
||||
# Timestamp validation (milliseconds since epoch)
|
||||
self._min_timestamp = 1000000000000 # 2001-09-09 (reasonable minimum)
|
||||
self._max_timestamp = 9999999999999 # 2286-11-20 (reasonable maximum)
|
||||
|
||||
self.logger.debug(f"Initialized base data validator for {exchange_name}")
|
||||
|
||||
# Abstract methods that must be implemented by subclasses
|
||||
|
||||
@abstractmethod
|
||||
def validate_symbol_format(self, symbol: str) -> ValidationResult:
|
||||
"""Validate exchange-specific symbol format."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def validate_websocket_message(self, message: Dict[str, Any]) -> DataValidationResult:
|
||||
"""Validate complete WebSocket message structure."""
|
||||
pass
|
||||
|
||||
# Common validation methods available to all subclasses
|
||||
|
||||
def validate_price(self, price: Union[str, int, float, Decimal]) -> ValidationResult:
|
||||
"""
|
||||
Validate price value with common rules.
|
||||
|
||||
Args:
|
||||
price: Price value to validate
|
||||
|
||||
Returns:
|
||||
ValidationResult with sanitized decimal price
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
sanitized_data = None
|
||||
|
||||
try:
|
||||
# Convert to Decimal for precise validation
|
||||
if isinstance(price, str) and price.strip() == "":
|
||||
errors.append("Empty price string")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
decimal_price = Decimal(str(price))
|
||||
sanitized_data = decimal_price
|
||||
|
||||
# Check for negative prices
|
||||
if decimal_price <= 0:
|
||||
errors.append(f"Price must be positive, got {decimal_price}")
|
||||
|
||||
# Check price bounds
|
||||
if decimal_price < self._min_price:
|
||||
warnings.append(f"Price {decimal_price} below minimum {self._min_price}")
|
||||
elif decimal_price > self._max_price:
|
||||
warnings.append(f"Price {decimal_price} above maximum {self._max_price}")
|
||||
|
||||
# Check for excessive decimal places (warn only)
|
||||
if decimal_price.as_tuple().exponent < -12:
|
||||
warnings.append(f"Price has excessive decimal precision: {decimal_price}")
|
||||
|
||||
except (InvalidOperation, ValueError, TypeError) as e:
|
||||
errors.append(f"Invalid price value: {price} - {str(e)}")
|
||||
|
||||
return ValidationResult(len(errors) == 0, errors, warnings, sanitized_data)
|
||||
|
||||
def validate_size(self, size: Union[str, int, float, Decimal]) -> ValidationResult:
|
||||
"""
|
||||
Validate size/quantity value with common rules.
|
||||
|
||||
Args:
|
||||
size: Size value to validate
|
||||
|
||||
Returns:
|
||||
ValidationResult with sanitized decimal size
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
sanitized_data = None
|
||||
|
||||
try:
|
||||
# Convert to Decimal for precise validation
|
||||
if isinstance(size, str) and size.strip() == "":
|
||||
errors.append("Empty size string")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
decimal_size = Decimal(str(size))
|
||||
sanitized_data = decimal_size
|
||||
|
||||
# Check for negative or zero sizes
|
||||
if decimal_size <= 0:
|
||||
errors.append(f"Size must be positive, got {decimal_size}")
|
||||
|
||||
# Check size bounds
|
||||
if decimal_size < self._min_size:
|
||||
warnings.append(f"Size {decimal_size} below minimum {self._min_size}")
|
||||
elif decimal_size > self._max_size:
|
||||
warnings.append(f"Size {decimal_size} above maximum {self._max_size}")
|
||||
|
||||
except (InvalidOperation, ValueError, TypeError) as e:
|
||||
errors.append(f"Invalid size value: {size} - {str(e)}")
|
||||
|
||||
return ValidationResult(len(errors) == 0, errors, warnings, sanitized_data)
|
||||
|
||||
def validate_volume(self, volume: Union[str, int, float, Decimal]) -> ValidationResult:
|
||||
"""
|
||||
Validate volume value with common rules.
|
||||
|
||||
Args:
|
||||
volume: Volume value to validate
|
||||
|
||||
Returns:
|
||||
ValidationResult
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
try:
|
||||
decimal_volume = Decimal(str(volume))
|
||||
|
||||
# Volume can be zero (no trades in period)
|
||||
if decimal_volume < 0:
|
||||
errors.append(f"Volume cannot be negative, got {decimal_volume}")
|
||||
|
||||
except (InvalidOperation, ValueError, TypeError) as e:
|
||||
errors.append(f"Invalid volume value: {volume} - {str(e)}")
|
||||
|
||||
return ValidationResult(len(errors) == 0, errors, warnings)
|
||||
|
||||
def validate_trade_side(self, side: str) -> ValidationResult:
|
||||
"""
|
||||
Validate trade side with common rules.
|
||||
|
||||
Args:
|
||||
side: Trade side string
|
||||
|
||||
Returns:
|
||||
ValidationResult
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
if not isinstance(side, str):
|
||||
errors.append(f"Trade side must be string, got {type(side)}")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
normalized_side = side.lower()
|
||||
if normalized_side not in self._valid_trade_sides:
|
||||
errors.append(f"Invalid trade side: {side}. Must be 'buy' or 'sell'")
|
||||
|
||||
return ValidationResult(len(errors) == 0, errors, warnings)
|
||||
|
||||
def validate_timestamp(self, timestamp: Union[str, int], is_milliseconds: bool = True) -> ValidationResult:
|
||||
"""
|
||||
Validate timestamp value with common rules.
|
||||
|
||||
Args:
|
||||
timestamp: Timestamp value to validate
|
||||
is_milliseconds: True if timestamp is in milliseconds, False for seconds
|
||||
|
||||
Returns:
|
||||
ValidationResult
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
try:
|
||||
# Convert to int
|
||||
if isinstance(timestamp, str):
|
||||
if not timestamp.isdigit():
|
||||
errors.append(f"Invalid timestamp format: {timestamp}")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
timestamp_int = int(timestamp)
|
||||
elif isinstance(timestamp, int):
|
||||
timestamp_int = timestamp
|
||||
else:
|
||||
errors.append(f"Timestamp must be string or int, got {type(timestamp)}")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
# Convert to milliseconds if needed
|
||||
if not is_milliseconds:
|
||||
timestamp_int = timestamp_int * 1000
|
||||
|
||||
# Check timestamp bounds
|
||||
if timestamp_int < self._min_timestamp:
|
||||
errors.append(f"Timestamp {timestamp_int} too old")
|
||||
elif timestamp_int > self._max_timestamp:
|
||||
errors.append(f"Timestamp {timestamp_int} too far in future")
|
||||
|
||||
# Check if timestamp is reasonable (within last year to next year)
|
||||
current_time_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
|
||||
one_year_ms = 365 * 24 * 60 * 60 * 1000
|
||||
|
||||
if timestamp_int < (current_time_ms - one_year_ms):
|
||||
warnings.append(f"Timestamp {timestamp_int} is older than 1 year")
|
||||
elif timestamp_int > (current_time_ms + one_year_ms):
|
||||
warnings.append(f"Timestamp {timestamp_int} is more than 1 year in future")
|
||||
|
||||
except (ValueError, TypeError) as e:
|
||||
errors.append(f"Invalid timestamp: {timestamp} - {str(e)}")
|
||||
|
||||
return ValidationResult(len(errors) == 0, errors, warnings)
|
||||
|
||||
def validate_trade_id(self, trade_id: Union[str, int]) -> ValidationResult:
|
||||
"""
|
||||
Validate trade ID with flexible rules.
|
||||
|
||||
Args:
|
||||
trade_id: Trade ID to validate
|
||||
|
||||
Returns:
|
||||
ValidationResult
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
if isinstance(trade_id, int):
|
||||
trade_id = str(trade_id)
|
||||
|
||||
if not isinstance(trade_id, str):
|
||||
errors.append(f"Trade ID must be string or int, got {type(trade_id)}")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
if not trade_id.strip():
|
||||
errors.append("Trade ID cannot be empty")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
# Flexible validation - allow alphanumeric, underscore, hyphen
|
||||
if not self._trade_id_pattern.match(trade_id):
|
||||
warnings.append(f"Trade ID has unusual format: {trade_id}")
|
||||
|
||||
return ValidationResult(len(errors) == 0, errors, warnings)
|
||||
|
||||
def validate_symbol_match(self, symbol: str, expected_symbol: Optional[str] = None) -> ValidationResult:
|
||||
"""
|
||||
Validate symbol matches expected value.
|
||||
|
||||
Args:
|
||||
symbol: Symbol to validate
|
||||
expected_symbol: Expected symbol value
|
||||
|
||||
Returns:
|
||||
ValidationResult
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
if not isinstance(symbol, str):
|
||||
errors.append(f"Symbol must be string, got {type(symbol)}")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
if expected_symbol and symbol != expected_symbol:
|
||||
warnings.append(f"Symbol mismatch: expected {expected_symbol}, got {symbol}")
|
||||
|
||||
return ValidationResult(len(errors) == 0, errors, warnings)
|
||||
|
||||
def validate_orderbook_side(self, side_data: List[List[str]], side_name: str) -> ValidationResult:
|
||||
"""
|
||||
Validate orderbook side (asks or bids) with common rules.
|
||||
|
||||
Args:
|
||||
side_data: List of price/size pairs
|
||||
side_name: Name of side for error messages
|
||||
|
||||
Returns:
|
||||
ValidationResult with sanitized data
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
sanitized_data = []
|
||||
|
||||
if not isinstance(side_data, list):
|
||||
errors.append(f"{side_name} must be a list")
|
||||
return ValidationResult(False, errors, warnings)
|
||||
|
||||
for i, level in enumerate(side_data):
|
||||
if not isinstance(level, list) or len(level) < 2:
|
||||
errors.append(f"{side_name}[{i}] must be a list with at least 2 elements")
|
||||
continue
|
||||
|
||||
# Validate price and size
|
||||
price_result = self.validate_price(level[0])
|
||||
size_result = self.validate_size(level[1])
|
||||
|
||||
if not price_result.is_valid:
|
||||
errors.extend([f"{side_name}[{i}] price: {error}" for error in price_result.errors])
|
||||
if not size_result.is_valid:
|
||||
errors.extend([f"{side_name}[{i}] size: {error}" for error in size_result.errors])
|
||||
|
||||
# Add sanitized level
|
||||
if price_result.is_valid and size_result.is_valid:
|
||||
sanitized_level = [str(price_result.sanitized_data), str(size_result.sanitized_data)]
|
||||
# Include additional fields if present
|
||||
if len(level) > 2:
|
||||
sanitized_level.extend(level[2:])
|
||||
sanitized_data.append(sanitized_level)
|
||||
|
||||
return ValidationResult(len(errors) == 0, errors, warnings, sanitized_data)
|
||||
|
||||
def validate_standardized_trade(self, trade: StandardizedTrade) -> DataValidationResult:
|
||||
"""
|
||||
Validate a standardized trade object.
|
||||
|
||||
Args:
|
||||
trade: StandardizedTrade object to validate
|
||||
|
||||
Returns:
|
||||
DataValidationResult
|
||||
"""
|
||||
errors = []
|
||||
warnings = []
|
||||
|
||||
try:
|
||||
# Validate price
|
||||
price_result = self.validate_price(trade.price)
|
||||
if not price_result.is_valid:
|
||||
errors.extend([f"price: {error}" for error in price_result.errors])
|
||||
warnings.extend([f"price: {warning}" for warning in price_result.warnings])
|
||||
|
||||
# Validate size
|
||||
size_result = self.validate_size(trade.size)
|
||||
if not size_result.is_valid:
|
||||
errors.extend([f"size: {error}" for error in size_result.errors])
|
||||
warnings.extend([f"size: {warning}" for warning in size_result.warnings])
|
||||
|
||||
# Validate side
|
||||
side_result = self.validate_trade_side(trade.side)
|
||||
if not side_result.is_valid:
|
||||
errors.extend([f"side: {error}" for error in side_result.errors])
|
||||
|
||||
# Validate trade ID
|
||||
trade_id_result = self.validate_trade_id(trade.trade_id)
|
||||
if not trade_id_result.is_valid:
|
||||
errors.extend([f"trade_id: {error}" for error in trade_id_result.errors])
|
||||
warnings.extend([f"trade_id: {warning}" for warning in trade_id_result.warnings])
|
||||
|
||||
# Validate symbol format (exchange-specific)
|
||||
symbol_result = self.validate_symbol_format(trade.symbol)
|
||||
if not symbol_result.is_valid:
|
||||
errors.extend([f"symbol: {error}" for error in symbol_result.errors])
|
||||
warnings.extend([f"symbol: {warning}" for warning in symbol_result.warnings])
|
||||
|
||||
# Validate timestamp
|
||||
timestamp_ms = int(trade.timestamp.timestamp() * 1000)
|
||||
timestamp_result = self.validate_timestamp(timestamp_ms, is_milliseconds=True)
|
||||
if not timestamp_result.is_valid:
|
||||
errors.extend([f"timestamp: {error}" for error in timestamp_result.errors])
|
||||
warnings.extend([f"timestamp: {warning}" for warning in timestamp_result.warnings])
|
||||
|
||||
return DataValidationResult(len(errors) == 0, errors, warnings)
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Exception during trade validation: {str(e)}")
|
||||
return DataValidationResult(False, errors, warnings)
|
||||
|
||||
def get_validator_info(self) -> Dict[str, Any]:
|
||||
"""Get validator configuration information."""
|
||||
return {
|
||||
'exchange': self.exchange_name,
|
||||
'component': self.component_name,
|
||||
'limits': {
|
||||
'min_price': str(self._min_price),
|
||||
'max_price': str(self._max_price),
|
||||
'min_size': str(self._min_size),
|
||||
'max_size': str(self._max_size),
|
||||
'min_timestamp': self._min_timestamp,
|
||||
'max_timestamp': self._max_timestamp
|
||||
},
|
||||
'patterns': {
|
||||
'numeric': self._numeric_pattern.pattern,
|
||||
'trade_id': self._trade_id_pattern.pattern
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Utility functions for common validation patterns
|
||||
|
||||
def is_valid_decimal(value: Any) -> bool:
|
||||
"""Check if value can be converted to a valid decimal."""
|
||||
try:
|
||||
Decimal(str(value))
|
||||
return True
|
||||
except (InvalidOperation, ValueError, TypeError):
|
||||
return False
|
||||
|
||||
|
||||
def normalize_symbol(symbol: str, exchange: str) -> str:
|
||||
"""
|
||||
Normalize symbol format for exchange.
|
||||
|
||||
Args:
|
||||
symbol: Raw symbol string
|
||||
exchange: Exchange name
|
||||
|
||||
Returns:
|
||||
Normalized symbol string
|
||||
"""
|
||||
# Basic normalization - can be extended per exchange
|
||||
return symbol.upper().strip()
|
||||
|
||||
|
||||
def validate_required_fields(data: Dict[str, Any], required_fields: List[str]) -> List[str]:
|
||||
"""
|
||||
Validate that all required fields are present in data.
|
||||
|
||||
Args:
|
||||
data: Data dictionary to check
|
||||
required_fields: List of required field names
|
||||
|
||||
Returns:
|
||||
List of missing field names
|
||||
"""
|
||||
missing_fields = []
|
||||
for field in required_fields:
|
||||
if field not in data or data[field] is None:
|
||||
missing_fields.append(field)
|
||||
return missing_fields
|
||||
|
||||
|
||||
__all__ = [
|
||||
'ValidationResult',
|
||||
'BaseDataValidator',
|
||||
'is_valid_decimal',
|
||||
'normalize_symbol',
|
||||
'validate_required_fields'
|
||||
]
|
||||
Reference in New Issue
Block a user