- Introduced a modular architecture for data processing, including common utilities for validation, transformation, and aggregation. - Implemented `StandardizedTrade`, `OHLCVCandle`, and `TimeframeBucket` classes for unified data handling across exchanges. - Developed `OKXDataProcessor` for OKX-specific data validation and processing, leveraging the new common framework. - Enhanced `OKXCollector` to utilize the common data processing utilities, improving modularity and maintainability. - Updated documentation to reflect the new architecture and provide guidance on the data processing framework. - Created comprehensive tests for the new data processing components to ensure reliability and functionality.
471 lines
16 KiB
Python
471 lines
16 KiB
Python
"""
|
|
Base transformation utilities for all exchanges.
|
|
|
|
This module provides common transformation patterns and base classes
|
|
for converting exchange-specific data to standardized formats.
|
|
"""
|
|
|
|
from datetime import datetime, timezone
|
|
from decimal import Decimal
|
|
from typing import Dict, List, Optional, Any, Iterator
|
|
from abc import ABC, abstractmethod
|
|
|
|
from .data_types import StandardizedTrade, OHLCVCandle, DataValidationResult
|
|
from .aggregation import BatchCandleProcessor
|
|
from utils.logger import get_logger
|
|
|
|
|
|
class BaseDataTransformer(ABC):
|
|
"""
|
|
Abstract base class for exchange data transformers.
|
|
|
|
This class provides common transformation patterns that can be
|
|
extended by exchange-specific implementations.
|
|
"""
|
|
|
|
def __init__(self,
|
|
exchange_name: str,
|
|
component_name: str = "base_data_transformer"):
|
|
"""
|
|
Initialize base data transformer.
|
|
|
|
Args:
|
|
exchange_name: Name of the exchange (e.g., 'okx', 'binance')
|
|
component_name: Name for logging
|
|
"""
|
|
self.exchange_name = exchange_name
|
|
self.component_name = component_name
|
|
self.logger = get_logger(self.component_name)
|
|
|
|
self.logger.info(f"Initialized base data transformer for {exchange_name}")
|
|
|
|
# Abstract methods that must be implemented by subclasses
|
|
|
|
@abstractmethod
|
|
def transform_trade_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[StandardizedTrade]:
|
|
"""Transform exchange-specific trade data to standardized format."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def transform_orderbook_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
|
|
"""Transform exchange-specific orderbook data to standardized format."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def transform_ticker_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
|
|
"""Transform exchange-specific ticker data to standardized format."""
|
|
pass
|
|
|
|
# Common transformation utilities available to all subclasses
|
|
|
|
def timestamp_to_datetime(self, timestamp: Any, is_milliseconds: bool = True) -> datetime:
|
|
"""
|
|
Convert various timestamp formats to timezone-aware datetime.
|
|
|
|
Args:
|
|
timestamp: Timestamp in various formats
|
|
is_milliseconds: True if timestamp is in milliseconds
|
|
|
|
Returns:
|
|
Timezone-aware datetime object
|
|
"""
|
|
try:
|
|
# Convert to int/float
|
|
if isinstance(timestamp, str):
|
|
timestamp_num = float(timestamp)
|
|
elif isinstance(timestamp, (int, float)):
|
|
timestamp_num = float(timestamp)
|
|
else:
|
|
raise ValueError(f"Invalid timestamp type: {type(timestamp)}")
|
|
|
|
# Convert to seconds if needed
|
|
if is_milliseconds:
|
|
timestamp_num = timestamp_num / 1000
|
|
|
|
# Create timezone-aware datetime
|
|
dt = datetime.fromtimestamp(timestamp_num, tz=timezone.utc)
|
|
return dt
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error converting timestamp {timestamp}: {e}")
|
|
# Return current time as fallback
|
|
return datetime.now(timezone.utc)
|
|
|
|
def safe_decimal_conversion(self, value: Any, field_name: str = "value") -> Optional[Decimal]:
|
|
"""
|
|
Safely convert value to Decimal with error handling.
|
|
|
|
Args:
|
|
value: Value to convert
|
|
field_name: Name of field for error logging
|
|
|
|
Returns:
|
|
Decimal value or None if conversion failed
|
|
"""
|
|
try:
|
|
if value is None or value == "":
|
|
return None
|
|
return Decimal(str(value))
|
|
except Exception as e:
|
|
self.logger.warning(f"Failed to convert {field_name} '{value}' to Decimal: {e}")
|
|
return None
|
|
|
|
def normalize_trade_side(self, side: str) -> str:
|
|
"""
|
|
Normalize trade side to standard format.
|
|
|
|
Args:
|
|
side: Raw trade side string
|
|
|
|
Returns:
|
|
Normalized side ('buy' or 'sell')
|
|
"""
|
|
normalized = side.lower().strip()
|
|
|
|
# Handle common variations
|
|
if normalized in ['buy', 'bid', 'b', '1']:
|
|
return 'buy'
|
|
elif normalized in ['sell', 'ask', 's', '0']:
|
|
return 'sell'
|
|
else:
|
|
self.logger.warning(f"Unknown trade side: {side}, defaulting to 'buy'")
|
|
return 'buy'
|
|
|
|
def validate_symbol_format(self, symbol: str) -> str:
|
|
"""
|
|
Validate and normalize symbol format.
|
|
|
|
Args:
|
|
symbol: Raw symbol string
|
|
|
|
Returns:
|
|
Normalized symbol string
|
|
"""
|
|
if not symbol or not isinstance(symbol, str):
|
|
raise ValueError(f"Invalid symbol: {symbol}")
|
|
|
|
# Basic normalization
|
|
normalized = symbol.upper().strip()
|
|
|
|
if not normalized:
|
|
raise ValueError("Empty symbol after normalization")
|
|
|
|
return normalized
|
|
|
|
def transform_database_record(self, record: Any) -> Optional[StandardizedTrade]:
|
|
"""
|
|
Transform database record to standardized format.
|
|
|
|
This method should be overridden by subclasses to handle
|
|
their specific database schema.
|
|
|
|
Args:
|
|
record: Database record
|
|
|
|
Returns:
|
|
StandardizedTrade or None if transformation failed
|
|
"""
|
|
self.logger.warning("transform_database_record not implemented for this exchange")
|
|
return None
|
|
|
|
def get_transformer_info(self) -> Dict[str, Any]:
|
|
"""Get transformer information."""
|
|
return {
|
|
'exchange': self.exchange_name,
|
|
'component': self.component_name,
|
|
'capabilities': {
|
|
'trade_transformation': True,
|
|
'orderbook_transformation': True,
|
|
'ticker_transformation': True,
|
|
'database_transformation': hasattr(self, 'transform_database_record')
|
|
}
|
|
}
|
|
|
|
|
|
class UnifiedDataTransformer:
|
|
"""
|
|
Unified data transformation system for all scenarios.
|
|
|
|
This class provides a common interface for transforming data from
|
|
various sources (real-time, historical, backfill) into standardized
|
|
formats for further processing.
|
|
|
|
TRANSFORMATION PROCESS:
|
|
|
|
1. Raw Data Input (exchange format, database records, etc.)
|
|
2. Validation (using exchange-specific validators)
|
|
3. Transformation to StandardizedTrade format
|
|
4. Optional aggregation to candles
|
|
5. Output in consistent format
|
|
"""
|
|
|
|
def __init__(self,
|
|
exchange_transformer: BaseDataTransformer,
|
|
component_name: str = "unified_data_transformer"):
|
|
"""
|
|
Initialize unified data transformer.
|
|
|
|
Args:
|
|
exchange_transformer: Exchange-specific transformer instance
|
|
component_name: Name for logging
|
|
"""
|
|
self.exchange_transformer = exchange_transformer
|
|
self.component_name = component_name
|
|
self.logger = get_logger(self.component_name)
|
|
|
|
self.logger.info(f"Initialized unified data transformer with {exchange_transformer.exchange_name} transformer")
|
|
|
|
def transform_trade_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[StandardizedTrade]:
|
|
"""
|
|
Transform trade data using exchange-specific transformer.
|
|
|
|
Args:
|
|
raw_data: Raw trade data from exchange
|
|
symbol: Trading symbol
|
|
|
|
Returns:
|
|
Standardized trade or None if transformation failed
|
|
"""
|
|
try:
|
|
return self.exchange_transformer.transform_trade_data(raw_data, symbol)
|
|
except Exception as e:
|
|
self.logger.error(f"Error in trade transformation: {e}")
|
|
return None
|
|
|
|
def transform_orderbook_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Transform orderbook data using exchange-specific transformer.
|
|
|
|
Args:
|
|
raw_data: Raw orderbook data from exchange
|
|
symbol: Trading symbol
|
|
|
|
Returns:
|
|
Standardized orderbook data or None if transformation failed
|
|
"""
|
|
try:
|
|
return self.exchange_transformer.transform_orderbook_data(raw_data, symbol)
|
|
except Exception as e:
|
|
self.logger.error(f"Error in orderbook transformation: {e}")
|
|
return None
|
|
|
|
def transform_ticker_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Transform ticker data using exchange-specific transformer.
|
|
|
|
Args:
|
|
raw_data: Raw ticker data from exchange
|
|
symbol: Trading symbol
|
|
|
|
Returns:
|
|
Standardized ticker data or None if transformation failed
|
|
"""
|
|
try:
|
|
return self.exchange_transformer.transform_ticker_data(raw_data, symbol)
|
|
except Exception as e:
|
|
self.logger.error(f"Error in ticker transformation: {e}")
|
|
return None
|
|
|
|
def process_trades_to_candles(self,
|
|
trades: Iterator[StandardizedTrade],
|
|
timeframes: List[str],
|
|
symbol: str) -> List[OHLCVCandle]:
|
|
"""
|
|
Process any trade iterator to candles - unified processing function.
|
|
|
|
This function handles ALL scenarios:
|
|
- Real-time: Single trade iterators
|
|
- Historical: Batch trade iterators
|
|
- Backfill: API trade iterators
|
|
|
|
Args:
|
|
trades: Iterator of standardized trades
|
|
timeframes: List of timeframes to generate
|
|
symbol: Trading symbol
|
|
|
|
Returns:
|
|
List of completed candles
|
|
"""
|
|
try:
|
|
processor = BatchCandleProcessor(
|
|
symbol,
|
|
self.exchange_transformer.exchange_name,
|
|
timeframes,
|
|
f"unified_batch_processor_{symbol}"
|
|
)
|
|
|
|
candles = processor.process_trades_to_candles(trades)
|
|
|
|
self.logger.info(f"Processed {processor.get_stats()['trades_processed']} trades to {len(candles)} candles")
|
|
return candles
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error processing trades to candles: {e}")
|
|
return []
|
|
|
|
def batch_transform_trades(self,
|
|
raw_trades: List[Dict[str, Any]],
|
|
symbol: str) -> List[StandardizedTrade]:
|
|
"""
|
|
Transform multiple trade records in batch.
|
|
|
|
Args:
|
|
raw_trades: List of raw trade data
|
|
symbol: Trading symbol
|
|
|
|
Returns:
|
|
List of successfully transformed trades
|
|
"""
|
|
transformed_trades = []
|
|
errors = 0
|
|
|
|
for raw_trade in raw_trades:
|
|
try:
|
|
trade = self.transform_trade_data(raw_trade, symbol)
|
|
if trade:
|
|
transformed_trades.append(trade)
|
|
else:
|
|
errors += 1
|
|
except Exception as e:
|
|
self.logger.error(f"Error transforming trade: {e}")
|
|
errors += 1
|
|
|
|
self.logger.info(f"Batch transformed {len(transformed_trades)} trades successfully, {errors} errors")
|
|
return transformed_trades
|
|
|
|
def get_transformer_info(self) -> Dict[str, Any]:
|
|
"""Get comprehensive transformer information."""
|
|
base_info = self.exchange_transformer.get_transformer_info()
|
|
base_info.update({
|
|
'unified_component': self.component_name,
|
|
'batch_processing': True,
|
|
'candle_aggregation': True
|
|
})
|
|
return base_info
|
|
|
|
|
|
# Utility functions for common transformation patterns
|
|
|
|
def create_standardized_trade(symbol: str,
|
|
trade_id: str,
|
|
price: Any,
|
|
size: Any,
|
|
side: str,
|
|
timestamp: Any,
|
|
exchange: str,
|
|
raw_data: Optional[Dict[str, Any]] = None,
|
|
is_milliseconds: bool = True) -> StandardizedTrade:
|
|
"""
|
|
Utility function to create StandardizedTrade with proper validation.
|
|
|
|
Args:
|
|
symbol: Trading symbol
|
|
trade_id: Trade identifier
|
|
price: Trade price (any numeric type)
|
|
size: Trade size (any numeric type)
|
|
side: Trade side ('buy' or 'sell')
|
|
timestamp: Trade timestamp
|
|
exchange: Exchange name
|
|
raw_data: Original raw data
|
|
is_milliseconds: True if timestamp is in milliseconds
|
|
|
|
Returns:
|
|
StandardizedTrade object
|
|
|
|
Raises:
|
|
ValueError: If data is invalid
|
|
"""
|
|
# Convert timestamp
|
|
if isinstance(timestamp, (int, float, str)):
|
|
timestamp_num = float(timestamp)
|
|
if is_milliseconds:
|
|
timestamp_num = timestamp_num / 1000
|
|
dt = datetime.fromtimestamp(timestamp_num, tz=timezone.utc)
|
|
elif isinstance(timestamp, datetime):
|
|
dt = timestamp
|
|
if dt.tzinfo is None:
|
|
dt = dt.replace(tzinfo=timezone.utc)
|
|
else:
|
|
raise ValueError(f"Invalid timestamp type: {type(timestamp)}")
|
|
|
|
# Convert price and size to Decimal
|
|
try:
|
|
decimal_price = Decimal(str(price))
|
|
decimal_size = Decimal(str(size))
|
|
except Exception as e:
|
|
raise ValueError(f"Invalid price or size: {e}")
|
|
|
|
# Normalize side
|
|
normalized_side = side.lower().strip()
|
|
if normalized_side not in ['buy', 'sell']:
|
|
raise ValueError(f"Invalid trade side: {side}")
|
|
|
|
return StandardizedTrade(
|
|
symbol=symbol.upper().strip(),
|
|
trade_id=str(trade_id),
|
|
price=decimal_price,
|
|
size=decimal_size,
|
|
side=normalized_side,
|
|
timestamp=dt,
|
|
exchange=exchange.lower(),
|
|
raw_data=raw_data
|
|
)
|
|
|
|
|
|
def batch_create_standardized_trades(raw_trades: List[Dict[str, Any]],
|
|
symbol: str,
|
|
exchange: str,
|
|
field_mapping: Dict[str, str],
|
|
is_milliseconds: bool = True) -> List[StandardizedTrade]:
|
|
"""
|
|
Batch create standardized trades from raw data.
|
|
|
|
Args:
|
|
raw_trades: List of raw trade dictionaries
|
|
symbol: Trading symbol
|
|
exchange: Exchange name
|
|
field_mapping: Mapping of StandardizedTrade fields to raw data fields
|
|
is_milliseconds: True if timestamps are in milliseconds
|
|
|
|
Returns:
|
|
List of successfully created StandardizedTrade objects
|
|
|
|
Example field_mapping:
|
|
{
|
|
'trade_id': 'id',
|
|
'price': 'px',
|
|
'size': 'sz',
|
|
'side': 'side',
|
|
'timestamp': 'ts'
|
|
}
|
|
"""
|
|
trades = []
|
|
|
|
for raw_trade in raw_trades:
|
|
try:
|
|
trade = create_standardized_trade(
|
|
symbol=symbol,
|
|
trade_id=raw_trade[field_mapping['trade_id']],
|
|
price=raw_trade[field_mapping['price']],
|
|
size=raw_trade[field_mapping['size']],
|
|
side=raw_trade[field_mapping['side']],
|
|
timestamp=raw_trade[field_mapping['timestamp']],
|
|
exchange=exchange,
|
|
raw_data=raw_trade,
|
|
is_milliseconds=is_milliseconds
|
|
)
|
|
trades.append(trade)
|
|
except Exception as e:
|
|
# Log error but continue processing
|
|
logger = get_logger("batch_transform")
|
|
logger.warning(f"Failed to transform trade: {e}")
|
|
|
|
return trades
|
|
|
|
|
|
__all__ = [
|
|
'BaseDataTransformer',
|
|
'UnifiedDataTransformer',
|
|
'create_standardized_trade',
|
|
'batch_create_standardized_trades'
|
|
] |