Add common data processing framework for OKX exchange

- Introduced a modular architecture for data processing, including common utilities for validation, transformation, and aggregation. - Implemented `StandardizedTrade`, `OHLCVCandle`, and `TimeframeBucket` classes for unified data handling across exchanges. - Developed `OKXDataProcessor` for OKX-specific data validation and processing, leveraging the new common framework. - Enhanced `OKXCollector` to utilize the common data processing utilities, improving modularity and maintainability. - Updated documentation to reflect the new architecture and provide guidance on the data processing framework. - Created comprehensive tests for the new data processing components to ensure reliability and functionality.
2025-05-31 21:58:47 +08:00
parent fa63e7eb2e
commit 8bb5f28fd2
15 changed files with 4015 additions and 214 deletions
--- a/data/common/init.py
+++ b/data/common/init.py
@@ -0,0 +1,52 @@
+"""
+Common data processing utilities for all exchanges.
+
+This package contains shared components for data validation, transformation,
+and aggregation that can be used across different exchange implementations.
+"""
+
+from .data_types import (
+    StandardizedTrade,
+    OHLCVCandle,
+    MarketDataPoint,
+    DataValidationResult
+)
+
+from .aggregation import (
+    TimeframeBucket,
+    RealTimeCandleProcessor,
+    CandleProcessingConfig
+)
+
+from .transformation import (
+    BaseDataTransformer,
+    UnifiedDataTransformer,
+    create_standardized_trade
+)
+
+from .validation import (
+    BaseDataValidator,
+    ValidationResult
+)
+
+__all__ = [
+    # Data types
+    'StandardizedTrade',
+    'OHLCVCandle', 
+    'MarketDataPoint',
+    'DataValidationResult',
+    
+    # Aggregation
+    'TimeframeBucket',
+    'RealTimeCandleProcessor',
+    'CandleProcessingConfig',
+    
+    # Transformation
+    'BaseDataTransformer',
+    'UnifiedDataTransformer',
+    'create_standardized_trade',
+    
+    # Validation
+    'BaseDataValidator',
+    'ValidationResult'
+] 
--- a/data/common/aggregation.py
+++ b/data/common/aggregation.py
@@ -0,0 +1,553 @@
+"""
+Common aggregation utilities for all exchanges.
+
+This module provides shared functionality for building OHLCV candles
+from trade data, regardless of the source exchange.
+
+AGGREGATION STRATEGY:
+- Uses RIGHT-ALIGNED timestamps (industry standard)
+- Candle timestamp = end time of the interval (close time)
+- 5-minute candle with timestamp 09:05:00 represents data from 09:00:01 to 09:05:00
+- Prevents future leakage by only completing candles when time boundary is crossed
+- Aligns with major exchanges (Binance, OKX, Coinbase)
+
+PROCESS FLOW:
+1. Trade arrives with timestamp T
+2. Calculate which time bucket this trade belongs to
+3. If bucket doesn't exist or time boundary crossed, complete previous bucket
+4. Add trade to current bucket
+5. Only emit completed candles (never future data)
+"""
+
+from datetime import datetime, timezone, timedelta
+from decimal import Decimal
+from typing import Dict, List, Optional, Any, Iterator, Callable
+from collections import defaultdict
+
+from .data_types import (
+    StandardizedTrade, 
+    OHLCVCandle, 
+    CandleProcessingConfig,
+    ProcessingStats
+)
+from utils.logger import get_logger
+
+
+class TimeframeBucket:
+    """
+    Time bucket for building OHLCV candles from trades.
+    
+    This class accumulates trades within a specific time period
+    and calculates OHLCV data incrementally.
+    
+    IMPORTANT: Uses RIGHT-ALIGNED timestamps
+    - start_time: Beginning of the interval (inclusive)
+    - end_time: End of the interval (exclusive) - this becomes the candle timestamp
+    - Example: 09:00:00 - 09:05:00 bucket -> candle timestamp = 09:05:00
+    """
+    
+    def __init__(self, symbol: str, timeframe: str, start_time: datetime, exchange: str = "unknown"):
+        """
+        Initialize time bucket for candle aggregation.
+        
+        Args:
+            symbol: Trading symbol (e.g., 'BTC-USDT')
+            timeframe: Time period (e.g., '1m', '5m', '1h')
+            start_time: Start time for this bucket (inclusive)
+            exchange: Exchange name
+        """
+        self.symbol = symbol
+        self.timeframe = timeframe
+        self.start_time = start_time
+        self.end_time = self._calculate_end_time(start_time, timeframe)
+        self.exchange = exchange
+        
+        # OHLCV data
+        self.open: Optional[Decimal] = None
+        self.high: Optional[Decimal] = None
+        self.low: Optional[Decimal] = None
+        self.close: Optional[Decimal] = None
+        self.volume: Decimal = Decimal('0')
+        self.trade_count: int = 0
+        
+        # Tracking
+        self.first_trade_time: Optional[datetime] = None
+        self.last_trade_time: Optional[datetime] = None
+        self.trades: List[StandardizedTrade] = []
+    
+    def add_trade(self, trade: StandardizedTrade) -> bool:
+        """
+        Add trade to this bucket if it belongs to this time period.
+        
+        Args:
+            trade: Standardized trade data
+            
+        Returns:
+            True if trade was added, False if outside time range
+        """
+        # Check if trade belongs in this bucket (start_time <= trade.timestamp < end_time)
+        if not (self.start_time <= trade.timestamp < self.end_time):
+            return False
+        
+        # First trade sets open price
+        if self.open is None:
+            self.open = trade.price
+            self.high = trade.price
+            self.low = trade.price
+            self.first_trade_time = trade.timestamp
+        
+        # Update OHLCV
+        self.high = max(self.high, trade.price)
+        self.low = min(self.low, trade.price)
+        self.close = trade.price  # Last trade sets close
+        self.volume += trade.size
+        self.trade_count += 1
+        self.last_trade_time = trade.timestamp
+        
+        # Store trade for detailed analysis if needed
+        self.trades.append(trade)
+        
+        return True
+    
+    def to_candle(self, is_complete: bool = True) -> OHLCVCandle:
+        """
+        Convert bucket to OHLCV candle.
+        
+        IMPORTANT: Candle timestamp = end_time (right-aligned, industry standard)
+        """
+        return OHLCVCandle(
+            symbol=self.symbol,
+            timeframe=self.timeframe,
+            start_time=self.start_time,
+            end_time=self.end_time,
+            open=self.open or Decimal('0'),
+            high=self.high or Decimal('0'),
+            low=self.low or Decimal('0'),
+            close=self.close or Decimal('0'),
+            volume=self.volume,
+            trade_count=self.trade_count,
+            exchange=self.exchange,
+            is_complete=is_complete,
+            first_trade_time=self.first_trade_time,
+            last_trade_time=self.last_trade_time
+        )
+    
+    def _calculate_end_time(self, start_time: datetime, timeframe: str) -> datetime:
+        """Calculate end time for this timeframe (right-aligned timestamp)."""
+        if timeframe == '1m':
+            return start_time + timedelta(minutes=1)
+        elif timeframe == '5m':
+            return start_time + timedelta(minutes=5)
+        elif timeframe == '15m':
+            return start_time + timedelta(minutes=15)
+        elif timeframe == '30m':
+            return start_time + timedelta(minutes=30)
+        elif timeframe == '1h':
+            return start_time + timedelta(hours=1)
+        elif timeframe == '4h':
+            return start_time + timedelta(hours=4)
+        elif timeframe == '1d':
+            return start_time + timedelta(days=1)
+        else:
+            raise ValueError(f"Unsupported timeframe: {timeframe}")
+
+
+class RealTimeCandleProcessor:
+    """
+    Real-time candle processor for live trade data.
+    
+    This class processes trades immediately as they arrive from WebSocket,
+    building candles incrementally and emitting completed candles when
+    time boundaries are crossed.
+    
+    AGGREGATION PROCESS (NO FUTURE LEAKAGE):
+    
+    1. Trade arrives from WebSocket/API with timestamp T
+    2. For each configured timeframe (1m, 5m, etc.):
+       a. Calculate which time bucket this trade belongs to
+       b. Get current bucket for this timeframe
+       c. Check if trade timestamp crosses time boundary
+       d. If boundary crossed: complete and emit previous bucket, create new bucket
+       e. Add trade to current bucket (updates OHLCV)
+    3. Only emit candles when time boundary is definitively crossed
+    4. Never emit incomplete/future candles during real-time processing
+    
+    TIMESTAMP ALIGNMENT:
+    - Uses RIGHT-ALIGNED timestamps (industry standard)
+    - 1-minute candle covering 09:00:00-09:01:00 gets timestamp 09:01:00
+    - 5-minute candle covering 09:00:00-09:05:00 gets timestamp 09:05:00
+    - Candle represents PAST data, never future
+    """
+    
+    def __init__(self, 
+                 symbol: str,
+                 exchange: str,
+                 config: Optional[CandleProcessingConfig] = None,
+                 component_name: str = "realtime_candle_processor"):
+        """
+        Initialize real-time candle processor.
+        
+        Args:
+            symbol: Trading symbol (e.g., 'BTC-USDT')
+            exchange: Exchange name (e.g., 'okx', 'binance')
+            config: Processing configuration
+            component_name: Name for logging
+        """
+        self.symbol = symbol
+        self.exchange = exchange
+        self.config = config or CandleProcessingConfig()
+        self.component_name = component_name
+        self.logger = get_logger(self.component_name)
+        
+        # Current buckets for each timeframe
+        self.current_buckets: Dict[str, TimeframeBucket] = {}
+        
+        # Callback functions for completed candles
+        self.candle_callbacks: List[Callable[[OHLCVCandle], None]] = []
+        
+        # Statistics
+        self.stats = ProcessingStats(active_timeframes=len(self.config.timeframes))
+        
+        self.logger.info(f"Initialized real-time candle processor for {symbol} on {exchange} with timeframes: {self.config.timeframes}")
+    
+    def add_candle_callback(self, callback: Callable[[OHLCVCandle], None]) -> None:
+        """Add callback function to receive completed candles."""
+        self.candle_callbacks.append(callback)
+        self.logger.debug(f"Added candle callback: {callback.__name__ if hasattr(callback, '__name__') else str(callback)}")
+    
+    def process_trade(self, trade: StandardizedTrade) -> List[OHLCVCandle]:
+        """
+        Process single trade - main entry point for real-time processing.
+        
+        This is called for each trade as it arrives from WebSocket.
+        
+        CRITICAL: Only returns completed candles (time boundary crossed)
+        Never returns incomplete/future candles to prevent leakage.
+        
+        Args:
+            trade: Standardized trade data
+            
+        Returns:
+            List of completed candles (if any time boundaries were crossed)
+        """
+        try:
+            completed_candles = []
+            
+            # Process trade for each timeframe
+            for timeframe in self.config.timeframes:
+                candle = self._process_trade_for_timeframe(trade, timeframe)
+                if candle:
+                    completed_candles.append(candle)
+            
+            # Update statistics
+            self.stats.trades_processed += 1
+            self.stats.last_trade_time = trade.timestamp
+            
+            # Emit completed candles to callbacks
+            for candle in completed_candles:
+                self._emit_candle(candle)
+            
+            return completed_candles
+            
+        except Exception as e:
+            self.logger.error(f"Error processing trade for {self.symbol}: {e}")
+            self.stats.errors_count += 1
+            return []
+    
+    def _process_trade_for_timeframe(self, trade: StandardizedTrade, timeframe: str) -> Optional[OHLCVCandle]:
+        """
+        Process trade for specific timeframe.
+        
+        CRITICAL LOGIC FOR PREVENTING FUTURE LEAKAGE:
+        1. Calculate which bucket this trade belongs to
+        2. Check if current bucket exists and matches
+        3. If bucket mismatch (time boundary crossed), complete current bucket first
+        4. Create new bucket and add trade
+        5. Only return completed candles, never incomplete ones
+        """
+        try:
+            # Calculate which bucket this trade belongs to
+            trade_bucket_start = self._get_bucket_start_time(trade.timestamp, timeframe)
+            
+            # Check if we have a current bucket for this timeframe
+            current_bucket = self.current_buckets.get(timeframe)
+            completed_candle = None
+            
+            # If no bucket exists or time boundary crossed, handle transition
+            if current_bucket is None:
+                # First bucket for this timeframe
+                current_bucket = TimeframeBucket(self.symbol, timeframe, trade_bucket_start, self.exchange)
+                self.current_buckets[timeframe] = current_bucket
+            elif current_bucket.start_time != trade_bucket_start:
+                # Time boundary crossed - complete previous bucket
+                if current_bucket.trade_count > 0:  # Only complete if it has trades
+                    completed_candle = current_bucket.to_candle(is_complete=True)
+                    self.stats.candles_emitted += 1
+                    self.stats.last_candle_time = completed_candle.end_time
+                
+                # Create new bucket for current time period
+                current_bucket = TimeframeBucket(self.symbol, timeframe, trade_bucket_start, self.exchange)
+                self.current_buckets[timeframe] = current_bucket
+            
+            # Add trade to current bucket
+            if not current_bucket.add_trade(trade):
+                # This should never happen if logic is correct
+                self.logger.warning(f"Trade {trade.timestamp} could not be added to bucket {current_bucket.start_time}-{current_bucket.end_time}")
+            
+            return completed_candle
+            
+        except Exception as e:
+            self.logger.error(f"Error processing trade for timeframe {timeframe}: {e}")
+            self.stats.errors_count += 1
+            return None
+    
+    def _get_bucket_start_time(self, timestamp: datetime, timeframe: str) -> datetime:
+        """
+        Calculate bucket start time for given timestamp and timeframe.
+        
+        This function determines which time bucket a trade belongs to.
+        The start time is the LEFT boundary of the interval.
+        
+        EXAMPLES:
+        - Trade at 09:03:45 for 5m timeframe -> bucket start = 09:00:00
+        - Trade at 09:07:23 for 5m timeframe -> bucket start = 09:05:00
+        - Trade at 14:00:00 for 1h timeframe -> bucket start = 14:00:00
+        
+        Args:
+            timestamp: Trade timestamp
+            timeframe: Target timeframe
+            
+        Returns:
+            Bucket start time (left boundary)
+        """
+        # Normalize to UTC and remove microseconds for clean boundaries
+        dt = timestamp.replace(second=0, microsecond=0)
+        
+        if timeframe == '1m':
+            # 1-minute buckets align to minute boundaries
+            return dt
+        elif timeframe == '5m':
+            # 5-minute buckets: 00:00, 00:05, 00:10, etc.
+            return dt.replace(minute=(dt.minute // 5) * 5)
+        elif timeframe == '15m':
+            # 15-minute buckets: 00:00, 00:15, 00:30, 00:45
+            return dt.replace(minute=(dt.minute // 15) * 15)
+        elif timeframe == '30m':
+            # 30-minute buckets: 00:00, 00:30
+            return dt.replace(minute=(dt.minute // 30) * 30)
+        elif timeframe == '1h':
+            # 1-hour buckets align to hour boundaries
+            return dt.replace(minute=0)
+        elif timeframe == '4h':
+            # 4-hour buckets: 00:00, 04:00, 08:00, 12:00, 16:00, 20:00
+            return dt.replace(minute=0, hour=(dt.hour // 4) * 4)
+        elif timeframe == '1d':
+            # 1-day buckets align to day boundaries (midnight UTC)
+            return dt.replace(minute=0, hour=0)
+        else:
+            raise ValueError(f"Unsupported timeframe: {timeframe}")
+    
+    def _emit_candle(self, candle: OHLCVCandle) -> None:
+        """Emit completed candle to all callbacks."""
+        try:
+            for callback in self.candle_callbacks:
+                callback(candle)
+        except Exception as e:
+            self.logger.error(f"Error in candle callback: {e}")
+            self.stats.errors_count += 1
+    
+    def get_current_candles(self, incomplete: bool = True) -> List[OHLCVCandle]:
+        """
+        Get current incomplete candles for all timeframes.
+        
+        WARNING: These are incomplete candles and should NOT be used for trading decisions.
+        They are useful for monitoring/debugging only.
+        """
+        candles = []
+        for bucket in self.current_buckets.values():
+            if bucket.trade_count > 0:  # Only return buckets with trades
+                candles.append(bucket.to_candle(is_complete=False))
+        return candles
+    
+    def force_complete_all_candles(self) -> List[OHLCVCandle]:
+        """
+        Force completion of all current candles (useful for shutdown/batch processing).
+        
+        WARNING: This should only be used during shutdown or batch processing,
+        not during live trading as it forces incomplete candles to be marked complete.
+        """
+        completed_candles = []
+        for bucket in self.current_buckets.values():
+            if bucket.trade_count > 0:
+                candle = bucket.to_candle(is_complete=True)
+                completed_candles.append(candle)
+                self._emit_candle(candle)
+        
+        # Clear buckets
+        self.current_buckets.clear()
+        return completed_candles
+    
+    def get_stats(self) -> Dict[str, Any]:
+        """Get processing statistics."""
+        stats_dict = self.stats.to_dict()
+        stats_dict['current_buckets'] = {
+            tf: bucket.trade_count for tf, bucket in self.current_buckets.items()
+        }
+        return stats_dict
+
+
+class BatchCandleProcessor:
+    """
+    Batch candle processor for historical data processing.
+    
+    This class processes large batches of historical trades efficiently,
+    building candles for multiple timeframes simultaneously.
+    """
+    
+    def __init__(self, 
+                 symbol: str,
+                 exchange: str,
+                 timeframes: List[str],
+                 component_name: str = "batch_candle_processor"):
+        """
+        Initialize batch candle processor.
+        
+        Args:
+            symbol: Trading symbol
+            exchange: Exchange name
+            timeframes: List of timeframes to process
+            component_name: Name for logging
+        """
+        self.symbol = symbol
+        self.exchange = exchange
+        self.timeframes = timeframes
+        self.component_name = component_name
+        self.logger = get_logger(self.component_name)
+        
+        # Statistics
+        self.stats = ProcessingStats(active_timeframes=len(timeframes))
+        
+        self.logger.info(f"Initialized batch candle processor for {symbol} on {exchange}")
+    
+    def process_trades_to_candles(self, trades: Iterator[StandardizedTrade]) -> List[OHLCVCandle]:
+        """
+        Process trade iterator to candles - optimized for batch processing.
+        
+        This function handles ALL scenarios:
+        - Historical: Batch trade iterators
+        - Backfill: API trade iterators
+        - Real-time batch: Multiple trades at once
+        
+        Args:
+            trades: Iterator of standardized trades
+            
+        Returns:
+            List of completed candles
+        """
+        try:
+            # Create temporary processor for this batch
+            config = CandleProcessingConfig(timeframes=self.timeframes, auto_save_candles=False)
+            processor = RealTimeCandleProcessor(
+                self.symbol, self.exchange, config, 
+                f"batch_processor_{self.symbol}_{self.exchange}"
+            )
+            
+            all_candles = []
+            
+            # Process all trades
+            for trade in trades:
+                completed_candles = processor.process_trade(trade)
+                all_candles.extend(completed_candles)
+                self.stats.trades_processed += 1
+            
+            # Force complete any remaining candles
+            remaining_candles = processor.force_complete_all_candles()
+            all_candles.extend(remaining_candles)
+            
+            # Update stats
+            self.stats.candles_emitted = len(all_candles)
+            if all_candles:
+                self.stats.last_candle_time = max(candle.end_time for candle in all_candles)
+            
+            self.logger.info(f"Batch processed {self.stats.trades_processed} trades to {len(all_candles)} candles")
+            return all_candles
+            
+        except Exception as e:
+            self.logger.error(f"Error in batch processing trades to candles: {e}")
+            self.stats.errors_count += 1
+            return []
+    
+    def get_stats(self) -> Dict[str, Any]:
+        """Get processing statistics."""
+        return self.stats.to_dict()
+
+
+# Utility functions for common aggregation operations
+
+def aggregate_trades_to_candles(trades: List[StandardizedTrade], 
+                                timeframes: List[str],
+                                symbol: str,
+                                exchange: str) -> List[OHLCVCandle]:
+    """
+    Simple utility function to aggregate a list of trades to candles.
+    
+    Args:
+        trades: List of standardized trades
+        timeframes: List of timeframes to generate
+        symbol: Trading symbol
+        exchange: Exchange name
+        
+    Returns:
+        List of completed candles
+    """
+    processor = BatchCandleProcessor(symbol, exchange, timeframes)
+    return processor.process_trades_to_candles(iter(trades))
+
+
+def validate_timeframe(timeframe: str) -> bool:
+    """
+    Validate if timeframe is supported.
+    
+    Args:
+        timeframe: Timeframe string (e.g., '1m', '5m', '1h')
+        
+    Returns:
+        True if supported, False otherwise
+    """
+    supported = ['1m', '5m', '15m', '30m', '1h', '4h', '1d']
+    return timeframe in supported
+
+
+def parse_timeframe(timeframe: str) -> tuple[int, str]:
+    """
+    Parse timeframe string into number and unit.
+    
+    Args:
+        timeframe: Timeframe string (e.g., '5m', '1h')
+        
+    Returns:
+        Tuple of (number, unit)
+        
+    Examples:
+        '5m' -> (5, 'm')
+        '1h' -> (1, 'h')
+        '1d' -> (1, 'd')
+    """
+    import re
+    match = re.match(r'^(\d+)([mhd])$', timeframe.lower())
+    if not match:
+        raise ValueError(f"Invalid timeframe format: {timeframe}")
+    
+    number = int(match.group(1))
+    unit = match.group(2)
+    return number, unit
+
+
+__all__ = [
+    'TimeframeBucket',
+    'RealTimeCandleProcessor', 
+    'BatchCandleProcessor',
+    'aggregate_trades_to_candles',
+    'validate_timeframe',
+    'parse_timeframe'
+] 
--- a/data/common/data_types.py
+++ b/data/common/data_types.py
@@ -0,0 +1,182 @@
+"""
+Common data types for all exchange implementations.
+
+These data structures provide a unified interface for market data
+regardless of the source exchange.
+"""
+
+from datetime import datetime, timezone
+from decimal import Decimal
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass, field
+from enum import Enum
+
+from ..base_collector import DataType, MarketDataPoint  # Import from base
+
+
+@dataclass 
+class DataValidationResult:
+    """Result of data validation - common across all exchanges."""
+    is_valid: bool
+    errors: List[str]
+    warnings: List[str]
+    sanitized_data: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class StandardizedTrade:
+    """
+    Standardized trade format for unified processing across all exchanges.
+    
+    This format works for both real-time and historical data processing,
+    ensuring consistency across all data sources and scenarios.
+    """
+    symbol: str
+    trade_id: str
+    price: Decimal
+    size: Decimal
+    side: str  # 'buy' or 'sell'
+    timestamp: datetime
+    exchange: str
+    raw_data: Optional[Dict[str, Any]] = None
+    
+    def __post_init__(self):
+        """Validate and normalize fields after initialization."""
+        # Ensure timestamp is timezone-aware
+        if self.timestamp.tzinfo is None:
+            self.timestamp = self.timestamp.replace(tzinfo=timezone.utc)
+        
+        # Normalize side to lowercase
+        self.side = self.side.lower()
+        
+        # Validate side
+        if self.side not in ['buy', 'sell']:
+            raise ValueError(f"Invalid trade side: {self.side}")
+
+
+@dataclass
+class OHLCVCandle:
+    """
+    OHLCV candle data structure for time-based aggregation.
+    
+    This represents a complete candle for a specific timeframe,
+    built from aggregating multiple trades within the time period.
+    """
+    symbol: str
+    timeframe: str
+    start_time: datetime
+    end_time: datetime
+    open: Decimal
+    high: Decimal
+    low: Decimal
+    close: Decimal
+    volume: Decimal
+    trade_count: int
+    exchange: str = "unknown"
+    is_complete: bool = False
+    first_trade_time: Optional[datetime] = None
+    last_trade_time: Optional[datetime] = None
+    
+    def __post_init__(self):
+        """Validate and normalize fields after initialization."""
+        # Ensure timestamps are timezone-aware
+        if self.start_time.tzinfo is None:
+            self.start_time = self.start_time.replace(tzinfo=timezone.utc)
+        if self.end_time.tzinfo is None:
+            self.end_time = self.end_time.replace(tzinfo=timezone.utc)
+        
+        # Validate OHLC relationships
+        if self.high < self.low:
+            raise ValueError("High price cannot be less than low price")
+        if self.open < 0 or self.high < 0 or self.low < 0 or self.close < 0:
+            raise ValueError("Prices cannot be negative")
+        if self.volume < 0:
+            raise ValueError("Volume cannot be negative")
+        if self.trade_count < 0:
+            raise ValueError("Trade count cannot be negative")
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert candle to dictionary for storage/serialization."""
+        return {
+            'symbol': self.symbol,
+            'timeframe': self.timeframe,
+            'start_time': self.start_time.isoformat(),
+            'end_time': self.end_time.isoformat(),
+            'open': str(self.open),
+            'high': str(self.high),
+            'low': str(self.low),
+            'close': str(self.close),
+            'volume': str(self.volume),
+            'trade_count': self.trade_count,
+            'exchange': self.exchange,
+            'is_complete': self.is_complete,
+            'first_trade_time': self.first_trade_time.isoformat() if self.first_trade_time else None,
+            'last_trade_time': self.last_trade_time.isoformat() if self.last_trade_time else None
+        }
+
+
+@dataclass
+class CandleProcessingConfig:
+    """Configuration for candle processing - shared across exchanges."""
+    timeframes: List[str] = field(default_factory=lambda: ['1m', '5m', '15m', '1h'])
+    auto_save_candles: bool = True
+    emit_incomplete_candles: bool = False
+    max_trades_per_candle: int = 100000  # Safety limit
+    
+    def __post_init__(self):
+        """Validate configuration after initialization."""
+        supported_timeframes = ['1m', '5m', '15m', '30m', '1h', '4h', '1d']
+        for tf in self.timeframes:
+            if tf not in supported_timeframes:
+                raise ValueError(f"Unsupported timeframe: {tf}")
+
+
+class TradeSide(Enum):
+    """Standardized trade side enumeration."""
+    BUY = "buy"
+    SELL = "sell"
+
+
+class TimeframeUnit(Enum):
+    """Time units for candle timeframes."""
+    MINUTE = "m"
+    HOUR = "h" 
+    DAY = "d"
+
+
+@dataclass
+class ProcessingStats:
+    """Common processing statistics structure."""
+    trades_processed: int = 0
+    candles_emitted: int = 0
+    errors_count: int = 0
+    warnings_count: int = 0
+    last_trade_time: Optional[datetime] = None
+    last_candle_time: Optional[datetime] = None
+    active_timeframes: int = 0
+    
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert stats to dictionary."""
+        return {
+            'trades_processed': self.trades_processed,
+            'candles_emitted': self.candles_emitted,
+            'errors_count': self.errors_count,
+            'warnings_count': self.warnings_count,
+            'last_trade_time': self.last_trade_time.isoformat() if self.last_trade_time else None,
+            'last_candle_time': self.last_candle_time.isoformat() if self.last_candle_time else None,
+            'active_timeframes': self.active_timeframes
+        }
+
+
+# Re-export from base_collector for convenience
+__all__ = [
+    'DataType',
+    'MarketDataPoint',
+    'DataValidationResult',
+    'StandardizedTrade',
+    'OHLCVCandle',
+    'CandleProcessingConfig',
+    'TradeSide',
+    'TimeframeUnit',
+    'ProcessingStats'
+] 
--- a/data/common/transformation.py
+++ b/data/common/transformation.py
@@ -0,0 +1,471 @@
+"""
+Base transformation utilities for all exchanges.
+
+This module provides common transformation patterns and base classes
+for converting exchange-specific data to standardized formats.
+"""
+
+from datetime import datetime, timezone
+from decimal import Decimal
+from typing import Dict, List, Optional, Any, Iterator
+from abc import ABC, abstractmethod
+
+from .data_types import StandardizedTrade, OHLCVCandle, DataValidationResult
+from .aggregation import BatchCandleProcessor
+from utils.logger import get_logger
+
+
+class BaseDataTransformer(ABC):
+    """
+    Abstract base class for exchange data transformers.
+    
+    This class provides common transformation patterns that can be
+    extended by exchange-specific implementations.
+    """
+    
+    def __init__(self, 
+                 exchange_name: str,
+                 component_name: str = "base_data_transformer"):
+        """
+        Initialize base data transformer.
+        
+        Args:
+            exchange_name: Name of the exchange (e.g., 'okx', 'binance')
+            component_name: Name for logging
+        """
+        self.exchange_name = exchange_name
+        self.component_name = component_name
+        self.logger = get_logger(self.component_name)
+        
+        self.logger.info(f"Initialized base data transformer for {exchange_name}")
+    
+    # Abstract methods that must be implemented by subclasses
+    
+    @abstractmethod
+    def transform_trade_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[StandardizedTrade]:
+        """Transform exchange-specific trade data to standardized format."""
+        pass
+    
+    @abstractmethod
+    def transform_orderbook_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
+        """Transform exchange-specific orderbook data to standardized format."""
+        pass
+    
+    @abstractmethod
+    def transform_ticker_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
+        """Transform exchange-specific ticker data to standardized format."""
+        pass
+    
+    # Common transformation utilities available to all subclasses
+    
+    def timestamp_to_datetime(self, timestamp: Any, is_milliseconds: bool = True) -> datetime:
+        """
+        Convert various timestamp formats to timezone-aware datetime.
+        
+        Args:
+            timestamp: Timestamp in various formats
+            is_milliseconds: True if timestamp is in milliseconds
+            
+        Returns:
+            Timezone-aware datetime object
+        """
+        try:
+            # Convert to int/float
+            if isinstance(timestamp, str):
+                timestamp_num = float(timestamp)
+            elif isinstance(timestamp, (int, float)):
+                timestamp_num = float(timestamp)
+            else:
+                raise ValueError(f"Invalid timestamp type: {type(timestamp)}")
+            
+            # Convert to seconds if needed
+            if is_milliseconds:
+                timestamp_num = timestamp_num / 1000
+            
+            # Create timezone-aware datetime
+            dt = datetime.fromtimestamp(timestamp_num, tz=timezone.utc)
+            return dt
+            
+        except Exception as e:
+            self.logger.error(f"Error converting timestamp {timestamp}: {e}")
+            # Return current time as fallback
+            return datetime.now(timezone.utc)
+    
+    def safe_decimal_conversion(self, value: Any, field_name: str = "value") -> Optional[Decimal]:
+        """
+        Safely convert value to Decimal with error handling.
+        
+        Args:
+            value: Value to convert
+            field_name: Name of field for error logging
+            
+        Returns:
+            Decimal value or None if conversion failed
+        """
+        try:
+            if value is None or value == "":
+                return None
+            return Decimal(str(value))
+        except Exception as e:
+            self.logger.warning(f"Failed to convert {field_name} '{value}' to Decimal: {e}")
+            return None
+    
+    def normalize_trade_side(self, side: str) -> str:
+        """
+        Normalize trade side to standard format.
+        
+        Args:
+            side: Raw trade side string
+            
+        Returns:
+            Normalized side ('buy' or 'sell')
+        """
+        normalized = side.lower().strip()
+        
+        # Handle common variations
+        if normalized in ['buy', 'bid', 'b', '1']:
+            return 'buy'
+        elif normalized in ['sell', 'ask', 's', '0']:
+            return 'sell'
+        else:
+            self.logger.warning(f"Unknown trade side: {side}, defaulting to 'buy'")
+            return 'buy'
+    
+    def validate_symbol_format(self, symbol: str) -> str:
+        """
+        Validate and normalize symbol format.
+        
+        Args:
+            symbol: Raw symbol string
+            
+        Returns:
+            Normalized symbol string
+        """
+        if not symbol or not isinstance(symbol, str):
+            raise ValueError(f"Invalid symbol: {symbol}")
+        
+        # Basic normalization
+        normalized = symbol.upper().strip()
+        
+        if not normalized:
+            raise ValueError("Empty symbol after normalization")
+        
+        return normalized
+    
+    def transform_database_record(self, record: Any) -> Optional[StandardizedTrade]:
+        """
+        Transform database record to standardized format.
+        
+        This method should be overridden by subclasses to handle
+        their specific database schema.
+        
+        Args:
+            record: Database record
+            
+        Returns:
+            StandardizedTrade or None if transformation failed
+        """
+        self.logger.warning("transform_database_record not implemented for this exchange")
+        return None
+    
+    def get_transformer_info(self) -> Dict[str, Any]:
+        """Get transformer information."""
+        return {
+            'exchange': self.exchange_name,
+            'component': self.component_name,
+            'capabilities': {
+                'trade_transformation': True,
+                'orderbook_transformation': True,
+                'ticker_transformation': True,
+                'database_transformation': hasattr(self, 'transform_database_record')
+            }
+        }
+
+
+class UnifiedDataTransformer:
+    """
+    Unified data transformation system for all scenarios.
+    
+    This class provides a common interface for transforming data from
+    various sources (real-time, historical, backfill) into standardized
+    formats for further processing.
+    
+    TRANSFORMATION PROCESS:
+    
+    1. Raw Data Input (exchange format, database records, etc.)
+    2. Validation (using exchange-specific validators)  
+    3. Transformation to StandardizedTrade format
+    4. Optional aggregation to candles
+    5. Output in consistent format
+    """
+    
+    def __init__(self, 
+                 exchange_transformer: BaseDataTransformer,
+                 component_name: str = "unified_data_transformer"):
+        """
+        Initialize unified data transformer.
+        
+        Args:
+            exchange_transformer: Exchange-specific transformer instance
+            component_name: Name for logging
+        """
+        self.exchange_transformer = exchange_transformer
+        self.component_name = component_name
+        self.logger = get_logger(self.component_name)
+        
+        self.logger.info(f"Initialized unified data transformer with {exchange_transformer.exchange_name} transformer")
+    
+    def transform_trade_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[StandardizedTrade]:
+        """
+        Transform trade data using exchange-specific transformer.
+        
+        Args:
+            raw_data: Raw trade data from exchange
+            symbol: Trading symbol
+            
+        Returns:
+            Standardized trade or None if transformation failed
+        """
+        try:
+            return self.exchange_transformer.transform_trade_data(raw_data, symbol)
+        except Exception as e:
+            self.logger.error(f"Error in trade transformation: {e}")
+            return None
+    
+    def transform_orderbook_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
+        """
+        Transform orderbook data using exchange-specific transformer.
+        
+        Args:
+            raw_data: Raw orderbook data from exchange
+            symbol: Trading symbol
+            
+        Returns:
+            Standardized orderbook data or None if transformation failed
+        """
+        try:
+            return self.exchange_transformer.transform_orderbook_data(raw_data, symbol)
+        except Exception as e:
+            self.logger.error(f"Error in orderbook transformation: {e}")
+            return None
+    
+    def transform_ticker_data(self, raw_data: Dict[str, Any], symbol: str) -> Optional[Dict[str, Any]]:
+        """
+        Transform ticker data using exchange-specific transformer.
+        
+        Args:
+            raw_data: Raw ticker data from exchange
+            symbol: Trading symbol
+            
+        Returns:
+            Standardized ticker data or None if transformation failed
+        """
+        try:
+            return self.exchange_transformer.transform_ticker_data(raw_data, symbol)
+        except Exception as e:
+            self.logger.error(f"Error in ticker transformation: {e}")
+            return None
+    
+    def process_trades_to_candles(self, 
+                                  trades: Iterator[StandardizedTrade], 
+                                  timeframes: List[str],
+                                  symbol: str) -> List[OHLCVCandle]:
+        """
+        Process any trade iterator to candles - unified processing function.
+        
+        This function handles ALL scenarios:
+        - Real-time: Single trade iterators
+        - Historical: Batch trade iterators
+        - Backfill: API trade iterators
+        
+        Args:
+            trades: Iterator of standardized trades
+            timeframes: List of timeframes to generate
+            symbol: Trading symbol
+            
+        Returns:
+            List of completed candles
+        """
+        try:
+            processor = BatchCandleProcessor(
+                symbol, 
+                self.exchange_transformer.exchange_name, 
+                timeframes,
+                f"unified_batch_processor_{symbol}"
+            )
+            
+            candles = processor.process_trades_to_candles(trades)
+            
+            self.logger.info(f"Processed {processor.get_stats()['trades_processed']} trades to {len(candles)} candles")
+            return candles
+            
+        except Exception as e:
+            self.logger.error(f"Error processing trades to candles: {e}")
+            return []
+    
+    def batch_transform_trades(self, 
+                               raw_trades: List[Dict[str, Any]], 
+                               symbol: str) -> List[StandardizedTrade]:
+        """
+        Transform multiple trade records in batch.
+        
+        Args:
+            raw_trades: List of raw trade data
+            symbol: Trading symbol
+            
+        Returns:
+            List of successfully transformed trades
+        """
+        transformed_trades = []
+        errors = 0
+        
+        for raw_trade in raw_trades:
+            try:
+                trade = self.transform_trade_data(raw_trade, symbol)
+                if trade:
+                    transformed_trades.append(trade)
+                else:
+                    errors += 1
+            except Exception as e:
+                self.logger.error(f"Error transforming trade: {e}")
+                errors += 1
+        
+        self.logger.info(f"Batch transformed {len(transformed_trades)} trades successfully, {errors} errors")
+        return transformed_trades
+    
+    def get_transformer_info(self) -> Dict[str, Any]:
+        """Get comprehensive transformer information."""
+        base_info = self.exchange_transformer.get_transformer_info()
+        base_info.update({
+            'unified_component': self.component_name,
+            'batch_processing': True,
+            'candle_aggregation': True
+        })
+        return base_info
+
+
+# Utility functions for common transformation patterns
+
+def create_standardized_trade(symbol: str,
+                              trade_id: str,
+                              price: Any,
+                              size: Any,
+                              side: str,
+                              timestamp: Any,
+                              exchange: str,
+                              raw_data: Optional[Dict[str, Any]] = None,
+                              is_milliseconds: bool = True) -> StandardizedTrade:
+    """
+    Utility function to create StandardizedTrade with proper validation.
+    
+    Args:
+        symbol: Trading symbol
+        trade_id: Trade identifier
+        price: Trade price (any numeric type)
+        size: Trade size (any numeric type)
+        side: Trade side ('buy' or 'sell')
+        timestamp: Trade timestamp
+        exchange: Exchange name
+        raw_data: Original raw data
+        is_milliseconds: True if timestamp is in milliseconds
+        
+    Returns:
+        StandardizedTrade object
+        
+    Raises:
+        ValueError: If data is invalid
+    """
+    # Convert timestamp
+    if isinstance(timestamp, (int, float, str)):
+        timestamp_num = float(timestamp)
+        if is_milliseconds:
+            timestamp_num = timestamp_num / 1000
+        dt = datetime.fromtimestamp(timestamp_num, tz=timezone.utc)
+    elif isinstance(timestamp, datetime):
+        dt = timestamp
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+    else:
+        raise ValueError(f"Invalid timestamp type: {type(timestamp)}")
+    
+    # Convert price and size to Decimal
+    try:
+        decimal_price = Decimal(str(price))
+        decimal_size = Decimal(str(size))
+    except Exception as e:
+        raise ValueError(f"Invalid price or size: {e}")
+    
+    # Normalize side
+    normalized_side = side.lower().strip()
+    if normalized_side not in ['buy', 'sell']:
+        raise ValueError(f"Invalid trade side: {side}")
+    
+    return StandardizedTrade(
+        symbol=symbol.upper().strip(),
+        trade_id=str(trade_id),
+        price=decimal_price,
+        size=decimal_size,
+        side=normalized_side,
+        timestamp=dt,
+        exchange=exchange.lower(),
+        raw_data=raw_data
+    )
+
+
+def batch_create_standardized_trades(raw_trades: List[Dict[str, Any]],
+                                     symbol: str,
+                                     exchange: str,
+                                     field_mapping: Dict[str, str],
+                                     is_milliseconds: bool = True) -> List[StandardizedTrade]:
+    """
+    Batch create standardized trades from raw data.
+    
+    Args:
+        raw_trades: List of raw trade dictionaries
+        symbol: Trading symbol
+        exchange: Exchange name
+        field_mapping: Mapping of StandardizedTrade fields to raw data fields
+        is_milliseconds: True if timestamps are in milliseconds
+        
+    Returns:
+        List of successfully created StandardizedTrade objects
+        
+    Example field_mapping:
+        {
+            'trade_id': 'id',
+            'price': 'px',
+            'size': 'sz',
+            'side': 'side',
+            'timestamp': 'ts'
+        }
+    """
+    trades = []
+    
+    for raw_trade in raw_trades:
+        try:
+            trade = create_standardized_trade(
+                symbol=symbol,
+                trade_id=raw_trade[field_mapping['trade_id']],
+                price=raw_trade[field_mapping['price']],
+                size=raw_trade[field_mapping['size']],
+                side=raw_trade[field_mapping['side']],
+                timestamp=raw_trade[field_mapping['timestamp']],
+                exchange=exchange,
+                raw_data=raw_trade,
+                is_milliseconds=is_milliseconds
+            )
+            trades.append(trade)
+        except Exception as e:
+            # Log error but continue processing
+            logger = get_logger("batch_transform")
+            logger.warning(f"Failed to transform trade: {e}")
+    
+    return trades
+
+
+__all__ = [
+    'BaseDataTransformer',
+    'UnifiedDataTransformer',
+    'create_standardized_trade',
+    'batch_create_standardized_trades'
+] 
--- a/data/common/validation.py
+++ b/data/common/validation.py
@@ -0,0 +1,484 @@
+"""
+Base validation utilities for all exchanges.
+
+This module provides common validation patterns and base classes
+that can be extended by exchange-specific validators.
+"""
+
+import re
+from datetime import datetime, timezone, timedelta
+from decimal import Decimal, InvalidOperation
+from typing import Dict, List, Optional, Any, Union, Pattern
+from abc import ABC, abstractmethod
+
+from .data_types import DataValidationResult, StandardizedTrade, TradeSide
+from utils.logger import get_logger
+
+
+class ValidationResult:
+    """Simple validation result for individual field validation."""
+    
+    def __init__(self, is_valid: bool, errors: List[str] = None, warnings: List[str] = None, sanitized_data: Any = None):
+        self.is_valid = is_valid
+        self.errors = errors or []
+        self.warnings = warnings or []
+        self.sanitized_data = sanitized_data
+
+
+class BaseDataValidator(ABC):
+    """
+    Abstract base class for exchange data validators.
+    
+    This class provides common validation patterns and utilities
+    that can be reused across different exchange implementations.
+    """
+    
+    def __init__(self, 
+                 exchange_name: str,
+                 component_name: str = "base_data_validator"):
+        """
+        Initialize base data validator.
+        
+        Args:
+            exchange_name: Name of the exchange (e.g., 'okx', 'binance')
+            component_name: Name for logging
+        """
+        self.exchange_name = exchange_name
+        self.component_name = component_name
+        self.logger = get_logger(self.component_name)
+        
+        # Common validation patterns
+        self._numeric_pattern = re.compile(r'^-?\d*\.?\d+$')
+        self._trade_id_pattern = re.compile(r'^[a-zA-Z0-9_-]+$')  # Flexible pattern
+        
+        # Valid trade sides
+        self._valid_trade_sides = {'buy', 'sell'}
+        
+        # Common price and size limits (can be overridden by subclasses)
+        self._min_price = Decimal('0.00000001')  # 1 satoshi equivalent
+        self._max_price = Decimal('10000000')    # 10 million
+        self._min_size = Decimal('0.00000001')   # Minimum trade size
+        self._max_size = Decimal('1000000000')   # 1 billion max size
+        
+        # Timestamp validation (milliseconds since epoch)
+        self._min_timestamp = 1000000000000  # 2001-09-09 (reasonable minimum)
+        self._max_timestamp = 9999999999999  # 2286-11-20 (reasonable maximum)
+        
+        self.logger.debug(f"Initialized base data validator for {exchange_name}")
+    
+    # Abstract methods that must be implemented by subclasses
+    
+    @abstractmethod
+    def validate_symbol_format(self, symbol: str) -> ValidationResult:
+        """Validate exchange-specific symbol format."""
+        pass
+    
+    @abstractmethod
+    def validate_websocket_message(self, message: Dict[str, Any]) -> DataValidationResult:
+        """Validate complete WebSocket message structure."""
+        pass
+    
+    # Common validation methods available to all subclasses
+    
+    def validate_price(self, price: Union[str, int, float, Decimal]) -> ValidationResult:
+        """
+        Validate price value with common rules.
+        
+        Args:
+            price: Price value to validate
+            
+        Returns:
+            ValidationResult with sanitized decimal price
+        """
+        errors = []
+        warnings = []
+        sanitized_data = None
+        
+        try:
+            # Convert to Decimal for precise validation
+            if isinstance(price, str) and price.strip() == "":
+                errors.append("Empty price string")
+                return ValidationResult(False, errors, warnings)
+            
+            decimal_price = Decimal(str(price))
+            sanitized_data = decimal_price
+            
+            # Check for negative prices
+            if decimal_price <= 0:
+                errors.append(f"Price must be positive, got {decimal_price}")
+            
+            # Check price bounds
+            if decimal_price < self._min_price:
+                warnings.append(f"Price {decimal_price} below minimum {self._min_price}")
+            elif decimal_price > self._max_price:
+                warnings.append(f"Price {decimal_price} above maximum {self._max_price}")
+            
+            # Check for excessive decimal places (warn only)
+            if decimal_price.as_tuple().exponent < -12:
+                warnings.append(f"Price has excessive decimal precision: {decimal_price}")
+            
+        except (InvalidOperation, ValueError, TypeError) as e:
+            errors.append(f"Invalid price value: {price} - {str(e)}")
+        
+        return ValidationResult(len(errors) == 0, errors, warnings, sanitized_data)
+    
+    def validate_size(self, size: Union[str, int, float, Decimal]) -> ValidationResult:
+        """
+        Validate size/quantity value with common rules.
+        
+        Args:
+            size: Size value to validate
+            
+        Returns:
+            ValidationResult with sanitized decimal size
+        """
+        errors = []
+        warnings = []
+        sanitized_data = None
+        
+        try:
+            # Convert to Decimal for precise validation
+            if isinstance(size, str) and size.strip() == "":
+                errors.append("Empty size string")
+                return ValidationResult(False, errors, warnings)
+            
+            decimal_size = Decimal(str(size))
+            sanitized_data = decimal_size
+            
+            # Check for negative or zero sizes
+            if decimal_size <= 0:
+                errors.append(f"Size must be positive, got {decimal_size}")
+            
+            # Check size bounds
+            if decimal_size < self._min_size:
+                warnings.append(f"Size {decimal_size} below minimum {self._min_size}")
+            elif decimal_size > self._max_size:
+                warnings.append(f"Size {decimal_size} above maximum {self._max_size}")
+            
+        except (InvalidOperation, ValueError, TypeError) as e:
+            errors.append(f"Invalid size value: {size} - {str(e)}")
+        
+        return ValidationResult(len(errors) == 0, errors, warnings, sanitized_data)
+    
+    def validate_volume(self, volume: Union[str, int, float, Decimal]) -> ValidationResult:
+        """
+        Validate volume value with common rules.
+        
+        Args:
+            volume: Volume value to validate
+            
+        Returns:
+            ValidationResult
+        """
+        errors = []
+        warnings = []
+        
+        try:
+            decimal_volume = Decimal(str(volume))
+            
+            # Volume can be zero (no trades in period)
+            if decimal_volume < 0:
+                errors.append(f"Volume cannot be negative, got {decimal_volume}")
+            
+        except (InvalidOperation, ValueError, TypeError) as e:
+            errors.append(f"Invalid volume value: {volume} - {str(e)}")
+        
+        return ValidationResult(len(errors) == 0, errors, warnings)
+    
+    def validate_trade_side(self, side: str) -> ValidationResult:
+        """
+        Validate trade side with common rules.
+        
+        Args:
+            side: Trade side string
+            
+        Returns:
+            ValidationResult
+        """
+        errors = []
+        warnings = []
+        
+        if not isinstance(side, str):
+            errors.append(f"Trade side must be string, got {type(side)}")
+            return ValidationResult(False, errors, warnings)
+        
+        normalized_side = side.lower()
+        if normalized_side not in self._valid_trade_sides:
+            errors.append(f"Invalid trade side: {side}. Must be 'buy' or 'sell'")
+        
+        return ValidationResult(len(errors) == 0, errors, warnings)
+    
+    def validate_timestamp(self, timestamp: Union[str, int], is_milliseconds: bool = True) -> ValidationResult:
+        """
+        Validate timestamp value with common rules.
+        
+        Args:
+            timestamp: Timestamp value to validate
+            is_milliseconds: True if timestamp is in milliseconds, False for seconds
+            
+        Returns:
+            ValidationResult
+        """
+        errors = []
+        warnings = []
+        
+        try:
+            # Convert to int
+            if isinstance(timestamp, str):
+                if not timestamp.isdigit():
+                    errors.append(f"Invalid timestamp format: {timestamp}")
+                    return ValidationResult(False, errors, warnings)
+                timestamp_int = int(timestamp)
+            elif isinstance(timestamp, int):
+                timestamp_int = timestamp
+            else:
+                errors.append(f"Timestamp must be string or int, got {type(timestamp)}")
+                return ValidationResult(False, errors, warnings)
+            
+            # Convert to milliseconds if needed
+            if not is_milliseconds:
+                timestamp_int = timestamp_int * 1000
+            
+            # Check timestamp bounds
+            if timestamp_int < self._min_timestamp:
+                errors.append(f"Timestamp {timestamp_int} too old")
+            elif timestamp_int > self._max_timestamp:
+                errors.append(f"Timestamp {timestamp_int} too far in future")
+            
+            # Check if timestamp is reasonable (within last year to next year)
+            current_time_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
+            one_year_ms = 365 * 24 * 60 * 60 * 1000
+            
+            if timestamp_int < (current_time_ms - one_year_ms):
+                warnings.append(f"Timestamp {timestamp_int} is older than 1 year")
+            elif timestamp_int > (current_time_ms + one_year_ms):
+                warnings.append(f"Timestamp {timestamp_int} is more than 1 year in future")
+            
+        except (ValueError, TypeError) as e:
+            errors.append(f"Invalid timestamp: {timestamp} - {str(e)}")
+        
+        return ValidationResult(len(errors) == 0, errors, warnings)
+    
+    def validate_trade_id(self, trade_id: Union[str, int]) -> ValidationResult:
+        """
+        Validate trade ID with flexible rules.
+        
+        Args:
+            trade_id: Trade ID to validate
+            
+        Returns:
+            ValidationResult
+        """
+        errors = []
+        warnings = []
+        
+        if isinstance(trade_id, int):
+            trade_id = str(trade_id)
+        
+        if not isinstance(trade_id, str):
+            errors.append(f"Trade ID must be string or int, got {type(trade_id)}")
+            return ValidationResult(False, errors, warnings)
+        
+        if not trade_id.strip():
+            errors.append("Trade ID cannot be empty")
+            return ValidationResult(False, errors, warnings)
+        
+        # Flexible validation - allow alphanumeric, underscore, hyphen
+        if not self._trade_id_pattern.match(trade_id):
+            warnings.append(f"Trade ID has unusual format: {trade_id}")
+        
+        return ValidationResult(len(errors) == 0, errors, warnings)
+    
+    def validate_symbol_match(self, symbol: str, expected_symbol: Optional[str] = None) -> ValidationResult:
+        """
+        Validate symbol matches expected value.
+        
+        Args:
+            symbol: Symbol to validate
+            expected_symbol: Expected symbol value
+            
+        Returns:
+            ValidationResult
+        """
+        errors = []
+        warnings = []
+        
+        if not isinstance(symbol, str):
+            errors.append(f"Symbol must be string, got {type(symbol)}")
+            return ValidationResult(False, errors, warnings)
+        
+        if expected_symbol and symbol != expected_symbol:
+            warnings.append(f"Symbol mismatch: expected {expected_symbol}, got {symbol}")
+        
+        return ValidationResult(len(errors) == 0, errors, warnings)
+    
+    def validate_orderbook_side(self, side_data: List[List[str]], side_name: str) -> ValidationResult:
+        """
+        Validate orderbook side (asks or bids) with common rules.
+        
+        Args:
+            side_data: List of price/size pairs
+            side_name: Name of side for error messages
+            
+        Returns:
+            ValidationResult with sanitized data
+        """
+        errors = []
+        warnings = []
+        sanitized_data = []
+        
+        if not isinstance(side_data, list):
+            errors.append(f"{side_name} must be a list")
+            return ValidationResult(False, errors, warnings)
+        
+        for i, level in enumerate(side_data):
+            if not isinstance(level, list) or len(level) < 2:
+                errors.append(f"{side_name}[{i}] must be a list with at least 2 elements")
+                continue
+            
+            # Validate price and size
+            price_result = self.validate_price(level[0])
+            size_result = self.validate_size(level[1])
+            
+            if not price_result.is_valid:
+                errors.extend([f"{side_name}[{i}] price: {error}" for error in price_result.errors])
+            if not size_result.is_valid:
+                errors.extend([f"{side_name}[{i}] size: {error}" for error in size_result.errors])
+            
+            # Add sanitized level
+            if price_result.is_valid and size_result.is_valid:
+                sanitized_level = [str(price_result.sanitized_data), str(size_result.sanitized_data)]
+                # Include additional fields if present
+                if len(level) > 2:
+                    sanitized_level.extend(level[2:])
+                sanitized_data.append(sanitized_level)
+        
+        return ValidationResult(len(errors) == 0, errors, warnings, sanitized_data)
+    
+    def validate_standardized_trade(self, trade: StandardizedTrade) -> DataValidationResult:
+        """
+        Validate a standardized trade object.
+        
+        Args:
+            trade: StandardizedTrade object to validate
+            
+        Returns:
+            DataValidationResult
+        """
+        errors = []
+        warnings = []
+        
+        try:
+            # Validate price
+            price_result = self.validate_price(trade.price)
+            if not price_result.is_valid:
+                errors.extend([f"price: {error}" for error in price_result.errors])
+            warnings.extend([f"price: {warning}" for warning in price_result.warnings])
+            
+            # Validate size
+            size_result = self.validate_size(trade.size)
+            if not size_result.is_valid:
+                errors.extend([f"size: {error}" for error in size_result.errors])
+            warnings.extend([f"size: {warning}" for warning in size_result.warnings])
+            
+            # Validate side
+            side_result = self.validate_trade_side(trade.side)
+            if not side_result.is_valid:
+                errors.extend([f"side: {error}" for error in side_result.errors])
+            
+            # Validate trade ID
+            trade_id_result = self.validate_trade_id(trade.trade_id)
+            if not trade_id_result.is_valid:
+                errors.extend([f"trade_id: {error}" for error in trade_id_result.errors])
+            warnings.extend([f"trade_id: {warning}" for warning in trade_id_result.warnings])
+            
+            # Validate symbol format (exchange-specific)
+            symbol_result = self.validate_symbol_format(trade.symbol)
+            if not symbol_result.is_valid:
+                errors.extend([f"symbol: {error}" for error in symbol_result.errors])
+            warnings.extend([f"symbol: {warning}" for warning in symbol_result.warnings])
+            
+            # Validate timestamp
+            timestamp_ms = int(trade.timestamp.timestamp() * 1000)
+            timestamp_result = self.validate_timestamp(timestamp_ms, is_milliseconds=True)
+            if not timestamp_result.is_valid:
+                errors.extend([f"timestamp: {error}" for error in timestamp_result.errors])
+            warnings.extend([f"timestamp: {warning}" for warning in timestamp_result.warnings])
+            
+            return DataValidationResult(len(errors) == 0, errors, warnings)
+            
+        except Exception as e:
+            errors.append(f"Exception during trade validation: {str(e)}")
+            return DataValidationResult(False, errors, warnings)
+    
+    def get_validator_info(self) -> Dict[str, Any]:
+        """Get validator configuration information."""
+        return {
+            'exchange': self.exchange_name,
+            'component': self.component_name,
+            'limits': {
+                'min_price': str(self._min_price),
+                'max_price': str(self._max_price),
+                'min_size': str(self._min_size),
+                'max_size': str(self._max_size),
+                'min_timestamp': self._min_timestamp,
+                'max_timestamp': self._max_timestamp
+            },
+            'patterns': {
+                'numeric': self._numeric_pattern.pattern,
+                'trade_id': self._trade_id_pattern.pattern
+            }
+        }
+
+
+# Utility functions for common validation patterns
+
+def is_valid_decimal(value: Any) -> bool:
+    """Check if value can be converted to a valid decimal."""
+    try:
+        Decimal(str(value))
+        return True
+    except (InvalidOperation, ValueError, TypeError):
+        return False
+
+
+def normalize_symbol(symbol: str, exchange: str) -> str:
+    """
+    Normalize symbol format for exchange.
+    
+    Args:
+        symbol: Raw symbol string
+        exchange: Exchange name
+        
+    Returns:
+        Normalized symbol string
+    """
+    # Basic normalization - can be extended per exchange
+    return symbol.upper().strip()
+
+
+def validate_required_fields(data: Dict[str, Any], required_fields: List[str]) -> List[str]:
+    """
+    Validate that all required fields are present in data.
+    
+    Args:
+        data: Data dictionary to check
+        required_fields: List of required field names
+        
+    Returns:
+        List of missing field names
+    """
+    missing_fields = []
+    for field in required_fields:
+        if field not in data or data[field] is None:
+            missing_fields.append(field)
+    return missing_fields
+
+
+__all__ = [
+    'ValidationResult',
+    'BaseDataValidator',
+    'is_valid_decimal',
+    'normalize_symbol',
+    'validate_required_fields'
+]