- Introduced a modular architecture for data processing, including common utilities for validation, transformation, and aggregation. - Implemented `StandardizedTrade`, `OHLCVCandle`, and `TimeframeBucket` classes for unified data handling across exchanges. - Developed `OKXDataProcessor` for OKX-specific data validation and processing, leveraging the new common framework. - Enhanced `OKXCollector` to utilize the common data processing utilities, improving modularity and maintainability. - Updated documentation to reflect the new architecture and provide guidance on the data processing framework. - Created comprehensive tests for the new data processing components to ensure reliability and functionality.
484 lines
18 KiB
Python
484 lines
18 KiB
Python
"""
|
|
Base validation utilities for all exchanges.
|
|
|
|
This module provides common validation patterns and base classes
|
|
that can be extended by exchange-specific validators.
|
|
"""
|
|
|
|
import re
|
|
from datetime import datetime, timezone, timedelta
|
|
from decimal import Decimal, InvalidOperation
|
|
from typing import Dict, List, Optional, Any, Union, Pattern
|
|
from abc import ABC, abstractmethod
|
|
|
|
from .data_types import DataValidationResult, StandardizedTrade, TradeSide
|
|
from utils.logger import get_logger
|
|
|
|
|
|
class ValidationResult:
|
|
"""Simple validation result for individual field validation."""
|
|
|
|
def __init__(self, is_valid: bool, errors: List[str] = None, warnings: List[str] = None, sanitized_data: Any = None):
|
|
self.is_valid = is_valid
|
|
self.errors = errors or []
|
|
self.warnings = warnings or []
|
|
self.sanitized_data = sanitized_data
|
|
|
|
|
|
class BaseDataValidator(ABC):
|
|
"""
|
|
Abstract base class for exchange data validators.
|
|
|
|
This class provides common validation patterns and utilities
|
|
that can be reused across different exchange implementations.
|
|
"""
|
|
|
|
def __init__(self,
|
|
exchange_name: str,
|
|
component_name: str = "base_data_validator"):
|
|
"""
|
|
Initialize base data validator.
|
|
|
|
Args:
|
|
exchange_name: Name of the exchange (e.g., 'okx', 'binance')
|
|
component_name: Name for logging
|
|
"""
|
|
self.exchange_name = exchange_name
|
|
self.component_name = component_name
|
|
self.logger = get_logger(self.component_name)
|
|
|
|
# Common validation patterns
|
|
self._numeric_pattern = re.compile(r'^-?\d*\.?\d+$')
|
|
self._trade_id_pattern = re.compile(r'^[a-zA-Z0-9_-]+$') # Flexible pattern
|
|
|
|
# Valid trade sides
|
|
self._valid_trade_sides = {'buy', 'sell'}
|
|
|
|
# Common price and size limits (can be overridden by subclasses)
|
|
self._min_price = Decimal('0.00000001') # 1 satoshi equivalent
|
|
self._max_price = Decimal('10000000') # 10 million
|
|
self._min_size = Decimal('0.00000001') # Minimum trade size
|
|
self._max_size = Decimal('1000000000') # 1 billion max size
|
|
|
|
# Timestamp validation (milliseconds since epoch)
|
|
self._min_timestamp = 1000000000000 # 2001-09-09 (reasonable minimum)
|
|
self._max_timestamp = 9999999999999 # 2286-11-20 (reasonable maximum)
|
|
|
|
self.logger.debug(f"Initialized base data validator for {exchange_name}")
|
|
|
|
# Abstract methods that must be implemented by subclasses
|
|
|
|
@abstractmethod
|
|
def validate_symbol_format(self, symbol: str) -> ValidationResult:
|
|
"""Validate exchange-specific symbol format."""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def validate_websocket_message(self, message: Dict[str, Any]) -> DataValidationResult:
|
|
"""Validate complete WebSocket message structure."""
|
|
pass
|
|
|
|
# Common validation methods available to all subclasses
|
|
|
|
def validate_price(self, price: Union[str, int, float, Decimal]) -> ValidationResult:
|
|
"""
|
|
Validate price value with common rules.
|
|
|
|
Args:
|
|
price: Price value to validate
|
|
|
|
Returns:
|
|
ValidationResult with sanitized decimal price
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
sanitized_data = None
|
|
|
|
try:
|
|
# Convert to Decimal for precise validation
|
|
if isinstance(price, str) and price.strip() == "":
|
|
errors.append("Empty price string")
|
|
return ValidationResult(False, errors, warnings)
|
|
|
|
decimal_price = Decimal(str(price))
|
|
sanitized_data = decimal_price
|
|
|
|
# Check for negative prices
|
|
if decimal_price <= 0:
|
|
errors.append(f"Price must be positive, got {decimal_price}")
|
|
|
|
# Check price bounds
|
|
if decimal_price < self._min_price:
|
|
warnings.append(f"Price {decimal_price} below minimum {self._min_price}")
|
|
elif decimal_price > self._max_price:
|
|
warnings.append(f"Price {decimal_price} above maximum {self._max_price}")
|
|
|
|
# Check for excessive decimal places (warn only)
|
|
if decimal_price.as_tuple().exponent < -12:
|
|
warnings.append(f"Price has excessive decimal precision: {decimal_price}")
|
|
|
|
except (InvalidOperation, ValueError, TypeError) as e:
|
|
errors.append(f"Invalid price value: {price} - {str(e)}")
|
|
|
|
return ValidationResult(len(errors) == 0, errors, warnings, sanitized_data)
|
|
|
|
def validate_size(self, size: Union[str, int, float, Decimal]) -> ValidationResult:
|
|
"""
|
|
Validate size/quantity value with common rules.
|
|
|
|
Args:
|
|
size: Size value to validate
|
|
|
|
Returns:
|
|
ValidationResult with sanitized decimal size
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
sanitized_data = None
|
|
|
|
try:
|
|
# Convert to Decimal for precise validation
|
|
if isinstance(size, str) and size.strip() == "":
|
|
errors.append("Empty size string")
|
|
return ValidationResult(False, errors, warnings)
|
|
|
|
decimal_size = Decimal(str(size))
|
|
sanitized_data = decimal_size
|
|
|
|
# Check for negative or zero sizes
|
|
if decimal_size <= 0:
|
|
errors.append(f"Size must be positive, got {decimal_size}")
|
|
|
|
# Check size bounds
|
|
if decimal_size < self._min_size:
|
|
warnings.append(f"Size {decimal_size} below minimum {self._min_size}")
|
|
elif decimal_size > self._max_size:
|
|
warnings.append(f"Size {decimal_size} above maximum {self._max_size}")
|
|
|
|
except (InvalidOperation, ValueError, TypeError) as e:
|
|
errors.append(f"Invalid size value: {size} - {str(e)}")
|
|
|
|
return ValidationResult(len(errors) == 0, errors, warnings, sanitized_data)
|
|
|
|
def validate_volume(self, volume: Union[str, int, float, Decimal]) -> ValidationResult:
|
|
"""
|
|
Validate volume value with common rules.
|
|
|
|
Args:
|
|
volume: Volume value to validate
|
|
|
|
Returns:
|
|
ValidationResult
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
|
|
try:
|
|
decimal_volume = Decimal(str(volume))
|
|
|
|
# Volume can be zero (no trades in period)
|
|
if decimal_volume < 0:
|
|
errors.append(f"Volume cannot be negative, got {decimal_volume}")
|
|
|
|
except (InvalidOperation, ValueError, TypeError) as e:
|
|
errors.append(f"Invalid volume value: {volume} - {str(e)}")
|
|
|
|
return ValidationResult(len(errors) == 0, errors, warnings)
|
|
|
|
def validate_trade_side(self, side: str) -> ValidationResult:
|
|
"""
|
|
Validate trade side with common rules.
|
|
|
|
Args:
|
|
side: Trade side string
|
|
|
|
Returns:
|
|
ValidationResult
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
|
|
if not isinstance(side, str):
|
|
errors.append(f"Trade side must be string, got {type(side)}")
|
|
return ValidationResult(False, errors, warnings)
|
|
|
|
normalized_side = side.lower()
|
|
if normalized_side not in self._valid_trade_sides:
|
|
errors.append(f"Invalid trade side: {side}. Must be 'buy' or 'sell'")
|
|
|
|
return ValidationResult(len(errors) == 0, errors, warnings)
|
|
|
|
def validate_timestamp(self, timestamp: Union[str, int], is_milliseconds: bool = True) -> ValidationResult:
|
|
"""
|
|
Validate timestamp value with common rules.
|
|
|
|
Args:
|
|
timestamp: Timestamp value to validate
|
|
is_milliseconds: True if timestamp is in milliseconds, False for seconds
|
|
|
|
Returns:
|
|
ValidationResult
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
|
|
try:
|
|
# Convert to int
|
|
if isinstance(timestamp, str):
|
|
if not timestamp.isdigit():
|
|
errors.append(f"Invalid timestamp format: {timestamp}")
|
|
return ValidationResult(False, errors, warnings)
|
|
timestamp_int = int(timestamp)
|
|
elif isinstance(timestamp, int):
|
|
timestamp_int = timestamp
|
|
else:
|
|
errors.append(f"Timestamp must be string or int, got {type(timestamp)}")
|
|
return ValidationResult(False, errors, warnings)
|
|
|
|
# Convert to milliseconds if needed
|
|
if not is_milliseconds:
|
|
timestamp_int = timestamp_int * 1000
|
|
|
|
# Check timestamp bounds
|
|
if timestamp_int < self._min_timestamp:
|
|
errors.append(f"Timestamp {timestamp_int} too old")
|
|
elif timestamp_int > self._max_timestamp:
|
|
errors.append(f"Timestamp {timestamp_int} too far in future")
|
|
|
|
# Check if timestamp is reasonable (within last year to next year)
|
|
current_time_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
|
|
one_year_ms = 365 * 24 * 60 * 60 * 1000
|
|
|
|
if timestamp_int < (current_time_ms - one_year_ms):
|
|
warnings.append(f"Timestamp {timestamp_int} is older than 1 year")
|
|
elif timestamp_int > (current_time_ms + one_year_ms):
|
|
warnings.append(f"Timestamp {timestamp_int} is more than 1 year in future")
|
|
|
|
except (ValueError, TypeError) as e:
|
|
errors.append(f"Invalid timestamp: {timestamp} - {str(e)}")
|
|
|
|
return ValidationResult(len(errors) == 0, errors, warnings)
|
|
|
|
def validate_trade_id(self, trade_id: Union[str, int]) -> ValidationResult:
|
|
"""
|
|
Validate trade ID with flexible rules.
|
|
|
|
Args:
|
|
trade_id: Trade ID to validate
|
|
|
|
Returns:
|
|
ValidationResult
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
|
|
if isinstance(trade_id, int):
|
|
trade_id = str(trade_id)
|
|
|
|
if not isinstance(trade_id, str):
|
|
errors.append(f"Trade ID must be string or int, got {type(trade_id)}")
|
|
return ValidationResult(False, errors, warnings)
|
|
|
|
if not trade_id.strip():
|
|
errors.append("Trade ID cannot be empty")
|
|
return ValidationResult(False, errors, warnings)
|
|
|
|
# Flexible validation - allow alphanumeric, underscore, hyphen
|
|
if not self._trade_id_pattern.match(trade_id):
|
|
warnings.append(f"Trade ID has unusual format: {trade_id}")
|
|
|
|
return ValidationResult(len(errors) == 0, errors, warnings)
|
|
|
|
def validate_symbol_match(self, symbol: str, expected_symbol: Optional[str] = None) -> ValidationResult:
|
|
"""
|
|
Validate symbol matches expected value.
|
|
|
|
Args:
|
|
symbol: Symbol to validate
|
|
expected_symbol: Expected symbol value
|
|
|
|
Returns:
|
|
ValidationResult
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
|
|
if not isinstance(symbol, str):
|
|
errors.append(f"Symbol must be string, got {type(symbol)}")
|
|
return ValidationResult(False, errors, warnings)
|
|
|
|
if expected_symbol and symbol != expected_symbol:
|
|
warnings.append(f"Symbol mismatch: expected {expected_symbol}, got {symbol}")
|
|
|
|
return ValidationResult(len(errors) == 0, errors, warnings)
|
|
|
|
def validate_orderbook_side(self, side_data: List[List[str]], side_name: str) -> ValidationResult:
|
|
"""
|
|
Validate orderbook side (asks or bids) with common rules.
|
|
|
|
Args:
|
|
side_data: List of price/size pairs
|
|
side_name: Name of side for error messages
|
|
|
|
Returns:
|
|
ValidationResult with sanitized data
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
sanitized_data = []
|
|
|
|
if not isinstance(side_data, list):
|
|
errors.append(f"{side_name} must be a list")
|
|
return ValidationResult(False, errors, warnings)
|
|
|
|
for i, level in enumerate(side_data):
|
|
if not isinstance(level, list) or len(level) < 2:
|
|
errors.append(f"{side_name}[{i}] must be a list with at least 2 elements")
|
|
continue
|
|
|
|
# Validate price and size
|
|
price_result = self.validate_price(level[0])
|
|
size_result = self.validate_size(level[1])
|
|
|
|
if not price_result.is_valid:
|
|
errors.extend([f"{side_name}[{i}] price: {error}" for error in price_result.errors])
|
|
if not size_result.is_valid:
|
|
errors.extend([f"{side_name}[{i}] size: {error}" for error in size_result.errors])
|
|
|
|
# Add sanitized level
|
|
if price_result.is_valid and size_result.is_valid:
|
|
sanitized_level = [str(price_result.sanitized_data), str(size_result.sanitized_data)]
|
|
# Include additional fields if present
|
|
if len(level) > 2:
|
|
sanitized_level.extend(level[2:])
|
|
sanitized_data.append(sanitized_level)
|
|
|
|
return ValidationResult(len(errors) == 0, errors, warnings, sanitized_data)
|
|
|
|
def validate_standardized_trade(self, trade: StandardizedTrade) -> DataValidationResult:
|
|
"""
|
|
Validate a standardized trade object.
|
|
|
|
Args:
|
|
trade: StandardizedTrade object to validate
|
|
|
|
Returns:
|
|
DataValidationResult
|
|
"""
|
|
errors = []
|
|
warnings = []
|
|
|
|
try:
|
|
# Validate price
|
|
price_result = self.validate_price(trade.price)
|
|
if not price_result.is_valid:
|
|
errors.extend([f"price: {error}" for error in price_result.errors])
|
|
warnings.extend([f"price: {warning}" for warning in price_result.warnings])
|
|
|
|
# Validate size
|
|
size_result = self.validate_size(trade.size)
|
|
if not size_result.is_valid:
|
|
errors.extend([f"size: {error}" for error in size_result.errors])
|
|
warnings.extend([f"size: {warning}" for warning in size_result.warnings])
|
|
|
|
# Validate side
|
|
side_result = self.validate_trade_side(trade.side)
|
|
if not side_result.is_valid:
|
|
errors.extend([f"side: {error}" for error in side_result.errors])
|
|
|
|
# Validate trade ID
|
|
trade_id_result = self.validate_trade_id(trade.trade_id)
|
|
if not trade_id_result.is_valid:
|
|
errors.extend([f"trade_id: {error}" for error in trade_id_result.errors])
|
|
warnings.extend([f"trade_id: {warning}" for warning in trade_id_result.warnings])
|
|
|
|
# Validate symbol format (exchange-specific)
|
|
symbol_result = self.validate_symbol_format(trade.symbol)
|
|
if not symbol_result.is_valid:
|
|
errors.extend([f"symbol: {error}" for error in symbol_result.errors])
|
|
warnings.extend([f"symbol: {warning}" for warning in symbol_result.warnings])
|
|
|
|
# Validate timestamp
|
|
timestamp_ms = int(trade.timestamp.timestamp() * 1000)
|
|
timestamp_result = self.validate_timestamp(timestamp_ms, is_milliseconds=True)
|
|
if not timestamp_result.is_valid:
|
|
errors.extend([f"timestamp: {error}" for error in timestamp_result.errors])
|
|
warnings.extend([f"timestamp: {warning}" for warning in timestamp_result.warnings])
|
|
|
|
return DataValidationResult(len(errors) == 0, errors, warnings)
|
|
|
|
except Exception as e:
|
|
errors.append(f"Exception during trade validation: {str(e)}")
|
|
return DataValidationResult(False, errors, warnings)
|
|
|
|
def get_validator_info(self) -> Dict[str, Any]:
|
|
"""Get validator configuration information."""
|
|
return {
|
|
'exchange': self.exchange_name,
|
|
'component': self.component_name,
|
|
'limits': {
|
|
'min_price': str(self._min_price),
|
|
'max_price': str(self._max_price),
|
|
'min_size': str(self._min_size),
|
|
'max_size': str(self._max_size),
|
|
'min_timestamp': self._min_timestamp,
|
|
'max_timestamp': self._max_timestamp
|
|
},
|
|
'patterns': {
|
|
'numeric': self._numeric_pattern.pattern,
|
|
'trade_id': self._trade_id_pattern.pattern
|
|
}
|
|
}
|
|
|
|
|
|
# Utility functions for common validation patterns
|
|
|
|
def is_valid_decimal(value: Any) -> bool:
|
|
"""Check if value can be converted to a valid decimal."""
|
|
try:
|
|
Decimal(str(value))
|
|
return True
|
|
except (InvalidOperation, ValueError, TypeError):
|
|
return False
|
|
|
|
|
|
def normalize_symbol(symbol: str, exchange: str) -> str:
|
|
"""
|
|
Normalize symbol format for exchange.
|
|
|
|
Args:
|
|
symbol: Raw symbol string
|
|
exchange: Exchange name
|
|
|
|
Returns:
|
|
Normalized symbol string
|
|
"""
|
|
# Basic normalization - can be extended per exchange
|
|
return symbol.upper().strip()
|
|
|
|
|
|
def validate_required_fields(data: Dict[str, Any], required_fields: List[str]) -> List[str]:
|
|
"""
|
|
Validate that all required fields are present in data.
|
|
|
|
Args:
|
|
data: Data dictionary to check
|
|
required_fields: List of required field names
|
|
|
|
Returns:
|
|
List of missing field names
|
|
"""
|
|
missing_fields = []
|
|
for field in required_fields:
|
|
if field not in data or data[field] is None:
|
|
missing_fields.append(field)
|
|
return missing_fields
|
|
|
|
|
|
__all__ = [
|
|
'ValidationResult',
|
|
'BaseDataValidator',
|
|
'is_valid_decimal',
|
|
'normalize_symbol',
|
|
'validate_required_fields'
|
|
] |