#!/usr/bin/env python3 """ Comprehensive Unit Tests for Data Collection and Aggregation Logic This module provides comprehensive unit tests for the data collection and aggregation functionality, covering: - OKX data collection and processing - Real-time candle aggregation - Data validation and transformation - Error handling and edge cases - Performance and reliability testing This completes task 2.9 of phase 2. """ import pytest import asyncio import json from datetime import datetime, timezone, timedelta from decimal import Decimal from typing import Dict, List, Any, Optional from unittest.mock import Mock, AsyncMock, patch from collections import defaultdict # Import modules under test from data.base_collector import BaseDataCollector, DataType, MarketDataPoint, CollectorStatus from data.collector_manager import CollectorManager, CollectorConfig from data.collection_service import DataCollectionService from data.exchanges.okx.collector import OKXCollector from data.exchanges.okx.data_processor import OKXDataProcessor, OKXDataValidator, OKXDataTransformer from data.exchanges.okx.websocket import OKXWebSocketClient, OKXSubscription, OKXChannelType from data.common.data_types import ( StandardizedTrade, OHLCVCandle, CandleProcessingConfig, DataValidationResult ) from data.common.aggregation.realtime import RealTimeCandleProcessor from data.common.validation import BaseDataValidator, ValidationResult from data.common.transformation import BaseDataTransformer from utils.logger import get_logger @pytest.fixture def logger(): """Create test logger.""" return get_logger("test_data_collection", log_level="DEBUG") @pytest.fixture def sample_trade_data(): """Sample OKX trade data for testing.""" return { "instId": "BTC-USDT", "tradeId": "123456789", "px": "50000.50", "sz": "0.1", "side": "buy", "ts": "1640995200000" # 2022-01-01 00:00:00 UTC } @pytest.fixture def sample_orderbook_data(): """Sample OKX orderbook data for testing.""" return { "instId": "BTC-USDT", "asks": [["50001.00", "0.5", "0", "2"]], "bids": [["49999.00", "0.3", "0", "1"]], "ts": "1640995200000", "seqId": "12345" } @pytest.fixture def sample_ticker_data(): """Sample OKX ticker data for testing.""" return { "instId": "BTC-USDT", "last": "50000.50", "lastSz": "0.1", "askPx": "50001.00", "askSz": "0.5", "bidPx": "49999.00", "bidSz": "0.3", "open24h": "49500.00", "high24h": "50500.00", "low24h": "49000.00", "vol24h": "1000.5", "volCcy24h": "50000000.00", "ts": "1640995200000" } @pytest.fixture def candle_config(): """Sample candle processing configuration.""" return CandleProcessingConfig( timeframes=['1s', '5s', '1m', '5m'], auto_save_candles=False, emit_incomplete_candles=False ) class TestDataCollectionAndAggregation: """Comprehensive test suite for data collection and aggregation logic.""" def test_basic_imports(self): """Test that all required modules can be imported.""" # This test ensures all imports are working assert StandardizedTrade is not None assert OHLCVCandle is not None assert CandleProcessingConfig is not None assert DataValidationResult is not None assert RealTimeCandleProcessor is not None assert BaseDataValidator is not None assert ValidationResult is not None class TestOKXDataValidation: """Test OKX-specific data validation.""" @pytest.fixture def validator(self, logger): """Create OKX data validator.""" return OKXDataValidator("test_validator", logger) def test_symbol_format_validation(self, validator): """Test OKX symbol format validation.""" # Valid symbols valid_symbols = ["BTC-USDT", "ETH-USDC", "SOL-USD", "DOGE-USDT"] for symbol in valid_symbols: result = validator.validate_symbol_format(symbol) assert result.is_valid, f"Symbol {symbol} should be valid" assert len(result.errors) == 0 # Invalid symbols invalid_symbols = ["BTCUSDT", "BTC/USDT", "btc-usdt", "BTC-", "-USDT", ""] for symbol in invalid_symbols: result = validator.validate_symbol_format(symbol) assert not result.is_valid, f"Symbol {symbol} should be invalid" assert len(result.errors) > 0 def test_trade_data_validation(self, validator, sample_trade_data): """Test trade data validation.""" # Valid trade data result = validator.validate_trade_data(sample_trade_data) assert result.is_valid assert len(result.errors) == 0 assert result.sanitized_data is not None # Missing required field incomplete_data = sample_trade_data.copy() del incomplete_data['px'] result = validator.validate_trade_data(incomplete_data) assert not result.is_valid assert any("Missing required trade field: px" in error for error in result.errors) # Invalid price invalid_price_data = sample_trade_data.copy() invalid_price_data['px'] = "invalid_price" result = validator.validate_trade_data(invalid_price_data) assert not result.is_valid assert any("price" in error.lower() for error in result.errors) def test_orderbook_data_validation(self, validator, sample_orderbook_data): """Test orderbook data validation.""" # Valid orderbook data result = validator.validate_orderbook_data(sample_orderbook_data) assert result.is_valid assert len(result.errors) == 0 # Missing asks/bids incomplete_data = sample_orderbook_data.copy() del incomplete_data['asks'] result = validator.validate_orderbook_data(incomplete_data) assert not result.is_valid assert any("asks" in error.lower() for error in result.errors) def test_ticker_data_validation(self, validator, sample_ticker_data): """Test ticker data validation.""" # Valid ticker data result = validator.validate_ticker_data(sample_ticker_data) assert result.is_valid assert len(result.errors) == 0 # Missing required field incomplete_data = sample_ticker_data.copy() del incomplete_data['last'] result = validator.validate_ticker_data(incomplete_data) assert not result.is_valid assert any("last" in error.lower() for error in result.errors) class TestOKXDataTransformation: """Test OKX-specific data transformation.""" @pytest.fixture def transformer(self, logger): """Create OKX data transformer.""" return OKXDataTransformer("test_transformer", logger) def test_trade_data_transformation(self, transformer, sample_trade_data): """Test trade data transformation to StandardizedTrade.""" result = transformer.transform_trade_data(sample_trade_data, "BTC-USDT") assert result is not None assert isinstance(result, StandardizedTrade) assert result.symbol == "BTC-USDT" assert result.trade_id == "123456789" assert result.price == Decimal("50000.50") assert result.size == Decimal("0.1") assert result.side == "buy" assert result.exchange == "okx" assert result.timestamp.year == 2022 def test_orderbook_data_transformation(self, transformer, sample_orderbook_data): """Test orderbook data transformation.""" result = transformer.transform_orderbook_data(sample_orderbook_data, "BTC-USDT") assert result is not None assert result['symbol'] == "BTC-USDT" assert result['exchange'] == "okx" assert 'asks' in result assert 'bids' in result assert len(result['asks']) > 0 assert len(result['bids']) > 0 def test_ticker_data_transformation(self, transformer, sample_ticker_data): """Test ticker data transformation.""" result = transformer.transform_ticker_data(sample_ticker_data, "BTC-USDT") assert result is not None assert result['symbol'] == "BTC-USDT" assert result['exchange'] == "okx" assert result['last'] == Decimal("50000.50") assert result['bid'] == Decimal("49999.00") assert result['ask'] == Decimal("50001.00") class TestRealTimeCandleAggregation: """Test real-time candle aggregation logic.""" @pytest.fixture def processor(self, candle_config, logger): """Create real-time candle processor.""" return RealTimeCandleProcessor( symbol="BTC-USDT", exchange="okx", config=candle_config, component_name="test_processor", logger=logger ) def test_single_trade_processing(self, processor): """Test processing a single trade.""" trade = StandardizedTrade( symbol="BTC-USDT", trade_id="123", price=Decimal("50000"), size=Decimal("0.1"), side="buy", timestamp=datetime(2022, 1, 1, 12, 0, 0, tzinfo=timezone.utc), exchange="okx" ) completed_candles = processor.process_trade(trade) # First trade shouldn't complete any candles assert len(completed_candles) == 0 # Check that candles are being built stats = processor.get_stats() assert stats['trades_processed'] == 1 assert 'current_buckets' in stats assert len(stats['current_buckets']) > 0 # Should have active buckets def test_candle_completion_timing(self, processor): """Test that candles complete at the correct time boundaries.""" base_time = datetime(2022, 1, 1, 12, 0, 0, tzinfo=timezone.utc) completed_candles = [] def candle_callback(candle): completed_candles.append(candle) processor.add_candle_callback(candle_callback) # Add trades at different seconds to trigger candle completions for i in range(6): # 6 seconds of trades trade = StandardizedTrade( symbol="BTC-USDT", trade_id=str(i), price=Decimal("50000") + Decimal(str(i)), size=Decimal("0.1"), side="buy", timestamp=base_time + timedelta(seconds=i), exchange="okx" ) processor.process_trade(trade) # Should have completed some 1s and 5s candles assert len(completed_candles) > 0 # Check candle properties for candle in completed_candles: assert candle.symbol == "BTC-USDT" assert candle.exchange == "okx" assert candle.timeframe in ['1s', '5s'] assert candle.trade_count > 0 assert candle.volume > 0 def test_ohlcv_calculation_accuracy(self, processor): """Test OHLCV calculation accuracy.""" base_time = datetime(2022, 1, 1, 12, 0, 0, tzinfo=timezone.utc) completed_candles = [] def candle_callback(candle): completed_candles.append(candle) processor.add_candle_callback(candle_callback) # Add trades with known prices to test OHLCV calculation prices = [Decimal("50000"), Decimal("50100"), Decimal("49900"), Decimal("50050")] sizes = [Decimal("0.1"), Decimal("0.2"), Decimal("0.15"), Decimal("0.05")] for i, (price, size) in enumerate(zip(prices, sizes)): trade = StandardizedTrade( symbol="BTC-USDT", trade_id=str(i), price=price, size=size, side="buy", timestamp=base_time + timedelta(milliseconds=i * 100), exchange="okx" ) processor.process_trade(trade) # Force completion by adding trade in next second trade = StandardizedTrade( symbol="BTC-USDT", trade_id="final", price=Decimal("50000"), size=Decimal("0.1"), side="buy", timestamp=base_time + timedelta(seconds=1), exchange="okx" ) processor.process_trade(trade) # Find 1s candle candle_1s = next((c for c in completed_candles if c.timeframe == '1s'), None) assert candle_1s is not None # Verify OHLCV values assert candle_1s.open == Decimal("50000") # First trade price assert candle_1s.high == Decimal("50100") # Highest price assert candle_1s.low == Decimal("49900") # Lowest price assert candle_1s.close == Decimal("50050") # Last trade price assert candle_1s.volume == sum(sizes) # Total volume assert candle_1s.trade_count == 4 # Number of trades def test_multiple_timeframe_aggregation(self, processor): """Test aggregation across multiple timeframes.""" base_time = datetime(2022, 1, 1, 12, 0, 0, tzinfo=timezone.utc) completed_candles = [] def candle_callback(candle): completed_candles.append(candle) processor.add_candle_callback(candle_callback) # Add trades over 6 seconds to trigger multiple timeframe completions for second in range(6): for ms in range(0, 1000, 100): # 10 trades per second trade = StandardizedTrade( symbol="BTC-USDT", trade_id=f"{second}_{ms}", price=Decimal("50000") + Decimal(str(second)), size=Decimal("0.01"), side="buy", timestamp=base_time + timedelta(seconds=second, milliseconds=ms), exchange="okx" ) processor.process_trade(trade) # Check that we have candles for different timeframes timeframes_found = set(c.timeframe for c in completed_candles) assert '1s' in timeframes_found assert '5s' in timeframes_found # Verify candle relationships (5s candle should aggregate 5 1s candles) candles_1s = [c for c in completed_candles if c.timeframe == '1s'] candles_5s = [c for c in completed_candles if c.timeframe == '5s'] if candles_5s: # Check that 5s candle volume is sum of constituent 1s candles candle_5s = candles_5s[0] related_1s_candles = [ c for c in candles_1s if c.start_time >= candle_5s.start_time and c.end_time <= candle_5s.end_time ] if related_1s_candles: expected_volume = sum(c.volume for c in related_1s_candles) expected_trades = sum(c.trade_count for c in related_1s_candles) assert candle_5s.volume >= expected_volume # May include partial data assert candle_5s.trade_count >= expected_trades class TestOKXDataProcessor: """Test OKX data processor integration.""" @pytest.fixture def processor(self, candle_config, logger): """Create OKX data processor.""" return OKXDataProcessor( symbol="BTC-USDT", config=candle_config, component_name="test_okx_processor", logger=logger ) def test_websocket_message_processing(self, processor, sample_trade_data): """Test WebSocket message processing.""" # Create a valid OKX WebSocket message message = { "arg": { "channel": "trades", "instId": "BTC-USDT" }, "data": [sample_trade_data] } success, data_points, errors = processor.validate_and_process_message(message, "BTC-USDT") assert success assert len(data_points) == 1 assert len(errors) == 0 assert data_points[0].data_type == DataType.TRADE assert data_points[0].symbol == "BTC-USDT" def test_invalid_message_handling(self, processor): """Test handling of invalid messages.""" # Invalid message structure invalid_message = {"invalid": "message"} success, data_points, errors = processor.validate_and_process_message(invalid_message) assert not success assert len(data_points) == 0 assert len(errors) > 0 def test_trade_callback_execution(self, processor, sample_trade_data): """Test that trade callbacks are executed.""" callback_called = False received_trade = None def trade_callback(trade): nonlocal callback_called, received_trade callback_called = True received_trade = trade processor.add_trade_callback(trade_callback) # Process trade message message = { "arg": {"channel": "trades", "instId": "BTC-USDT"}, "data": [sample_trade_data] } processor.validate_and_process_message(message, "BTC-USDT") assert callback_called assert received_trade is not None assert isinstance(received_trade, StandardizedTrade) def test_candle_callback_execution(self, processor, sample_trade_data): """Test that candle callbacks are executed when candles complete.""" callback_called = False received_candle = None def candle_callback(candle): nonlocal callback_called, received_candle callback_called = True received_candle = candle processor.add_candle_callback(candle_callback) # Process multiple trades to complete a candle base_time = int(datetime(2022, 1, 1, 12, 0, 0, tzinfo=timezone.utc).timestamp() * 1000) for i in range(2): # Two trades in different seconds trade_data = sample_trade_data.copy() trade_data['ts'] = str(base_time + i * 1000) # 1 second apart trade_data['tradeId'] = str(i) message = { "arg": {"channel": "trades", "instId": "BTC-USDT"}, "data": [trade_data] } processor.validate_and_process_message(message, "BTC-USDT") # May need to wait for candle completion if callback_called: assert received_candle is not None assert isinstance(received_candle, OHLCVCandle) class TestDataCollectionService: """Test the data collection service integration.""" @pytest.fixture def service_config(self): """Create service configuration.""" return { 'exchanges': { 'okx': { 'enabled': True, 'symbols': ['BTC-USDT'], 'data_types': ['trade', 'ticker'], 'store_raw_data': False } }, 'candle_config': { 'timeframes': ['1s', '1m'], 'auto_save_candles': False } } @pytest.mark.asyncio async def test_service_initialization(self, service_config, logger): """Test data collection service initialization.""" # Create a temporary config file for testing import tempfile import json with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: # Convert our test config to match expected format test_config = { "exchange": "okx", "connection": { "public_ws_url": "wss://ws.okx.com:8443/ws/v5/public", "ping_interval": 25.0, "pong_timeout": 10.0, "max_reconnect_attempts": 5, "reconnect_delay": 5.0 }, "data_collection": { "store_raw_data": False, "health_check_interval": 120.0, "auto_restart": True, "buffer_size": 1000 }, "trading_pairs": [ { "symbol": "BTC-USDT", "enabled": True, "data_types": ["trade", "ticker"], "timeframes": ["1s", "1m"], "channels": { "trades": "trades", "ticker": "tickers" } } ] } json.dump(test_config, f) config_path = f.name try: service = DataCollectionService(config_path=config_path) assert service.config_path == config_path assert not service.running # Check that the service loaded configuration assert service.config is not None assert 'exchange' in service.config finally: # Clean up temporary file import os os.unlink(config_path) @pytest.mark.asyncio async def test_service_lifecycle(self, service_config, logger): """Test service start/stop lifecycle.""" # Create a temporary config file for testing import tempfile import json with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: # Convert our test config to match expected format test_config = { "exchange": "okx", "connection": { "public_ws_url": "wss://ws.okx.com:8443/ws/v5/public", "ping_interval": 25.0, "pong_timeout": 10.0, "max_reconnect_attempts": 5, "reconnect_delay": 5.0 }, "data_collection": { "store_raw_data": False, "health_check_interval": 120.0, "auto_restart": True, "buffer_size": 1000 }, "trading_pairs": [ { "symbol": "BTC-USDT", "enabled": True, "data_types": ["trade", "ticker"], "timeframes": ["1s", "1m"], "channels": { "trades": "trades", "ticker": "tickers" } } ] } json.dump(test_config, f) config_path = f.name try: service = DataCollectionService(config_path=config_path) # Test initialization without actually starting collectors # (to avoid network dependencies in unit tests) assert not service.running # Test status retrieval status = service.get_status() assert 'running' in status assert 'collectors_total' in status finally: # Clean up temporary file import os os.unlink(config_path) class TestErrorHandlingAndEdgeCases: """Test error handling and edge cases in data collection.""" def test_malformed_trade_data(self, logger): """Test handling of malformed trade data.""" validator = OKXDataValidator("test", logger) malformed_data = { "instId": "BTC-USDT", "px": None, # Null price "sz": "invalid_size", "side": "invalid_side", "ts": "not_a_timestamp" } result = validator.validate_trade_data(malformed_data) assert not result.is_valid assert len(result.errors) > 0 def test_empty_aggregation_data(self, candle_config, logger): """Test aggregation with no trade data.""" processor = RealTimeCandleProcessor( symbol="BTC-USDT", exchange="okx", config=candle_config, logger=logger ) stats = processor.get_stats() assert stats['trades_processed'] == 0 assert 'current_buckets' in stats def test_out_of_order_trades(self, candle_config, logger): """Test handling of out-of-order trade timestamps.""" processor = RealTimeCandleProcessor( symbol="BTC-USDT", exchange="okx", config=candle_config, logger=logger ) base_time = datetime(2022, 1, 1, 12, 0, 0, tzinfo=timezone.utc) # Add trades in reverse chronological order for i in range(3, 0, -1): trade = StandardizedTrade( symbol="BTC-USDT", trade_id=str(i), price=Decimal("50000"), size=Decimal("0.1"), side="buy", timestamp=base_time + timedelta(seconds=i), exchange="okx" ) processor.process_trade(trade) # Should handle gracefully without crashing stats = processor.get_stats() assert stats['trades_processed'] == 3 def test_extreme_price_values(self, logger): """Test handling of extreme price values.""" validator = OKXDataValidator("test", logger) # Very large price large_price_data = { "instId": "BTC-USDT", "tradeId": "123", "px": "999999999999.99", "sz": "0.1", "side": "buy", "ts": "1640995200000" } result = validator.validate_trade_data(large_price_data) # Should handle large numbers gracefully assert result.is_valid or "price" in str(result.errors) # Very small price small_price_data = large_price_data.copy() small_price_data["px"] = "0.00000001" result = validator.validate_trade_data(small_price_data) assert result.is_valid or "price" in str(result.errors) class TestPerformanceAndReliability: """Test performance and reliability aspects.""" def test_high_frequency_trade_processing(self, candle_config, logger): """Test processing high frequency of trades.""" processor = RealTimeCandleProcessor( symbol="BTC-USDT", exchange="okx", config=candle_config, logger=logger ) base_time = datetime(2022, 1, 1, 12, 0, 0, tzinfo=timezone.utc) # Process 1000 trades rapidly for i in range(1000): trade = StandardizedTrade( symbol="BTC-USDT", trade_id=str(i), price=Decimal("50000") + Decimal(str(i % 100)), size=Decimal("0.001"), side="buy" if i % 2 == 0 else "sell", timestamp=base_time + timedelta(milliseconds=i), exchange="okx" ) processor.process_trade(trade) stats = processor.get_stats() assert stats['trades_processed'] == 1000 assert 'current_buckets' in stats def test_memory_usage_with_long_running_aggregation(self, candle_config, logger): """Test memory usage doesn't grow unbounded.""" processor = RealTimeCandleProcessor( symbol="BTC-USDT", exchange="okx", config=candle_config, logger=logger ) base_time = datetime(2022, 1, 1, 12, 0, 0, tzinfo=timezone.utc) # Process trades over a long time period for minute in range(10): # 10 minutes for second in range(60): # 60 seconds per minute trade = StandardizedTrade( symbol="BTC-USDT", trade_id=f"{minute}_{second}", price=Decimal("50000"), size=Decimal("0.1"), side="buy", timestamp=base_time + timedelta(minutes=minute, seconds=second), exchange="okx" ) processor.process_trade(trade) stats = processor.get_stats() # Should have processed many trades but not keep unlimited candles in memory assert stats['trades_processed'] == 600 # 10 minutes * 60 seconds # Check current buckets instead of non-existent active_candles assert 'current_buckets' in stats if __name__ == "__main__": pytest.main([__file__, "-v"])