""" Comprehensive unit tests for timeframe aggregation utilities. This test suite verifies: 1. Mathematical equivalence to pandas resampling 2. Bar timestamp correctness (end vs start mode) 3. OHLCV aggregation accuracy 4. Edge cases (empty data, single data point, gaps) 5. Performance benchmarks 6. MinuteDataBuffer functionality """ import pytest import pandas as pd import numpy as np from datetime import datetime, timedelta from typing import List, Dict, Union import time # Import the utilities to test from IncrementalTrader.utils import ( aggregate_minute_data_to_timeframe, parse_timeframe_to_minutes, get_latest_complete_bar, MinuteDataBuffer, TimeframeError ) class TestTimeframeParser: """Test timeframe string parsing functionality.""" def test_valid_timeframes(self): """Test parsing of valid timeframe strings.""" test_cases = [ ("1min", 1), ("5min", 5), ("15min", 15), ("30min", 30), ("1h", 60), ("2h", 120), ("4h", 240), ("1d", 1440), ("7d", 10080), ("1w", 10080), ] for timeframe_str, expected_minutes in test_cases: result = parse_timeframe_to_minutes(timeframe_str) assert result == expected_minutes, f"Failed for {timeframe_str}: expected {expected_minutes}, got {result}" def test_case_insensitive(self): """Test that parsing is case insensitive.""" assert parse_timeframe_to_minutes("15MIN") == 15 assert parse_timeframe_to_minutes("1H") == 60 assert parse_timeframe_to_minutes("1D") == 1440 def test_invalid_timeframes(self): """Test that invalid timeframes raise appropriate errors.""" invalid_cases = [ "", "invalid", "15", "min", "0min", "-5min", "1.5h", None, 123, ] for invalid_timeframe in invalid_cases: with pytest.raises(TimeframeError): parse_timeframe_to_minutes(invalid_timeframe) class TestAggregation: """Test core aggregation functionality.""" @pytest.fixture def sample_minute_data(self): """Create sample minute data for testing.""" start_time = pd.Timestamp('2024-01-01 09:00:00') data = [] for i in range(60): # 1 hour of minute data timestamp = start_time + pd.Timedelta(minutes=i) data.append({ 'timestamp': timestamp, 'open': 100.0 + i * 0.1, 'high': 100.5 + i * 0.1, 'low': 99.5 + i * 0.1, 'close': 100.2 + i * 0.1, 'volume': 1000 + i * 10 }) return data def test_empty_data(self): """Test aggregation with empty data.""" result = aggregate_minute_data_to_timeframe([], "15min") assert result == [] def test_single_data_point(self): """Test aggregation with single data point.""" data = [{ 'timestamp': pd.Timestamp('2024-01-01 09:00:00'), 'open': 100.0, 'high': 101.0, 'low': 99.0, 'close': 100.5, 'volume': 1000 }] # Should not produce any complete bars for 15min timeframe result = aggregate_minute_data_to_timeframe(data, "15min") assert len(result) == 0 def test_15min_aggregation_end_timestamps(self, sample_minute_data): """Test 15-minute aggregation with end timestamps.""" result = aggregate_minute_data_to_timeframe(sample_minute_data, "15min", "end") # Should have 4 complete 15-minute bars assert len(result) == 4 # Check timestamps are bar end times expected_timestamps = [ pd.Timestamp('2024-01-01 09:15:00'), pd.Timestamp('2024-01-01 09:30:00'), pd.Timestamp('2024-01-01 09:45:00'), pd.Timestamp('2024-01-01 10:00:00'), ] for i, expected_ts in enumerate(expected_timestamps): assert result[i]['timestamp'] == expected_ts def test_15min_aggregation_start_timestamps(self, sample_minute_data): """Test 15-minute aggregation with start timestamps.""" result = aggregate_minute_data_to_timeframe(sample_minute_data, "15min", "start") # Should have 4 complete 15-minute bars assert len(result) == 4 # Check timestamps are bar start times expected_timestamps = [ pd.Timestamp('2024-01-01 09:00:00'), pd.Timestamp('2024-01-01 09:15:00'), pd.Timestamp('2024-01-01 09:30:00'), pd.Timestamp('2024-01-01 09:45:00'), ] for i, expected_ts in enumerate(expected_timestamps): assert result[i]['timestamp'] == expected_ts def test_ohlcv_aggregation_correctness(self, sample_minute_data): """Test that OHLCV aggregation follows correct rules.""" result = aggregate_minute_data_to_timeframe(sample_minute_data, "15min", "end") # Test first 15-minute bar (minutes 0-14) first_bar = result[0] # Open should be first open (minute 0) assert first_bar['open'] == 100.0 # High should be maximum high in period expected_high = max(100.5 + i * 0.1 for i in range(15)) assert first_bar['high'] == expected_high # Low should be minimum low in period expected_low = min(99.5 + i * 0.1 for i in range(15)) assert first_bar['low'] == expected_low # Close should be last close (minute 14) assert first_bar['close'] == 100.2 + 14 * 0.1 # Volume should be sum of all volumes expected_volume = sum(1000 + i * 10 for i in range(15)) assert first_bar['volume'] == expected_volume def test_pandas_equivalence(self, sample_minute_data): """Test that aggregation matches pandas resampling exactly.""" # Convert to DataFrame for pandas comparison df = pd.DataFrame(sample_minute_data) df = df.set_index('timestamp') # Pandas resampling pandas_result = df.resample('15min', label='right').agg({ 'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum' }).dropna() # Our aggregation our_result = aggregate_minute_data_to_timeframe(sample_minute_data, "15min", "end") # Compare results assert len(our_result) == len(pandas_result) for i, (pandas_ts, pandas_row) in enumerate(pandas_result.iterrows()): our_bar = our_result[i] assert our_bar['timestamp'] == pandas_ts assert abs(our_bar['open'] - pandas_row['open']) < 1e-10 assert abs(our_bar['high'] - pandas_row['high']) < 1e-10 assert abs(our_bar['low'] - pandas_row['low']) < 1e-10 assert abs(our_bar['close'] - pandas_row['close']) < 1e-10 assert abs(our_bar['volume'] - pandas_row['volume']) < 1e-10 def test_different_timeframes(self, sample_minute_data): """Test aggregation for different timeframes.""" timeframes = ["5min", "15min", "30min", "1h"] expected_counts = [12, 4, 2, 1] for timeframe, expected_count in zip(timeframes, expected_counts): result = aggregate_minute_data_to_timeframe(sample_minute_data, timeframe) assert len(result) == expected_count, f"Failed for {timeframe}: expected {expected_count}, got {len(result)}" def test_invalid_data_validation(self): """Test validation of invalid input data.""" # Test non-list input with pytest.raises(ValueError): aggregate_minute_data_to_timeframe("not a list", "15min") # Test missing required fields invalid_data = [{'timestamp': pd.Timestamp('2024-01-01 09:00:00'), 'open': 100}] # Missing fields with pytest.raises(ValueError): aggregate_minute_data_to_timeframe(invalid_data, "15min") # Test invalid timestamp mode valid_data = [{ 'timestamp': pd.Timestamp('2024-01-01 09:00:00'), 'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000 }] with pytest.raises(ValueError): aggregate_minute_data_to_timeframe(valid_data, "15min", "invalid_mode") class TestLatestCompleteBar: """Test latest complete bar functionality.""" @pytest.fixture def sample_data_with_incomplete(self): """Create sample data with incomplete last bar.""" start_time = pd.Timestamp('2024-01-01 09:00:00') data = [] # 17 minutes of data (1 complete 15min bar + 2 minutes of incomplete bar) for i in range(17): timestamp = start_time + pd.Timedelta(minutes=i) data.append({ 'timestamp': timestamp, 'open': 100.0 + i * 0.1, 'high': 100.5 + i * 0.1, 'low': 99.5 + i * 0.1, 'close': 100.2 + i * 0.1, 'volume': 1000 + i * 10 }) return data def test_latest_complete_bar_end_mode(self, sample_data_with_incomplete): """Test getting latest complete bar with end timestamps.""" result = get_latest_complete_bar(sample_data_with_incomplete, "15min", "end") assert result is not None assert result['timestamp'] == pd.Timestamp('2024-01-01 09:15:00') def test_latest_complete_bar_start_mode(self, sample_data_with_incomplete): """Test getting latest complete bar with start timestamps.""" result = get_latest_complete_bar(sample_data_with_incomplete, "15min", "start") assert result is not None assert result['timestamp'] == pd.Timestamp('2024-01-01 09:00:00') def test_no_complete_bars(self): """Test when no complete bars are available.""" # Only 5 minutes of data for 15min timeframe data = [] start_time = pd.Timestamp('2024-01-01 09:00:00') for i in range(5): timestamp = start_time + pd.Timedelta(minutes=i) data.append({ 'timestamp': timestamp, 'open': 100.0, 'high': 101.0, 'low': 99.0, 'close': 100.5, 'volume': 1000 }) result = get_latest_complete_bar(data, "15min") assert result is None def test_empty_data(self): """Test with empty data.""" result = get_latest_complete_bar([], "15min") assert result is None class TestMinuteDataBuffer: """Test MinuteDataBuffer functionality.""" def test_buffer_initialization(self): """Test buffer initialization.""" buffer = MinuteDataBuffer(max_size=100) assert buffer.max_size == 100 assert buffer.size() == 0 assert not buffer.is_full() assert buffer.get_time_range() is None def test_invalid_initialization(self): """Test invalid buffer initialization.""" with pytest.raises(ValueError): MinuteDataBuffer(max_size=0) with pytest.raises(ValueError): MinuteDataBuffer(max_size=-10) def test_add_data(self): """Test adding data to buffer.""" buffer = MinuteDataBuffer(max_size=10) timestamp = pd.Timestamp('2024-01-01 09:00:00') ohlcv_data = {'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000} buffer.add(timestamp, ohlcv_data) assert buffer.size() == 1 assert not buffer.is_full() time_range = buffer.get_time_range() assert time_range == (timestamp, timestamp) def test_buffer_overflow(self): """Test buffer behavior when max size is exceeded.""" buffer = MinuteDataBuffer(max_size=3) # Add 5 data points for i in range(5): timestamp = pd.Timestamp('2024-01-01 09:00:00') + pd.Timedelta(minutes=i) ohlcv_data = {'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000} buffer.add(timestamp, ohlcv_data) # Should only keep last 3 assert buffer.size() == 3 assert buffer.is_full() # Should have data from minutes 2, 3, 4 time_range = buffer.get_time_range() expected_start = pd.Timestamp('2024-01-01 09:02:00') expected_end = pd.Timestamp('2024-01-01 09:04:00') assert time_range == (expected_start, expected_end) def test_get_data_with_lookback(self): """Test getting data with lookback limit.""" buffer = MinuteDataBuffer(max_size=10) # Add 5 data points for i in range(5): timestamp = pd.Timestamp('2024-01-01 09:00:00') + pd.Timedelta(minutes=i) ohlcv_data = {'open': 100 + i, 'high': 101 + i, 'low': 99 + i, 'close': 100.5 + i, 'volume': 1000} buffer.add(timestamp, ohlcv_data) # Get last 3 minutes data = buffer.get_data(lookback_minutes=3) assert len(data) == 3 # Should be minutes 2, 3, 4 assert data[0]['open'] == 102 assert data[1]['open'] == 103 assert data[2]['open'] == 104 # Get all data all_data = buffer.get_data() assert len(all_data) == 5 def test_aggregate_to_timeframe(self): """Test aggregating buffer data to timeframe.""" buffer = MinuteDataBuffer(max_size=100) # Add 30 minutes of data for i in range(30): timestamp = pd.Timestamp('2024-01-01 09:00:00') + pd.Timedelta(minutes=i) ohlcv_data = { 'open': 100.0 + i * 0.1, 'high': 100.5 + i * 0.1, 'low': 99.5 + i * 0.1, 'close': 100.2 + i * 0.1, 'volume': 1000 + i * 10 } buffer.add(timestamp, ohlcv_data) # Aggregate to 15min bars_15m = buffer.aggregate_to_timeframe("15min") assert len(bars_15m) == 2 # 2 complete 15-minute bars # Test with lookback limit bars_15m_limited = buffer.aggregate_to_timeframe("15min", lookback_bars=1) assert len(bars_15m_limited) == 1 def test_get_latest_complete_bar(self): """Test getting latest complete bar from buffer.""" buffer = MinuteDataBuffer(max_size=100) # Add 17 minutes of data (1 complete 15min bar + 2 minutes) for i in range(17): timestamp = pd.Timestamp('2024-01-01 09:00:00') + pd.Timedelta(minutes=i) ohlcv_data = { 'open': 100.0 + i * 0.1, 'high': 100.5 + i * 0.1, 'low': 99.5 + i * 0.1, 'close': 100.2 + i * 0.1, 'volume': 1000 + i * 10 } buffer.add(timestamp, ohlcv_data) # Should get the complete 15-minute bar latest_bar = buffer.get_latest_complete_bar("15min") assert latest_bar is not None assert latest_bar['timestamp'] == pd.Timestamp('2024-01-01 09:15:00') def test_invalid_data_validation(self): """Test validation of invalid data.""" buffer = MinuteDataBuffer(max_size=10) timestamp = pd.Timestamp('2024-01-01 09:00:00') # Missing required field with pytest.raises(ValueError): buffer.add(timestamp, {'open': 100, 'high': 101}) # Missing low, close, volume # Invalid data type with pytest.raises(ValueError): buffer.add(timestamp, {'open': 'invalid', 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000}) # Invalid lookback buffer.add(timestamp, {'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000}) with pytest.raises(ValueError): buffer.get_data(lookback_minutes=0) def test_clear_buffer(self): """Test clearing buffer.""" buffer = MinuteDataBuffer(max_size=10) # Add some data timestamp = pd.Timestamp('2024-01-01 09:00:00') ohlcv_data = {'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000} buffer.add(timestamp, ohlcv_data) assert buffer.size() == 1 # Clear buffer buffer.clear() assert buffer.size() == 0 assert buffer.get_time_range() is None def test_buffer_repr(self): """Test buffer string representation.""" buffer = MinuteDataBuffer(max_size=10) # Empty buffer repr_empty = repr(buffer) assert "size=0" in repr_empty assert "empty" in repr_empty # Add data timestamp = pd.Timestamp('2024-01-01 09:00:00') ohlcv_data = {'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000} buffer.add(timestamp, ohlcv_data) repr_with_data = repr(buffer) assert "size=1" in repr_with_data assert "2024-01-01 09:00:00" in repr_with_data class TestPerformance: """Test performance characteristics of the utilities.""" def test_aggregation_performance(self): """Test aggregation performance with large datasets.""" # Create large dataset (1 week of minute data) start_time = pd.Timestamp('2024-01-01 00:00:00') large_data = [] for i in range(7 * 24 * 60): # 1 week of minutes timestamp = start_time + pd.Timedelta(minutes=i) large_data.append({ 'timestamp': timestamp, 'open': 100.0 + np.random.randn() * 0.1, 'high': 100.5 + np.random.randn() * 0.1, 'low': 99.5 + np.random.randn() * 0.1, 'close': 100.2 + np.random.randn() * 0.1, 'volume': 1000 + np.random.randint(0, 500) }) # Time the aggregation start_time = time.time() result = aggregate_minute_data_to_timeframe(large_data, "15min") end_time = time.time() aggregation_time = end_time - start_time # Should complete within reasonable time (< 1 second for 1 week of data) assert aggregation_time < 1.0, f"Aggregation took too long: {aggregation_time:.3f}s" # Verify result size expected_bars = 7 * 24 * 4 # 7 days * 24 hours * 4 15-min bars per hour assert len(result) == expected_bars def test_buffer_performance(self): """Test buffer performance with frequent updates.""" buffer = MinuteDataBuffer(max_size=1440) # 24 hours # Time adding 1 hour of data start_time = time.time() for i in range(60): timestamp = pd.Timestamp('2024-01-01 09:00:00') + pd.Timedelta(minutes=i) ohlcv_data = { 'open': 100.0 + i * 0.1, 'high': 100.5 + i * 0.1, 'low': 99.5 + i * 0.1, 'close': 100.2 + i * 0.1, 'volume': 1000 + i * 10 } buffer.add(timestamp, ohlcv_data) end_time = time.time() add_time = end_time - start_time # Should be very fast (< 0.1 seconds for 60 additions) assert add_time < 0.1, f"Buffer additions took too long: {add_time:.3f}s" # Time aggregation start_time = time.time() bars = buffer.aggregate_to_timeframe("15min") end_time = time.time() agg_time = end_time - start_time # Should be fast (< 0.01 seconds) assert agg_time < 0.01, f"Buffer aggregation took too long: {agg_time:.3f}s" if __name__ == "__main__": # Run tests if script is executed directly pytest.main([__file__, "-v"])