Cycles/test/test_timeframe_utils.py

550 lines
20 KiB
Python
Raw Normal View History

2025-05-28 18:26:51 +08:00
"""
Comprehensive unit tests for timeframe aggregation utilities.
This test suite verifies:
1. Mathematical equivalence to pandas resampling
2. Bar timestamp correctness (end vs start mode)
3. OHLCV aggregation accuracy
4. Edge cases (empty data, single data point, gaps)
5. Performance benchmarks
6. MinuteDataBuffer functionality
"""
import pytest
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import List, Dict, Union
import time
# Import the utilities to test
from IncrementalTrader.utils import (
aggregate_minute_data_to_timeframe,
parse_timeframe_to_minutes,
get_latest_complete_bar,
MinuteDataBuffer,
TimeframeError
)
class TestTimeframeParser:
"""Test timeframe string parsing functionality."""
def test_valid_timeframes(self):
"""Test parsing of valid timeframe strings."""
test_cases = [
("1min", 1),
("5min", 5),
("15min", 15),
("30min", 30),
("1h", 60),
("2h", 120),
("4h", 240),
("1d", 1440),
("7d", 10080),
("1w", 10080),
]
for timeframe_str, expected_minutes in test_cases:
result = parse_timeframe_to_minutes(timeframe_str)
assert result == expected_minutes, f"Failed for {timeframe_str}: expected {expected_minutes}, got {result}"
def test_case_insensitive(self):
"""Test that parsing is case insensitive."""
assert parse_timeframe_to_minutes("15MIN") == 15
assert parse_timeframe_to_minutes("1H") == 60
assert parse_timeframe_to_minutes("1D") == 1440
def test_invalid_timeframes(self):
"""Test that invalid timeframes raise appropriate errors."""
invalid_cases = [
"",
"invalid",
"15",
"min",
"0min",
"-5min",
"1.5h",
None,
123,
]
for invalid_timeframe in invalid_cases:
with pytest.raises(TimeframeError):
parse_timeframe_to_minutes(invalid_timeframe)
class TestAggregation:
"""Test core aggregation functionality."""
@pytest.fixture
def sample_minute_data(self):
"""Create sample minute data for testing."""
start_time = pd.Timestamp('2024-01-01 09:00:00')
data = []
for i in range(60): # 1 hour of minute data
timestamp = start_time + pd.Timedelta(minutes=i)
data.append({
'timestamp': timestamp,
'open': 100.0 + i * 0.1,
'high': 100.5 + i * 0.1,
'low': 99.5 + i * 0.1,
'close': 100.2 + i * 0.1,
'volume': 1000 + i * 10
})
return data
def test_empty_data(self):
"""Test aggregation with empty data."""
result = aggregate_minute_data_to_timeframe([], "15min")
assert result == []
def test_single_data_point(self):
"""Test aggregation with single data point."""
data = [{
'timestamp': pd.Timestamp('2024-01-01 09:00:00'),
'open': 100.0,
'high': 101.0,
'low': 99.0,
'close': 100.5,
'volume': 1000
}]
# Should not produce any complete bars for 15min timeframe
result = aggregate_minute_data_to_timeframe(data, "15min")
assert len(result) == 0
def test_15min_aggregation_end_timestamps(self, sample_minute_data):
"""Test 15-minute aggregation with end timestamps."""
result = aggregate_minute_data_to_timeframe(sample_minute_data, "15min", "end")
# Should have 4 complete 15-minute bars
assert len(result) == 4
# Check timestamps are bar end times
expected_timestamps = [
pd.Timestamp('2024-01-01 09:15:00'),
pd.Timestamp('2024-01-01 09:30:00'),
pd.Timestamp('2024-01-01 09:45:00'),
pd.Timestamp('2024-01-01 10:00:00'),
]
for i, expected_ts in enumerate(expected_timestamps):
assert result[i]['timestamp'] == expected_ts
def test_15min_aggregation_start_timestamps(self, sample_minute_data):
"""Test 15-minute aggregation with start timestamps."""
result = aggregate_minute_data_to_timeframe(sample_minute_data, "15min", "start")
# Should have 4 complete 15-minute bars
assert len(result) == 4
# Check timestamps are bar start times
expected_timestamps = [
pd.Timestamp('2024-01-01 09:00:00'),
pd.Timestamp('2024-01-01 09:15:00'),
pd.Timestamp('2024-01-01 09:30:00'),
pd.Timestamp('2024-01-01 09:45:00'),
]
for i, expected_ts in enumerate(expected_timestamps):
assert result[i]['timestamp'] == expected_ts
def test_ohlcv_aggregation_correctness(self, sample_minute_data):
"""Test that OHLCV aggregation follows correct rules."""
result = aggregate_minute_data_to_timeframe(sample_minute_data, "15min", "end")
# Test first 15-minute bar (minutes 0-14)
first_bar = result[0]
# Open should be first open (minute 0)
assert first_bar['open'] == 100.0
# High should be maximum high in period
expected_high = max(100.5 + i * 0.1 for i in range(15))
assert first_bar['high'] == expected_high
# Low should be minimum low in period
expected_low = min(99.5 + i * 0.1 for i in range(15))
assert first_bar['low'] == expected_low
# Close should be last close (minute 14)
assert first_bar['close'] == 100.2 + 14 * 0.1
# Volume should be sum of all volumes
expected_volume = sum(1000 + i * 10 for i in range(15))
assert first_bar['volume'] == expected_volume
def test_pandas_equivalence(self, sample_minute_data):
"""Test that aggregation matches pandas resampling exactly."""
# Convert to DataFrame for pandas comparison
df = pd.DataFrame(sample_minute_data)
df = df.set_index('timestamp')
# Pandas resampling
pandas_result = df.resample('15min', label='right').agg({
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last',
'volume': 'sum'
}).dropna()
# Our aggregation
our_result = aggregate_minute_data_to_timeframe(sample_minute_data, "15min", "end")
# Compare results
assert len(our_result) == len(pandas_result)
for i, (pandas_ts, pandas_row) in enumerate(pandas_result.iterrows()):
our_bar = our_result[i]
assert our_bar['timestamp'] == pandas_ts
assert abs(our_bar['open'] - pandas_row['open']) < 1e-10
assert abs(our_bar['high'] - pandas_row['high']) < 1e-10
assert abs(our_bar['low'] - pandas_row['low']) < 1e-10
assert abs(our_bar['close'] - pandas_row['close']) < 1e-10
assert abs(our_bar['volume'] - pandas_row['volume']) < 1e-10
def test_different_timeframes(self, sample_minute_data):
"""Test aggregation for different timeframes."""
timeframes = ["5min", "15min", "30min", "1h"]
expected_counts = [12, 4, 2, 1]
for timeframe, expected_count in zip(timeframes, expected_counts):
result = aggregate_minute_data_to_timeframe(sample_minute_data, timeframe)
assert len(result) == expected_count, f"Failed for {timeframe}: expected {expected_count}, got {len(result)}"
def test_invalid_data_validation(self):
"""Test validation of invalid input data."""
# Test non-list input
with pytest.raises(ValueError):
aggregate_minute_data_to_timeframe("not a list", "15min")
# Test missing required fields
invalid_data = [{'timestamp': pd.Timestamp('2024-01-01 09:00:00'), 'open': 100}] # Missing fields
with pytest.raises(ValueError):
aggregate_minute_data_to_timeframe(invalid_data, "15min")
# Test invalid timestamp mode
valid_data = [{
'timestamp': pd.Timestamp('2024-01-01 09:00:00'),
'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000
}]
with pytest.raises(ValueError):
aggregate_minute_data_to_timeframe(valid_data, "15min", "invalid_mode")
class TestLatestCompleteBar:
"""Test latest complete bar functionality."""
@pytest.fixture
def sample_data_with_incomplete(self):
"""Create sample data with incomplete last bar."""
start_time = pd.Timestamp('2024-01-01 09:00:00')
data = []
# 17 minutes of data (1 complete 15min bar + 2 minutes of incomplete bar)
for i in range(17):
timestamp = start_time + pd.Timedelta(minutes=i)
data.append({
'timestamp': timestamp,
'open': 100.0 + i * 0.1,
'high': 100.5 + i * 0.1,
'low': 99.5 + i * 0.1,
'close': 100.2 + i * 0.1,
'volume': 1000 + i * 10
})
return data
def test_latest_complete_bar_end_mode(self, sample_data_with_incomplete):
"""Test getting latest complete bar with end timestamps."""
result = get_latest_complete_bar(sample_data_with_incomplete, "15min", "end")
assert result is not None
assert result['timestamp'] == pd.Timestamp('2024-01-01 09:15:00')
def test_latest_complete_bar_start_mode(self, sample_data_with_incomplete):
"""Test getting latest complete bar with start timestamps."""
result = get_latest_complete_bar(sample_data_with_incomplete, "15min", "start")
assert result is not None
assert result['timestamp'] == pd.Timestamp('2024-01-01 09:00:00')
def test_no_complete_bars(self):
"""Test when no complete bars are available."""
# Only 5 minutes of data for 15min timeframe
data = []
start_time = pd.Timestamp('2024-01-01 09:00:00')
for i in range(5):
timestamp = start_time + pd.Timedelta(minutes=i)
data.append({
'timestamp': timestamp,
'open': 100.0,
'high': 101.0,
'low': 99.0,
'close': 100.5,
'volume': 1000
})
result = get_latest_complete_bar(data, "15min")
assert result is None
def test_empty_data(self):
"""Test with empty data."""
result = get_latest_complete_bar([], "15min")
assert result is None
class TestMinuteDataBuffer:
"""Test MinuteDataBuffer functionality."""
def test_buffer_initialization(self):
"""Test buffer initialization."""
buffer = MinuteDataBuffer(max_size=100)
assert buffer.max_size == 100
assert buffer.size() == 0
assert not buffer.is_full()
assert buffer.get_time_range() is None
def test_invalid_initialization(self):
"""Test invalid buffer initialization."""
with pytest.raises(ValueError):
MinuteDataBuffer(max_size=0)
with pytest.raises(ValueError):
MinuteDataBuffer(max_size=-10)
def test_add_data(self):
"""Test adding data to buffer."""
buffer = MinuteDataBuffer(max_size=10)
timestamp = pd.Timestamp('2024-01-01 09:00:00')
ohlcv_data = {'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000}
buffer.add(timestamp, ohlcv_data)
assert buffer.size() == 1
assert not buffer.is_full()
time_range = buffer.get_time_range()
assert time_range == (timestamp, timestamp)
def test_buffer_overflow(self):
"""Test buffer behavior when max size is exceeded."""
buffer = MinuteDataBuffer(max_size=3)
# Add 5 data points
for i in range(5):
timestamp = pd.Timestamp('2024-01-01 09:00:00') + pd.Timedelta(minutes=i)
ohlcv_data = {'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000}
buffer.add(timestamp, ohlcv_data)
# Should only keep last 3
assert buffer.size() == 3
assert buffer.is_full()
# Should have data from minutes 2, 3, 4
time_range = buffer.get_time_range()
expected_start = pd.Timestamp('2024-01-01 09:02:00')
expected_end = pd.Timestamp('2024-01-01 09:04:00')
assert time_range == (expected_start, expected_end)
def test_get_data_with_lookback(self):
"""Test getting data with lookback limit."""
buffer = MinuteDataBuffer(max_size=10)
# Add 5 data points
for i in range(5):
timestamp = pd.Timestamp('2024-01-01 09:00:00') + pd.Timedelta(minutes=i)
ohlcv_data = {'open': 100 + i, 'high': 101 + i, 'low': 99 + i, 'close': 100.5 + i, 'volume': 1000}
buffer.add(timestamp, ohlcv_data)
# Get last 3 minutes
data = buffer.get_data(lookback_minutes=3)
assert len(data) == 3
# Should be minutes 2, 3, 4
assert data[0]['open'] == 102
assert data[1]['open'] == 103
assert data[2]['open'] == 104
# Get all data
all_data = buffer.get_data()
assert len(all_data) == 5
def test_aggregate_to_timeframe(self):
"""Test aggregating buffer data to timeframe."""
buffer = MinuteDataBuffer(max_size=100)
# Add 30 minutes of data
for i in range(30):
timestamp = pd.Timestamp('2024-01-01 09:00:00') + pd.Timedelta(minutes=i)
ohlcv_data = {
'open': 100.0 + i * 0.1,
'high': 100.5 + i * 0.1,
'low': 99.5 + i * 0.1,
'close': 100.2 + i * 0.1,
'volume': 1000 + i * 10
}
buffer.add(timestamp, ohlcv_data)
# Aggregate to 15min
bars_15m = buffer.aggregate_to_timeframe("15min")
assert len(bars_15m) == 2 # 2 complete 15-minute bars
# Test with lookback limit
bars_15m_limited = buffer.aggregate_to_timeframe("15min", lookback_bars=1)
assert len(bars_15m_limited) == 1
def test_get_latest_complete_bar(self):
"""Test getting latest complete bar from buffer."""
buffer = MinuteDataBuffer(max_size=100)
# Add 17 minutes of data (1 complete 15min bar + 2 minutes)
for i in range(17):
timestamp = pd.Timestamp('2024-01-01 09:00:00') + pd.Timedelta(minutes=i)
ohlcv_data = {
'open': 100.0 + i * 0.1,
'high': 100.5 + i * 0.1,
'low': 99.5 + i * 0.1,
'close': 100.2 + i * 0.1,
'volume': 1000 + i * 10
}
buffer.add(timestamp, ohlcv_data)
# Should get the complete 15-minute bar
latest_bar = buffer.get_latest_complete_bar("15min")
assert latest_bar is not None
assert latest_bar['timestamp'] == pd.Timestamp('2024-01-01 09:15:00')
def test_invalid_data_validation(self):
"""Test validation of invalid data."""
buffer = MinuteDataBuffer(max_size=10)
timestamp = pd.Timestamp('2024-01-01 09:00:00')
# Missing required field
with pytest.raises(ValueError):
buffer.add(timestamp, {'open': 100, 'high': 101}) # Missing low, close, volume
# Invalid data type
with pytest.raises(ValueError):
buffer.add(timestamp, {'open': 'invalid', 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000})
# Invalid lookback
buffer.add(timestamp, {'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000})
with pytest.raises(ValueError):
buffer.get_data(lookback_minutes=0)
def test_clear_buffer(self):
"""Test clearing buffer."""
buffer = MinuteDataBuffer(max_size=10)
# Add some data
timestamp = pd.Timestamp('2024-01-01 09:00:00')
ohlcv_data = {'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000}
buffer.add(timestamp, ohlcv_data)
assert buffer.size() == 1
# Clear buffer
buffer.clear()
assert buffer.size() == 0
assert buffer.get_time_range() is None
def test_buffer_repr(self):
"""Test buffer string representation."""
buffer = MinuteDataBuffer(max_size=10)
# Empty buffer
repr_empty = repr(buffer)
assert "size=0" in repr_empty
assert "empty" in repr_empty
# Add data
timestamp = pd.Timestamp('2024-01-01 09:00:00')
ohlcv_data = {'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 1000}
buffer.add(timestamp, ohlcv_data)
repr_with_data = repr(buffer)
assert "size=1" in repr_with_data
assert "2024-01-01 09:00:00" in repr_with_data
class TestPerformance:
"""Test performance characteristics of the utilities."""
def test_aggregation_performance(self):
"""Test aggregation performance with large datasets."""
# Create large dataset (1 week of minute data)
start_time = pd.Timestamp('2024-01-01 00:00:00')
large_data = []
for i in range(7 * 24 * 60): # 1 week of minutes
timestamp = start_time + pd.Timedelta(minutes=i)
large_data.append({
'timestamp': timestamp,
'open': 100.0 + np.random.randn() * 0.1,
'high': 100.5 + np.random.randn() * 0.1,
'low': 99.5 + np.random.randn() * 0.1,
'close': 100.2 + np.random.randn() * 0.1,
'volume': 1000 + np.random.randint(0, 500)
})
# Time the aggregation
start_time = time.time()
result = aggregate_minute_data_to_timeframe(large_data, "15min")
end_time = time.time()
aggregation_time = end_time - start_time
# Should complete within reasonable time (< 1 second for 1 week of data)
assert aggregation_time < 1.0, f"Aggregation took too long: {aggregation_time:.3f}s"
# Verify result size
expected_bars = 7 * 24 * 4 # 7 days * 24 hours * 4 15-min bars per hour
assert len(result) == expected_bars
def test_buffer_performance(self):
"""Test buffer performance with frequent updates."""
buffer = MinuteDataBuffer(max_size=1440) # 24 hours
# Time adding 1 hour of data
start_time = time.time()
for i in range(60):
timestamp = pd.Timestamp('2024-01-01 09:00:00') + pd.Timedelta(minutes=i)
ohlcv_data = {
'open': 100.0 + i * 0.1,
'high': 100.5 + i * 0.1,
'low': 99.5 + i * 0.1,
'close': 100.2 + i * 0.1,
'volume': 1000 + i * 10
}
buffer.add(timestamp, ohlcv_data)
end_time = time.time()
add_time = end_time - start_time
# Should be very fast (< 0.1 seconds for 60 additions)
assert add_time < 0.1, f"Buffer additions took too long: {add_time:.3f}s"
# Time aggregation
start_time = time.time()
bars = buffer.aggregate_to_timeframe("15min")
end_time = time.time()
agg_time = end_time - start_time
# Should be fast (< 0.01 seconds)
assert agg_time < 0.01, f"Buffer aggregation took too long: {agg_time:.3f}s"
if __name__ == "__main__":
# Run tests if script is executed directly
pytest.main([__file__, "-v"])