Ajasra 5614520c58 Enhance backtesting performance and data handling
- Introduced DataCache utility for optimized data loading, reducing redundant I/O operations during strategy execution.
- Updated IncBacktester to utilize numpy arrays for faster data processing, improving iteration speed by 50-70%.
- Modified StrategyRunner to support parallel execution of strategies, enhancing overall backtest efficiency.
- Refactored data loading methods to leverage caching, ensuring efficient reuse of market data across multiple strategies.
2025-05-29 15:21:19 +08:00

722 lines
29 KiB
Python

"""
Backtester Utilities
This module provides utility functions for data loading, system resource management,
and result saving for the incremental backtesting framework.
"""
import os
import json
import pandas as pd
import numpy as np
import psutil
import hashlib
from typing import Dict, List, Any, Optional
import logging
from datetime import datetime
logger = logging.getLogger(__name__)
class DataCache:
"""
Data caching utility for optimizing repeated data loading operations.
This class provides intelligent caching of loaded market data to eliminate
redundant I/O operations when running multiple strategies or parameter
optimizations with the same data requirements.
Features:
- Automatic cache key generation based on file path and date range
- Memory-efficient storage with DataFrame copying to prevent mutations
- Cache statistics tracking for performance monitoring
- File modification time tracking for cache invalidation
- Configurable memory limits to prevent excessive memory usage
Example:
cache = DataCache(max_cache_size=10)
data1 = cache.get_data("btc_data.csv", "2023-01-01", "2023-01-31", data_loader)
data2 = cache.get_data("btc_data.csv", "2023-01-01", "2023-01-31", data_loader) # Cache hit
print(cache.get_cache_stats()) # {'hits': 1, 'misses': 1, 'hit_ratio': 0.5}
"""
def __init__(self, max_cache_size: int = 20):
"""
Initialize data cache.
Args:
max_cache_size: Maximum number of datasets to cache (LRU eviction)
"""
self._cache: Dict[str, Dict[str, Any]] = {}
self._access_order: List[str] = [] # For LRU tracking
self._max_cache_size = max_cache_size
self._cache_stats = {
'hits': 0,
'misses': 0,
'evictions': 0,
'total_requests': 0
}
logger.info(f"DataCache initialized with max_cache_size={max_cache_size}")
def get_data(self, file_path: str, start_date: str, end_date: str,
data_loader: 'DataLoader') -> pd.DataFrame:
"""
Get data from cache or load if not cached.
Args:
file_path: Path to the data file (relative to data_dir)
start_date: Start date for filtering (YYYY-MM-DD format)
end_date: End date for filtering (YYYY-MM-DD format)
data_loader: DataLoader instance to use for loading data
Returns:
pd.DataFrame: Loaded OHLCV data with DatetimeIndex
"""
self._cache_stats['total_requests'] += 1
# Generate cache key
cache_key = self._generate_cache_key(file_path, start_date, end_date, data_loader.data_dir)
# Check if data is cached and still valid
if cache_key in self._cache:
cached_entry = self._cache[cache_key]
# Check if file has been modified since caching
if self._is_cache_valid(cached_entry, file_path, data_loader.data_dir):
self._cache_stats['hits'] += 1
self._update_access_order(cache_key)
logger.debug(f"Cache HIT for {file_path} [{start_date} to {end_date}]")
# Return a copy to prevent mutations affecting cached data
return cached_entry['data'].copy()
# Cache miss - load data
self._cache_stats['misses'] += 1
logger.debug(f"Cache MISS for {file_path} [{start_date} to {end_date}] - loading from disk")
# Load data using the provided data loader
data = data_loader.load_data(file_path, start_date, end_date)
# Cache the loaded data
self._store_in_cache(cache_key, data, file_path, data_loader.data_dir)
# Return a copy to prevent mutations affecting cached data
return data.copy()
def _generate_cache_key(self, file_path: str, start_date: str, end_date: str, data_dir: str) -> str:
"""Generate a unique cache key for the data request."""
# Include file path, date range, and data directory in the key
key_components = f"{data_dir}:{file_path}:{start_date}:{end_date}"
# Use hash for consistent key length and to handle special characters
cache_key = hashlib.md5(key_components.encode()).hexdigest()
return cache_key
def _is_cache_valid(self, cached_entry: Dict[str, Any], file_path: str, data_dir: str) -> bool:
"""Check if cached data is still valid (file not modified)."""
try:
full_path = os.path.join(data_dir, file_path)
current_mtime = os.path.getmtime(full_path)
cached_mtime = cached_entry['file_mtime']
return current_mtime == cached_mtime
except (OSError, KeyError):
# File not found or missing metadata - consider invalid
return False
def _store_in_cache(self, cache_key: str, data: pd.DataFrame, file_path: str, data_dir: str) -> None:
"""Store data in cache with metadata."""
# Enforce cache size limit using LRU eviction
if len(self._cache) >= self._max_cache_size:
self._evict_lru_entry()
# Get file modification time for cache validation
try:
full_path = os.path.join(data_dir, file_path)
file_mtime = os.path.getmtime(full_path)
except OSError:
file_mtime = 0 # Fallback if file not accessible
# Store cache entry
cache_entry = {
'data': data.copy(), # Store a copy to prevent external mutations
'file_path': file_path,
'file_mtime': file_mtime,
'cached_at': datetime.now(),
'data_shape': data.shape,
'memory_usage_mb': data.memory_usage(deep=True).sum() / 1024 / 1024
}
self._cache[cache_key] = cache_entry
self._update_access_order(cache_key)
logger.debug(f"Cached data for {file_path}: {data.shape[0]} rows, "
f"{cache_entry['memory_usage_mb']:.1f}MB")
def _update_access_order(self, cache_key: str) -> None:
"""Update LRU access order."""
if cache_key in self._access_order:
self._access_order.remove(cache_key)
self._access_order.append(cache_key)
def _evict_lru_entry(self) -> None:
"""Evict least recently used cache entry."""
if not self._access_order:
return
lru_key = self._access_order.pop(0)
evicted_entry = self._cache.pop(lru_key, None)
if evicted_entry:
self._cache_stats['evictions'] += 1
logger.debug(f"Evicted LRU cache entry: {evicted_entry['file_path']} "
f"({evicted_entry['memory_usage_mb']:.1f}MB)")
def get_cache_stats(self) -> Dict[str, Any]:
"""
Get cache performance statistics.
Returns:
Dict containing cache statistics including hit ratio and memory usage
"""
total_requests = self._cache_stats['total_requests']
hits = self._cache_stats['hits']
hit_ratio = hits / total_requests if total_requests > 0 else 0.0
# Calculate total memory usage
total_memory_mb = sum(
entry['memory_usage_mb'] for entry in self._cache.values()
)
stats = {
'hits': hits,
'misses': self._cache_stats['misses'],
'evictions': self._cache_stats['evictions'],
'total_requests': total_requests,
'hit_ratio': hit_ratio,
'cached_datasets': len(self._cache),
'max_cache_size': self._max_cache_size,
'total_memory_mb': total_memory_mb
}
return stats
def clear_cache(self) -> None:
"""Clear all cached data."""
cleared_count = len(self._cache)
cleared_memory_mb = sum(entry['memory_usage_mb'] for entry in self._cache.values())
self._cache.clear()
self._access_order.clear()
# Reset stats except totals (for historical tracking)
self._cache_stats['evictions'] += cleared_count
logger.info(f"Cache cleared: {cleared_count} datasets, {cleared_memory_mb:.1f}MB freed")
def get_cached_datasets_info(self) -> List[Dict[str, Any]]:
"""Get information about all cached datasets."""
datasets_info = []
for cache_key, entry in self._cache.items():
dataset_info = {
'cache_key': cache_key,
'file_path': entry['file_path'],
'cached_at': entry['cached_at'],
'data_shape': entry['data_shape'],
'memory_usage_mb': entry['memory_usage_mb']
}
datasets_info.append(dataset_info)
# Sort by access order (most recent first)
datasets_info.sort(
key=lambda x: self._access_order.index(x['cache_key']) if x['cache_key'] in self._access_order else -1,
reverse=True
)
return datasets_info
class DataLoader:
"""
Data loading utilities for backtesting.
This class handles loading and preprocessing of market data from various formats
including CSV and JSON files.
"""
def __init__(self, data_dir: str = "data"):
"""
Initialize data loader.
Args:
data_dir: Directory containing data files
"""
self.data_dir = data_dir
os.makedirs(self.data_dir, exist_ok=True)
def load_data(self, file_path: str, start_date: str, end_date: str) -> pd.DataFrame:
"""
Load data with optimized dtypes and filtering, supporting CSV and JSON input.
Args:
file_path: Path to the data file (relative to data_dir)
start_date: Start date for filtering (YYYY-MM-DD format)
end_date: End date for filtering (YYYY-MM-DD format)
Returns:
pd.DataFrame: Loaded OHLCV data with DatetimeIndex
"""
full_path = os.path.join(self.data_dir, file_path)
if not os.path.exists(full_path):
raise FileNotFoundError(f"Data file not found: {full_path}")
# Determine file type
_, ext = os.path.splitext(file_path)
ext = ext.lower()
try:
if ext == ".json":
return self._load_json_data(full_path, start_date, end_date)
else:
return self._load_csv_data(full_path, start_date, end_date)
except Exception as e:
logger.error(f"Error loading data from {file_path}: {e}")
# Return an empty DataFrame with a DatetimeIndex
return pd.DataFrame(index=pd.to_datetime([]))
def _load_json_data(self, file_path: str, start_date: str, end_date: str) -> pd.DataFrame:
"""Load data from JSON file."""
with open(file_path, 'r') as f:
raw = json.load(f)
data = pd.DataFrame(raw["Data"])
# Convert columns to lowercase
data.columns = data.columns.str.lower()
# Convert timestamp to datetime
data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")
# Filter by date range
data = data[(data["timestamp"] >= start_date) & (data["timestamp"] <= end_date)]
logger.info(f"JSON data loaded: {len(data)} rows for {start_date} to {end_date}")
return data.set_index("timestamp")
def _load_csv_data(self, file_path: str, start_date: str, end_date: str) -> pd.DataFrame:
"""Load data from CSV file."""
# Define optimized dtypes
dtypes = {
'Open': 'float32',
'High': 'float32',
'Low': 'float32',
'Close': 'float32',
'Volume': 'float32'
}
# Read data with original capitalized column names
try:
data = pd.read_csv(file_path, dtype=dtypes)
except Exception as e:
logger.warning(f"Failed to read CSV with default engine, trying python engine: {e}")
data = pd.read_csv(file_path, dtype=dtypes, engine='python')
# Handle timestamp column
if 'Timestamp' in data.columns:
data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s')
# Filter by date range
data = data[(data['Timestamp'] >= start_date) & (data['Timestamp'] <= end_date)]
# Convert column names to lowercase
data.columns = data.columns.str.lower()
# Convert numpy float32 to Python float for compatibility
numeric_columns = ['open', 'high', 'low', 'close', 'volume']
for col in numeric_columns:
if col in data.columns:
data[col] = data[col].astype(float)
logger.info(f"CSV data loaded: {len(data)} rows for {start_date} to {end_date}")
return data.set_index('timestamp')
else:
# Attempt to use the first column if 'Timestamp' is not present
data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')
data = data[(data['timestamp'] >= start_date) & (data['timestamp'] <= end_date)]
data.columns = data.columns.str.lower()
# Convert numpy float32 to Python float for compatibility
numeric_columns = ['open', 'high', 'low', 'close', 'volume']
for col in numeric_columns:
if col in data.columns:
data[col] = data[col].astype(float)
logger.info(f"CSV data loaded (first column as timestamp): {len(data)} rows for {start_date} to {end_date}")
return data.set_index('timestamp')
def validate_data(self, data: pd.DataFrame) -> bool:
"""
Validate loaded data for required columns and basic integrity.
Args:
data: DataFrame to validate
Returns:
bool: True if data is valid
"""
if data.empty:
logger.error("Data is empty")
return False
required_columns = ['open', 'high', 'low', 'close', 'volume']
missing_columns = [col for col in required_columns if col not in data.columns]
if missing_columns:
logger.error(f"Missing required columns: {missing_columns}")
return False
# Check for NaN values
if data[required_columns].isnull().any().any():
logger.warning("Data contains NaN values")
# Check for negative prices
price_columns = ['open', 'high', 'low', 'close']
if (data[price_columns] <= 0).any().any():
logger.warning("Data contains non-positive prices")
# Check OHLC consistency
if not ((data['low'] <= data['open']) &
(data['low'] <= data['close']) &
(data['high'] >= data['open']) &
(data['high'] >= data['close'])).all():
logger.warning("Data contains OHLC inconsistencies")
return True
class SystemUtils:
"""
System resource management utilities.
This class provides methods for determining optimal system resource usage
for parallel processing and performance optimization.
"""
def __init__(self):
"""Initialize system utilities."""
pass
def get_optimal_workers(self) -> int:
"""
Determine optimal number of worker processes based on system resources.
Returns:
int: Optimal number of worker processes
"""
cpu_count = os.cpu_count() or 4
memory_gb = psutil.virtual_memory().total / (1024**3)
# Heuristic: Use 75% of cores, but cap based on available memory
# Assume each worker needs ~2GB for large datasets
workers_by_memory = max(1, int(memory_gb / 2))
workers_by_cpu = max(1, int(cpu_count * 0.75))
optimal_workers = min(workers_by_cpu, workers_by_memory)
logger.info(f"System resources: {cpu_count} CPUs, {memory_gb:.1f}GB RAM")
logger.info(f"Using {optimal_workers} workers for processing")
return optimal_workers
def get_system_info(self) -> Dict[str, Any]:
"""
Get comprehensive system information.
Returns:
Dict containing system information
"""
memory = psutil.virtual_memory()
return {
"cpu_count": os.cpu_count(),
"memory_total_gb": memory.total / (1024**3),
"memory_available_gb": memory.available / (1024**3),
"memory_percent": memory.percent,
"optimal_workers": self.get_optimal_workers()
}
class ResultsSaver:
"""
Results saving utilities for backtesting.
This class handles saving backtest results in various formats including
CSV, JSON, and comprehensive reports.
"""
def __init__(self, results_dir: str = "results"):
"""
Initialize results saver.
Args:
results_dir: Directory for saving results
"""
self.results_dir = results_dir
os.makedirs(self.results_dir, exist_ok=True)
def save_results_csv(self, results: List[Dict[str, Any]], filename: str) -> None:
"""
Save backtest results to CSV file.
Args:
results: List of backtest results
filename: Output filename
"""
try:
# Convert results to DataFrame for easy saving
df_data = []
for result in results:
if result.get("success", True):
row = {
"strategy_name": result.get("strategy_name", ""),
"profit_ratio": result.get("profit_ratio", 0),
"final_usd": result.get("final_usd", 0),
"n_trades": result.get("n_trades", 0),
"win_rate": result.get("win_rate", 0),
"max_drawdown": result.get("max_drawdown", 0),
"avg_trade": result.get("avg_trade", 0),
"total_fees_usd": result.get("total_fees_usd", 0),
"backtest_duration_seconds": result.get("backtest_duration_seconds", 0),
"data_points_processed": result.get("data_points_processed", 0)
}
# Add strategy parameters
strategy_params = result.get("strategy_params", {})
for key, value in strategy_params.items():
row[f"strategy_{key}"] = value
# Add trader parameters
trader_params = result.get("trader_params", {})
for key, value in trader_params.items():
row[f"trader_{key}"] = value
df_data.append(row)
# Save to CSV
df = pd.DataFrame(df_data)
full_path = os.path.join(self.results_dir, filename)
df.to_csv(full_path, index=False)
logger.info(f"Results saved to {full_path}: {len(df_data)} rows")
except Exception as e:
logger.error(f"Error saving results to {filename}: {e}")
raise
def save_comprehensive_results(self, results: List[Dict[str, Any]],
base_filename: str,
summary: Optional[Dict[str, Any]] = None,
action_log: Optional[List[Dict[str, Any]]] = None,
session_start_time: Optional[datetime] = None) -> None:
"""
Save comprehensive backtest results including summary, individual results, and logs.
Args:
results: List of backtest results
base_filename: Base filename (without extension)
summary: Optional summary statistics
action_log: Optional action log
session_start_time: Optional session start time
"""
try:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
session_start = session_start_time or datetime.now()
# 1. Save summary report
if summary is None:
summary = self._calculate_summary_statistics(results)
summary_data = {
"session_info": {
"timestamp": timestamp,
"session_start": session_start.isoformat(),
"session_duration_seconds": (datetime.now() - session_start).total_seconds()
},
"summary_statistics": summary,
"action_log_summary": {
"total_actions": len(action_log) if action_log else 0,
"action_types": list(set(action["action_type"] for action in action_log)) if action_log else []
}
}
summary_filename = f"{base_filename}_summary_{timestamp}.json"
self._save_json(summary_data, summary_filename)
# 2. Save detailed results CSV
self.save_results_csv(results, f"{base_filename}_detailed_{timestamp}.csv")
# 3. Save individual strategy results
valid_results = [r for r in results if r.get("success", True)]
for i, result in enumerate(valid_results):
strategy_filename = f"{base_filename}_strategy_{i+1}_{result['strategy_name']}_{timestamp}.json"
strategy_data = self._format_strategy_result(result)
self._save_json(strategy_data, strategy_filename)
# 4. Save action log if provided
if action_log:
action_log_filename = f"{base_filename}_actions_{timestamp}.json"
action_log_data = {
"session_info": {
"timestamp": timestamp,
"session_start": session_start.isoformat(),
"total_actions": len(action_log)
},
"actions": action_log
}
self._save_json(action_log_data, action_log_filename)
# 5. Create master index file
index_filename = f"{base_filename}_index_{timestamp}.json"
index_data = self._create_index_file(base_filename, timestamp, valid_results, summary)
self._save_json(index_data, index_filename)
# Print summary
print(f"\n📊 Comprehensive results saved:")
print(f" 📋 Summary: {self.results_dir}/{summary_filename}")
print(f" 📈 Detailed CSV: {self.results_dir}/{base_filename}_detailed_{timestamp}.csv")
if action_log:
print(f" 📝 Action Log: {self.results_dir}/{action_log_filename}")
print(f" 📁 Individual Strategies: {len(valid_results)} files")
print(f" 🗂️ Master Index: {self.results_dir}/{index_filename}")
except Exception as e:
logger.error(f"Error saving comprehensive results: {e}")
raise
def _save_json(self, data: Dict[str, Any], filename: str) -> None:
"""Save data to JSON file."""
full_path = os.path.join(self.results_dir, filename)
with open(full_path, 'w') as f:
json.dump(data, f, indent=2, default=str)
logger.info(f"JSON saved: {full_path}")
def _calculate_summary_statistics(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Calculate summary statistics from results."""
valid_results = [r for r in results if r.get("success", True)]
if not valid_results:
return {
"total_runs": len(results),
"successful_runs": 0,
"failed_runs": len(results),
"error": "No valid results to summarize"
}
# Extract metrics
profit_ratios = [r["profit_ratio"] for r in valid_results]
final_balances = [r["final_usd"] for r in valid_results]
n_trades_list = [r["n_trades"] for r in valid_results]
win_rates = [r["win_rate"] for r in valid_results]
max_drawdowns = [r["max_drawdown"] for r in valid_results]
return {
"total_runs": len(results),
"successful_runs": len(valid_results),
"failed_runs": len(results) - len(valid_results),
"profit_ratio": {
"mean": np.mean(profit_ratios),
"std": np.std(profit_ratios),
"min": np.min(profit_ratios),
"max": np.max(profit_ratios),
"median": np.median(profit_ratios)
},
"final_usd": {
"mean": np.mean(final_balances),
"std": np.std(final_balances),
"min": np.min(final_balances),
"max": np.max(final_balances),
"median": np.median(final_balances)
},
"n_trades": {
"mean": np.mean(n_trades_list),
"std": np.std(n_trades_list),
"min": np.min(n_trades_list),
"max": np.max(n_trades_list),
"median": np.median(n_trades_list)
},
"win_rate": {
"mean": np.mean(win_rates),
"std": np.std(win_rates),
"min": np.min(win_rates),
"max": np.max(win_rates),
"median": np.median(win_rates)
},
"max_drawdown": {
"mean": np.mean(max_drawdowns),
"std": np.std(max_drawdowns),
"min": np.min(max_drawdowns),
"max": np.max(max_drawdowns),
"median": np.median(max_drawdowns)
},
"best_run": max(valid_results, key=lambda x: x["profit_ratio"]),
"worst_run": min(valid_results, key=lambda x: x["profit_ratio"])
}
def _format_strategy_result(self, result: Dict[str, Any]) -> Dict[str, Any]:
"""Format individual strategy result for saving."""
return {
"strategy_info": {
"name": result['strategy_name'],
"params": result.get('strategy_params', {}),
"trader_params": result.get('trader_params', {})
},
"performance": {
"initial_usd": result['initial_usd'],
"final_usd": result['final_usd'],
"profit_ratio": result['profit_ratio'],
"n_trades": result['n_trades'],
"win_rate": result['win_rate'],
"max_drawdown": result['max_drawdown'],
"avg_trade": result['avg_trade'],
"total_fees_usd": result['total_fees_usd']
},
"execution": {
"backtest_duration_seconds": result.get('backtest_duration_seconds', 0),
"data_points_processed": result.get('data_points_processed', 0),
"warmup_complete": result.get('warmup_complete', False)
},
"trades": result.get('trades', [])
}
def _create_index_file(self, base_filename: str, timestamp: str,
valid_results: List[Dict[str, Any]],
summary: Dict[str, Any]) -> Dict[str, Any]:
"""Create master index file."""
return {
"session_info": {
"timestamp": timestamp,
"base_filename": base_filename,
"total_strategies": len(valid_results)
},
"files": {
"summary": f"{base_filename}_summary_{timestamp}.json",
"detailed_csv": f"{base_filename}_detailed_{timestamp}.csv",
"individual_strategies": [
f"{base_filename}_strategy_{i+1}_{result['strategy_name']}_{timestamp}.json"
for i, result in enumerate(valid_results)
]
},
"quick_stats": {
"best_profit": summary.get("profit_ratio", {}).get("max", 0) if summary.get("profit_ratio") else 0,
"worst_profit": summary.get("profit_ratio", {}).get("min", 0) if summary.get("profit_ratio") else 0,
"avg_profit": summary.get("profit_ratio", {}).get("mean", 0) if summary.get("profit_ratio") else 0,
"total_successful_runs": summary.get("successful_runs", 0),
"total_failed_runs": summary.get("failed_runs", 0)
}
}