""" Backtester Utilities This module provides utility functions for data loading, system resource management, and result saving for the incremental backtesting framework. """ import os import json import pandas as pd import numpy as np import psutil import hashlib from typing import Dict, List, Any, Optional import logging from datetime import datetime logger = logging.getLogger(__name__) class DataCache: """ Data caching utility for optimizing repeated data loading operations. This class provides intelligent caching of loaded market data to eliminate redundant I/O operations when running multiple strategies or parameter optimizations with the same data requirements. Features: - Automatic cache key generation based on file path and date range - Memory-efficient storage with DataFrame copying to prevent mutations - Cache statistics tracking for performance monitoring - File modification time tracking for cache invalidation - Configurable memory limits to prevent excessive memory usage Example: cache = DataCache(max_cache_size=10) data1 = cache.get_data("btc_data.csv", "2023-01-01", "2023-01-31", data_loader) data2 = cache.get_data("btc_data.csv", "2023-01-01", "2023-01-31", data_loader) # Cache hit print(cache.get_cache_stats()) # {'hits': 1, 'misses': 1, 'hit_ratio': 0.5} """ def __init__(self, max_cache_size: int = 20): """ Initialize data cache. Args: max_cache_size: Maximum number of datasets to cache (LRU eviction) """ self._cache: Dict[str, Dict[str, Any]] = {} self._access_order: List[str] = [] # For LRU tracking self._max_cache_size = max_cache_size self._cache_stats = { 'hits': 0, 'misses': 0, 'evictions': 0, 'total_requests': 0 } logger.info(f"DataCache initialized with max_cache_size={max_cache_size}") def get_data(self, file_path: str, start_date: str, end_date: str, data_loader: 'DataLoader') -> pd.DataFrame: """ Get data from cache or load if not cached. Args: file_path: Path to the data file (relative to data_dir) start_date: Start date for filtering (YYYY-MM-DD format) end_date: End date for filtering (YYYY-MM-DD format) data_loader: DataLoader instance to use for loading data Returns: pd.DataFrame: Loaded OHLCV data with DatetimeIndex """ self._cache_stats['total_requests'] += 1 # Generate cache key cache_key = self._generate_cache_key(file_path, start_date, end_date, data_loader.data_dir) # Check if data is cached and still valid if cache_key in self._cache: cached_entry = self._cache[cache_key] # Check if file has been modified since caching if self._is_cache_valid(cached_entry, file_path, data_loader.data_dir): self._cache_stats['hits'] += 1 self._update_access_order(cache_key) logger.debug(f"Cache HIT for {file_path} [{start_date} to {end_date}]") # Return a copy to prevent mutations affecting cached data return cached_entry['data'].copy() # Cache miss - load data self._cache_stats['misses'] += 1 logger.debug(f"Cache MISS for {file_path} [{start_date} to {end_date}] - loading from disk") # Load data using the provided data loader data = data_loader.load_data(file_path, start_date, end_date) # Cache the loaded data self._store_in_cache(cache_key, data, file_path, data_loader.data_dir) # Return a copy to prevent mutations affecting cached data return data.copy() def _generate_cache_key(self, file_path: str, start_date: str, end_date: str, data_dir: str) -> str: """Generate a unique cache key for the data request.""" # Include file path, date range, and data directory in the key key_components = f"{data_dir}:{file_path}:{start_date}:{end_date}" # Use hash for consistent key length and to handle special characters cache_key = hashlib.md5(key_components.encode()).hexdigest() return cache_key def _is_cache_valid(self, cached_entry: Dict[str, Any], file_path: str, data_dir: str) -> bool: """Check if cached data is still valid (file not modified).""" try: full_path = os.path.join(data_dir, file_path) current_mtime = os.path.getmtime(full_path) cached_mtime = cached_entry['file_mtime'] return current_mtime == cached_mtime except (OSError, KeyError): # File not found or missing metadata - consider invalid return False def _store_in_cache(self, cache_key: str, data: pd.DataFrame, file_path: str, data_dir: str) -> None: """Store data in cache with metadata.""" # Enforce cache size limit using LRU eviction if len(self._cache) >= self._max_cache_size: self._evict_lru_entry() # Get file modification time for cache validation try: full_path = os.path.join(data_dir, file_path) file_mtime = os.path.getmtime(full_path) except OSError: file_mtime = 0 # Fallback if file not accessible # Store cache entry cache_entry = { 'data': data.copy(), # Store a copy to prevent external mutations 'file_path': file_path, 'file_mtime': file_mtime, 'cached_at': datetime.now(), 'data_shape': data.shape, 'memory_usage_mb': data.memory_usage(deep=True).sum() / 1024 / 1024 } self._cache[cache_key] = cache_entry self._update_access_order(cache_key) logger.debug(f"Cached data for {file_path}: {data.shape[0]} rows, " f"{cache_entry['memory_usage_mb']:.1f}MB") def _update_access_order(self, cache_key: str) -> None: """Update LRU access order.""" if cache_key in self._access_order: self._access_order.remove(cache_key) self._access_order.append(cache_key) def _evict_lru_entry(self) -> None: """Evict least recently used cache entry.""" if not self._access_order: return lru_key = self._access_order.pop(0) evicted_entry = self._cache.pop(lru_key, None) if evicted_entry: self._cache_stats['evictions'] += 1 logger.debug(f"Evicted LRU cache entry: {evicted_entry['file_path']} " f"({evicted_entry['memory_usage_mb']:.1f}MB)") def get_cache_stats(self) -> Dict[str, Any]: """ Get cache performance statistics. Returns: Dict containing cache statistics including hit ratio and memory usage """ total_requests = self._cache_stats['total_requests'] hits = self._cache_stats['hits'] hit_ratio = hits / total_requests if total_requests > 0 else 0.0 # Calculate total memory usage total_memory_mb = sum( entry['memory_usage_mb'] for entry in self._cache.values() ) stats = { 'hits': hits, 'misses': self._cache_stats['misses'], 'evictions': self._cache_stats['evictions'], 'total_requests': total_requests, 'hit_ratio': hit_ratio, 'cached_datasets': len(self._cache), 'max_cache_size': self._max_cache_size, 'total_memory_mb': total_memory_mb } return stats def clear_cache(self) -> None: """Clear all cached data.""" cleared_count = len(self._cache) cleared_memory_mb = sum(entry['memory_usage_mb'] for entry in self._cache.values()) self._cache.clear() self._access_order.clear() # Reset stats except totals (for historical tracking) self._cache_stats['evictions'] += cleared_count logger.info(f"Cache cleared: {cleared_count} datasets, {cleared_memory_mb:.1f}MB freed") def get_cached_datasets_info(self) -> List[Dict[str, Any]]: """Get information about all cached datasets.""" datasets_info = [] for cache_key, entry in self._cache.items(): dataset_info = { 'cache_key': cache_key, 'file_path': entry['file_path'], 'cached_at': entry['cached_at'], 'data_shape': entry['data_shape'], 'memory_usage_mb': entry['memory_usage_mb'] } datasets_info.append(dataset_info) # Sort by access order (most recent first) datasets_info.sort( key=lambda x: self._access_order.index(x['cache_key']) if x['cache_key'] in self._access_order else -1, reverse=True ) return datasets_info class DataLoader: """ Data loading utilities for backtesting. This class handles loading and preprocessing of market data from various formats including CSV and JSON files. """ def __init__(self, data_dir: str = "data"): """ Initialize data loader. Args: data_dir: Directory containing data files """ self.data_dir = data_dir os.makedirs(self.data_dir, exist_ok=True) def load_data(self, file_path: str, start_date: str, end_date: str) -> pd.DataFrame: """ Load data with optimized dtypes and filtering, supporting CSV and JSON input. Args: file_path: Path to the data file (relative to data_dir) start_date: Start date for filtering (YYYY-MM-DD format) end_date: End date for filtering (YYYY-MM-DD format) Returns: pd.DataFrame: Loaded OHLCV data with DatetimeIndex """ full_path = os.path.join(self.data_dir, file_path) if not os.path.exists(full_path): raise FileNotFoundError(f"Data file not found: {full_path}") # Determine file type _, ext = os.path.splitext(file_path) ext = ext.lower() try: if ext == ".json": return self._load_json_data(full_path, start_date, end_date) else: return self._load_csv_data(full_path, start_date, end_date) except Exception as e: logger.error(f"Error loading data from {file_path}: {e}") # Return an empty DataFrame with a DatetimeIndex return pd.DataFrame(index=pd.to_datetime([])) def _load_json_data(self, file_path: str, start_date: str, end_date: str) -> pd.DataFrame: """Load data from JSON file.""" with open(file_path, 'r') as f: raw = json.load(f) data = pd.DataFrame(raw["Data"]) # Convert columns to lowercase data.columns = data.columns.str.lower() # Convert timestamp to datetime data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s") # Filter by date range data = data[(data["timestamp"] >= start_date) & (data["timestamp"] <= end_date)] logger.info(f"JSON data loaded: {len(data)} rows for {start_date} to {end_date}") return data.set_index("timestamp") def _load_csv_data(self, file_path: str, start_date: str, end_date: str) -> pd.DataFrame: """Load data from CSV file.""" # Define optimized dtypes dtypes = { 'Open': 'float32', 'High': 'float32', 'Low': 'float32', 'Close': 'float32', 'Volume': 'float32' } # Read data with original capitalized column names try: data = pd.read_csv(file_path, dtype=dtypes) except Exception as e: logger.warning(f"Failed to read CSV with default engine, trying python engine: {e}") data = pd.read_csv(file_path, dtype=dtypes, engine='python') # Handle timestamp column if 'Timestamp' in data.columns: data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s') # Filter by date range data = data[(data['Timestamp'] >= start_date) & (data['Timestamp'] <= end_date)] # Convert column names to lowercase data.columns = data.columns.str.lower() # Convert numpy float32 to Python float for compatibility numeric_columns = ['open', 'high', 'low', 'close', 'volume'] for col in numeric_columns: if col in data.columns: data[col] = data[col].astype(float) logger.info(f"CSV data loaded: {len(data)} rows for {start_date} to {end_date}") return data.set_index('timestamp') else: # Attempt to use the first column if 'Timestamp' is not present data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True) data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s') data = data[(data['timestamp'] >= start_date) & (data['timestamp'] <= end_date)] data.columns = data.columns.str.lower() # Convert numpy float32 to Python float for compatibility numeric_columns = ['open', 'high', 'low', 'close', 'volume'] for col in numeric_columns: if col in data.columns: data[col] = data[col].astype(float) logger.info(f"CSV data loaded (first column as timestamp): {len(data)} rows for {start_date} to {end_date}") return data.set_index('timestamp') def validate_data(self, data: pd.DataFrame) -> bool: """ Validate loaded data for required columns and basic integrity. Args: data: DataFrame to validate Returns: bool: True if data is valid """ if data.empty: logger.error("Data is empty") return False required_columns = ['open', 'high', 'low', 'close', 'volume'] missing_columns = [col for col in required_columns if col not in data.columns] if missing_columns: logger.error(f"Missing required columns: {missing_columns}") return False # Check for NaN values if data[required_columns].isnull().any().any(): logger.warning("Data contains NaN values") # Check for negative prices price_columns = ['open', 'high', 'low', 'close'] if (data[price_columns] <= 0).any().any(): logger.warning("Data contains non-positive prices") # Check OHLC consistency if not ((data['low'] <= data['open']) & (data['low'] <= data['close']) & (data['high'] >= data['open']) & (data['high'] >= data['close'])).all(): logger.warning("Data contains OHLC inconsistencies") return True class SystemUtils: """ System resource management utilities. This class provides methods for determining optimal system resource usage for parallel processing and performance optimization. """ def __init__(self): """Initialize system utilities.""" pass def get_optimal_workers(self) -> int: """ Determine optimal number of worker processes based on system resources. Returns: int: Optimal number of worker processes """ cpu_count = os.cpu_count() or 4 memory_gb = psutil.virtual_memory().total / (1024**3) # Heuristic: Use 75% of cores, but cap based on available memory # Assume each worker needs ~2GB for large datasets workers_by_memory = max(1, int(memory_gb / 2)) workers_by_cpu = max(1, int(cpu_count * 0.75)) optimal_workers = min(workers_by_cpu, workers_by_memory) logger.info(f"System resources: {cpu_count} CPUs, {memory_gb:.1f}GB RAM") logger.info(f"Using {optimal_workers} workers for processing") return optimal_workers def get_system_info(self) -> Dict[str, Any]: """ Get comprehensive system information. Returns: Dict containing system information """ memory = psutil.virtual_memory() return { "cpu_count": os.cpu_count(), "memory_total_gb": memory.total / (1024**3), "memory_available_gb": memory.available / (1024**3), "memory_percent": memory.percent, "optimal_workers": self.get_optimal_workers() } class ResultsSaver: """ Results saving utilities for backtesting. This class handles saving backtest results in various formats including CSV, JSON, and comprehensive reports. """ def __init__(self, results_dir: str = "results"): """ Initialize results saver. Args: results_dir: Directory for saving results """ self.results_dir = results_dir os.makedirs(self.results_dir, exist_ok=True) def save_results_csv(self, results: List[Dict[str, Any]], filename: str) -> None: """ Save backtest results to CSV file. Args: results: List of backtest results filename: Output filename """ try: # Convert results to DataFrame for easy saving df_data = [] for result in results: if result.get("success", True): row = { "strategy_name": result.get("strategy_name", ""), "profit_ratio": result.get("profit_ratio", 0), "final_usd": result.get("final_usd", 0), "n_trades": result.get("n_trades", 0), "win_rate": result.get("win_rate", 0), "max_drawdown": result.get("max_drawdown", 0), "avg_trade": result.get("avg_trade", 0), "total_fees_usd": result.get("total_fees_usd", 0), "backtest_duration_seconds": result.get("backtest_duration_seconds", 0), "data_points_processed": result.get("data_points_processed", 0) } # Add strategy parameters strategy_params = result.get("strategy_params", {}) for key, value in strategy_params.items(): row[f"strategy_{key}"] = value # Add trader parameters trader_params = result.get("trader_params", {}) for key, value in trader_params.items(): row[f"trader_{key}"] = value df_data.append(row) # Save to CSV df = pd.DataFrame(df_data) full_path = os.path.join(self.results_dir, filename) df.to_csv(full_path, index=False) logger.info(f"Results saved to {full_path}: {len(df_data)} rows") except Exception as e: logger.error(f"Error saving results to {filename}: {e}") raise def save_comprehensive_results(self, results: List[Dict[str, Any]], base_filename: str, summary: Optional[Dict[str, Any]] = None, action_log: Optional[List[Dict[str, Any]]] = None, session_start_time: Optional[datetime] = None) -> None: """ Save comprehensive backtest results including summary, individual results, and logs. Args: results: List of backtest results base_filename: Base filename (without extension) summary: Optional summary statistics action_log: Optional action log session_start_time: Optional session start time """ try: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") session_start = session_start_time or datetime.now() # 1. Save summary report if summary is None: summary = self._calculate_summary_statistics(results) summary_data = { "session_info": { "timestamp": timestamp, "session_start": session_start.isoformat(), "session_duration_seconds": (datetime.now() - session_start).total_seconds() }, "summary_statistics": summary, "action_log_summary": { "total_actions": len(action_log) if action_log else 0, "action_types": list(set(action["action_type"] for action in action_log)) if action_log else [] } } summary_filename = f"{base_filename}_summary_{timestamp}.json" self._save_json(summary_data, summary_filename) # 2. Save detailed results CSV self.save_results_csv(results, f"{base_filename}_detailed_{timestamp}.csv") # 3. Save individual strategy results valid_results = [r for r in results if r.get("success", True)] for i, result in enumerate(valid_results): strategy_filename = f"{base_filename}_strategy_{i+1}_{result['strategy_name']}_{timestamp}.json" strategy_data = self._format_strategy_result(result) self._save_json(strategy_data, strategy_filename) # 4. Save action log if provided if action_log: action_log_filename = f"{base_filename}_actions_{timestamp}.json" action_log_data = { "session_info": { "timestamp": timestamp, "session_start": session_start.isoformat(), "total_actions": len(action_log) }, "actions": action_log } self._save_json(action_log_data, action_log_filename) # 5. Create master index file index_filename = f"{base_filename}_index_{timestamp}.json" index_data = self._create_index_file(base_filename, timestamp, valid_results, summary) self._save_json(index_data, index_filename) # Print summary print(f"\nšŸ“Š Comprehensive results saved:") print(f" šŸ“‹ Summary: {self.results_dir}/{summary_filename}") print(f" šŸ“ˆ Detailed CSV: {self.results_dir}/{base_filename}_detailed_{timestamp}.csv") if action_log: print(f" šŸ“ Action Log: {self.results_dir}/{action_log_filename}") print(f" šŸ“ Individual Strategies: {len(valid_results)} files") print(f" šŸ—‚ļø Master Index: {self.results_dir}/{index_filename}") except Exception as e: logger.error(f"Error saving comprehensive results: {e}") raise def _save_json(self, data: Dict[str, Any], filename: str) -> None: """Save data to JSON file.""" full_path = os.path.join(self.results_dir, filename) with open(full_path, 'w') as f: json.dump(data, f, indent=2, default=str) logger.info(f"JSON saved: {full_path}") def _calculate_summary_statistics(self, results: List[Dict[str, Any]]) -> Dict[str, Any]: """Calculate summary statistics from results.""" valid_results = [r for r in results if r.get("success", True)] if not valid_results: return { "total_runs": len(results), "successful_runs": 0, "failed_runs": len(results), "error": "No valid results to summarize" } # Extract metrics profit_ratios = [r["profit_ratio"] for r in valid_results] final_balances = [r["final_usd"] for r in valid_results] n_trades_list = [r["n_trades"] for r in valid_results] win_rates = [r["win_rate"] for r in valid_results] max_drawdowns = [r["max_drawdown"] for r in valid_results] return { "total_runs": len(results), "successful_runs": len(valid_results), "failed_runs": len(results) - len(valid_results), "profit_ratio": { "mean": np.mean(profit_ratios), "std": np.std(profit_ratios), "min": np.min(profit_ratios), "max": np.max(profit_ratios), "median": np.median(profit_ratios) }, "final_usd": { "mean": np.mean(final_balances), "std": np.std(final_balances), "min": np.min(final_balances), "max": np.max(final_balances), "median": np.median(final_balances) }, "n_trades": { "mean": np.mean(n_trades_list), "std": np.std(n_trades_list), "min": np.min(n_trades_list), "max": np.max(n_trades_list), "median": np.median(n_trades_list) }, "win_rate": { "mean": np.mean(win_rates), "std": np.std(win_rates), "min": np.min(win_rates), "max": np.max(win_rates), "median": np.median(win_rates) }, "max_drawdown": { "mean": np.mean(max_drawdowns), "std": np.std(max_drawdowns), "min": np.min(max_drawdowns), "max": np.max(max_drawdowns), "median": np.median(max_drawdowns) }, "best_run": max(valid_results, key=lambda x: x["profit_ratio"]), "worst_run": min(valid_results, key=lambda x: x["profit_ratio"]) } def _format_strategy_result(self, result: Dict[str, Any]) -> Dict[str, Any]: """Format individual strategy result for saving.""" return { "strategy_info": { "name": result['strategy_name'], "params": result.get('strategy_params', {}), "trader_params": result.get('trader_params', {}) }, "performance": { "initial_usd": result['initial_usd'], "final_usd": result['final_usd'], "profit_ratio": result['profit_ratio'], "n_trades": result['n_trades'], "win_rate": result['win_rate'], "max_drawdown": result['max_drawdown'], "avg_trade": result['avg_trade'], "total_fees_usd": result['total_fees_usd'] }, "execution": { "backtest_duration_seconds": result.get('backtest_duration_seconds', 0), "data_points_processed": result.get('data_points_processed', 0), "warmup_complete": result.get('warmup_complete', False) }, "trades": result.get('trades', []) } def _create_index_file(self, base_filename: str, timestamp: str, valid_results: List[Dict[str, Any]], summary: Dict[str, Any]) -> Dict[str, Any]: """Create master index file.""" return { "session_info": { "timestamp": timestamp, "base_filename": base_filename, "total_strategies": len(valid_results) }, "files": { "summary": f"{base_filename}_summary_{timestamp}.json", "detailed_csv": f"{base_filename}_detailed_{timestamp}.csv", "individual_strategies": [ f"{base_filename}_strategy_{i+1}_{result['strategy_name']}_{timestamp}.json" for i, result in enumerate(valid_results) ] }, "quick_stats": { "best_profit": summary.get("profit_ratio", {}).get("max", 0) if summary.get("profit_ratio") else 0, "worst_profit": summary.get("profit_ratio", {}).get("min", 0) if summary.get("profit_ratio") else 0, "avg_profit": summary.get("profit_ratio", {}).get("mean", 0) if summary.get("profit_ratio") else 0, "total_successful_runs": summary.get("successful_runs", 0), "total_failed_runs": summary.get("failed_runs", 0) } }