Cycles/IncrementalTrader/backtester/utils.py

"""
Backtester Utilities

This module provides utility functions for data loading, system resource management,
and result saving for the incremental backtesting framework.
"""

import os
import json
import pandas as pd
import numpy as np
import psutil
import hashlib
from typing import Dict, List, Any, Optional
import logging
from datetime import datetime

logger = logging.getLogger(__name__)


class DataCache:
    """
    Data caching utility for optimizing repeated data loading operations.

    This class provides intelligent caching of loaded market data to eliminate
    redundant I/O operations when running multiple strategies or parameter
    optimizations with the same data requirements.

    Features:
    - Automatic cache key generation based on file path and date range
    - Memory-efficient storage with DataFrame copying to prevent mutations
    - Cache statistics tracking for performance monitoring
    - File modification time tracking for cache invalidation
    - Configurable memory limits to prevent excessive memory usage

    Example:
        cache = DataCache(max_cache_size=10)
        data1 = cache.get_data("btc_data.csv", "2023-01-01", "2023-01-31", data_loader)
        data2 = cache.get_data("btc_data.csv", "2023-01-01", "2023-01-31", data_loader)  # Cache hit
        print(cache.get_cache_stats())  # {'hits': 1, 'misses': 1, 'hit_ratio': 0.5}
    """

    def __init__(self, max_cache_size: int = 20):
        """
        Initialize data cache.

        Args:
            max_cache_size: Maximum number of datasets to cache (LRU eviction)
        """
        self._cache: Dict[str, Dict[str, Any]] = {}
        self._access_order: List[str] = []  # For LRU tracking
        self._max_cache_size = max_cache_size
        self._cache_stats = {
            'hits': 0,
            'misses': 0,
            'evictions': 0,
            'total_requests': 0
        }

        logger.info(f"DataCache initialized with max_cache_size={max_cache_size}")

    def get_data(self, file_path: str, start_date: str, end_date: str,
                 data_loader: 'DataLoader') -> pd.DataFrame:
        """
        Get data from cache or load if not cached.

        Args:
            file_path: Path to the data file (relative to data_dir)
            start_date: Start date for filtering (YYYY-MM-DD format)
            end_date: End date for filtering (YYYY-MM-DD format)
            data_loader: DataLoader instance to use for loading data

        Returns:
            pd.DataFrame: Loaded OHLCV data with DatetimeIndex
        """
        self._cache_stats['total_requests'] += 1

        # Generate cache key
        cache_key = self._generate_cache_key(file_path, start_date, end_date, data_loader.data_dir)

        # Check if data is cached and still valid
        if cache_key in self._cache:
            cached_entry = self._cache[cache_key]

            # Check if file has been modified since caching
            if self._is_cache_valid(cached_entry, file_path, data_loader.data_dir):
                self._cache_stats['hits'] += 1
                self._update_access_order(cache_key)

                logger.debug(f"Cache HIT for {file_path} [{start_date} to {end_date}]")

                # Return a copy to prevent mutations affecting cached data
                return cached_entry['data'].copy()

        # Cache miss - load data
        self._cache_stats['misses'] += 1
        logger.debug(f"Cache MISS for {file_path} [{start_date} to {end_date}] - loading from disk")

        # Load data using the provided data loader
        data = data_loader.load_data(file_path, start_date, end_date)

        # Cache the loaded data
        self._store_in_cache(cache_key, data, file_path, data_loader.data_dir)

        # Return a copy to prevent mutations affecting cached data
        return data.copy()

    def _generate_cache_key(self, file_path: str, start_date: str, end_date: str, data_dir: str) -> str:
        """Generate a unique cache key for the data request."""
        # Include file path, date range, and data directory in the key
        key_components = f"{data_dir}:{file_path}:{start_date}:{end_date}"

        # Use hash for consistent key length and to handle special characters
        cache_key = hashlib.md5(key_components.encode()).hexdigest()

        return cache_key

    def _is_cache_valid(self, cached_entry: Dict[str, Any], file_path: str, data_dir: str) -> bool:
        """Check if cached data is still valid (file not modified)."""
        try:
            full_path = os.path.join(data_dir, file_path)
            current_mtime = os.path.getmtime(full_path)
            cached_mtime = cached_entry['file_mtime']

            return current_mtime == cached_mtime
        except (OSError, KeyError):
            # File not found or missing metadata - consider invalid
            return False

    def _store_in_cache(self, cache_key: str, data: pd.DataFrame, file_path: str, data_dir: str) -> None:
        """Store data in cache with metadata."""
        # Enforce cache size limit using LRU eviction
        if len(self._cache) >= self._max_cache_size:
            self._evict_lru_entry()

        # Get file modification time for cache validation
        try:
            full_path = os.path.join(data_dir, file_path)
            file_mtime = os.path.getmtime(full_path)
        except OSError:
            file_mtime = 0  # Fallback if file not accessible

        # Store cache entry
        cache_entry = {
            'data': data.copy(),  # Store a copy to prevent external mutations
            'file_path': file_path,
            'file_mtime': file_mtime,
            'cached_at': datetime.now(),
            'data_shape': data.shape,
            'memory_usage_mb': data.memory_usage(deep=True).sum() / 1024 / 1024
        }

        self._cache[cache_key] = cache_entry
        self._update_access_order(cache_key)

        logger.debug(f"Cached data for {file_path}: {data.shape[0]} rows, "
                    f"{cache_entry['memory_usage_mb']:.1f}MB")

    def _update_access_order(self, cache_key: str) -> None:
        """Update LRU access order."""
        if cache_key in self._access_order:
            self._access_order.remove(cache_key)
        self._access_order.append(cache_key)

    def _evict_lru_entry(self) -> None:
        """Evict least recently used cache entry."""
        if not self._access_order:
            return

        lru_key = self._access_order.pop(0)
        evicted_entry = self._cache.pop(lru_key, None)

        if evicted_entry:
            self._cache_stats['evictions'] += 1
            logger.debug(f"Evicted LRU cache entry: {evicted_entry['file_path']} "
                        f"({evicted_entry['memory_usage_mb']:.1f}MB)")

    def get_cache_stats(self) -> Dict[str, Any]:
        """
        Get cache performance statistics.

        Returns:
            Dict containing cache statistics including hit ratio and memory usage
        """
        total_requests = self._cache_stats['total_requests']
        hits = self._cache_stats['hits']

        hit_ratio = hits / total_requests if total_requests > 0 else 0.0

        # Calculate total memory usage
        total_memory_mb = sum(
            entry['memory_usage_mb'] for entry in self._cache.values()
        )

        stats = {
            'hits': hits,
            'misses': self._cache_stats['misses'],
            'evictions': self._cache_stats['evictions'],
            'total_requests': total_requests,
            'hit_ratio': hit_ratio,
            'cached_datasets': len(self._cache),
            'max_cache_size': self._max_cache_size,
            'total_memory_mb': total_memory_mb
        }

        return stats

    def clear_cache(self) -> None:
        """Clear all cached data."""
        cleared_count = len(self._cache)
        cleared_memory_mb = sum(entry['memory_usage_mb'] for entry in self._cache.values())

        self._cache.clear()
        self._access_order.clear()

        # Reset stats except totals (for historical tracking)
        self._cache_stats['evictions'] += cleared_count

        logger.info(f"Cache cleared: {cleared_count} datasets, {cleared_memory_mb:.1f}MB freed")

    def get_cached_datasets_info(self) -> List[Dict[str, Any]]:
        """Get information about all cached datasets."""
        datasets_info = []

        for cache_key, entry in self._cache.items():
            dataset_info = {
                'cache_key': cache_key,
                'file_path': entry['file_path'],
                'cached_at': entry['cached_at'],
                'data_shape': entry['data_shape'],
                'memory_usage_mb': entry['memory_usage_mb']
            }
            datasets_info.append(dataset_info)

        # Sort by access order (most recent first)
        datasets_info.sort(
            key=lambda x: self._access_order.index(x['cache_key']) if x['cache_key'] in self._access_order else -1,
            reverse=True
        )

        return datasets_info


class DataLoader:
    """
    Data loading utilities for backtesting.

    This class handles loading and preprocessing of market data from various formats
    including CSV and JSON files.
    """

    def __init__(self, data_dir: str = "data"):
        """
        Initialize data loader.

        Args:
            data_dir: Directory containing data files
        """
        self.data_dir = data_dir
        os.makedirs(self.data_dir, exist_ok=True)

    def load_data(self, file_path: str, start_date: str, end_date: str) -> pd.DataFrame:
        """
        Load data with optimized dtypes and filtering, supporting CSV and JSON input.

        Args:
            file_path: Path to the data file (relative to data_dir)
            start_date: Start date for filtering (YYYY-MM-DD format)
            end_date: End date for filtering (YYYY-MM-DD format)

        Returns:
            pd.DataFrame: Loaded OHLCV data with DatetimeIndex
        """
        full_path = os.path.join(self.data_dir, file_path)

        if not os.path.exists(full_path):
            raise FileNotFoundError(f"Data file not found: {full_path}")

        # Determine file type
        _, ext = os.path.splitext(file_path)
        ext = ext.lower()

        try:
            if ext == ".json":
                return self._load_json_data(full_path, start_date, end_date)
            else:
                return self._load_csv_data(full_path, start_date, end_date)

        except Exception as e:
            logger.error(f"Error loading data from {file_path}: {e}")
            # Return an empty DataFrame with a DatetimeIndex
            return pd.DataFrame(index=pd.to_datetime([]))

    def _load_json_data(self, file_path: str, start_date: str, end_date: str) -> pd.DataFrame:
        """Load data from JSON file."""
        with open(file_path, 'r') as f:
            raw = json.load(f)

        data = pd.DataFrame(raw["Data"])

        # Convert columns to lowercase
        data.columns = data.columns.str.lower()

        # Convert timestamp to datetime
        data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")

        # Filter by date range
        data = data[(data["timestamp"] >= start_date) & (data["timestamp"] <= end_date)]

        logger.info(f"JSON data loaded: {len(data)} rows for {start_date} to {end_date}")
        return data.set_index("timestamp")

    def _load_csv_data(self, file_path: str, start_date: str, end_date: str) -> pd.DataFrame:
        """Load data from CSV file."""
        # Define optimized dtypes
        dtypes = {
            'Open': 'float32',
            'High': 'float32',
            'Low': 'float32',
            'Close': 'float32',
            'Volume': 'float32'
        }

        # Read data with original capitalized column names
        try:
            data = pd.read_csv(file_path, dtype=dtypes)
        except Exception as e:
            logger.warning(f"Failed to read CSV with default engine, trying python engine: {e}")
            data = pd.read_csv(file_path, dtype=dtypes, engine='python')

        # Handle timestamp column
        if 'Timestamp' in data.columns:
            data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s')
            # Filter by date range
            data = data[(data['Timestamp'] >= start_date) & (data['Timestamp'] <= end_date)]
            # Convert column names to lowercase
            data.columns = data.columns.str.lower()

            # Convert numpy float32 to Python float for compatibility
            numeric_columns = ['open', 'high', 'low', 'close', 'volume']
            for col in numeric_columns:
                if col in data.columns:
                    data[col] = data[col].astype(float)

            logger.info(f"CSV data loaded: {len(data)} rows for {start_date} to {end_date}")
            return data.set_index('timestamp')
        else:
            # Attempt to use the first column if 'Timestamp' is not present
            data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)
            data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')
            data = data[(data['timestamp'] >= start_date) & (data['timestamp'] <= end_date)]
            data.columns = data.columns.str.lower()

            # Convert numpy float32 to Python float for compatibility
            numeric_columns = ['open', 'high', 'low', 'close', 'volume']
            for col in numeric_columns:
                if col in data.columns:
                    data[col] = data[col].astype(float)

            logger.info(f"CSV data loaded (first column as timestamp): {len(data)} rows for {start_date} to {end_date}")
            return data.set_index('timestamp')

    def validate_data(self, data: pd.DataFrame) -> bool:
        """
        Validate loaded data for required columns and basic integrity.

        Args:
            data: DataFrame to validate

        Returns:
            bool: True if data is valid
        """
        if data.empty:
            logger.error("Data is empty")
            return False

        required_columns = ['open', 'high', 'low', 'close', 'volume']
        missing_columns = [col for col in required_columns if col not in data.columns]

        if missing_columns:
            logger.error(f"Missing required columns: {missing_columns}")
            return False

        # Check for NaN values
        if data[required_columns].isnull().any().any():
            logger.warning("Data contains NaN values")

        # Check for negative prices
        price_columns = ['open', 'high', 'low', 'close']
        if (data[price_columns] <= 0).any().any():
            logger.warning("Data contains non-positive prices")

        # Check OHLC consistency
        if not ((data['low'] <= data['open']) &
                (data['low'] <= data['close']) &
                (data['high'] >= data['open']) &
                (data['high'] >= data['close'])).all():
            logger.warning("Data contains OHLC inconsistencies")

        return True


class SystemUtils:
    """
    System resource management utilities.

    This class provides methods for determining optimal system resource usage
    for parallel processing and performance optimization.
    """

    def __init__(self):
        """Initialize system utilities."""
        pass

    def get_optimal_workers(self) -> int:
        """
        Determine optimal number of worker processes based on system resources.

        Returns:
            int: Optimal number of worker processes
        """
        cpu_count = os.cpu_count() or 4
        memory_gb = psutil.virtual_memory().total / (1024**3)

        # Heuristic: Use 75% of cores, but cap based on available memory
        # Assume each worker needs ~2GB for large datasets
        workers_by_memory = max(1, int(memory_gb / 2))
        workers_by_cpu = max(1, int(cpu_count * 0.75))

        optimal_workers = min(workers_by_cpu, workers_by_memory)

        logger.info(f"System resources: {cpu_count} CPUs, {memory_gb:.1f}GB RAM")
        logger.info(f"Using {optimal_workers} workers for processing")

        return optimal_workers

    def get_system_info(self) -> Dict[str, Any]:
        """
        Get comprehensive system information.

        Returns:
            Dict containing system information
        """
        memory = psutil.virtual_memory()

        return {
            "cpu_count": os.cpu_count(),
            "memory_total_gb": memory.total / (1024**3),
            "memory_available_gb": memory.available / (1024**3),
            "memory_percent": memory.percent,
            "optimal_workers": self.get_optimal_workers()
        }


class ResultsSaver:
    """
    Results saving utilities for backtesting.

    This class handles saving backtest results in various formats including
    CSV, JSON, and comprehensive reports.
    """

    def __init__(self, results_dir: str = "results"):
        """
        Initialize results saver.

        Args:
            results_dir: Directory for saving results
        """
        self.results_dir = results_dir
        os.makedirs(self.results_dir, exist_ok=True)

    def save_results_csv(self, results: List[Dict[str, Any]], filename: str) -> None:
        """
        Save backtest results to CSV file.

        Args:
            results: List of backtest results
            filename: Output filename
        """
        try:
            # Convert results to DataFrame for easy saving
            df_data = []
            for result in results:
                if result.get("success", True):
                    row = {
                        "strategy_name": result.get("strategy_name", ""),
                        "profit_ratio": result.get("profit_ratio", 0),
                        "final_usd": result.get("final_usd", 0),
                        "n_trades": result.get("n_trades", 0),
                        "win_rate": result.get("win_rate", 0),
                        "max_drawdown": result.get("max_drawdown", 0),
                        "avg_trade": result.get("avg_trade", 0),
                        "total_fees_usd": result.get("total_fees_usd", 0),
                        "backtest_duration_seconds": result.get("backtest_duration_seconds", 0),
                        "data_points_processed": result.get("data_points_processed", 0)
                    }

                    # Add strategy parameters
                    strategy_params = result.get("strategy_params", {})
                    for key, value in strategy_params.items():
                        row[f"strategy_{key}"] = value

                    # Add trader parameters
                    trader_params = result.get("trader_params", {})
                    for key, value in trader_params.items():
                        row[f"trader_{key}"] = value

                    df_data.append(row)

            # Save to CSV
            df = pd.DataFrame(df_data)
            full_path = os.path.join(self.results_dir, filename)
            df.to_csv(full_path, index=False)

            logger.info(f"Results saved to {full_path}: {len(df_data)} rows")

        except Exception as e:
            logger.error(f"Error saving results to {filename}: {e}")
            raise

    def save_comprehensive_results(self, results: List[Dict[str, Any]],
                                 base_filename: str,
                                 summary: Optional[Dict[str, Any]] = None,
                                 action_log: Optional[List[Dict[str, Any]]] = None,
                                 session_start_time: Optional[datetime] = None) -> None:
        """
        Save comprehensive backtest results including summary, individual results, and logs.

        Args:
            results: List of backtest results
            base_filename: Base filename (without extension)
            summary: Optional summary statistics
            action_log: Optional action log
            session_start_time: Optional session start time
        """
        try:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            session_start = session_start_time or datetime.now()

            # 1. Save summary report
            if summary is None:
                summary = self._calculate_summary_statistics(results)

            summary_data = {
                "session_info": {
                    "timestamp": timestamp,
                    "session_start": session_start.isoformat(),
                    "session_duration_seconds": (datetime.now() - session_start).total_seconds()
                },
                "summary_statistics": summary,
                "action_log_summary": {
                    "total_actions": len(action_log) if action_log else 0,
                    "action_types": list(set(action["action_type"] for action in action_log)) if action_log else []
                }
            }

            summary_filename = f"{base_filename}_summary_{timestamp}.json"
            self._save_json(summary_data, summary_filename)

            # 2. Save detailed results CSV
            self.save_results_csv(results, f"{base_filename}_detailed_{timestamp}.csv")

            # 3. Save individual strategy results
            valid_results = [r for r in results if r.get("success", True)]
            for i, result in enumerate(valid_results):
                strategy_filename = f"{base_filename}_strategy_{i+1}_{result['strategy_name']}_{timestamp}.json"
                strategy_data = self._format_strategy_result(result)
                self._save_json(strategy_data, strategy_filename)

            # 4. Save action log if provided
            if action_log:
                action_log_filename = f"{base_filename}_actions_{timestamp}.json"
                action_log_data = {
                    "session_info": {
                        "timestamp": timestamp,
                        "session_start": session_start.isoformat(),
                        "total_actions": len(action_log)
                    },
                    "actions": action_log
                }
                self._save_json(action_log_data, action_log_filename)

            # 5. Create master index file
            index_filename = f"{base_filename}_index_{timestamp}.json"
            index_data = self._create_index_file(base_filename, timestamp, valid_results, summary)
            self._save_json(index_data, index_filename)

            # Print summary
            print(f"\n📊 Comprehensive results saved:")
            print(f"  📋 Summary: {self.results_dir}/{summary_filename}")
            print(f"  📈 Detailed CSV: {self.results_dir}/{base_filename}_detailed_{timestamp}.csv")
            if action_log:
                print(f"  📝 Action Log: {self.results_dir}/{action_log_filename}")
            print(f"  📁 Individual Strategies: {len(valid_results)} files")
            print(f"  🗂️  Master Index: {self.results_dir}/{index_filename}")

        except Exception as e:
            logger.error(f"Error saving comprehensive results: {e}")
            raise

    def _save_json(self, data: Dict[str, Any], filename: str) -> None:
        """Save data to JSON file."""
        full_path = os.path.join(self.results_dir, filename)
        with open(full_path, 'w') as f:
            json.dump(data, f, indent=2, default=str)
        logger.info(f"JSON saved: {full_path}")

    def _calculate_summary_statistics(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Calculate summary statistics from results."""
        valid_results = [r for r in results if r.get("success", True)]

        if not valid_results:
            return {
                "total_runs": len(results),
                "successful_runs": 0,
                "failed_runs": len(results),
                "error": "No valid results to summarize"
            }

        # Extract metrics
        profit_ratios = [r["profit_ratio"] for r in valid_results]
        final_balances = [r["final_usd"] for r in valid_results]
        n_trades_list = [r["n_trades"] for r in valid_results]
        win_rates = [r["win_rate"] for r in valid_results]
        max_drawdowns = [r["max_drawdown"] for r in valid_results]

        return {
            "total_runs": len(results),
            "successful_runs": len(valid_results),
            "failed_runs": len(results) - len(valid_results),
            "profit_ratio": {
                "mean": np.mean(profit_ratios),
                "std": np.std(profit_ratios),
                "min": np.min(profit_ratios),
                "max": np.max(profit_ratios),
                "median": np.median(profit_ratios)
            },
            "final_usd": {
                "mean": np.mean(final_balances),
                "std": np.std(final_balances),
                "min": np.min(final_balances),
                "max": np.max(final_balances),
                "median": np.median(final_balances)
            },
            "n_trades": {
                "mean": np.mean(n_trades_list),
                "std": np.std(n_trades_list),
                "min": np.min(n_trades_list),
                "max": np.max(n_trades_list),
                "median": np.median(n_trades_list)
            },
            "win_rate": {
                "mean": np.mean(win_rates),
                "std": np.std(win_rates),
                "min": np.min(win_rates),
                "max": np.max(win_rates),
                "median": np.median(win_rates)
            },
            "max_drawdown": {
                "mean": np.mean(max_drawdowns),
                "std": np.std(max_drawdowns),
                "min": np.min(max_drawdowns),
                "max": np.max(max_drawdowns),
                "median": np.median(max_drawdowns)
            },
            "best_run": max(valid_results, key=lambda x: x["profit_ratio"]),
            "worst_run": min(valid_results, key=lambda x: x["profit_ratio"])
        }

    def _format_strategy_result(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """Format individual strategy result for saving."""
        return {
            "strategy_info": {
                "name": result['strategy_name'],
                "params": result.get('strategy_params', {}),
                "trader_params": result.get('trader_params', {})
            },
            "performance": {
                "initial_usd": result['initial_usd'],
                "final_usd": result['final_usd'],
                "profit_ratio": result['profit_ratio'],
                "n_trades": result['n_trades'],
                "win_rate": result['win_rate'],
                "max_drawdown": result['max_drawdown'],
                "avg_trade": result['avg_trade'],
                "total_fees_usd": result['total_fees_usd']
            },
            "execution": {
                "backtest_duration_seconds": result.get('backtest_duration_seconds', 0),
                "data_points_processed": result.get('data_points_processed', 0),
                "warmup_complete": result.get('warmup_complete', False)
            },
            "trades": result.get('trades', [])
        }

    def _create_index_file(self, base_filename: str, timestamp: str,
                          valid_results: List[Dict[str, Any]],
                          summary: Dict[str, Any]) -> Dict[str, Any]:
        """Create master index file."""
        return {
            "session_info": {
                "timestamp": timestamp,
                "base_filename": base_filename,
                "total_strategies": len(valid_results)
            },
            "files": {
                "summary": f"{base_filename}_summary_{timestamp}.json",
                "detailed_csv": f"{base_filename}_detailed_{timestamp}.csv",
                "individual_strategies": [
                    f"{base_filename}_strategy_{i+1}_{result['strategy_name']}_{timestamp}.json"
                    for i, result in enumerate(valid_results)
                ]
            },
            "quick_stats": {
                "best_profit": summary.get("profit_ratio", {}).get("max", 0) if summary.get("profit_ratio") else 0,
                "worst_profit": summary.get("profit_ratio", {}).get("min", 0) if summary.get("profit_ratio") else 0,
                "avg_profit": summary.get("profit_ratio", {}).get("mean", 0) if summary.get("profit_ratio") else 0,
                "total_successful_runs": summary.get("successful_runs", 0),
                "total_failed_runs": summary.get("failed_runs", 0)
            }
        }