Cycles/cycles/utils/data_loader.py

import os
import json
import pandas as pd
from typing import Union, Optional
import logging

from .storage_utils import (
    _parse_timestamp_column, 
    _filter_by_date_range, 
    _normalize_column_names,
    TimestampParsingError,
    DataLoadingError
)


class DataLoader:
    """Handles loading and preprocessing of data from various file formats"""
    
    def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None):
        """Initialize data loader
        
        Args:
            data_dir: Directory containing data files
            logging_instance: Optional logging instance
        """
        self.data_dir = data_dir
        self.logging = logging_instance

    def load_data(self, file_path: str, start_date: Union[str, pd.Timestamp], 
                  stop_date: Union[str, pd.Timestamp]) -> pd.DataFrame:
        """Load data with optimized dtypes and filtering, supporting CSV and JSON input
        
        Args:
            file_path: path to the data file
            start_date: start date (string or datetime-like)
            stop_date: stop date (string or datetime-like)
            
        Returns:
            pandas DataFrame with timestamp index
            
        Raises:
            DataLoadingError: If data loading fails
        """
        try:
            # Convert string dates to pandas datetime objects for proper comparison
            start_date = pd.to_datetime(start_date)
            stop_date = pd.to_datetime(stop_date)
            
            # Determine file type
            _, ext = os.path.splitext(file_path)
            ext = ext.lower()
            
            if ext == ".json":
                return self._load_json_data(file_path, start_date, stop_date)
            else:
                return self._load_csv_data(file_path, start_date, stop_date)
                
        except Exception as e:
            error_msg = f"Error loading data from {file_path}: {e}"
            if self.logging is not None:
                self.logging.error(error_msg)
            # Return an empty DataFrame with a DatetimeIndex
            return pd.DataFrame(index=pd.to_datetime([]))

    def _load_json_data(self, file_path: str, start_date: pd.Timestamp, 
                       stop_date: pd.Timestamp) -> pd.DataFrame:
        """Load and process JSON data file
        
        Args:
            file_path: Path to JSON file
            start_date: Start date for filtering
            stop_date: Stop date for filtering
            
        Returns:
            Processed DataFrame with timestamp index
        """
        with open(os.path.join(self.data_dir, file_path), 'r') as f:
            raw = json.load(f)
        
        data = pd.DataFrame(raw["Data"])
        data = _normalize_column_names(data)
        
        # Convert timestamp to datetime
        data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")
        
        # Filter by date range
        data = _filter_by_date_range(data, "timestamp", start_date, stop_date)
        
        if self.logging is not None:
            self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
        
        return data.set_index("timestamp")

    def _load_csv_data(self, file_path: str, start_date: pd.Timestamp, 
                      stop_date: pd.Timestamp) -> pd.DataFrame:
        """Load and process CSV data file
        
        Args:
            file_path: Path to CSV file
            start_date: Start date for filtering
            stop_date: Stop date for filtering
            
        Returns:
            Processed DataFrame with timestamp index
        """
        # Define optimized dtypes
        dtypes = {
            'Open': 'float32',
            'High': 'float32', 
            'Low': 'float32',
            'Close': 'float32',
            'Volume': 'float32'
        }
        
        # Read data with original capitalized column names
        data = pd.read_csv(os.path.join(self.data_dir, file_path), dtype=dtypes)
        
        return self._process_csv_timestamps(data, start_date, stop_date, file_path)

    def _process_csv_timestamps(self, data: pd.DataFrame, start_date: pd.Timestamp, 
                               stop_date: pd.Timestamp, file_path: str) -> pd.DataFrame:
        """Process timestamps in CSV data and filter by date range
        
        Args:
            data: DataFrame with CSV data
            start_date: Start date for filtering
            stop_date: Stop date for filtering
            file_path: Original file path for logging
            
        Returns:
            Processed DataFrame with timestamp index
        """
        if 'Timestamp' in data.columns:
            data = _parse_timestamp_column(data, 'Timestamp')
            data = _filter_by_date_range(data, 'Timestamp', start_date, stop_date)
            data = _normalize_column_names(data)
            
            if self.logging is not None:
                self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
            
            return data.set_index('timestamp')
        else:
            # Attempt to use the first column if 'Timestamp' is not present
            data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)
            data = _parse_timestamp_column(data, 'timestamp')
            data = _filter_by_date_range(data, 'timestamp', start_date, stop_date)
            data = _normalize_column_names(data)
            
            if self.logging is not None:
                self.logging.info(f"Data loaded from {file_path} (using first column as timestamp) for date range {start_date} to {stop_date}")
            
            return data.set_index('timestamp')
Implement backtesting framework with modular architecture for data loading, processing, and result management. Introduced BacktestRunner, ConfigManager, and ResultProcessor classes for improved maintainability and error handling. Updated main execution script to utilize new components and added comprehensive logging. Enhanced README with detailed project overview and usage instructions. 2025-06-25 13:08:07 +08:00			`import os`
			`import json`
			`import pandas as pd`
			`from typing import Union, Optional`
			`import logging`

			`from .storage_utils import (`
			`_parse_timestamp_column,`
			`_filter_by_date_range,`
			`_normalize_column_names,`
			`TimestampParsingError,`
			`DataLoadingError`
			`)`


			`class DataLoader:`
			`"""Handles loading and preprocessing of data from various file formats"""`

			`def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None):`
			`"""Initialize data loader`

			`Args:`
			`data_dir: Directory containing data files`
			`logging_instance: Optional logging instance`
			`"""`
			`self.data_dir = data_dir`
			`self.logging = logging_instance`

			`def load_data(self, file_path: str, start_date: Union[str, pd.Timestamp],`
			`stop_date: Union[str, pd.Timestamp]) -> pd.DataFrame:`
			`"""Load data with optimized dtypes and filtering, supporting CSV and JSON input`

			`Args:`
			`file_path: path to the data file`
			`start_date: start date (string or datetime-like)`
			`stop_date: stop date (string or datetime-like)`

			`Returns:`
			`pandas DataFrame with timestamp index`

			`Raises:`
			`DataLoadingError: If data loading fails`
			`"""`
			`try:`
			`# Convert string dates to pandas datetime objects for proper comparison`
			`start_date = pd.to_datetime(start_date)`
			`stop_date = pd.to_datetime(stop_date)`

			`# Determine file type`
			`_, ext = os.path.splitext(file_path)`
			`ext = ext.lower()`

			`if ext == ".json":`
			`return self._load_json_data(file_path, start_date, stop_date)`
			`else:`
			`return self._load_csv_data(file_path, start_date, stop_date)`

			`except Exception as e:`
			`error_msg = f"Error loading data from {file_path}: {e}"`
			`if self.logging is not None:`
			`self.logging.error(error_msg)`
			`# Return an empty DataFrame with a DatetimeIndex`
			`return pd.DataFrame(index=pd.to_datetime([]))`

			`def _load_json_data(self, file_path: str, start_date: pd.Timestamp,`
			`stop_date: pd.Timestamp) -> pd.DataFrame:`
			`"""Load and process JSON data file`

			`Args:`
			`file_path: Path to JSON file`
			`start_date: Start date for filtering`
			`stop_date: Stop date for filtering`

			`Returns:`
			`Processed DataFrame with timestamp index`
			`"""`
			`with open(os.path.join(self.data_dir, file_path), 'r') as f:`
			`raw = json.load(f)`

			`data = pd.DataFrame(raw["Data"])`
			`data = _normalize_column_names(data)`

			`# Convert timestamp to datetime`
			`data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")`

			`# Filter by date range`
			`data = _filter_by_date_range(data, "timestamp", start_date, stop_date)`

			`if self.logging is not None:`
			`self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")`

			`return data.set_index("timestamp")`

			`def _load_csv_data(self, file_path: str, start_date: pd.Timestamp,`
			`stop_date: pd.Timestamp) -> pd.DataFrame:`
			`"""Load and process CSV data file`

			`Args:`
			`file_path: Path to CSV file`
			`start_date: Start date for filtering`
			`stop_date: Stop date for filtering`

			`Returns:`
			`Processed DataFrame with timestamp index`
			`"""`
			`# Define optimized dtypes`
			`dtypes = {`
			`'Open': 'float32',`
			`'High': 'float32',`
			`'Low': 'float32',`
			`'Close': 'float32',`
			`'Volume': 'float32'`
			`}`

			`# Read data with original capitalized column names`
			`data = pd.read_csv(os.path.join(self.data_dir, file_path), dtype=dtypes)`

			`return self._process_csv_timestamps(data, start_date, stop_date, file_path)`

			`def _process_csv_timestamps(self, data: pd.DataFrame, start_date: pd.Timestamp,`
			`stop_date: pd.Timestamp, file_path: str) -> pd.DataFrame:`
			`"""Process timestamps in CSV data and filter by date range`

			`Args:`
			`data: DataFrame with CSV data`
			`start_date: Start date for filtering`
			`stop_date: Stop date for filtering`
			`file_path: Original file path for logging`

			`Returns:`
			`Processed DataFrame with timestamp index`
			`"""`
			`if 'Timestamp' in data.columns:`
			`data = _parse_timestamp_column(data, 'Timestamp')`
			`data = _filter_by_date_range(data, 'Timestamp', start_date, stop_date)`
			`data = _normalize_column_names(data)`

			`if self.logging is not None:`
			`self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")`

			`return data.set_index('timestamp')`
			`else:`
			`# Attempt to use the first column if 'Timestamp' is not present`
			`data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)`
			`data = _parse_timestamp_column(data, 'timestamp')`
			`data = _filter_by_date_range(data, 'timestamp', start_date, stop_date)`
			`data = _normalize_column_names(data)`

			`if self.logging is not None:`
			`self.logging.info(f"Data loaded from {file_path} (using first column as timestamp) for date range {start_date} to {stop_date}")`

			`return data.set_index('timestamp')`