Cycles/cycles/utils/data_loader.py

import os
import json
import pandas as pd
from typing import Union, Optional
import logging

from .storage_utils import (
    _parse_timestamp_column,
    _filter_by_date_range,
    _normalize_column_names,
    TimestampParsingError,
    DataLoadingError
)


class DataLoader:
    """Handles loading and preprocessing of data from various file formats"""

    def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None):
        """Initialize data loader

        Args:
            data_dir: Directory containing data files
            logging_instance: Optional logging instance
        """
        self.data_dir = data_dir
        self.logging = logging_instance

    def load_data(self, file_path: str, start_date: Union[str, pd.Timestamp],
                  stop_date: Union[str, pd.Timestamp]) -> pd.DataFrame:
        """Load data with optimized dtypes and filtering, supporting CSV and JSON input

        Args:
            file_path: path to the data file
            start_date: start date (string or datetime-like)
            stop_date: stop date (string or datetime-like)

        Returns:
            pandas DataFrame with timestamp index

        Raises:
            DataLoadingError: If data loading fails
        """
        try:
            # Convert string dates to pandas datetime objects for proper comparison
            start_date = pd.to_datetime(start_date)
            stop_date = pd.to_datetime(stop_date)

            # Determine file type
            _, ext = os.path.splitext(file_path)
            ext = ext.lower()

            if ext == ".json":
                return self._load_json_data(file_path, start_date, stop_date)
            else:
                return self._load_csv_data(file_path, start_date, stop_date)

        except Exception as e:
            error_msg = f"Error loading data from {file_path}: {e}"
            if self.logging is not None:
                self.logging.error(error_msg)
            # Return an empty DataFrame with a DatetimeIndex
            return pd.DataFrame(index=pd.to_datetime([]))

    def _load_json_data(self, file_path: str, start_date: pd.Timestamp,
                       stop_date: pd.Timestamp) -> pd.DataFrame:
        """Load and process JSON data file

        Args:
            file_path: Path to JSON file
            start_date: Start date for filtering
            stop_date: Stop date for filtering

        Returns:
            Processed DataFrame with timestamp index
        """
        with open(os.path.join(self.data_dir, file_path), 'r') as f:
            raw = json.load(f)

        data = pd.DataFrame(raw["Data"])
        data = _normalize_column_names(data)

        # Convert timestamp to datetime
        data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")

        # Filter by date range
        data = _filter_by_date_range(data, "timestamp", start_date, stop_date)

        if self.logging is not None:
            self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")

        return data.set_index("timestamp")

    def _load_csv_data(self, file_path: str, start_date: pd.Timestamp,
                      stop_date: pd.Timestamp) -> pd.DataFrame:
        """Load and process CSV data file

        Args:
            file_path: Path to CSV file
            start_date: Start date for filtering
            stop_date: Stop date for filtering

        Returns:
            Processed DataFrame with timestamp index
        """
        # Define optimized dtypes
        dtypes = {
            'Open': 'float32',
            'High': 'float32',
            'Low': 'float32',
            'Close': 'float32',
            'Volume': 'float32'
        }

        # Read data with original capitalized column names
        data = pd.read_csv(os.path.join(self.data_dir, file_path), dtype=dtypes)

        return self._process_csv_timestamps(data, start_date, stop_date, file_path)

    def _process_csv_timestamps(self, data: pd.DataFrame, start_date: pd.Timestamp,
                               stop_date: pd.Timestamp, file_path: str) -> pd.DataFrame:
        """Process timestamps in CSV data and filter by date range

        Args:
            data: DataFrame with CSV data
            start_date: Start date for filtering
            stop_date: Stop date for filtering
            file_path: Original file path for logging

        Returns:
            Processed DataFrame with timestamp index
        """
        if 'Timestamp' in data.columns:
            data = _parse_timestamp_column(data, 'Timestamp')
            data = _filter_by_date_range(data, 'Timestamp', start_date, stop_date)
            data = _normalize_column_names(data)

            if self.logging is not None:
                self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")

            return data.set_index('timestamp')
        else:
            # Attempt to use the first column if 'Timestamp' is not present
            data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)
            data = _parse_timestamp_column(data, 'timestamp')
            data = _filter_by_date_range(data, 'timestamp', start_date, stop_date)
            data = _normalize_column_names(data)

            if self.logging is not None:
                self.logging.info(f"Data loaded from {file_path} (using first column as timestamp) for date range {start_date} to {stop_date}")

            return data.set_index('timestamp')