import os import json import pandas as pd from typing import Union, Optional import logging from .storage_utils import ( _parse_timestamp_column, _filter_by_date_range, _normalize_column_names, TimestampParsingError, DataLoadingError ) class DataLoader: """Handles loading and preprocessing of data from various file formats""" def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None): """Initialize data loader Args: data_dir: Directory containing data files logging_instance: Optional logging instance """ self.data_dir = data_dir self.logging = logging_instance def load_data(self, file_path: str, start_date: Union[str, pd.Timestamp], stop_date: Union[str, pd.Timestamp]) -> pd.DataFrame: """Load data with optimized dtypes and filtering, supporting CSV and JSON input Args: file_path: path to the data file start_date: start date (string or datetime-like) stop_date: stop date (string or datetime-like) Returns: pandas DataFrame with timestamp index Raises: DataLoadingError: If data loading fails """ try: # Convert string dates to pandas datetime objects for proper comparison start_date = pd.to_datetime(start_date) stop_date = pd.to_datetime(stop_date) # Determine file type _, ext = os.path.splitext(file_path) ext = ext.lower() if ext == ".json": return self._load_json_data(file_path, start_date, stop_date) else: return self._load_csv_data(file_path, start_date, stop_date) except Exception as e: error_msg = f"Error loading data from {file_path}: {e}" if self.logging is not None: self.logging.error(error_msg) # Return an empty DataFrame with a DatetimeIndex return pd.DataFrame(index=pd.to_datetime([])) def _load_json_data(self, file_path: str, start_date: pd.Timestamp, stop_date: pd.Timestamp) -> pd.DataFrame: """Load and process JSON data file Args: file_path: Path to JSON file start_date: Start date for filtering stop_date: Stop date for filtering Returns: Processed DataFrame with timestamp index """ with open(os.path.join(self.data_dir, file_path), 'r') as f: raw = json.load(f) data = pd.DataFrame(raw["Data"]) data = _normalize_column_names(data) # Convert timestamp to datetime data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s") # Filter by date range data = _filter_by_date_range(data, "timestamp", start_date, stop_date) if self.logging is not None: self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}") return data.set_index("timestamp") def _load_csv_data(self, file_path: str, start_date: pd.Timestamp, stop_date: pd.Timestamp) -> pd.DataFrame: """Load and process CSV data file Args: file_path: Path to CSV file start_date: Start date for filtering stop_date: Stop date for filtering Returns: Processed DataFrame with timestamp index """ # Define optimized dtypes dtypes = { 'Open': 'float32', 'High': 'float32', 'Low': 'float32', 'Close': 'float32', 'Volume': 'float32' } # Read data with original capitalized column names data = pd.read_csv(os.path.join(self.data_dir, file_path), dtype=dtypes) return self._process_csv_timestamps(data, start_date, stop_date, file_path) def _process_csv_timestamps(self, data: pd.DataFrame, start_date: pd.Timestamp, stop_date: pd.Timestamp, file_path: str) -> pd.DataFrame: """Process timestamps in CSV data and filter by date range Args: data: DataFrame with CSV data start_date: Start date for filtering stop_date: Stop date for filtering file_path: Original file path for logging Returns: Processed DataFrame with timestamp index """ if 'Timestamp' in data.columns: data = _parse_timestamp_column(data, 'Timestamp') data = _filter_by_date_range(data, 'Timestamp', start_date, stop_date) data = _normalize_column_names(data) if self.logging is not None: self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}") return data.set_index('timestamp') else: # Attempt to use the first column if 'Timestamp' is not present data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True) data = _parse_timestamp_column(data, 'timestamp') data = _filter_by_date_range(data, 'timestamp', start_date, stop_date) data = _normalize_column_names(data) if self.logging is not None: self.logging.info(f"Data loaded from {file_path} (using first column as timestamp) for date range {start_date} to {stop_date}") return data.set_index('timestamp')