152 lines
5.6 KiB
Python
152 lines
5.6 KiB
Python
|
|
import os
|
||
|
|
import json
|
||
|
|
import pandas as pd
|
||
|
|
from typing import Union, Optional
|
||
|
|
import logging
|
||
|
|
|
||
|
|
from .storage_utils import (
|
||
|
|
_parse_timestamp_column,
|
||
|
|
_filter_by_date_range,
|
||
|
|
_normalize_column_names,
|
||
|
|
TimestampParsingError,
|
||
|
|
DataLoadingError
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
class DataLoader:
|
||
|
|
"""Handles loading and preprocessing of data from various file formats"""
|
||
|
|
|
||
|
|
def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None):
|
||
|
|
"""Initialize data loader
|
||
|
|
|
||
|
|
Args:
|
||
|
|
data_dir: Directory containing data files
|
||
|
|
logging_instance: Optional logging instance
|
||
|
|
"""
|
||
|
|
self.data_dir = data_dir
|
||
|
|
self.logging = logging_instance
|
||
|
|
|
||
|
|
def load_data(self, file_path: str, start_date: Union[str, pd.Timestamp],
|
||
|
|
stop_date: Union[str, pd.Timestamp]) -> pd.DataFrame:
|
||
|
|
"""Load data with optimized dtypes and filtering, supporting CSV and JSON input
|
||
|
|
|
||
|
|
Args:
|
||
|
|
file_path: path to the data file
|
||
|
|
start_date: start date (string or datetime-like)
|
||
|
|
stop_date: stop date (string or datetime-like)
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
pandas DataFrame with timestamp index
|
||
|
|
|
||
|
|
Raises:
|
||
|
|
DataLoadingError: If data loading fails
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
# Convert string dates to pandas datetime objects for proper comparison
|
||
|
|
start_date = pd.to_datetime(start_date)
|
||
|
|
stop_date = pd.to_datetime(stop_date)
|
||
|
|
|
||
|
|
# Determine file type
|
||
|
|
_, ext = os.path.splitext(file_path)
|
||
|
|
ext = ext.lower()
|
||
|
|
|
||
|
|
if ext == ".json":
|
||
|
|
return self._load_json_data(file_path, start_date, stop_date)
|
||
|
|
else:
|
||
|
|
return self._load_csv_data(file_path, start_date, stop_date)
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
error_msg = f"Error loading data from {file_path}: {e}"
|
||
|
|
if self.logging is not None:
|
||
|
|
self.logging.error(error_msg)
|
||
|
|
# Return an empty DataFrame with a DatetimeIndex
|
||
|
|
return pd.DataFrame(index=pd.to_datetime([]))
|
||
|
|
|
||
|
|
def _load_json_data(self, file_path: str, start_date: pd.Timestamp,
|
||
|
|
stop_date: pd.Timestamp) -> pd.DataFrame:
|
||
|
|
"""Load and process JSON data file
|
||
|
|
|
||
|
|
Args:
|
||
|
|
file_path: Path to JSON file
|
||
|
|
start_date: Start date for filtering
|
||
|
|
stop_date: Stop date for filtering
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Processed DataFrame with timestamp index
|
||
|
|
"""
|
||
|
|
with open(os.path.join(self.data_dir, file_path), 'r') as f:
|
||
|
|
raw = json.load(f)
|
||
|
|
|
||
|
|
data = pd.DataFrame(raw["Data"])
|
||
|
|
data = _normalize_column_names(data)
|
||
|
|
|
||
|
|
# Convert timestamp to datetime
|
||
|
|
data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")
|
||
|
|
|
||
|
|
# Filter by date range
|
||
|
|
data = _filter_by_date_range(data, "timestamp", start_date, stop_date)
|
||
|
|
|
||
|
|
if self.logging is not None:
|
||
|
|
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
|
||
|
|
|
||
|
|
return data.set_index("timestamp")
|
||
|
|
|
||
|
|
def _load_csv_data(self, file_path: str, start_date: pd.Timestamp,
|
||
|
|
stop_date: pd.Timestamp) -> pd.DataFrame:
|
||
|
|
"""Load and process CSV data file
|
||
|
|
|
||
|
|
Args:
|
||
|
|
file_path: Path to CSV file
|
||
|
|
start_date: Start date for filtering
|
||
|
|
stop_date: Stop date for filtering
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Processed DataFrame with timestamp index
|
||
|
|
"""
|
||
|
|
# Define optimized dtypes
|
||
|
|
dtypes = {
|
||
|
|
'Open': 'float32',
|
||
|
|
'High': 'float32',
|
||
|
|
'Low': 'float32',
|
||
|
|
'Close': 'float32',
|
||
|
|
'Volume': 'float32'
|
||
|
|
}
|
||
|
|
|
||
|
|
# Read data with original capitalized column names
|
||
|
|
data = pd.read_csv(os.path.join(self.data_dir, file_path), dtype=dtypes)
|
||
|
|
|
||
|
|
return self._process_csv_timestamps(data, start_date, stop_date, file_path)
|
||
|
|
|
||
|
|
def _process_csv_timestamps(self, data: pd.DataFrame, start_date: pd.Timestamp,
|
||
|
|
stop_date: pd.Timestamp, file_path: str) -> pd.DataFrame:
|
||
|
|
"""Process timestamps in CSV data and filter by date range
|
||
|
|
|
||
|
|
Args:
|
||
|
|
data: DataFrame with CSV data
|
||
|
|
start_date: Start date for filtering
|
||
|
|
stop_date: Stop date for filtering
|
||
|
|
file_path: Original file path for logging
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Processed DataFrame with timestamp index
|
||
|
|
"""
|
||
|
|
if 'Timestamp' in data.columns:
|
||
|
|
data = _parse_timestamp_column(data, 'Timestamp')
|
||
|
|
data = _filter_by_date_range(data, 'Timestamp', start_date, stop_date)
|
||
|
|
data = _normalize_column_names(data)
|
||
|
|
|
||
|
|
if self.logging is not None:
|
||
|
|
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
|
||
|
|
|
||
|
|
return data.set_index('timestamp')
|
||
|
|
else:
|
||
|
|
# Attempt to use the first column if 'Timestamp' is not present
|
||
|
|
data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)
|
||
|
|
data = _parse_timestamp_column(data, 'timestamp')
|
||
|
|
data = _filter_by_date_range(data, 'timestamp', start_date, stop_date)
|
||
|
|
data = _normalize_column_names(data)
|
||
|
|
|
||
|
|
if self.logging is not None:
|
||
|
|
self.logging.info(f"Data loaded from {file_path} (using first column as timestamp) for date range {start_date} to {stop_date}")
|
||
|
|
|
||
|
|
return data.set_index('timestamp')
|