from typing import Tuple import os import pandas as pd import numpy as np from .config import DataConfig def load_and_filter_data(cfg: DataConfig) -> pd.DataFrame: """Load CSV, filter, and convert timestamp. - Reads the CSV at cfg.csv_path - Drops rows with Volume == 0 if configured - Converts 'Timestamp' from seconds to datetime and filters by cfg.min_date - Adds 'log_return' target column """ if not os.path.exists(cfg.csv_path): raise FileNotFoundError(f"CSV not found: {cfg.csv_path}") df = pd.read_csv(cfg.csv_path) if cfg.drop_volume_zero and 'Volume' in df.columns: df = df[df['Volume'] != 0] if 'Timestamp' not in df.columns: raise ValueError("Expected 'Timestamp' column in input CSV") df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s') df = df[df['Timestamp'] >= cfg.min_date] if cfg.max_date: df = df[df['Timestamp'] <= cfg.max_date] if 'Close' not in df.columns: raise ValueError("Expected 'Close' column in input CSV") df['log_return'] = np.log(df['Close'] / df['Close'].shift(1)) return df