39 lines
1.1 KiB
Python
Raw Permalink Normal View History

from typing import Tuple
import os
import pandas as pd
import numpy as np
from .config import DataConfig
def load_and_filter_data(cfg: DataConfig) -> pd.DataFrame:
"""Load CSV, filter, and convert timestamp.
- Reads the CSV at cfg.csv_path
- Drops rows with Volume == 0 if configured
- Converts 'Timestamp' from seconds to datetime and filters by cfg.min_date
- Adds 'log_return' target column
"""
if not os.path.exists(cfg.csv_path):
raise FileNotFoundError(f"CSV not found: {cfg.csv_path}")
df = pd.read_csv(cfg.csv_path)
if cfg.drop_volume_zero and 'Volume' in df.columns:
df = df[df['Volume'] != 0]
if 'Timestamp' not in df.columns:
raise ValueError("Expected 'Timestamp' column in input CSV")
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df = df[df['Timestamp'] >= cfg.min_date]
if cfg.max_date:
df = df[df['Timestamp'] <= cfg.max_date]
if 'Close' not in df.columns:
raise ValueError("Expected 'Close' column in input CSV")
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
return df