39 lines
1.1 KiB
Python
39 lines
1.1 KiB
Python
|
|
from typing import Tuple
|
||
|
|
import os
|
||
|
|
import pandas as pd
|
||
|
|
import numpy as np
|
||
|
|
|
||
|
|
from .config import DataConfig
|
||
|
|
|
||
|
|
|
||
|
|
def load_and_filter_data(cfg: DataConfig) -> pd.DataFrame:
|
||
|
|
"""Load CSV, filter, and convert timestamp.
|
||
|
|
|
||
|
|
- Reads the CSV at cfg.csv_path
|
||
|
|
- Drops rows with Volume == 0 if configured
|
||
|
|
- Converts 'Timestamp' from seconds to datetime and filters by cfg.min_date
|
||
|
|
- Adds 'log_return' target column
|
||
|
|
"""
|
||
|
|
if not os.path.exists(cfg.csv_path):
|
||
|
|
raise FileNotFoundError(f"CSV not found: {cfg.csv_path}")
|
||
|
|
|
||
|
|
df = pd.read_csv(cfg.csv_path)
|
||
|
|
if cfg.drop_volume_zero and 'Volume' in df.columns:
|
||
|
|
df = df[df['Volume'] != 0]
|
||
|
|
|
||
|
|
if 'Timestamp' not in df.columns:
|
||
|
|
raise ValueError("Expected 'Timestamp' column in input CSV")
|
||
|
|
|
||
|
|
df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
|
||
|
|
df = df[df['Timestamp'] >= cfg.min_date]
|
||
|
|
if cfg.max_date:
|
||
|
|
df = df[df['Timestamp'] <= cfg.max_date]
|
||
|
|
|
||
|
|
if 'Close' not in df.columns:
|
||
|
|
raise ValueError("Expected 'Close' column in input CSV")
|
||
|
|
|
||
|
|
df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
|
||
|
|
return df
|
||
|
|
|
||
|
|
|