40 lines
1.1 KiB
Python

from typing import List
import pandas as pd
import numpy as np
from .config import PreprocessConfig
def add_basic_time_features(df: pd.DataFrame) -> pd.DataFrame:
"""Add basic time features such as hour-of-day."""
df = df.copy()
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
df['hour'] = df['Timestamp'].dt.hour
return df
def downcast_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Downcast numeric columns to save memory."""
df = df.copy()
for col in df.columns:
try:
df[col] = pd.to_numeric(df[col], downcast='float')
except Exception:
# ignore non-numeric columns
pass
return df
def handle_nans(df: pd.DataFrame, cfg: PreprocessConfig) -> pd.DataFrame:
"""Impute NaNs (mean) or drop rows, based on config."""
df = df.copy()
if cfg.impute_nans:
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
df[col] = df[col].fillna(df[col].mean())
else:
df = df.dropna().reset_index(drop=True)
return df