40 lines
1.1 KiB
Python
40 lines
1.1 KiB
Python
from typing import List
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
from .config import PreprocessConfig
|
|
|
|
|
|
def add_basic_time_features(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Add basic time features such as hour-of-day."""
|
|
df = df.copy()
|
|
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
|
|
df['hour'] = df['Timestamp'].dt.hour
|
|
return df
|
|
|
|
|
|
def downcast_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Downcast numeric columns to save memory."""
|
|
df = df.copy()
|
|
for col in df.columns:
|
|
try:
|
|
df[col] = pd.to_numeric(df[col], downcast='float')
|
|
except Exception:
|
|
# ignore non-numeric columns
|
|
pass
|
|
return df
|
|
|
|
|
|
def handle_nans(df: pd.DataFrame, cfg: PreprocessConfig) -> pd.DataFrame:
|
|
"""Impute NaNs (mean) or drop rows, based on config."""
|
|
df = df.copy()
|
|
if cfg.impute_nans:
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
for col in numeric_cols:
|
|
df[col] = df[col].fillna(df[col].mean())
|
|
else:
|
|
df = df.dropna().reset_index(drop=True)
|
|
return df
|
|
|
|
|