61 lines
2.5 KiB
Python
61 lines
2.5 KiB
Python
import pandas as pd
|
|
|
|
def aggregate_to_daily(data_df: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
Aggregates time-series financial data to daily OHLCV format.
|
|
|
|
The input DataFrame is expected to have a DatetimeIndex.
|
|
'open' will be the first 'open' price of the day.
|
|
'close' will be the last 'close' price of the day.
|
|
'high' will be the maximum 'high' price of the day.
|
|
'low' will be the minimum 'low' price of the day.
|
|
'volume' (if present) will be the sum of volumes for the day.
|
|
|
|
Args:
|
|
data_df (pd.DataFrame): DataFrame with a DatetimeIndex and columns
|
|
like 'open', 'high', 'low', 'close', and optionally 'volume'.
|
|
Column names are expected to be lowercase.
|
|
|
|
Returns:
|
|
pd.DataFrame: DataFrame aggregated to daily OHLCV data.
|
|
The index will be a DatetimeIndex with the time set to noon (12:00:00) for each day.
|
|
Returns an empty DataFrame if no relevant OHLCV columns are found.
|
|
|
|
Raises:
|
|
ValueError: If the input DataFrame does not have a DatetimeIndex.
|
|
"""
|
|
if not isinstance(data_df.index, pd.DatetimeIndex):
|
|
raise ValueError("Input DataFrame must have a DatetimeIndex.")
|
|
|
|
agg_rules = {}
|
|
|
|
# Define aggregation rules based on available columns
|
|
if 'open' in data_df.columns:
|
|
agg_rules['open'] = 'first'
|
|
if 'high' in data_df.columns:
|
|
agg_rules['high'] = 'max'
|
|
if 'low' in data_df.columns:
|
|
agg_rules['low'] = 'min'
|
|
if 'close' in data_df.columns:
|
|
agg_rules['close'] = 'last'
|
|
if 'volume' in data_df.columns:
|
|
agg_rules['volume'] = 'sum'
|
|
|
|
if not agg_rules:
|
|
# Log a warning or raise an error if no relevant columns are found
|
|
# For now, returning an empty DataFrame with a message might be suitable for some cases
|
|
print("Warning: No standard OHLCV columns (open, high, low, close, volume) found for daily aggregation.")
|
|
return pd.DataFrame(index=pd.to_datetime([])) # Return empty DF with datetime index
|
|
|
|
# Resample to daily frequency and apply aggregation rules
|
|
daily_data = data_df.resample('D').agg(agg_rules)
|
|
|
|
# Adjust timestamps to noon if data exists
|
|
if not daily_data.empty and isinstance(daily_data.index, pd.DatetimeIndex):
|
|
daily_data.index = daily_data.index + pd.Timedelta(hours=12)
|
|
|
|
# Remove rows where all values are NaN (these are days with no trades in the original data)
|
|
daily_data.dropna(how='all', inplace=True)
|
|
|
|
return daily_data
|