Cycles/cycles/utils/data_utils.py

import pandas as pd

def aggregate_to_daily(data_df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregates time-series financial data to daily OHLCV format.

    The input DataFrame is expected to have a DatetimeIndex.
    'open' will be the first 'open' price of the day.
    'close' will be the last 'close' price of the day.
    'high' will be the maximum 'high' price of the day.
    'low' will be the minimum 'low' price of the day.
    'volume' (if present) will be the sum of volumes for the day.

    Args:
        data_df (pd.DataFrame): DataFrame with a DatetimeIndex and columns
                                like 'open', 'high', 'low', 'close', and optionally 'volume'.
                                Column names are expected to be lowercase.

    Returns:
        pd.DataFrame: DataFrame aggregated to daily OHLCV data.
                      The index will be a DatetimeIndex with the time set to noon (12:00:00) for each day.
                      Returns an empty DataFrame if no relevant OHLCV columns are found.

    Raises:
        ValueError: If the input DataFrame does not have a DatetimeIndex.
    """
    if not isinstance(data_df.index, pd.DatetimeIndex):
        raise ValueError("Input DataFrame must have a DatetimeIndex.")

    agg_rules = {}

    # Define aggregation rules based on available columns
    if 'open' in data_df.columns:
        agg_rules['open'] = 'first'
    if 'high' in data_df.columns:
        agg_rules['high'] = 'max'
    if 'low' in data_df.columns:
        agg_rules['low'] = 'min'
    if 'close' in data_df.columns:
        agg_rules['close'] = 'last'
    if 'volume' in data_df.columns:
        agg_rules['volume'] = 'sum'

    if not agg_rules:
        # Log a warning or raise an error if no relevant columns are found
        # For now, returning an empty DataFrame with a message might be suitable for some cases
        print("Warning: No standard OHLCV columns (open, high, low, close, volume) found for daily aggregation.")
        return pd.DataFrame(index=pd.to_datetime([])) # Return empty DF with datetime index

    # Resample to daily frequency and apply aggregation rules
    daily_data = data_df.resample('D').agg(agg_rules)

    # Adjust timestamps to noon if data exists
    if not daily_data.empty and isinstance(daily_data.index, pd.DatetimeIndex):
        daily_data.index = daily_data.index + pd.Timedelta(hours=12)

    # Remove rows where all values are NaN (these are days with no trades in the original data)
    daily_data.dropna(how='all', inplace=True)

    return daily_data