aggregate for specific condition

2025-05-22 16:53:23 +08:00
parent a924328c90
commit 736b278ee2
1 changed files with 117 additions and 16 deletions
--- a/cycles/utils/data_utils.py
+++ b/cycles/utils/data_utils.py
@@ -1,5 +1,80 @@
 import pandas as pd
 def check_data(data_df: pd.DataFrame) -> bool:
    """
    Checks if the input DataFrame has a DatetimeIndex.
    Args:
        data_df (pd.DataFrame): DataFrame to check.
    Returns:
        bool: True if the DataFrame has a DatetimeIndex, False otherwise.
    """
    if not isinstance(data_df.index, pd.DatetimeIndex):
        print("Warning: Input DataFrame must have a DatetimeIndex.")
        return False
    agg_rules = {}
    # Define aggregation rules based on available columns
    if 'open' in data_df.columns:
        agg_rules['open'] = 'first'
    if 'high' in data_df.columns:
        agg_rules['high'] = 'max'
    if 'low' in data_df.columns:
        agg_rules['low'] = 'min'
    if 'close' in data_df.columns:
        agg_rules['close'] = 'last'
    if 'volume' in data_df.columns:
        agg_rules['volume'] = 'sum'
    if not agg_rules:
        print("Warning: No standard OHLCV columns (open, high, low, close, volume) found for daily aggregation.")
        return False
    return agg_rules
 def aggregate_to_weekly(data_df: pd.DataFrame, weeks: int = 1) -> pd.DataFrame:
    """
    Aggregates time-series financial data to weekly OHLCV format.
    The input DataFrame is expected to have a DatetimeIndex.
    'open' will be the first 'open' price of the week.
    'close' will be the last 'close' price of the week.
    'high' will be the maximum 'high' price of the week.
    'low' will be the minimum 'low' price of the week.
    'volume' (if present) will be the sum of volumes for the week.
    Args:
        data_df (pd.DataFrame): DataFrame with a DatetimeIndex and columns
                                like 'open', 'high', 'low', 'close', and optionally 'volume'.
        weeks (int): The number of weeks to aggregate to. Default is 1.
    Returns:
        pd.DataFrame: DataFrame aggregated to weekly OHLCV data.
                      The index will be a DatetimeIndex with the time set to the start of the week.
                      Returns an empty DataFrame if no relevant OHLCV columns are found.
    """
    agg_rules = check_data(data_df)
    if not agg_rules:
        print("Warning: No standard OHLCV columns (open, high, low, close, volume) found for weekly aggregation.")
        return pd.DataFrame(index=pd.to_datetime([]))
    # Resample to weekly frequency and apply aggregation rules
    weekly_data = data_df.resample(f'{weeks}W').agg(agg_rules)
    weekly_data.dropna(how='all', inplace=True)
    # Adjust timestamps to the start of the week    
    if not weekly_data.empty and isinstance(weekly_data.index, pd.DatetimeIndex):
        weekly_data.index = weekly_data.index.floor('W')
    return weekly_data
 def aggregate_to_daily(data_df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregates time-series financial data to daily OHLCV format.
@@ -24,23 +99,9 @@ def aggregate_to_daily(data_df: pd.DataFrame) -> pd.DataFrame:
    Raises:
        ValueError: If the input DataFrame does not have a DatetimeIndex.
    """
    if not isinstance(data_df.index, pd.DatetimeIndex):
        raise ValueError("Input DataFrame must have a DatetimeIndex.")
    agg_rules = {}
    # Define aggregation rules based on available columns
    if 'open' in data_df.columns:
        agg_rules['open'] = 'first'
    if 'high' in data_df.columns:
        agg_rules['high'] = 'max'
    if 'low' in data_df.columns:
        agg_rules['low'] = 'min'
    if 'close' in data_df.columns:
        agg_rules['close'] = 'last'
    if 'volume' in data_df.columns:
        agg_rules['volume'] = 'sum'
    agg_rules = check_data(data_df)
    if not agg_rules:
        # Log a warning or raise an error if no relevant columns are found
        # For now, returning an empty DataFrame with a message might be suitable for some cases
@@ -58,3 +119,43 @@ def aggregate_to_daily(data_df: pd.DataFrame) -> pd.DataFrame:
    daily_data.dropna(how='all', inplace=True)
    return daily_data
 def aggregate_to_hourly(data_df: pd.DataFrame, hours: int = 1) -> pd.DataFrame:
    """
    Aggregates time-series financial data to hourly OHLCV format.
    The input DataFrame is expected to have a DatetimeIndex.
    'open' will be the first 'open' price of the hour.
    'close' will be the last 'close' price of the hour.
    'high' will be the maximum 'high' price of the hour.
    'low' will be the minimum 'low' price of the hour.
    'volume' (if present) will be the sum of volumes for the hour.
    Args:
        data_df (pd.DataFrame): DataFrame with a DatetimeIndex and columns
                                like 'open', 'high', 'low', 'close', and optionally 'volume'.
        hours (int): The number of hours to aggregate to. Default is 1.
    Returns:
        pd.DataFrame: DataFrame aggregated to hourly OHLCV data.
                      The index will be a DatetimeIndex with the time set to the start of the hour.
                      Returns an empty DataFrame if no relevant OHLCV columns are found.
    """
    agg_rules = check_data(data_df)
    if not agg_rules:
        print("Warning: No standard OHLCV columns (open, high, low, close, volume) found for hourly aggregation.")
        return pd.DataFrame(index=pd.to_datetime([]))
    # Resample to hourly frequency and apply aggregation rules  
    hourly_data = data_df.resample(f'{hours}H').agg(agg_rules)
    hourly_data.dropna(how='all', inplace=True)
    # Adjust timestamps to the start of the hour
    if not hourly_data.empty and isinstance(hourly_data.index, pd.DatetimeIndex):
        hourly_data.index = hourly_data.index.floor('H')
    return hourly_data