TCPDashboard/data/common/aggregation/utils.py

"""
Utility functions for market data aggregation.

This module provides common utility functions for working with OHLCV candles
and trade data aggregation.
"""

import re
from typing import List, Tuple
from utils.timeframe_utils import load_timeframe_options
import pandas as pd

from ..data_types import StandardizedTrade, OHLCVCandle


def aggregate_trades_to_candles(trades: List[StandardizedTrade],
                              timeframes: List[str],
                              symbol: str,
                              exchange: str) -> List[OHLCVCandle]:
    """
    Simple utility function to aggregate a list of trades to candles.

    Args:
        trades: List of standardized trades
        timeframes: List of timeframes to generate
        symbol: Trading symbol
        exchange: Exchange name

    Returns:
        List of completed candles
    """
    from .batch import BatchCandleProcessor
    processor = BatchCandleProcessor(symbol, exchange, timeframes)
    return processor.process_trades_to_candles(iter(trades))


def validate_timeframe(timeframe: str) -> bool:
    """
    Validate if timeframe is supported.

    Args:
        timeframe: Timeframe string (e.g., '1s', '5s', '10s', '1m', '5m', '1h')

    Returns:
        True if supported, False otherwise
    """
    supported = [item['value'] for item in load_timeframe_options()]
    return timeframe in supported


def parse_timeframe(timeframe: str) -> Tuple[int, str]:
    """
    Parse timeframe string into number and unit.

    Args:
        timeframe: Timeframe string (e.g., '1s', '5m', '1h')

    Returns:
        Tuple of (number, unit)

    Examples:
        '1s' -> (1, 's')
        '5m' -> (5, 'm')
        '1h' -> (1, 'h')
        '1d' -> (1, 'd')
    """
    match = re.match(r'^(\d+)([smhd])$', timeframe.lower())
    if not match:
        raise ValueError(f"Invalid timeframe format: {timeframe}")
    number = int(match.group(1))
    unit = match.group(2)
    # Disallow zero or negative timeframes, as they are not meaningful for bucket intervals
    if number <= 0:
        raise ValueError(f"Timeframe value must be positive: {timeframe}")
    return number, unit


def resample_candles_to_timeframe(df: pd.DataFrame, target_timeframe: str) -> pd.DataFrame:
    """
    Resamples a DataFrame of OHLCV candles to a higher timeframe.

    Args:
        df (pd.DataFrame): Input DataFrame with a datetime index and 'open', 'high', 'low', 'close', 'volume',
                           and optionally 'trades_count' columns.
        target_timeframe (str): The target timeframe for resampling (e.g., '1h', '1d').

    Returns:
        pd.DataFrame: Resampled DataFrame with OHLCV data for the target timeframe.
    """
    if df.empty:
        return pd.DataFrame()

    # Ensure the DataFrame index is a datetime index
    if not isinstance(df.index, pd.DatetimeIndex):
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df = df.set_index('timestamp')

    # Convert timedelta string to pandas frequency string
    # '1m' -> '1T', '1h' -> '1H', '1d' -> '1D'
    timeframe_map = {
        's': 'S',
        'm': 'T',
        'h': 'H',
        'd': 'D'
    }

    # Convert target_timeframe to pandas offset string
    match = re.match(r'^(\d+)([smhd])$', target_timeframe.lower())
    if not match:
        raise ValueError(f"Invalid target timeframe format: {target_timeframe}")
    number = match.group(1)
    unit = timeframe_map.get(match.group(2))
    if not unit:
        raise ValueError(f"Unsupported timeframe unit: {target_timeframe}")

    resample_freq = f"{number}{unit}"

    # Define how to aggregate each column
    ohlcv_dict = {
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum',
    }

    # Only include 'trades_count' if it exists in the DataFrame
    if 'trades_count' in df.columns:
        ohlcv_dict['trades_count'] = 'sum'

    # Resample the data
    resampled_df = df.resample(resample_freq).apply(ohlcv_dict)

    # Drop rows where all OHLCV values are NaN (e.g., periods with no data)
    resampled_df.dropna(subset=['open', 'high', 'low', 'close'], inplace=True)

    # Fill NaN trades_count with 0 after resampling
    if 'trades_count' in resampled_df.columns:
        resampled_df['trades_count'] = resampled_df['trades_count'].fillna(0).astype(int)

    return resampled_df


__all__ = [
    'aggregate_trades_to_candles',
    'validate_timeframe',
    'parse_timeframe',
    'resample_candles_to_timeframe'
]