data ingestion

2025-06-13 16:49:29 +08:00
parent f09864d61b
commit 622fda9d2e
6 changed files with 408 additions and 10 deletions
--- a/data/common/aggregation/utils.py
+++ b/data/common/aggregation/utils.py
@@ -8,6 +8,7 @@ and trade data aggregation.
 import re
 from typing import List, Tuple
 from utils.timeframe_utils import load_timeframe_options
+import pandas as pd

 from ..data_types import StandardizedTrade, OHLCVCandle

@@ -74,8 +75,75 @@ def parse_timeframe(timeframe: str) -> Tuple[int, str]:
    return number, unit


+def resample_candles_to_timeframe(df: pd.DataFrame, target_timeframe: str) -> pd.DataFrame:
+    """
+    Resamples a DataFrame of OHLCV candles to a higher timeframe.
+
+    Args:
+        df (pd.DataFrame): Input DataFrame with a datetime index and 'open', 'high', 'low', 'close', 'volume',
+                           and optionally 'trades_count' columns.
+        target_timeframe (str): The target timeframe for resampling (e.g., '1h', '1d').
+
+    Returns:
+        pd.DataFrame: Resampled DataFrame with OHLCV data for the target timeframe.
+    """
+    if df.empty:
+        return pd.DataFrame()
+
+    # Ensure the DataFrame index is a datetime index
+    if not isinstance(df.index, pd.DatetimeIndex):
+        df['timestamp'] = pd.to_datetime(df['timestamp'])
+        df = df.set_index('timestamp')
+    
+    # Convert timedelta string to pandas frequency string
+    # '1m' -> '1T', '1h' -> '1H', '1d' -> '1D'
+    timeframe_map = {
+        's': 'S',
+        'm': 'T',
+        'h': 'H',
+        'd': 'D'
+    }
+    
+    # Convert target_timeframe to pandas offset string
+    match = re.match(r'^(\d+)([smhd])$', target_timeframe.lower())
+    if not match:
+        raise ValueError(f"Invalid target timeframe format: {target_timeframe}")
+    number = match.group(1)
+    unit = timeframe_map.get(match.group(2))
+    if not unit:
+        raise ValueError(f"Unsupported timeframe unit: {target_timeframe}")
+    
+    resample_freq = f"{number}{unit}"
+
+    # Define how to aggregate each column
+    ohlcv_dict = {
+        'open': 'first',
+        'high': 'max',
+        'low': 'min',
+        'close': 'last',
+        'volume': 'sum',
+    }
+    
+    # Only include 'trades_count' if it exists in the DataFrame
+    if 'trades_count' in df.columns:
+        ohlcv_dict['trades_count'] = 'sum'
+
+    # Resample the data
+    resampled_df = df.resample(resample_freq).apply(ohlcv_dict)
+    
+    # Drop rows where all OHLCV values are NaN (e.g., periods with no data)
+    resampled_df.dropna(subset=['open', 'high', 'low', 'close'], inplace=True)
+
+    # Fill NaN trades_count with 0 after resampling
+    if 'trades_count' in resampled_df.columns:
+        resampled_df['trades_count'] = resampled_df['trades_count'].fillna(0).astype(int)
+
+    return resampled_df
+
+
 __all__ = [
    'aggregate_trades_to_candles',
    'validate_timeframe',
-    'parse_timeframe'
+    'parse_timeframe',
+    'resample_candles_to_timeframe'
 ]