data ingestion

This commit is contained in:
Vasily.onl
2025-06-13 16:49:29 +08:00
parent f09864d61b
commit 622fda9d2e
6 changed files with 408 additions and 10 deletions

View File

@@ -8,6 +8,7 @@ and trade data aggregation.
import re
from typing import List, Tuple
from utils.timeframe_utils import load_timeframe_options
import pandas as pd
from ..data_types import StandardizedTrade, OHLCVCandle
@@ -74,8 +75,75 @@ def parse_timeframe(timeframe: str) -> Tuple[int, str]:
return number, unit
def resample_candles_to_timeframe(df: pd.DataFrame, target_timeframe: str) -> pd.DataFrame:
"""
Resamples a DataFrame of OHLCV candles to a higher timeframe.
Args:
df (pd.DataFrame): Input DataFrame with a datetime index and 'open', 'high', 'low', 'close', 'volume',
and optionally 'trades_count' columns.
target_timeframe (str): The target timeframe for resampling (e.g., '1h', '1d').
Returns:
pd.DataFrame: Resampled DataFrame with OHLCV data for the target timeframe.
"""
if df.empty:
return pd.DataFrame()
# Ensure the DataFrame index is a datetime index
if not isinstance(df.index, pd.DatetimeIndex):
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.set_index('timestamp')
# Convert timedelta string to pandas frequency string
# '1m' -> '1T', '1h' -> '1H', '1d' -> '1D'
timeframe_map = {
's': 'S',
'm': 'T',
'h': 'H',
'd': 'D'
}
# Convert target_timeframe to pandas offset string
match = re.match(r'^(\d+)([smhd])$', target_timeframe.lower())
if not match:
raise ValueError(f"Invalid target timeframe format: {target_timeframe}")
number = match.group(1)
unit = timeframe_map.get(match.group(2))
if not unit:
raise ValueError(f"Unsupported timeframe unit: {target_timeframe}")
resample_freq = f"{number}{unit}"
# Define how to aggregate each column
ohlcv_dict = {
'open': 'first',
'high': 'max',
'low': 'min',
'close': 'last',
'volume': 'sum',
}
# Only include 'trades_count' if it exists in the DataFrame
if 'trades_count' in df.columns:
ohlcv_dict['trades_count'] = 'sum'
# Resample the data
resampled_df = df.resample(resample_freq).apply(ohlcv_dict)
# Drop rows where all OHLCV values are NaN (e.g., periods with no data)
resampled_df.dropna(subset=['open', 'high', 'low', 'close'], inplace=True)
# Fill NaN trades_count with 0 after resampling
if 'trades_count' in resampled_df.columns:
resampled_df['trades_count'] = resampled_df['trades_count'].fillna(0).astype(int)
return resampled_df
__all__ = [
'aggregate_trades_to_candles',
'validate_timeframe',
'parse_timeframe'
'parse_timeframe',
'resample_candles_to_timeframe'
]