Implement backtesting framework with modular architecture for data loading, processing, and result management. Introduced BacktestRunner, ConfigManager, and ResultProcessor classes for improved maintainability and error handling. Updated main execution script to utilize new components and added comprehensive logging. Enhanced README with detailed project overview and usage instructions.

This commit is contained in:
Simon Moisy
2025-06-25 13:08:07 +08:00
parent 02e5db2a36
commit 6c5dcc1183
12 changed files with 2243 additions and 501 deletions

152
cycles/utils/data_loader.py Normal file
View File

@@ -0,0 +1,152 @@
import os
import json
import pandas as pd
from typing import Union, Optional
import logging
from .storage_utils import (
_parse_timestamp_column,
_filter_by_date_range,
_normalize_column_names,
TimestampParsingError,
DataLoadingError
)
class DataLoader:
"""Handles loading and preprocessing of data from various file formats"""
def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None):
"""Initialize data loader
Args:
data_dir: Directory containing data files
logging_instance: Optional logging instance
"""
self.data_dir = data_dir
self.logging = logging_instance
def load_data(self, file_path: str, start_date: Union[str, pd.Timestamp],
stop_date: Union[str, pd.Timestamp]) -> pd.DataFrame:
"""Load data with optimized dtypes and filtering, supporting CSV and JSON input
Args:
file_path: path to the data file
start_date: start date (string or datetime-like)
stop_date: stop date (string or datetime-like)
Returns:
pandas DataFrame with timestamp index
Raises:
DataLoadingError: If data loading fails
"""
try:
# Convert string dates to pandas datetime objects for proper comparison
start_date = pd.to_datetime(start_date)
stop_date = pd.to_datetime(stop_date)
# Determine file type
_, ext = os.path.splitext(file_path)
ext = ext.lower()
if ext == ".json":
return self._load_json_data(file_path, start_date, stop_date)
else:
return self._load_csv_data(file_path, start_date, stop_date)
except Exception as e:
error_msg = f"Error loading data from {file_path}: {e}"
if self.logging is not None:
self.logging.error(error_msg)
# Return an empty DataFrame with a DatetimeIndex
return pd.DataFrame(index=pd.to_datetime([]))
def _load_json_data(self, file_path: str, start_date: pd.Timestamp,
stop_date: pd.Timestamp) -> pd.DataFrame:
"""Load and process JSON data file
Args:
file_path: Path to JSON file
start_date: Start date for filtering
stop_date: Stop date for filtering
Returns:
Processed DataFrame with timestamp index
"""
with open(os.path.join(self.data_dir, file_path), 'r') as f:
raw = json.load(f)
data = pd.DataFrame(raw["Data"])
data = _normalize_column_names(data)
# Convert timestamp to datetime
data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")
# Filter by date range
data = _filter_by_date_range(data, "timestamp", start_date, stop_date)
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
return data.set_index("timestamp")
def _load_csv_data(self, file_path: str, start_date: pd.Timestamp,
stop_date: pd.Timestamp) -> pd.DataFrame:
"""Load and process CSV data file
Args:
file_path: Path to CSV file
start_date: Start date for filtering
stop_date: Stop date for filtering
Returns:
Processed DataFrame with timestamp index
"""
# Define optimized dtypes
dtypes = {
'Open': 'float32',
'High': 'float32',
'Low': 'float32',
'Close': 'float32',
'Volume': 'float32'
}
# Read data with original capitalized column names
data = pd.read_csv(os.path.join(self.data_dir, file_path), dtype=dtypes)
return self._process_csv_timestamps(data, start_date, stop_date, file_path)
def _process_csv_timestamps(self, data: pd.DataFrame, start_date: pd.Timestamp,
stop_date: pd.Timestamp, file_path: str) -> pd.DataFrame:
"""Process timestamps in CSV data and filter by date range
Args:
data: DataFrame with CSV data
start_date: Start date for filtering
stop_date: Stop date for filtering
file_path: Original file path for logging
Returns:
Processed DataFrame with timestamp index
"""
if 'Timestamp' in data.columns:
data = _parse_timestamp_column(data, 'Timestamp')
data = _filter_by_date_range(data, 'Timestamp', start_date, stop_date)
data = _normalize_column_names(data)
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
return data.set_index('timestamp')
else:
# Attempt to use the first column if 'Timestamp' is not present
data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)
data = _parse_timestamp_column(data, 'timestamp')
data = _filter_by_date_range(data, 'timestamp', start_date, stop_date)
data = _normalize_column_names(data)
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} (using first column as timestamp) for date range {start_date} to {stop_date}")
return data.set_index('timestamp')

106
cycles/utils/data_saver.py Normal file
View File

@@ -0,0 +1,106 @@
import os
import pandas as pd
from typing import Optional
import logging
from .storage_utils import DataSavingError
class DataSaver:
"""Handles saving data to various file formats"""
def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None):
"""Initialize data saver
Args:
data_dir: Directory for saving data files
logging_instance: Optional logging instance
"""
self.data_dir = data_dir
self.logging = logging_instance
def save_data(self, data: pd.DataFrame, file_path: str) -> None:
"""Save processed data to a CSV file.
If the DataFrame has a DatetimeIndex, it's converted to float Unix timestamps
(seconds since epoch) before saving. The index is saved as a column named 'timestamp'.
Args:
data: DataFrame to save
file_path: path to the data file relative to the data_dir
Raises:
DataSavingError: If saving fails
"""
try:
data_to_save = data.copy()
data_to_save = self._prepare_data_for_saving(data_to_save)
# Save to CSV, ensuring the 'timestamp' column (if created) is written
full_path = os.path.join(self.data_dir, file_path)
data_to_save.to_csv(full_path, index=False)
if self.logging is not None:
self.logging.info(f"Data saved to {full_path} with Unix timestamp column.")
except Exception as e:
error_msg = f"Failed to save data to {file_path}: {e}"
if self.logging is not None:
self.logging.error(error_msg)
raise DataSavingError(error_msg) from e
def _prepare_data_for_saving(self, data: pd.DataFrame) -> pd.DataFrame:
"""Prepare DataFrame for saving by handling different index types
Args:
data: DataFrame to prepare
Returns:
DataFrame ready for saving
"""
if isinstance(data.index, pd.DatetimeIndex):
return self._convert_datetime_index_to_timestamp(data)
elif pd.api.types.is_numeric_dtype(data.index.dtype):
return self._convert_numeric_index_to_timestamp(data)
else:
# For other index types, save with the current index
return data
def _convert_datetime_index_to_timestamp(self, data: pd.DataFrame) -> pd.DataFrame:
"""Convert DatetimeIndex to Unix timestamp column
Args:
data: DataFrame with DatetimeIndex
Returns:
DataFrame with timestamp column
"""
# Convert DatetimeIndex to Unix timestamp (float seconds since epoch)
data['timestamp'] = data.index.astype('int64') / 1e9
data.reset_index(drop=True, inplace=True)
# Ensure 'timestamp' is the first column if other columns exist
if 'timestamp' in data.columns and len(data.columns) > 1:
cols = ['timestamp'] + [col for col in data.columns if col != 'timestamp']
data = data[cols]
return data
def _convert_numeric_index_to_timestamp(self, data: pd.DataFrame) -> pd.DataFrame:
"""Convert numeric index to timestamp column
Args:
data: DataFrame with numeric index
Returns:
DataFrame with timestamp column
"""
# If index is already numeric (e.g. float Unix timestamps from a previous save/load cycle)
data['timestamp'] = data.index
data.reset_index(drop=True, inplace=True)
# Ensure 'timestamp' is the first column if other columns exist
if 'timestamp' in data.columns and len(data.columns) > 1:
cols = ['timestamp'] + [col for col in data.columns if col != 'timestamp']
data = data[cols]
return data

View File

@@ -0,0 +1,179 @@
import os
import csv
from typing import Dict, List, Optional, Any
from collections import defaultdict
import logging
from .storage_utils import DataSavingError
class ResultFormatter:
"""Handles formatting and writing of backtest results to CSV files"""
def __init__(self, results_dir: str, logging_instance: Optional[logging.Logger] = None):
"""Initialize result formatter
Args:
results_dir: Directory for saving result files
logging_instance: Optional logging instance
"""
self.results_dir = results_dir
self.logging = logging_instance
def format_row(self, row: Dict[str, Any]) -> Dict[str, str]:
"""Format a row for a combined results CSV file
Args:
row: Dictionary containing row data
Returns:
Dictionary with formatted values
"""
return {
"timeframe": row["timeframe"],
"stop_loss_pct": f"{row['stop_loss_pct']*100:.2f}%",
"n_trades": row["n_trades"],
"n_stop_loss": row["n_stop_loss"],
"win_rate": f"{row['win_rate']*100:.2f}%",
"max_drawdown": f"{row['max_drawdown']*100:.2f}%",
"avg_trade": f"{row['avg_trade']*100:.2f}%",
"profit_ratio": f"{row['profit_ratio']*100:.2f}%",
"final_usd": f"{row['final_usd']:.2f}",
"total_fees_usd": f"{row['total_fees_usd']:.2f}",
}
def write_results_chunk(self, filename: str, fieldnames: List[str],
rows: List[Dict], write_header: bool = False,
initial_usd: Optional[float] = None) -> None:
"""Write a chunk of results to a CSV file
Args:
filename: filename to write to
fieldnames: list of fieldnames
rows: list of rows
write_header: whether to write the header
initial_usd: initial USD value for header comment
Raises:
DataSavingError: If writing fails
"""
try:
mode = 'w' if write_header else 'a'
with open(filename, mode, newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if write_header:
if initial_usd is not None:
csvfile.write(f"# initial_usd: {initial_usd}\n")
writer.writeheader()
for row in rows:
# Only keep keys that are in fieldnames
filtered_row = {k: v for k, v in row.items() if k in fieldnames}
writer.writerow(filtered_row)
except Exception as e:
error_msg = f"Failed to write results chunk to {filename}: {e}"
if self.logging is not None:
self.logging.error(error_msg)
raise DataSavingError(error_msg) from e
def write_backtest_results(self, filename: str, fieldnames: List[str],
rows: List[Dict], metadata_lines: Optional[List[str]] = None) -> str:
"""Write combined backtest results to a CSV file
Args:
filename: filename to write to
fieldnames: list of fieldnames
rows: list of result dictionaries
metadata_lines: optional list of strings to write as header comments
Returns:
Full path to the written file
Raises:
DataSavingError: If writing fails
"""
try:
fname = os.path.join(self.results_dir, filename)
with open(fname, "w", newline="") as csvfile:
if metadata_lines:
for line in metadata_lines:
csvfile.write(f"{line}\n")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
for row in rows:
writer.writerow(self.format_row(row))
if self.logging is not None:
self.logging.info(f"Combined results written to {fname}")
return fname
except Exception as e:
error_msg = f"Failed to write backtest results to {filename}: {e}"
if self.logging is not None:
self.logging.error(error_msg)
raise DataSavingError(error_msg) from e
def write_trades(self, all_trade_rows: List[Dict], trades_fieldnames: List[str]) -> None:
"""Write trades to separate CSV files grouped by timeframe and stop loss
Args:
all_trade_rows: list of trade dictionaries
trades_fieldnames: list of trade fieldnames
Raises:
DataSavingError: If writing fails
"""
try:
trades_by_combo = self._group_trades_by_combination(all_trade_rows)
for (tf, sl), trades in trades_by_combo.items():
self._write_single_trade_file(tf, sl, trades, trades_fieldnames)
except Exception as e:
error_msg = f"Failed to write trades: {e}"
if self.logging is not None:
self.logging.error(error_msg)
raise DataSavingError(error_msg) from e
def _group_trades_by_combination(self, all_trade_rows: List[Dict]) -> Dict:
"""Group trades by timeframe and stop loss combination
Args:
all_trade_rows: List of trade dictionaries
Returns:
Dictionary grouped by (timeframe, stop_loss_pct) tuples
"""
trades_by_combo = defaultdict(list)
for trade in all_trade_rows:
tf = trade.get("timeframe")
sl = trade.get("stop_loss_pct")
trades_by_combo[(tf, sl)].append(trade)
return trades_by_combo
def _write_single_trade_file(self, timeframe: str, stop_loss_pct: float,
trades: List[Dict], trades_fieldnames: List[str]) -> None:
"""Write trades for a single timeframe/stop-loss combination
Args:
timeframe: Timeframe identifier
stop_loss_pct: Stop loss percentage
trades: List of trades for this combination
trades_fieldnames: List of field names for trades
"""
sl_percent = int(round(stop_loss_pct * 100))
trades_filename = os.path.join(self.results_dir, f"trades_{timeframe}_ST{sl_percent}pct.csv")
with open(trades_filename, "w", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=trades_fieldnames)
writer.writeheader()
for trade in trades:
writer.writerow({k: trade.get(k, "") for k in trades_fieldnames})
if self.logging is not None:
self.logging.info(f"Trades written to {trades_filename}")

View File

@@ -1,17 +1,32 @@
import os
import json
import pandas as pd
import csv
from collections import defaultdict
from typing import Optional, Union, Dict, Any, List
import logging
from .data_loader import DataLoader
from .data_saver import DataSaver
from .result_formatter import ResultFormatter
from .storage_utils import DataLoadingError, DataSavingError
RESULTS_DIR = "../results"
DATA_DIR = "../data"
RESULTS_DIR = "results"
DATA_DIR = "data"
class Storage:
"""Storage class for storing and loading results and data"""
"""Unified storage interface for data and results operations
Acts as a coordinator for DataLoader, DataSaver, and ResultFormatter components,
maintaining backward compatibility while providing a clean separation of concerns.
"""
def __init__(self, logging=None, results_dir=RESULTS_DIR, data_dir=DATA_DIR):
"""Initialize storage with component instances
Args:
logging: Optional logging instance
results_dir: Directory for results files
data_dir: Directory for data files
"""
self.results_dir = results_dir
self.data_dir = data_dir
self.logging = logging
@@ -20,196 +35,89 @@ class Storage:
os.makedirs(self.results_dir, exist_ok=True)
os.makedirs(self.data_dir, exist_ok=True)
def load_data(self, file_path, start_date, stop_date):
# Initialize component instances
self.data_loader = DataLoader(data_dir, logging)
self.data_saver = DataSaver(data_dir, logging)
self.result_formatter = ResultFormatter(results_dir, logging)
def load_data(self, file_path: str, start_date: Union[str, pd.Timestamp],
stop_date: Union[str, pd.Timestamp]) -> pd.DataFrame:
"""Load data with optimized dtypes and filtering, supporting CSV and JSON input
Args:
file_path: path to the data file
start_date: start date
stop_date: stop date
start_date: start date (string or datetime-like)
stop_date: stop date (string or datetime-like)
Returns:
pandas DataFrame
pandas DataFrame with timestamp index
Raises:
DataLoadingError: If data loading fails
"""
# Determine file type
_, ext = os.path.splitext(file_path)
ext = ext.lower()
try:
if ext == ".json":
with open(os.path.join(self.data_dir, file_path), 'r') as f:
raw = json.load(f)
data = pd.DataFrame(raw["Data"])
# Convert columns to lowercase
data.columns = data.columns.str.lower()
# Convert timestamp to datetime
data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")
# Filter by date range
data = data[(data["timestamp"] >= start_date) & (data["timestamp"] <= stop_date)]
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
return data.set_index("timestamp")
else:
# Define optimized dtypes
dtypes = {
'Open': 'float32',
'High': 'float32',
'Low': 'float32',
'Close': 'float32',
'Volume': 'float32'
}
# Read data with original capitalized column names
data = pd.read_csv(os.path.join(self.data_dir, file_path), dtype=dtypes)
# Convert timestamp to datetime
if 'Timestamp' in data.columns:
data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s')
# Filter by date range
data = data[(data['Timestamp'] >= start_date) & (data['Timestamp'] <= stop_date)]
# Now convert column names to lowercase
data.columns = data.columns.str.lower()
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
return data.set_index('timestamp')
else: # Attempt to use the first column if 'Timestamp' is not present
data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')
data = data[(data['timestamp'] >= start_date) & (data['timestamp'] <= stop_date)]
data.columns = data.columns.str.lower() # Ensure all other columns are lower
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} (using first column as timestamp) for date range {start_date} to {stop_date}")
return data.set_index('timestamp')
except Exception as e:
if self.logging is not None:
self.logging.error(f"Error loading data from {file_path}: {e}")
# Return an empty DataFrame with a DatetimeIndex
return pd.DataFrame(index=pd.to_datetime([]))
def save_data(self, data: pd.DataFrame, file_path: str):
"""Save processed data to a CSV file.
If the DataFrame has a DatetimeIndex, it's converted to float Unix timestamps
(seconds since epoch) before saving. The index is saved as a column named 'timestamp'.
return self.data_loader.load_data(file_path, start_date, stop_date)
def save_data(self, data: pd.DataFrame, file_path: str) -> None:
"""Save processed data to a CSV file
Args:
data (pd.DataFrame): data to save.
file_path (str): path to the data file relative to the data_dir.
data: DataFrame to save
file_path: path to the data file relative to the data_dir
Raises:
DataSavingError: If saving fails
"""
data_to_save = data.copy()
self.data_saver.save_data(data, file_path)
if isinstance(data_to_save.index, pd.DatetimeIndex):
# Convert DatetimeIndex to Unix timestamp (float seconds since epoch)
# and make it a column named 'timestamp'.
data_to_save['timestamp'] = data_to_save.index.astype('int64') / 1e9
# Reset index so 'timestamp' column is saved and old DatetimeIndex is not saved as a column.
# We want the 'timestamp' column to be the first one.
data_to_save.reset_index(drop=True, inplace=True)
# Ensure 'timestamp' is the first column if other columns exist
if 'timestamp' in data_to_save.columns and len(data_to_save.columns) > 1:
cols = ['timestamp'] + [col for col in data_to_save.columns if col != 'timestamp']
data_to_save = data_to_save[cols]
elif pd.api.types.is_numeric_dtype(data_to_save.index.dtype):
# If index is already numeric (e.g. float Unix timestamps from a previous save/load cycle),
# make it a column named 'timestamp'.
data_to_save['timestamp'] = data_to_save.index
data_to_save.reset_index(drop=True, inplace=True)
if 'timestamp' in data_to_save.columns and len(data_to_save.columns) > 1:
cols = ['timestamp'] + [col for col in data_to_save.columns if col != 'timestamp']
data_to_save = data_to_save[cols]
else:
# For other index types, or if no index that we want to specifically handle,
# save with the current index. pandas to_csv will handle it.
# This branch might be removed if we strictly expect either DatetimeIndex or a numeric one from previous save.
pass # data_to_save remains as is, to_csv will write its index if index=True
# Save to CSV, ensuring the 'timestamp' column (if created) is written, and not the DataFrame's active index.
full_path = os.path.join(self.data_dir, file_path)
data_to_save.to_csv(full_path, index=False) # index=False because timestamp is now a column
if self.logging is not None:
self.logging.info(f"Data saved to {full_path} with Unix timestamp column.")
def format_row(self, row):
def format_row(self, row: Dict[str, Any]) -> Dict[str, str]:
"""Format a row for a combined results CSV file
Args:
row: row to format
row: Dictionary containing row data
Returns:
formatted row
Dictionary with formatted values
"""
return self.result_formatter.format_row(row)
return {
"timeframe": row["timeframe"],
"stop_loss_pct": f"{row['stop_loss_pct']*100:.2f}%",
"n_trades": row["n_trades"],
"n_stop_loss": row["n_stop_loss"],
"win_rate": f"{row['win_rate']*100:.2f}%",
"max_drawdown": f"{row['max_drawdown']*100:.2f}%",
"avg_trade": f"{row['avg_trade']*100:.2f}%",
"profit_ratio": f"{row['profit_ratio']*100:.2f}%",
"final_usd": f"{row['final_usd']:.2f}",
"total_fees_usd": f"{row['total_fees_usd']:.2f}",
}
def write_results_chunk(self, filename, fieldnames, rows, write_header=False, initial_usd=None):
def write_results_chunk(self, filename: str, fieldnames: List[str],
rows: List[Dict], write_header: bool = False,
initial_usd: Optional[float] = None) -> None:
"""Write a chunk of results to a CSV file
Args:
filename: filename to write to
fieldnames: list of fieldnames
rows: list of rows
write_header: whether to write the header
initial_usd: initial USD
initial_usd: initial USD value for header comment
"""
mode = 'w' if write_header else 'a'
self.result_formatter.write_results_chunk(
filename, fieldnames, rows, write_header, initial_usd
)
def write_backtest_results(self, filename: str, fieldnames: List[str],
rows: List[Dict], metadata_lines: Optional[List[str]] = None) -> str:
"""Write combined backtest results to a CSV file
with open(filename, mode, newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if write_header:
csvfile.write(f"# initial_usd: {initial_usd}\n")
writer.writeheader()
for row in rows:
# Only keep keys that are in fieldnames
filtered_row = {k: v for k, v in row.items() if k in fieldnames}
writer.writerow(filtered_row)
def write_backtest_results(self, filename, fieldnames, rows, metadata_lines=None):
"""Write a combined results to a CSV file
Args:
filename: filename to write to
fieldnames: list of fieldnames
rows: list of rows
rows: list of result dictionaries
metadata_lines: optional list of strings to write as header comments
Returns:
Full path to the written file
"""
fname = os.path.join(self.results_dir, filename)
with open(fname, "w", newline="") as csvfile:
if metadata_lines:
for line in metadata_lines:
csvfile.write(f"{line}\n")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
for row in rows:
writer.writerow(self.format_row(row))
if self.logging is not None:
self.logging.info(f"Combined results written to {fname}")
def write_trades(self, all_trade_rows, trades_fieldnames):
"""Write trades to a CSV file
return self.result_formatter.write_backtest_results(
filename, fieldnames, rows, metadata_lines
)
def write_trades(self, all_trade_rows: List[Dict], trades_fieldnames: List[str]) -> None:
"""Write trades to separate CSV files grouped by timeframe and stop loss
Args:
all_trade_rows: list of trade rows
all_trade_rows: list of trade dictionaries
trades_fieldnames: list of trade fieldnames
logging: logging object
"""
trades_by_combo = defaultdict(list)
for trade in all_trade_rows:
tf = trade.get("timeframe")
sl = trade.get("stop_loss_pct")
trades_by_combo[(tf, sl)].append(trade)
for (tf, sl), trades in trades_by_combo.items():
sl_percent = int(round(sl * 100))
trades_filename = os.path.join(self.results_dir, f"trades_{tf}_ST{sl_percent}pct.csv")
with open(trades_filename, "w", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=trades_fieldnames)
writer.writeheader()
for trade in trades:
writer.writerow({k: trade.get(k, "") for k in trades_fieldnames})
if self.logging is not None:
self.logging.info(f"Trades written to {trades_filename}")
self.result_formatter.write_trades(all_trade_rows, trades_fieldnames)

View File

@@ -0,0 +1,73 @@
import pandas as pd
class TimestampParsingError(Exception):
"""Custom exception for timestamp parsing errors"""
pass
class DataLoadingError(Exception):
"""Custom exception for data loading errors"""
pass
class DataSavingError(Exception):
"""Custom exception for data saving errors"""
pass
def _parse_timestamp_column(data: pd.DataFrame, column_name: str) -> pd.DataFrame:
"""Parse timestamp column handling both Unix timestamps and datetime strings
Args:
data: DataFrame containing the timestamp column
column_name: Name of the timestamp column
Returns:
DataFrame with parsed timestamp column
Raises:
TimestampParsingError: If timestamp parsing fails
"""
try:
sample_timestamp = str(data[column_name].iloc[0])
try:
# Check if it's a Unix timestamp (numeric)
float(sample_timestamp)
# It's a Unix timestamp, convert using unit='s'
data[column_name] = pd.to_datetime(data[column_name], unit='s')
except ValueError:
# It's already in datetime string format, convert without unit
data[column_name] = pd.to_datetime(data[column_name])
return data
except Exception as e:
raise TimestampParsingError(f"Failed to parse timestamp column '{column_name}': {e}")
def _filter_by_date_range(data: pd.DataFrame, timestamp_col: str,
start_date: pd.Timestamp, stop_date: pd.Timestamp) -> pd.DataFrame:
"""Filter DataFrame by date range
Args:
data: DataFrame to filter
timestamp_col: Name of timestamp column
start_date: Start date for filtering
stop_date: Stop date for filtering
Returns:
Filtered DataFrame
"""
return data[(data[timestamp_col] >= start_date) & (data[timestamp_col] <= stop_date)]
def _normalize_column_names(data: pd.DataFrame) -> pd.DataFrame:
"""Convert all column names to lowercase
Args:
data: DataFrame to normalize
Returns:
DataFrame with lowercase column names
"""
data.columns = data.columns.str.lower()
return data