Merge branch 'xgboost'

# Conflicts:
#	.gitignore
#	README.md
#	cycles/backtest.py
#	main.py
#	pyproject.toml
#	uv.lock
This commit is contained in:
2025-07-11 09:04:49 +08:00
39 changed files with 6311 additions and 1332 deletions

152
cycles/utils/data_loader.py Normal file
View File

@@ -0,0 +1,152 @@
import os
import json
import pandas as pd
from typing import Union, Optional
import logging
from .storage_utils import (
_parse_timestamp_column,
_filter_by_date_range,
_normalize_column_names,
TimestampParsingError,
DataLoadingError
)
class DataLoader:
"""Handles loading and preprocessing of data from various file formats"""
def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None):
"""Initialize data loader
Args:
data_dir: Directory containing data files
logging_instance: Optional logging instance
"""
self.data_dir = data_dir
self.logging = logging_instance
def load_data(self, file_path: str, start_date: Union[str, pd.Timestamp],
stop_date: Union[str, pd.Timestamp]) -> pd.DataFrame:
"""Load data with optimized dtypes and filtering, supporting CSV and JSON input
Args:
file_path: path to the data file
start_date: start date (string or datetime-like)
stop_date: stop date (string or datetime-like)
Returns:
pandas DataFrame with timestamp index
Raises:
DataLoadingError: If data loading fails
"""
try:
# Convert string dates to pandas datetime objects for proper comparison
start_date = pd.to_datetime(start_date)
stop_date = pd.to_datetime(stop_date)
# Determine file type
_, ext = os.path.splitext(file_path)
ext = ext.lower()
if ext == ".json":
return self._load_json_data(file_path, start_date, stop_date)
else:
return self._load_csv_data(file_path, start_date, stop_date)
except Exception as e:
error_msg = f"Error loading data from {file_path}: {e}"
if self.logging is not None:
self.logging.error(error_msg)
# Return an empty DataFrame with a DatetimeIndex
return pd.DataFrame(index=pd.to_datetime([]))
def _load_json_data(self, file_path: str, start_date: pd.Timestamp,
stop_date: pd.Timestamp) -> pd.DataFrame:
"""Load and process JSON data file
Args:
file_path: Path to JSON file
start_date: Start date for filtering
stop_date: Stop date for filtering
Returns:
Processed DataFrame with timestamp index
"""
with open(os.path.join(self.data_dir, file_path), 'r') as f:
raw = json.load(f)
data = pd.DataFrame(raw["Data"])
data = _normalize_column_names(data)
# Convert timestamp to datetime
data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")
# Filter by date range
data = _filter_by_date_range(data, "timestamp", start_date, stop_date)
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
return data.set_index("timestamp")
def _load_csv_data(self, file_path: str, start_date: pd.Timestamp,
stop_date: pd.Timestamp) -> pd.DataFrame:
"""Load and process CSV data file
Args:
file_path: Path to CSV file
start_date: Start date for filtering
stop_date: Stop date for filtering
Returns:
Processed DataFrame with timestamp index
"""
# Define optimized dtypes
dtypes = {
'Open': 'float32',
'High': 'float32',
'Low': 'float32',
'Close': 'float32',
'Volume': 'float32'
}
# Read data with original capitalized column names
data = pd.read_csv(os.path.join(self.data_dir, file_path), dtype=dtypes)
return self._process_csv_timestamps(data, start_date, stop_date, file_path)
def _process_csv_timestamps(self, data: pd.DataFrame, start_date: pd.Timestamp,
stop_date: pd.Timestamp, file_path: str) -> pd.DataFrame:
"""Process timestamps in CSV data and filter by date range
Args:
data: DataFrame with CSV data
start_date: Start date for filtering
stop_date: Stop date for filtering
file_path: Original file path for logging
Returns:
Processed DataFrame with timestamp index
"""
if 'Timestamp' in data.columns:
data = _parse_timestamp_column(data, 'Timestamp')
data = _filter_by_date_range(data, 'Timestamp', start_date, stop_date)
data = _normalize_column_names(data)
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
return data.set_index('timestamp')
else:
# Attempt to use the first column if 'Timestamp' is not present
data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)
data = _parse_timestamp_column(data, 'timestamp')
data = _filter_by_date_range(data, 'timestamp', start_date, stop_date)
data = _normalize_column_names(data)
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} (using first column as timestamp) for date range {start_date} to {stop_date}")
return data.set_index('timestamp')

106
cycles/utils/data_saver.py Normal file
View File

@@ -0,0 +1,106 @@
import os
import pandas as pd
from typing import Optional
import logging
from .storage_utils import DataSavingError
class DataSaver:
"""Handles saving data to various file formats"""
def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None):
"""Initialize data saver
Args:
data_dir: Directory for saving data files
logging_instance: Optional logging instance
"""
self.data_dir = data_dir
self.logging = logging_instance
def save_data(self, data: pd.DataFrame, file_path: str) -> None:
"""Save processed data to a CSV file.
If the DataFrame has a DatetimeIndex, it's converted to float Unix timestamps
(seconds since epoch) before saving. The index is saved as a column named 'timestamp'.
Args:
data: DataFrame to save
file_path: path to the data file relative to the data_dir
Raises:
DataSavingError: If saving fails
"""
try:
data_to_save = data.copy()
data_to_save = self._prepare_data_for_saving(data_to_save)
# Save to CSV, ensuring the 'timestamp' column (if created) is written
full_path = os.path.join(self.data_dir, file_path)
data_to_save.to_csv(full_path, index=False)
if self.logging is not None:
self.logging.info(f"Data saved to {full_path} with Unix timestamp column.")
except Exception as e:
error_msg = f"Failed to save data to {file_path}: {e}"
if self.logging is not None:
self.logging.error(error_msg)
raise DataSavingError(error_msg) from e
def _prepare_data_for_saving(self, data: pd.DataFrame) -> pd.DataFrame:
"""Prepare DataFrame for saving by handling different index types
Args:
data: DataFrame to prepare
Returns:
DataFrame ready for saving
"""
if isinstance(data.index, pd.DatetimeIndex):
return self._convert_datetime_index_to_timestamp(data)
elif pd.api.types.is_numeric_dtype(data.index.dtype):
return self._convert_numeric_index_to_timestamp(data)
else:
# For other index types, save with the current index
return data
def _convert_datetime_index_to_timestamp(self, data: pd.DataFrame) -> pd.DataFrame:
"""Convert DatetimeIndex to Unix timestamp column
Args:
data: DataFrame with DatetimeIndex
Returns:
DataFrame with timestamp column
"""
# Convert DatetimeIndex to Unix timestamp (float seconds since epoch)
data['timestamp'] = data.index.astype('int64') / 1e9
data.reset_index(drop=True, inplace=True)
# Ensure 'timestamp' is the first column if other columns exist
if 'timestamp' in data.columns and len(data.columns) > 1:
cols = ['timestamp'] + [col for col in data.columns if col != 'timestamp']
data = data[cols]
return data
def _convert_numeric_index_to_timestamp(self, data: pd.DataFrame) -> pd.DataFrame:
"""Convert numeric index to timestamp column
Args:
data: DataFrame with numeric index
Returns:
DataFrame with timestamp column
"""
# If index is already numeric (e.g. float Unix timestamps from a previous save/load cycle)
data['timestamp'] = data.index
data.reset_index(drop=True, inplace=True)
# Ensure 'timestamp' is the first column if other columns exist
if 'timestamp' in data.columns and len(data.columns) > 1:
cols = ['timestamp'] + [col for col in data.columns if col != 'timestamp']
data = data[cols]
return data

View File

@@ -1,128 +0,0 @@
import threading
import time
import queue
from google.oauth2.service_account import Credentials
import gspread
import math
import numpy as np
from collections import defaultdict
class GSheetBatchPusher(threading.Thread):
def __init__(self, queue, timestamp, spreadsheet_name, interval=60, logging=None):
super().__init__(daemon=True)
self.queue = queue
self.timestamp = timestamp
self.spreadsheet_name = spreadsheet_name
self.interval = interval
self._stop_event = threading.Event()
self.logging = logging
def run(self):
while not self._stop_event.is_set():
self.push_all()
time.sleep(self.interval)
# Final push on stop
self.push_all()
def stop(self):
self._stop_event.set()
def push_all(self):
batch_results = []
batch_trades = []
while True:
try:
results, trades = self.queue.get_nowait()
batch_results.extend(results)
batch_trades.extend(trades)
except queue.Empty:
break
if batch_results or batch_trades:
self.write_results_per_combination_gsheet(batch_results, batch_trades, self.timestamp, self.spreadsheet_name)
def write_results_per_combination_gsheet(self, results_rows, trade_rows, timestamp, spreadsheet_name="GlimBit Backtest Results"):
scopes = [
"https://www.googleapis.com/auth/spreadsheets",
"https://www.googleapis.com/auth/drive"
]
creds = Credentials.from_service_account_file('credentials/service_account.json', scopes=scopes)
gc = gspread.authorize(creds)
sh = gc.open(spreadsheet_name)
try:
worksheet = sh.worksheet("Results")
except gspread.exceptions.WorksheetNotFound:
worksheet = sh.add_worksheet(title="Results", rows="1000", cols="20")
# Clear the worksheet before writing new results
worksheet.clear()
# Updated fieldnames to match your data rows
fieldnames = [
"timeframe", "stop_loss_pct", "n_trades", "n_stop_loss", "win_rate",
"max_drawdown", "avg_trade", "profit_ratio", "initial_usd", "final_usd"
]
def to_native(val):
if isinstance(val, (np.generic, np.ndarray)):
val = val.item()
if hasattr(val, 'isoformat'):
return val.isoformat()
# Handle inf, -inf, nan
if isinstance(val, float):
if math.isinf(val):
return "" if val > 0 else "-∞"
if math.isnan(val):
return ""
return val
# Write header if sheet is empty
if len(worksheet.get_all_values()) == 0:
worksheet.append_row(fieldnames)
for row in results_rows:
values = [to_native(row.get(field, "")) for field in fieldnames]
worksheet.append_row(values)
trades_fieldnames = [
"entry_time", "exit_time", "entry_price", "exit_price", "profit_pct", "type"
]
trades_by_combo = defaultdict(list)
for trade in trade_rows:
tf = trade.get("timeframe")
sl = trade.get("stop_loss_pct")
trades_by_combo[(tf, sl)].append(trade)
for (tf, sl), trades in trades_by_combo.items():
sl_percent = int(round(sl * 100))
sheet_name = f"Trades_{tf}_ST{sl_percent}%"
try:
trades_ws = sh.worksheet(sheet_name)
except gspread.exceptions.WorksheetNotFound:
trades_ws = sh.add_worksheet(title=sheet_name, rows="1000", cols="20")
# Clear the trades worksheet before writing new trades
trades_ws.clear()
if len(trades_ws.get_all_values()) == 0:
trades_ws.append_row(trades_fieldnames)
for trade in trades:
trade_row = [to_native(trade.get(field, "")) for field in trades_fieldnames]
try:
trades_ws.append_row(trade_row)
except gspread.exceptions.APIError as e:
if '429' in str(e):
if self.logging is not None:
self.logging.warning(f"Google Sheets API quota exceeded (429). Please wait one minute. Will retry on next batch push. Sheet: {sheet_name}")
# Re-queue the failed batch for retry
self.queue.put((results_rows, trade_rows))
return # Stop pushing for this batch, will retry next interval
else:
raise

View File

@@ -0,0 +1,233 @@
#!/usr/bin/env python3
"""
Progress Manager for tracking multiple parallel backtest tasks
"""
import threading
import time
import sys
from typing import Dict, Optional, Callable
from dataclasses import dataclass
@dataclass
class TaskProgress:
"""Represents progress information for a single task"""
task_id: str
name: str
current: int
total: int
start_time: float
last_update: float
@property
def percentage(self) -> float:
"""Calculate completion percentage"""
if self.total == 0:
return 0.0
return (self.current / self.total) * 100
@property
def elapsed_time(self) -> float:
"""Calculate elapsed time in seconds"""
return time.time() - self.start_time
@property
def eta(self) -> Optional[float]:
"""Estimate time to completion in seconds"""
if self.current == 0 or self.percentage >= 100:
return None
elapsed = self.elapsed_time
rate = self.current / elapsed
remaining = self.total - self.current
return remaining / rate if rate > 0 else None
class ProgressManager:
"""Manages progress tracking for multiple parallel tasks"""
def __init__(self, update_interval: float = 1.0, display_width: int = 50):
"""
Initialize progress manager
Args:
update_interval: How often to update display (seconds)
display_width: Width of progress bar in characters
"""
self.tasks: Dict[str, TaskProgress] = {}
self.update_interval = update_interval
self.display_width = display_width
self.lock = threading.Lock()
self.display_thread: Optional[threading.Thread] = None
self.running = False
self.last_display_height = 0
def start_task(self, task_id: str, name: str, total: int) -> None:
"""
Start tracking a new task
Args:
task_id: Unique identifier for the task
name: Human-readable name for the task
total: Total number of steps in the task
"""
with self.lock:
self.tasks[task_id] = TaskProgress(
task_id=task_id,
name=name,
current=0,
total=total,
start_time=time.time(),
last_update=time.time()
)
def update_progress(self, task_id: str, current: int) -> None:
"""
Update progress for a specific task
Args:
task_id: Task identifier
current: Current progress value
"""
with self.lock:
if task_id in self.tasks:
self.tasks[task_id].current = current
self.tasks[task_id].last_update = time.time()
def complete_task(self, task_id: str) -> None:
"""
Mark a task as completed
Args:
task_id: Task identifier
"""
with self.lock:
if task_id in self.tasks:
task = self.tasks[task_id]
task.current = task.total
task.last_update = time.time()
def start_display(self) -> None:
"""Start the progress display thread"""
if not self.running:
self.running = True
self.display_thread = threading.Thread(target=self._display_loop, daemon=True)
self.display_thread.start()
def stop_display(self) -> None:
"""Stop the progress display thread"""
self.running = False
if self.display_thread:
self.display_thread.join(timeout=1.0)
self._clear_display()
def _display_loop(self) -> None:
"""Main loop for updating the progress display"""
while self.running:
self._update_display()
time.sleep(self.update_interval)
def _update_display(self) -> None:
"""Update the console display with current progress"""
with self.lock:
if not self.tasks:
return
# Clear previous display
self._clear_display()
# Build display lines
lines = []
for task in sorted(self.tasks.values(), key=lambda t: t.task_id):
line = self._format_progress_line(task)
lines.append(line)
# Print all lines
for line in lines:
print(line, flush=True)
self.last_display_height = len(lines)
def _clear_display(self) -> None:
"""Clear the previous progress display"""
if self.last_display_height > 0:
# Move cursor up and clear lines
for _ in range(self.last_display_height):
sys.stdout.write('\033[F') # Move cursor up one line
sys.stdout.write('\033[K') # Clear line
sys.stdout.flush()
def _format_progress_line(self, task: TaskProgress) -> str:
"""
Format a single progress line for display
Args:
task: TaskProgress instance
Returns:
Formatted progress string
"""
# Progress bar
filled_width = int(task.percentage / 100 * self.display_width)
bar = '' * filled_width + '' * (self.display_width - filled_width)
# Time information
elapsed_str = self._format_time(task.elapsed_time)
eta_str = self._format_time(task.eta) if task.eta else "N/A"
# Format line
line = (f"{task.name:<25}{bar}"
f"{task.percentage:5.1f}% "
f"({task.current:,}/{task.total:,}) "
f"{elapsed_str} ETA: {eta_str}")
return line
def _format_time(self, seconds: float) -> str:
"""
Format time duration for display
Args:
seconds: Time in seconds
Returns:
Formatted time string
"""
if seconds < 60:
return f"{seconds:.0f}s"
elif seconds < 3600:
minutes = seconds / 60
return f"{minutes:.1f}m"
else:
hours = seconds / 3600
return f"{hours:.1f}h"
def get_task_progress_callback(self, task_id: str) -> Callable[[int], None]:
"""
Get a progress callback function for a specific task
Args:
task_id: Task identifier
Returns:
Callback function that updates progress for this task
"""
def callback(current: int) -> None:
self.update_progress(task_id, current)
return callback
def all_tasks_completed(self) -> bool:
"""Check if all tasks are completed"""
with self.lock:
return all(task.current >= task.total for task in self.tasks.values())
def get_summary(self) -> str:
"""Get a summary of all tasks"""
with self.lock:
total_tasks = len(self.tasks)
completed_tasks = sum(1 for task in self.tasks.values()
if task.current >= task.total)
return f"Tasks: {completed_tasks}/{total_tasks} completed"

View File

@@ -0,0 +1,179 @@
import os
import csv
from typing import Dict, List, Optional, Any
from collections import defaultdict
import logging
from .storage_utils import DataSavingError
class ResultFormatter:
"""Handles formatting and writing of backtest results to CSV files"""
def __init__(self, results_dir: str, logging_instance: Optional[logging.Logger] = None):
"""Initialize result formatter
Args:
results_dir: Directory for saving result files
logging_instance: Optional logging instance
"""
self.results_dir = results_dir
self.logging = logging_instance
def format_row(self, row: Dict[str, Any]) -> Dict[str, str]:
"""Format a row for a combined results CSV file
Args:
row: Dictionary containing row data
Returns:
Dictionary with formatted values
"""
return {
"timeframe": row["timeframe"],
"stop_loss_pct": f"{row['stop_loss_pct']*100:.2f}%",
"n_trades": row["n_trades"],
"n_stop_loss": row["n_stop_loss"],
"win_rate": f"{row['win_rate']*100:.2f}%",
"max_drawdown": f"{row['max_drawdown']*100:.2f}%",
"avg_trade": f"{row['avg_trade']*100:.2f}%",
"profit_ratio": f"{row['profit_ratio']*100:.2f}%",
"final_usd": f"{row['final_usd']:.2f}",
"total_fees_usd": f"{row['total_fees_usd']:.2f}",
}
def write_results_chunk(self, filename: str, fieldnames: List[str],
rows: List[Dict], write_header: bool = False,
initial_usd: Optional[float] = None) -> None:
"""Write a chunk of results to a CSV file
Args:
filename: filename to write to
fieldnames: list of fieldnames
rows: list of rows
write_header: whether to write the header
initial_usd: initial USD value for header comment
Raises:
DataSavingError: If writing fails
"""
try:
mode = 'w' if write_header else 'a'
with open(filename, mode, newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if write_header:
if initial_usd is not None:
csvfile.write(f"# initial_usd: {initial_usd}\n")
writer.writeheader()
for row in rows:
# Only keep keys that are in fieldnames
filtered_row = {k: v for k, v in row.items() if k in fieldnames}
writer.writerow(filtered_row)
except Exception as e:
error_msg = f"Failed to write results chunk to {filename}: {e}"
if self.logging is not None:
self.logging.error(error_msg)
raise DataSavingError(error_msg) from e
def write_backtest_results(self, filename: str, fieldnames: List[str],
rows: List[Dict], metadata_lines: Optional[List[str]] = None) -> str:
"""Write combined backtest results to a CSV file
Args:
filename: filename to write to
fieldnames: list of fieldnames
rows: list of result dictionaries
metadata_lines: optional list of strings to write as header comments
Returns:
Full path to the written file
Raises:
DataSavingError: If writing fails
"""
try:
fname = os.path.join(self.results_dir, filename)
with open(fname, "w", newline="") as csvfile:
if metadata_lines:
for line in metadata_lines:
csvfile.write(f"{line}\n")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
for row in rows:
writer.writerow(self.format_row(row))
if self.logging is not None:
self.logging.info(f"Combined results written to {fname}")
return fname
except Exception as e:
error_msg = f"Failed to write backtest results to {filename}: {e}"
if self.logging is not None:
self.logging.error(error_msg)
raise DataSavingError(error_msg) from e
def write_trades(self, all_trade_rows: List[Dict], trades_fieldnames: List[str]) -> None:
"""Write trades to separate CSV files grouped by timeframe and stop loss
Args:
all_trade_rows: list of trade dictionaries
trades_fieldnames: list of trade fieldnames
Raises:
DataSavingError: If writing fails
"""
try:
trades_by_combo = self._group_trades_by_combination(all_trade_rows)
for (tf, sl), trades in trades_by_combo.items():
self._write_single_trade_file(tf, sl, trades, trades_fieldnames)
except Exception as e:
error_msg = f"Failed to write trades: {e}"
if self.logging is not None:
self.logging.error(error_msg)
raise DataSavingError(error_msg) from e
def _group_trades_by_combination(self, all_trade_rows: List[Dict]) -> Dict:
"""Group trades by timeframe and stop loss combination
Args:
all_trade_rows: List of trade dictionaries
Returns:
Dictionary grouped by (timeframe, stop_loss_pct) tuples
"""
trades_by_combo = defaultdict(list)
for trade in all_trade_rows:
tf = trade.get("timeframe")
sl = trade.get("stop_loss_pct")
trades_by_combo[(tf, sl)].append(trade)
return trades_by_combo
def _write_single_trade_file(self, timeframe: str, stop_loss_pct: float,
trades: List[Dict], trades_fieldnames: List[str]) -> None:
"""Write trades for a single timeframe/stop-loss combination
Args:
timeframe: Timeframe identifier
stop_loss_pct: Stop loss percentage
trades: List of trades for this combination
trades_fieldnames: List of field names for trades
"""
sl_percent = int(round(stop_loss_pct * 100))
trades_filename = os.path.join(self.results_dir, f"trades_{timeframe}_ST{sl_percent}pct.csv")
with open(trades_filename, "w", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=trades_fieldnames)
writer.writeheader()
for trade in trades:
writer.writerow({k: trade.get(k, "") for k in trades_fieldnames})
if self.logging is not None:
self.logging.info(f"Trades written to {trades_filename}")

View File

@@ -1,17 +1,32 @@
import os
import json
import pandas as pd
import csv
from collections import defaultdict
from typing import Optional, Union, Dict, Any, List
import logging
from .data_loader import DataLoader
from .data_saver import DataSaver
from .result_formatter import ResultFormatter
from .storage_utils import DataLoadingError, DataSavingError
RESULTS_DIR = "../results"
DATA_DIR = "../data"
RESULTS_DIR = "results"
DATA_DIR = "data"
class Storage:
"""Storage class for storing and loading results and data"""
"""Unified storage interface for data and results operations
Acts as a coordinator for DataLoader, DataSaver, and ResultFormatter components,
maintaining backward compatibility while providing a clean separation of concerns.
"""
def __init__(self, logging=None, results_dir=RESULTS_DIR, data_dir=DATA_DIR):
"""Initialize storage with component instances
Args:
logging: Optional logging instance
results_dir: Directory for results files
data_dir: Directory for data files
"""
self.results_dir = results_dir
self.data_dir = data_dir
self.logging = logging
@@ -20,196 +35,89 @@ class Storage:
os.makedirs(self.results_dir, exist_ok=True)
os.makedirs(self.data_dir, exist_ok=True)
def load_data(self, file_path, start_date, stop_date):
# Initialize component instances
self.data_loader = DataLoader(data_dir, logging)
self.data_saver = DataSaver(data_dir, logging)
self.result_formatter = ResultFormatter(results_dir, logging)
def load_data(self, file_path: str, start_date: Union[str, pd.Timestamp],
stop_date: Union[str, pd.Timestamp]) -> pd.DataFrame:
"""Load data with optimized dtypes and filtering, supporting CSV and JSON input
Args:
file_path: path to the data file
start_date: start date
stop_date: stop date
start_date: start date (string or datetime-like)
stop_date: stop date (string or datetime-like)
Returns:
pandas DataFrame
pandas DataFrame with timestamp index
Raises:
DataLoadingError: If data loading fails
"""
# Determine file type
_, ext = os.path.splitext(file_path)
ext = ext.lower()
try:
if ext == ".json":
with open(os.path.join(self.data_dir, file_path), 'r') as f:
raw = json.load(f)
data = pd.DataFrame(raw["Data"])
# Convert columns to lowercase
data.columns = data.columns.str.lower()
# Convert timestamp to datetime
data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")
# Filter by date range
data = data[(data["timestamp"] >= start_date) & (data["timestamp"] <= stop_date)]
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
return data.set_index("timestamp")
else:
# Define optimized dtypes
dtypes = {
'Open': 'float32',
'High': 'float32',
'Low': 'float32',
'Close': 'float32',
'Volume': 'float32'
}
# Read data with original capitalized column names
data = pd.read_csv(os.path.join(self.data_dir, file_path), dtype=dtypes)
# Convert timestamp to datetime
if 'Timestamp' in data.columns:
data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s')
# Filter by date range
data = data[(data['Timestamp'] >= start_date) & (data['Timestamp'] <= stop_date)]
# Now convert column names to lowercase
data.columns = data.columns.str.lower()
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
return data.set_index('timestamp')
else: # Attempt to use the first column if 'Timestamp' is not present
data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')
data = data[(data['timestamp'] >= start_date) & (data['timestamp'] <= stop_date)]
data.columns = data.columns.str.lower() # Ensure all other columns are lower
if self.logging is not None:
self.logging.info(f"Data loaded from {file_path} (using first column as timestamp) for date range {start_date} to {stop_date}")
return data.set_index('timestamp')
except Exception as e:
if self.logging is not None:
self.logging.error(f"Error loading data from {file_path}: {e}")
# Return an empty DataFrame with a DatetimeIndex
return pd.DataFrame(index=pd.to_datetime([]))
def save_data(self, data: pd.DataFrame, file_path: str):
"""Save processed data to a CSV file.
If the DataFrame has a DatetimeIndex, it's converted to float Unix timestamps
(seconds since epoch) before saving. The index is saved as a column named 'timestamp'.
return self.data_loader.load_data(file_path, start_date, stop_date)
def save_data(self, data: pd.DataFrame, file_path: str) -> None:
"""Save processed data to a CSV file
Args:
data (pd.DataFrame): data to save.
file_path (str): path to the data file relative to the data_dir.
data: DataFrame to save
file_path: path to the data file relative to the data_dir
Raises:
DataSavingError: If saving fails
"""
data_to_save = data.copy()
self.data_saver.save_data(data, file_path)
if isinstance(data_to_save.index, pd.DatetimeIndex):
# Convert DatetimeIndex to Unix timestamp (float seconds since epoch)
# and make it a column named 'timestamp'.
data_to_save['timestamp'] = data_to_save.index.astype('int64') / 1e9
# Reset index so 'timestamp' column is saved and old DatetimeIndex is not saved as a column.
# We want the 'timestamp' column to be the first one.
data_to_save.reset_index(drop=True, inplace=True)
# Ensure 'timestamp' is the first column if other columns exist
if 'timestamp' in data_to_save.columns and len(data_to_save.columns) > 1:
cols = ['timestamp'] + [col for col in data_to_save.columns if col != 'timestamp']
data_to_save = data_to_save[cols]
elif pd.api.types.is_numeric_dtype(data_to_save.index.dtype):
# If index is already numeric (e.g. float Unix timestamps from a previous save/load cycle),
# make it a column named 'timestamp'.
data_to_save['timestamp'] = data_to_save.index
data_to_save.reset_index(drop=True, inplace=True)
if 'timestamp' in data_to_save.columns and len(data_to_save.columns) > 1:
cols = ['timestamp'] + [col for col in data_to_save.columns if col != 'timestamp']
data_to_save = data_to_save[cols]
else:
# For other index types, or if no index that we want to specifically handle,
# save with the current index. pandas to_csv will handle it.
# This branch might be removed if we strictly expect either DatetimeIndex or a numeric one from previous save.
pass # data_to_save remains as is, to_csv will write its index if index=True
# Save to CSV, ensuring the 'timestamp' column (if created) is written, and not the DataFrame's active index.
full_path = os.path.join(self.data_dir, file_path)
data_to_save.to_csv(full_path, index=False) # index=False because timestamp is now a column
if self.logging is not None:
self.logging.info(f"Data saved to {full_path} with Unix timestamp column.")
def format_row(self, row):
def format_row(self, row: Dict[str, Any]) -> Dict[str, str]:
"""Format a row for a combined results CSV file
Args:
row: row to format
row: Dictionary containing row data
Returns:
formatted row
Dictionary with formatted values
"""
return self.result_formatter.format_row(row)
return {
"timeframe": row["timeframe"],
"stop_loss_pct": f"{row['stop_loss_pct']*100:.2f}%",
"n_trades": row["n_trades"],
"n_stop_loss": row["n_stop_loss"],
"win_rate": f"{row['win_rate']*100:.2f}%",
"max_drawdown": f"{row['max_drawdown']*100:.2f}%",
"avg_trade": f"{row['avg_trade']*100:.2f}%",
"profit_ratio": f"{row['profit_ratio']*100:.2f}%",
"final_usd": f"{row['final_usd']:.2f}",
"total_fees_usd": f"{row['total_fees_usd']:.2f}",
}
def write_results_chunk(self, filename, fieldnames, rows, write_header=False, initial_usd=None):
def write_results_chunk(self, filename: str, fieldnames: List[str],
rows: List[Dict], write_header: bool = False,
initial_usd: Optional[float] = None) -> None:
"""Write a chunk of results to a CSV file
Args:
filename: filename to write to
fieldnames: list of fieldnames
rows: list of rows
write_header: whether to write the header
initial_usd: initial USD
initial_usd: initial USD value for header comment
"""
mode = 'w' if write_header else 'a'
self.result_formatter.write_results_chunk(
filename, fieldnames, rows, write_header, initial_usd
)
def write_backtest_results(self, filename: str, fieldnames: List[str],
rows: List[Dict], metadata_lines: Optional[List[str]] = None) -> str:
"""Write combined backtest results to a CSV file
with open(filename, mode, newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if write_header:
csvfile.write(f"# initial_usd: {initial_usd}\n")
writer.writeheader()
for row in rows:
# Only keep keys that are in fieldnames
filtered_row = {k: v for k, v in row.items() if k in fieldnames}
writer.writerow(filtered_row)
def write_backtest_results(self, filename, fieldnames, rows, metadata_lines=None):
"""Write a combined results to a CSV file
Args:
filename: filename to write to
fieldnames: list of fieldnames
rows: list of rows
rows: list of result dictionaries
metadata_lines: optional list of strings to write as header comments
Returns:
Full path to the written file
"""
fname = os.path.join(self.results_dir, filename)
with open(fname, "w", newline="") as csvfile:
if metadata_lines:
for line in metadata_lines:
csvfile.write(f"{line}\n")
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
writer.writeheader()
for row in rows:
writer.writerow(self.format_row(row))
if self.logging is not None:
self.logging.info(f"Combined results written to {fname}")
def write_trades(self, all_trade_rows, trades_fieldnames):
"""Write trades to a CSV file
return self.result_formatter.write_backtest_results(
filename, fieldnames, rows, metadata_lines
)
def write_trades(self, all_trade_rows: List[Dict], trades_fieldnames: List[str]) -> None:
"""Write trades to separate CSV files grouped by timeframe and stop loss
Args:
all_trade_rows: list of trade rows
all_trade_rows: list of trade dictionaries
trades_fieldnames: list of trade fieldnames
logging: logging object
"""
trades_by_combo = defaultdict(list)
for trade in all_trade_rows:
tf = trade.get("timeframe")
sl = trade.get("stop_loss_pct")
trades_by_combo[(tf, sl)].append(trade)
for (tf, sl), trades in trades_by_combo.items():
sl_percent = int(round(sl * 100))
trades_filename = os.path.join(self.results_dir, f"trades_{tf}_ST{sl_percent}pct.csv")
with open(trades_filename, "w", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=trades_fieldnames)
writer.writeheader()
for trade in trades:
writer.writerow({k: trade.get(k, "") for k in trades_fieldnames})
if self.logging is not None:
self.logging.info(f"Trades written to {trades_filename}")
self.result_formatter.write_trades(all_trade_rows, trades_fieldnames)

View File

@@ -0,0 +1,73 @@
import pandas as pd
class TimestampParsingError(Exception):
"""Custom exception for timestamp parsing errors"""
pass
class DataLoadingError(Exception):
"""Custom exception for data loading errors"""
pass
class DataSavingError(Exception):
"""Custom exception for data saving errors"""
pass
def _parse_timestamp_column(data: pd.DataFrame, column_name: str) -> pd.DataFrame:
"""Parse timestamp column handling both Unix timestamps and datetime strings
Args:
data: DataFrame containing the timestamp column
column_name: Name of the timestamp column
Returns:
DataFrame with parsed timestamp column
Raises:
TimestampParsingError: If timestamp parsing fails
"""
try:
sample_timestamp = str(data[column_name].iloc[0])
try:
# Check if it's a Unix timestamp (numeric)
float(sample_timestamp)
# It's a Unix timestamp, convert using unit='s'
data[column_name] = pd.to_datetime(data[column_name], unit='s')
except ValueError:
# It's already in datetime string format, convert without unit
data[column_name] = pd.to_datetime(data[column_name])
return data
except Exception as e:
raise TimestampParsingError(f"Failed to parse timestamp column '{column_name}': {e}")
def _filter_by_date_range(data: pd.DataFrame, timestamp_col: str,
start_date: pd.Timestamp, stop_date: pd.Timestamp) -> pd.DataFrame:
"""Filter DataFrame by date range
Args:
data: DataFrame to filter
timestamp_col: Name of timestamp column
start_date: Start date for filtering
stop_date: Stop date for filtering
Returns:
Filtered DataFrame
"""
return data[(data[timestamp_col] >= start_date) & (data[timestamp_col] <= stop_date)]
def _normalize_column_names(data: pd.DataFrame) -> pd.DataFrame:
"""Convert all column names to lowercase
Args:
data: DataFrame to normalize
Returns:
DataFrame with lowercase column names
"""
data.columns = data.columns.str.lower()
return data

View File

@@ -10,10 +10,12 @@ class SystemUtils:
"""Determine optimal number of worker processes based on system resources"""
cpu_count = os.cpu_count() or 4
memory_gb = psutil.virtual_memory().total / (1024**3)
# Heuristic: Use 75% of cores, but cap based on available memory
# Assume each worker needs ~2GB for large datasets
workers_by_memory = max(1, int(memory_gb / 2))
workers_by_cpu = max(1, int(cpu_count * 0.75))
# OPTIMIZATION: More aggressive worker allocation for better performance
workers_by_memory = max(1, int(memory_gb / 2)) # 2GB per worker
workers_by_cpu = max(1, int(cpu_count * 0.8)) # Use 80% of CPU cores
optimal_workers = min(workers_by_cpu, workers_by_memory, 8) # Cap at 8 workers
if self.logging is not None:
self.logging.info(f"Using {min(workers_by_cpu, workers_by_memory)} workers for processing")
return min(workers_by_cpu, workers_by_memory)
self.logging.info(f"Using {optimal_workers} workers for processing (CPU-based: {workers_by_cpu}, Memory-based: {workers_by_memory})")
return optimal_workers