Merge branch 'xgboost'
# Conflicts: # .gitignore # README.md # cycles/backtest.py # main.py # pyproject.toml # uv.lock
This commit is contained in:
152
cycles/utils/data_loader.py
Normal file
152
cycles/utils/data_loader.py
Normal file
@@ -0,0 +1,152 @@
|
||||
import os
|
||||
import json
|
||||
import pandas as pd
|
||||
from typing import Union, Optional
|
||||
import logging
|
||||
|
||||
from .storage_utils import (
|
||||
_parse_timestamp_column,
|
||||
_filter_by_date_range,
|
||||
_normalize_column_names,
|
||||
TimestampParsingError,
|
||||
DataLoadingError
|
||||
)
|
||||
|
||||
|
||||
class DataLoader:
|
||||
"""Handles loading and preprocessing of data from various file formats"""
|
||||
|
||||
def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None):
|
||||
"""Initialize data loader
|
||||
|
||||
Args:
|
||||
data_dir: Directory containing data files
|
||||
logging_instance: Optional logging instance
|
||||
"""
|
||||
self.data_dir = data_dir
|
||||
self.logging = logging_instance
|
||||
|
||||
def load_data(self, file_path: str, start_date: Union[str, pd.Timestamp],
|
||||
stop_date: Union[str, pd.Timestamp]) -> pd.DataFrame:
|
||||
"""Load data with optimized dtypes and filtering, supporting CSV and JSON input
|
||||
|
||||
Args:
|
||||
file_path: path to the data file
|
||||
start_date: start date (string or datetime-like)
|
||||
stop_date: stop date (string or datetime-like)
|
||||
|
||||
Returns:
|
||||
pandas DataFrame with timestamp index
|
||||
|
||||
Raises:
|
||||
DataLoadingError: If data loading fails
|
||||
"""
|
||||
try:
|
||||
# Convert string dates to pandas datetime objects for proper comparison
|
||||
start_date = pd.to_datetime(start_date)
|
||||
stop_date = pd.to_datetime(stop_date)
|
||||
|
||||
# Determine file type
|
||||
_, ext = os.path.splitext(file_path)
|
||||
ext = ext.lower()
|
||||
|
||||
if ext == ".json":
|
||||
return self._load_json_data(file_path, start_date, stop_date)
|
||||
else:
|
||||
return self._load_csv_data(file_path, start_date, stop_date)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error loading data from {file_path}: {e}"
|
||||
if self.logging is not None:
|
||||
self.logging.error(error_msg)
|
||||
# Return an empty DataFrame with a DatetimeIndex
|
||||
return pd.DataFrame(index=pd.to_datetime([]))
|
||||
|
||||
def _load_json_data(self, file_path: str, start_date: pd.Timestamp,
|
||||
stop_date: pd.Timestamp) -> pd.DataFrame:
|
||||
"""Load and process JSON data file
|
||||
|
||||
Args:
|
||||
file_path: Path to JSON file
|
||||
start_date: Start date for filtering
|
||||
stop_date: Stop date for filtering
|
||||
|
||||
Returns:
|
||||
Processed DataFrame with timestamp index
|
||||
"""
|
||||
with open(os.path.join(self.data_dir, file_path), 'r') as f:
|
||||
raw = json.load(f)
|
||||
|
||||
data = pd.DataFrame(raw["Data"])
|
||||
data = _normalize_column_names(data)
|
||||
|
||||
# Convert timestamp to datetime
|
||||
data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")
|
||||
|
||||
# Filter by date range
|
||||
data = _filter_by_date_range(data, "timestamp", start_date, stop_date)
|
||||
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
|
||||
|
||||
return data.set_index("timestamp")
|
||||
|
||||
def _load_csv_data(self, file_path: str, start_date: pd.Timestamp,
|
||||
stop_date: pd.Timestamp) -> pd.DataFrame:
|
||||
"""Load and process CSV data file
|
||||
|
||||
Args:
|
||||
file_path: Path to CSV file
|
||||
start_date: Start date for filtering
|
||||
stop_date: Stop date for filtering
|
||||
|
||||
Returns:
|
||||
Processed DataFrame with timestamp index
|
||||
"""
|
||||
# Define optimized dtypes
|
||||
dtypes = {
|
||||
'Open': 'float32',
|
||||
'High': 'float32',
|
||||
'Low': 'float32',
|
||||
'Close': 'float32',
|
||||
'Volume': 'float32'
|
||||
}
|
||||
|
||||
# Read data with original capitalized column names
|
||||
data = pd.read_csv(os.path.join(self.data_dir, file_path), dtype=dtypes)
|
||||
|
||||
return self._process_csv_timestamps(data, start_date, stop_date, file_path)
|
||||
|
||||
def _process_csv_timestamps(self, data: pd.DataFrame, start_date: pd.Timestamp,
|
||||
stop_date: pd.Timestamp, file_path: str) -> pd.DataFrame:
|
||||
"""Process timestamps in CSV data and filter by date range
|
||||
|
||||
Args:
|
||||
data: DataFrame with CSV data
|
||||
start_date: Start date for filtering
|
||||
stop_date: Stop date for filtering
|
||||
file_path: Original file path for logging
|
||||
|
||||
Returns:
|
||||
Processed DataFrame with timestamp index
|
||||
"""
|
||||
if 'Timestamp' in data.columns:
|
||||
data = _parse_timestamp_column(data, 'Timestamp')
|
||||
data = _filter_by_date_range(data, 'Timestamp', start_date, stop_date)
|
||||
data = _normalize_column_names(data)
|
||||
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
|
||||
|
||||
return data.set_index('timestamp')
|
||||
else:
|
||||
# Attempt to use the first column if 'Timestamp' is not present
|
||||
data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)
|
||||
data = _parse_timestamp_column(data, 'timestamp')
|
||||
data = _filter_by_date_range(data, 'timestamp', start_date, stop_date)
|
||||
data = _normalize_column_names(data)
|
||||
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Data loaded from {file_path} (using first column as timestamp) for date range {start_date} to {stop_date}")
|
||||
|
||||
return data.set_index('timestamp')
|
||||
106
cycles/utils/data_saver.py
Normal file
106
cycles/utils/data_saver.py
Normal file
@@ -0,0 +1,106 @@
|
||||
import os
|
||||
import pandas as pd
|
||||
from typing import Optional
|
||||
import logging
|
||||
|
||||
from .storage_utils import DataSavingError
|
||||
|
||||
|
||||
class DataSaver:
|
||||
"""Handles saving data to various file formats"""
|
||||
|
||||
def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None):
|
||||
"""Initialize data saver
|
||||
|
||||
Args:
|
||||
data_dir: Directory for saving data files
|
||||
logging_instance: Optional logging instance
|
||||
"""
|
||||
self.data_dir = data_dir
|
||||
self.logging = logging_instance
|
||||
|
||||
def save_data(self, data: pd.DataFrame, file_path: str) -> None:
|
||||
"""Save processed data to a CSV file.
|
||||
If the DataFrame has a DatetimeIndex, it's converted to float Unix timestamps
|
||||
(seconds since epoch) before saving. The index is saved as a column named 'timestamp'.
|
||||
|
||||
Args:
|
||||
data: DataFrame to save
|
||||
file_path: path to the data file relative to the data_dir
|
||||
|
||||
Raises:
|
||||
DataSavingError: If saving fails
|
||||
"""
|
||||
try:
|
||||
data_to_save = data.copy()
|
||||
data_to_save = self._prepare_data_for_saving(data_to_save)
|
||||
|
||||
# Save to CSV, ensuring the 'timestamp' column (if created) is written
|
||||
full_path = os.path.join(self.data_dir, file_path)
|
||||
data_to_save.to_csv(full_path, index=False)
|
||||
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Data saved to {full_path} with Unix timestamp column.")
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to save data to {file_path}: {e}"
|
||||
if self.logging is not None:
|
||||
self.logging.error(error_msg)
|
||||
raise DataSavingError(error_msg) from e
|
||||
|
||||
def _prepare_data_for_saving(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Prepare DataFrame for saving by handling different index types
|
||||
|
||||
Args:
|
||||
data: DataFrame to prepare
|
||||
|
||||
Returns:
|
||||
DataFrame ready for saving
|
||||
"""
|
||||
if isinstance(data.index, pd.DatetimeIndex):
|
||||
return self._convert_datetime_index_to_timestamp(data)
|
||||
elif pd.api.types.is_numeric_dtype(data.index.dtype):
|
||||
return self._convert_numeric_index_to_timestamp(data)
|
||||
else:
|
||||
# For other index types, save with the current index
|
||||
return data
|
||||
|
||||
def _convert_datetime_index_to_timestamp(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Convert DatetimeIndex to Unix timestamp column
|
||||
|
||||
Args:
|
||||
data: DataFrame with DatetimeIndex
|
||||
|
||||
Returns:
|
||||
DataFrame with timestamp column
|
||||
"""
|
||||
# Convert DatetimeIndex to Unix timestamp (float seconds since epoch)
|
||||
data['timestamp'] = data.index.astype('int64') / 1e9
|
||||
data.reset_index(drop=True, inplace=True)
|
||||
|
||||
# Ensure 'timestamp' is the first column if other columns exist
|
||||
if 'timestamp' in data.columns and len(data.columns) > 1:
|
||||
cols = ['timestamp'] + [col for col in data.columns if col != 'timestamp']
|
||||
data = data[cols]
|
||||
|
||||
return data
|
||||
|
||||
def _convert_numeric_index_to_timestamp(self, data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Convert numeric index to timestamp column
|
||||
|
||||
Args:
|
||||
data: DataFrame with numeric index
|
||||
|
||||
Returns:
|
||||
DataFrame with timestamp column
|
||||
"""
|
||||
# If index is already numeric (e.g. float Unix timestamps from a previous save/load cycle)
|
||||
data['timestamp'] = data.index
|
||||
data.reset_index(drop=True, inplace=True)
|
||||
|
||||
# Ensure 'timestamp' is the first column if other columns exist
|
||||
if 'timestamp' in data.columns and len(data.columns) > 1:
|
||||
cols = ['timestamp'] + [col for col in data.columns if col != 'timestamp']
|
||||
data = data[cols]
|
||||
|
||||
return data
|
||||
@@ -1,128 +0,0 @@
|
||||
import threading
|
||||
import time
|
||||
import queue
|
||||
from google.oauth2.service_account import Credentials
|
||||
import gspread
|
||||
import math
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
class GSheetBatchPusher(threading.Thread):
|
||||
|
||||
def __init__(self, queue, timestamp, spreadsheet_name, interval=60, logging=None):
|
||||
super().__init__(daemon=True)
|
||||
self.queue = queue
|
||||
self.timestamp = timestamp
|
||||
self.spreadsheet_name = spreadsheet_name
|
||||
self.interval = interval
|
||||
self._stop_event = threading.Event()
|
||||
self.logging = logging
|
||||
|
||||
def run(self):
|
||||
while not self._stop_event.is_set():
|
||||
self.push_all()
|
||||
time.sleep(self.interval)
|
||||
# Final push on stop
|
||||
self.push_all()
|
||||
|
||||
def stop(self):
|
||||
self._stop_event.set()
|
||||
|
||||
def push_all(self):
|
||||
batch_results = []
|
||||
batch_trades = []
|
||||
while True:
|
||||
try:
|
||||
results, trades = self.queue.get_nowait()
|
||||
batch_results.extend(results)
|
||||
batch_trades.extend(trades)
|
||||
except queue.Empty:
|
||||
break
|
||||
|
||||
if batch_results or batch_trades:
|
||||
self.write_results_per_combination_gsheet(batch_results, batch_trades, self.timestamp, self.spreadsheet_name)
|
||||
|
||||
|
||||
def write_results_per_combination_gsheet(self, results_rows, trade_rows, timestamp, spreadsheet_name="GlimBit Backtest Results"):
|
||||
scopes = [
|
||||
"https://www.googleapis.com/auth/spreadsheets",
|
||||
"https://www.googleapis.com/auth/drive"
|
||||
]
|
||||
creds = Credentials.from_service_account_file('credentials/service_account.json', scopes=scopes)
|
||||
gc = gspread.authorize(creds)
|
||||
sh = gc.open(spreadsheet_name)
|
||||
|
||||
try:
|
||||
worksheet = sh.worksheet("Results")
|
||||
except gspread.exceptions.WorksheetNotFound:
|
||||
worksheet = sh.add_worksheet(title="Results", rows="1000", cols="20")
|
||||
|
||||
# Clear the worksheet before writing new results
|
||||
worksheet.clear()
|
||||
|
||||
# Updated fieldnames to match your data rows
|
||||
fieldnames = [
|
||||
"timeframe", "stop_loss_pct", "n_trades", "n_stop_loss", "win_rate",
|
||||
"max_drawdown", "avg_trade", "profit_ratio", "initial_usd", "final_usd"
|
||||
]
|
||||
|
||||
def to_native(val):
|
||||
if isinstance(val, (np.generic, np.ndarray)):
|
||||
val = val.item()
|
||||
if hasattr(val, 'isoformat'):
|
||||
return val.isoformat()
|
||||
# Handle inf, -inf, nan
|
||||
if isinstance(val, float):
|
||||
if math.isinf(val):
|
||||
return "∞" if val > 0 else "-∞"
|
||||
if math.isnan(val):
|
||||
return ""
|
||||
return val
|
||||
|
||||
# Write header if sheet is empty
|
||||
if len(worksheet.get_all_values()) == 0:
|
||||
worksheet.append_row(fieldnames)
|
||||
|
||||
for row in results_rows:
|
||||
values = [to_native(row.get(field, "")) for field in fieldnames]
|
||||
worksheet.append_row(values)
|
||||
|
||||
trades_fieldnames = [
|
||||
"entry_time", "exit_time", "entry_price", "exit_price", "profit_pct", "type"
|
||||
]
|
||||
trades_by_combo = defaultdict(list)
|
||||
|
||||
for trade in trade_rows:
|
||||
tf = trade.get("timeframe")
|
||||
sl = trade.get("stop_loss_pct")
|
||||
trades_by_combo[(tf, sl)].append(trade)
|
||||
|
||||
for (tf, sl), trades in trades_by_combo.items():
|
||||
sl_percent = int(round(sl * 100))
|
||||
sheet_name = f"Trades_{tf}_ST{sl_percent}%"
|
||||
|
||||
try:
|
||||
trades_ws = sh.worksheet(sheet_name)
|
||||
except gspread.exceptions.WorksheetNotFound:
|
||||
trades_ws = sh.add_worksheet(title=sheet_name, rows="1000", cols="20")
|
||||
|
||||
# Clear the trades worksheet before writing new trades
|
||||
trades_ws.clear()
|
||||
|
||||
if len(trades_ws.get_all_values()) == 0:
|
||||
trades_ws.append_row(trades_fieldnames)
|
||||
|
||||
for trade in trades:
|
||||
trade_row = [to_native(trade.get(field, "")) for field in trades_fieldnames]
|
||||
try:
|
||||
trades_ws.append_row(trade_row)
|
||||
except gspread.exceptions.APIError as e:
|
||||
if '429' in str(e):
|
||||
if self.logging is not None:
|
||||
self.logging.warning(f"Google Sheets API quota exceeded (429). Please wait one minute. Will retry on next batch push. Sheet: {sheet_name}")
|
||||
# Re-queue the failed batch for retry
|
||||
self.queue.put((results_rows, trade_rows))
|
||||
return # Stop pushing for this batch, will retry next interval
|
||||
else:
|
||||
raise
|
||||
233
cycles/utils/progress_manager.py
Normal file
233
cycles/utils/progress_manager.py
Normal file
@@ -0,0 +1,233 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Progress Manager for tracking multiple parallel backtest tasks
|
||||
"""
|
||||
|
||||
import threading
|
||||
import time
|
||||
import sys
|
||||
from typing import Dict, Optional, Callable
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskProgress:
|
||||
"""Represents progress information for a single task"""
|
||||
task_id: str
|
||||
name: str
|
||||
current: int
|
||||
total: int
|
||||
start_time: float
|
||||
last_update: float
|
||||
|
||||
@property
|
||||
def percentage(self) -> float:
|
||||
"""Calculate completion percentage"""
|
||||
if self.total == 0:
|
||||
return 0.0
|
||||
return (self.current / self.total) * 100
|
||||
|
||||
@property
|
||||
def elapsed_time(self) -> float:
|
||||
"""Calculate elapsed time in seconds"""
|
||||
return time.time() - self.start_time
|
||||
|
||||
@property
|
||||
def eta(self) -> Optional[float]:
|
||||
"""Estimate time to completion in seconds"""
|
||||
if self.current == 0 or self.percentage >= 100:
|
||||
return None
|
||||
|
||||
elapsed = self.elapsed_time
|
||||
rate = self.current / elapsed
|
||||
remaining = self.total - self.current
|
||||
return remaining / rate if rate > 0 else None
|
||||
|
||||
|
||||
class ProgressManager:
|
||||
"""Manages progress tracking for multiple parallel tasks"""
|
||||
|
||||
def __init__(self, update_interval: float = 1.0, display_width: int = 50):
|
||||
"""
|
||||
Initialize progress manager
|
||||
|
||||
Args:
|
||||
update_interval: How often to update display (seconds)
|
||||
display_width: Width of progress bar in characters
|
||||
"""
|
||||
self.tasks: Dict[str, TaskProgress] = {}
|
||||
self.update_interval = update_interval
|
||||
self.display_width = display_width
|
||||
self.lock = threading.Lock()
|
||||
self.display_thread: Optional[threading.Thread] = None
|
||||
self.running = False
|
||||
self.last_display_height = 0
|
||||
|
||||
def start_task(self, task_id: str, name: str, total: int) -> None:
|
||||
"""
|
||||
Start tracking a new task
|
||||
|
||||
Args:
|
||||
task_id: Unique identifier for the task
|
||||
name: Human-readable name for the task
|
||||
total: Total number of steps in the task
|
||||
"""
|
||||
with self.lock:
|
||||
self.tasks[task_id] = TaskProgress(
|
||||
task_id=task_id,
|
||||
name=name,
|
||||
current=0,
|
||||
total=total,
|
||||
start_time=time.time(),
|
||||
last_update=time.time()
|
||||
)
|
||||
|
||||
def update_progress(self, task_id: str, current: int) -> None:
|
||||
"""
|
||||
Update progress for a specific task
|
||||
|
||||
Args:
|
||||
task_id: Task identifier
|
||||
current: Current progress value
|
||||
"""
|
||||
with self.lock:
|
||||
if task_id in self.tasks:
|
||||
self.tasks[task_id].current = current
|
||||
self.tasks[task_id].last_update = time.time()
|
||||
|
||||
def complete_task(self, task_id: str) -> None:
|
||||
"""
|
||||
Mark a task as completed
|
||||
|
||||
Args:
|
||||
task_id: Task identifier
|
||||
"""
|
||||
with self.lock:
|
||||
if task_id in self.tasks:
|
||||
task = self.tasks[task_id]
|
||||
task.current = task.total
|
||||
task.last_update = time.time()
|
||||
|
||||
def start_display(self) -> None:
|
||||
"""Start the progress display thread"""
|
||||
if not self.running:
|
||||
self.running = True
|
||||
self.display_thread = threading.Thread(target=self._display_loop, daemon=True)
|
||||
self.display_thread.start()
|
||||
|
||||
def stop_display(self) -> None:
|
||||
"""Stop the progress display thread"""
|
||||
self.running = False
|
||||
if self.display_thread:
|
||||
self.display_thread.join(timeout=1.0)
|
||||
self._clear_display()
|
||||
|
||||
def _display_loop(self) -> None:
|
||||
"""Main loop for updating the progress display"""
|
||||
while self.running:
|
||||
self._update_display()
|
||||
time.sleep(self.update_interval)
|
||||
|
||||
def _update_display(self) -> None:
|
||||
"""Update the console display with current progress"""
|
||||
with self.lock:
|
||||
if not self.tasks:
|
||||
return
|
||||
|
||||
# Clear previous display
|
||||
self._clear_display()
|
||||
|
||||
# Build display lines
|
||||
lines = []
|
||||
for task in sorted(self.tasks.values(), key=lambda t: t.task_id):
|
||||
line = self._format_progress_line(task)
|
||||
lines.append(line)
|
||||
|
||||
# Print all lines
|
||||
for line in lines:
|
||||
print(line, flush=True)
|
||||
|
||||
self.last_display_height = len(lines)
|
||||
|
||||
def _clear_display(self) -> None:
|
||||
"""Clear the previous progress display"""
|
||||
if self.last_display_height > 0:
|
||||
# Move cursor up and clear lines
|
||||
for _ in range(self.last_display_height):
|
||||
sys.stdout.write('\033[F') # Move cursor up one line
|
||||
sys.stdout.write('\033[K') # Clear line
|
||||
sys.stdout.flush()
|
||||
|
||||
def _format_progress_line(self, task: TaskProgress) -> str:
|
||||
"""
|
||||
Format a single progress line for display
|
||||
|
||||
Args:
|
||||
task: TaskProgress instance
|
||||
|
||||
Returns:
|
||||
Formatted progress string
|
||||
"""
|
||||
# Progress bar
|
||||
filled_width = int(task.percentage / 100 * self.display_width)
|
||||
bar = '█' * filled_width + '░' * (self.display_width - filled_width)
|
||||
|
||||
# Time information
|
||||
elapsed_str = self._format_time(task.elapsed_time)
|
||||
eta_str = self._format_time(task.eta) if task.eta else "N/A"
|
||||
|
||||
# Format line
|
||||
line = (f"{task.name:<25} │{bar}│ "
|
||||
f"{task.percentage:5.1f}% "
|
||||
f"({task.current:,}/{task.total:,}) "
|
||||
f"⏱ {elapsed_str} ETA: {eta_str}")
|
||||
|
||||
return line
|
||||
|
||||
def _format_time(self, seconds: float) -> str:
|
||||
"""
|
||||
Format time duration for display
|
||||
|
||||
Args:
|
||||
seconds: Time in seconds
|
||||
|
||||
Returns:
|
||||
Formatted time string
|
||||
"""
|
||||
if seconds < 60:
|
||||
return f"{seconds:.0f}s"
|
||||
elif seconds < 3600:
|
||||
minutes = seconds / 60
|
||||
return f"{minutes:.1f}m"
|
||||
else:
|
||||
hours = seconds / 3600
|
||||
return f"{hours:.1f}h"
|
||||
|
||||
def get_task_progress_callback(self, task_id: str) -> Callable[[int], None]:
|
||||
"""
|
||||
Get a progress callback function for a specific task
|
||||
|
||||
Args:
|
||||
task_id: Task identifier
|
||||
|
||||
Returns:
|
||||
Callback function that updates progress for this task
|
||||
"""
|
||||
def callback(current: int) -> None:
|
||||
self.update_progress(task_id, current)
|
||||
|
||||
return callback
|
||||
|
||||
def all_tasks_completed(self) -> bool:
|
||||
"""Check if all tasks are completed"""
|
||||
with self.lock:
|
||||
return all(task.current >= task.total for task in self.tasks.values())
|
||||
|
||||
def get_summary(self) -> str:
|
||||
"""Get a summary of all tasks"""
|
||||
with self.lock:
|
||||
total_tasks = len(self.tasks)
|
||||
completed_tasks = sum(1 for task in self.tasks.values()
|
||||
if task.current >= task.total)
|
||||
|
||||
return f"Tasks: {completed_tasks}/{total_tasks} completed"
|
||||
179
cycles/utils/result_formatter.py
Normal file
179
cycles/utils/result_formatter.py
Normal file
@@ -0,0 +1,179 @@
|
||||
import os
|
||||
import csv
|
||||
from typing import Dict, List, Optional, Any
|
||||
from collections import defaultdict
|
||||
import logging
|
||||
|
||||
from .storage_utils import DataSavingError
|
||||
|
||||
|
||||
class ResultFormatter:
|
||||
"""Handles formatting and writing of backtest results to CSV files"""
|
||||
|
||||
def __init__(self, results_dir: str, logging_instance: Optional[logging.Logger] = None):
|
||||
"""Initialize result formatter
|
||||
|
||||
Args:
|
||||
results_dir: Directory for saving result files
|
||||
logging_instance: Optional logging instance
|
||||
"""
|
||||
self.results_dir = results_dir
|
||||
self.logging = logging_instance
|
||||
|
||||
def format_row(self, row: Dict[str, Any]) -> Dict[str, str]:
|
||||
"""Format a row for a combined results CSV file
|
||||
|
||||
Args:
|
||||
row: Dictionary containing row data
|
||||
|
||||
Returns:
|
||||
Dictionary with formatted values
|
||||
"""
|
||||
return {
|
||||
"timeframe": row["timeframe"],
|
||||
"stop_loss_pct": f"{row['stop_loss_pct']*100:.2f}%",
|
||||
"n_trades": row["n_trades"],
|
||||
"n_stop_loss": row["n_stop_loss"],
|
||||
"win_rate": f"{row['win_rate']*100:.2f}%",
|
||||
"max_drawdown": f"{row['max_drawdown']*100:.2f}%",
|
||||
"avg_trade": f"{row['avg_trade']*100:.2f}%",
|
||||
"profit_ratio": f"{row['profit_ratio']*100:.2f}%",
|
||||
"final_usd": f"{row['final_usd']:.2f}",
|
||||
"total_fees_usd": f"{row['total_fees_usd']:.2f}",
|
||||
}
|
||||
|
||||
def write_results_chunk(self, filename: str, fieldnames: List[str],
|
||||
rows: List[Dict], write_header: bool = False,
|
||||
initial_usd: Optional[float] = None) -> None:
|
||||
"""Write a chunk of results to a CSV file
|
||||
|
||||
Args:
|
||||
filename: filename to write to
|
||||
fieldnames: list of fieldnames
|
||||
rows: list of rows
|
||||
write_header: whether to write the header
|
||||
initial_usd: initial USD value for header comment
|
||||
|
||||
Raises:
|
||||
DataSavingError: If writing fails
|
||||
"""
|
||||
try:
|
||||
mode = 'w' if write_header else 'a'
|
||||
|
||||
with open(filename, mode, newline="") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
if write_header:
|
||||
if initial_usd is not None:
|
||||
csvfile.write(f"# initial_usd: {initial_usd}\n")
|
||||
writer.writeheader()
|
||||
|
||||
for row in rows:
|
||||
# Only keep keys that are in fieldnames
|
||||
filtered_row = {k: v for k, v in row.items() if k in fieldnames}
|
||||
writer.writerow(filtered_row)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to write results chunk to {filename}: {e}"
|
||||
if self.logging is not None:
|
||||
self.logging.error(error_msg)
|
||||
raise DataSavingError(error_msg) from e
|
||||
|
||||
def write_backtest_results(self, filename: str, fieldnames: List[str],
|
||||
rows: List[Dict], metadata_lines: Optional[List[str]] = None) -> str:
|
||||
"""Write combined backtest results to a CSV file
|
||||
|
||||
Args:
|
||||
filename: filename to write to
|
||||
fieldnames: list of fieldnames
|
||||
rows: list of result dictionaries
|
||||
metadata_lines: optional list of strings to write as header comments
|
||||
|
||||
Returns:
|
||||
Full path to the written file
|
||||
|
||||
Raises:
|
||||
DataSavingError: If writing fails
|
||||
"""
|
||||
try:
|
||||
fname = os.path.join(self.results_dir, filename)
|
||||
with open(fname, "w", newline="") as csvfile:
|
||||
if metadata_lines:
|
||||
for line in metadata_lines:
|
||||
csvfile.write(f"{line}\n")
|
||||
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
|
||||
writer.writeheader()
|
||||
|
||||
for row in rows:
|
||||
writer.writerow(self.format_row(row))
|
||||
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Combined results written to {fname}")
|
||||
|
||||
return fname
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to write backtest results to {filename}: {e}"
|
||||
if self.logging is not None:
|
||||
self.logging.error(error_msg)
|
||||
raise DataSavingError(error_msg) from e
|
||||
|
||||
def write_trades(self, all_trade_rows: List[Dict], trades_fieldnames: List[str]) -> None:
|
||||
"""Write trades to separate CSV files grouped by timeframe and stop loss
|
||||
|
||||
Args:
|
||||
all_trade_rows: list of trade dictionaries
|
||||
trades_fieldnames: list of trade fieldnames
|
||||
|
||||
Raises:
|
||||
DataSavingError: If writing fails
|
||||
"""
|
||||
try:
|
||||
trades_by_combo = self._group_trades_by_combination(all_trade_rows)
|
||||
|
||||
for (tf, sl), trades in trades_by_combo.items():
|
||||
self._write_single_trade_file(tf, sl, trades, trades_fieldnames)
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to write trades: {e}"
|
||||
if self.logging is not None:
|
||||
self.logging.error(error_msg)
|
||||
raise DataSavingError(error_msg) from e
|
||||
|
||||
def _group_trades_by_combination(self, all_trade_rows: List[Dict]) -> Dict:
|
||||
"""Group trades by timeframe and stop loss combination
|
||||
|
||||
Args:
|
||||
all_trade_rows: List of trade dictionaries
|
||||
|
||||
Returns:
|
||||
Dictionary grouped by (timeframe, stop_loss_pct) tuples
|
||||
"""
|
||||
trades_by_combo = defaultdict(list)
|
||||
for trade in all_trade_rows:
|
||||
tf = trade.get("timeframe")
|
||||
sl = trade.get("stop_loss_pct")
|
||||
trades_by_combo[(tf, sl)].append(trade)
|
||||
return trades_by_combo
|
||||
|
||||
def _write_single_trade_file(self, timeframe: str, stop_loss_pct: float,
|
||||
trades: List[Dict], trades_fieldnames: List[str]) -> None:
|
||||
"""Write trades for a single timeframe/stop-loss combination
|
||||
|
||||
Args:
|
||||
timeframe: Timeframe identifier
|
||||
stop_loss_pct: Stop loss percentage
|
||||
trades: List of trades for this combination
|
||||
trades_fieldnames: List of field names for trades
|
||||
"""
|
||||
sl_percent = int(round(stop_loss_pct * 100))
|
||||
trades_filename = os.path.join(self.results_dir, f"trades_{timeframe}_ST{sl_percent}pct.csv")
|
||||
|
||||
with open(trades_filename, "w", newline="") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=trades_fieldnames)
|
||||
writer.writeheader()
|
||||
for trade in trades:
|
||||
writer.writerow({k: trade.get(k, "") for k in trades_fieldnames})
|
||||
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Trades written to {trades_filename}")
|
||||
@@ -1,17 +1,32 @@
|
||||
import os
|
||||
import json
|
||||
import pandas as pd
|
||||
import csv
|
||||
from collections import defaultdict
|
||||
from typing import Optional, Union, Dict, Any, List
|
||||
import logging
|
||||
|
||||
from .data_loader import DataLoader
|
||||
from .data_saver import DataSaver
|
||||
from .result_formatter import ResultFormatter
|
||||
from .storage_utils import DataLoadingError, DataSavingError
|
||||
|
||||
RESULTS_DIR = "../results"
|
||||
DATA_DIR = "../data"
|
||||
|
||||
RESULTS_DIR = "results"
|
||||
DATA_DIR = "data"
|
||||
|
||||
class Storage:
|
||||
|
||||
"""Storage class for storing and loading results and data"""
|
||||
"""Unified storage interface for data and results operations
|
||||
|
||||
Acts as a coordinator for DataLoader, DataSaver, and ResultFormatter components,
|
||||
maintaining backward compatibility while providing a clean separation of concerns.
|
||||
"""
|
||||
|
||||
def __init__(self, logging=None, results_dir=RESULTS_DIR, data_dir=DATA_DIR):
|
||||
|
||||
"""Initialize storage with component instances
|
||||
|
||||
Args:
|
||||
logging: Optional logging instance
|
||||
results_dir: Directory for results files
|
||||
data_dir: Directory for data files
|
||||
"""
|
||||
self.results_dir = results_dir
|
||||
self.data_dir = data_dir
|
||||
self.logging = logging
|
||||
@@ -20,196 +35,89 @@ class Storage:
|
||||
os.makedirs(self.results_dir, exist_ok=True)
|
||||
os.makedirs(self.data_dir, exist_ok=True)
|
||||
|
||||
def load_data(self, file_path, start_date, stop_date):
|
||||
# Initialize component instances
|
||||
self.data_loader = DataLoader(data_dir, logging)
|
||||
self.data_saver = DataSaver(data_dir, logging)
|
||||
self.result_formatter = ResultFormatter(results_dir, logging)
|
||||
|
||||
def load_data(self, file_path: str, start_date: Union[str, pd.Timestamp],
|
||||
stop_date: Union[str, pd.Timestamp]) -> pd.DataFrame:
|
||||
"""Load data with optimized dtypes and filtering, supporting CSV and JSON input
|
||||
|
||||
Args:
|
||||
file_path: path to the data file
|
||||
start_date: start date
|
||||
stop_date: stop date
|
||||
start_date: start date (string or datetime-like)
|
||||
stop_date: stop date (string or datetime-like)
|
||||
|
||||
Returns:
|
||||
pandas DataFrame
|
||||
pandas DataFrame with timestamp index
|
||||
|
||||
Raises:
|
||||
DataLoadingError: If data loading fails
|
||||
"""
|
||||
# Determine file type
|
||||
_, ext = os.path.splitext(file_path)
|
||||
ext = ext.lower()
|
||||
try:
|
||||
if ext == ".json":
|
||||
with open(os.path.join(self.data_dir, file_path), 'r') as f:
|
||||
raw = json.load(f)
|
||||
data = pd.DataFrame(raw["Data"])
|
||||
# Convert columns to lowercase
|
||||
data.columns = data.columns.str.lower()
|
||||
# Convert timestamp to datetime
|
||||
data["timestamp"] = pd.to_datetime(data["timestamp"], unit="s")
|
||||
# Filter by date range
|
||||
data = data[(data["timestamp"] >= start_date) & (data["timestamp"] <= stop_date)]
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
|
||||
return data.set_index("timestamp")
|
||||
else:
|
||||
# Define optimized dtypes
|
||||
dtypes = {
|
||||
'Open': 'float32',
|
||||
'High': 'float32',
|
||||
'Low': 'float32',
|
||||
'Close': 'float32',
|
||||
'Volume': 'float32'
|
||||
}
|
||||
# Read data with original capitalized column names
|
||||
data = pd.read_csv(os.path.join(self.data_dir, file_path), dtype=dtypes)
|
||||
|
||||
|
||||
# Convert timestamp to datetime
|
||||
if 'Timestamp' in data.columns:
|
||||
data['Timestamp'] = pd.to_datetime(data['Timestamp'], unit='s')
|
||||
# Filter by date range
|
||||
data = data[(data['Timestamp'] >= start_date) & (data['Timestamp'] <= stop_date)]
|
||||
# Now convert column names to lowercase
|
||||
data.columns = data.columns.str.lower()
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Data loaded from {file_path} for date range {start_date} to {stop_date}")
|
||||
return data.set_index('timestamp')
|
||||
else: # Attempt to use the first column if 'Timestamp' is not present
|
||||
data.rename(columns={data.columns[0]: 'timestamp'}, inplace=True)
|
||||
data['timestamp'] = pd.to_datetime(data['timestamp'], unit='s')
|
||||
data = data[(data['timestamp'] >= start_date) & (data['timestamp'] <= stop_date)]
|
||||
data.columns = data.columns.str.lower() # Ensure all other columns are lower
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Data loaded from {file_path} (using first column as timestamp) for date range {start_date} to {stop_date}")
|
||||
return data.set_index('timestamp')
|
||||
except Exception as e:
|
||||
if self.logging is not None:
|
||||
self.logging.error(f"Error loading data from {file_path}: {e}")
|
||||
# Return an empty DataFrame with a DatetimeIndex
|
||||
return pd.DataFrame(index=pd.to_datetime([]))
|
||||
|
||||
def save_data(self, data: pd.DataFrame, file_path: str):
|
||||
"""Save processed data to a CSV file.
|
||||
If the DataFrame has a DatetimeIndex, it's converted to float Unix timestamps
|
||||
(seconds since epoch) before saving. The index is saved as a column named 'timestamp'.
|
||||
return self.data_loader.load_data(file_path, start_date, stop_date)
|
||||
|
||||
def save_data(self, data: pd.DataFrame, file_path: str) -> None:
|
||||
"""Save processed data to a CSV file
|
||||
|
||||
Args:
|
||||
data (pd.DataFrame): data to save.
|
||||
file_path (str): path to the data file relative to the data_dir.
|
||||
data: DataFrame to save
|
||||
file_path: path to the data file relative to the data_dir
|
||||
|
||||
Raises:
|
||||
DataSavingError: If saving fails
|
||||
"""
|
||||
data_to_save = data.copy()
|
||||
self.data_saver.save_data(data, file_path)
|
||||
|
||||
if isinstance(data_to_save.index, pd.DatetimeIndex):
|
||||
# Convert DatetimeIndex to Unix timestamp (float seconds since epoch)
|
||||
# and make it a column named 'timestamp'.
|
||||
data_to_save['timestamp'] = data_to_save.index.astype('int64') / 1e9
|
||||
# Reset index so 'timestamp' column is saved and old DatetimeIndex is not saved as a column.
|
||||
# We want the 'timestamp' column to be the first one.
|
||||
data_to_save.reset_index(drop=True, inplace=True)
|
||||
# Ensure 'timestamp' is the first column if other columns exist
|
||||
if 'timestamp' in data_to_save.columns and len(data_to_save.columns) > 1:
|
||||
cols = ['timestamp'] + [col for col in data_to_save.columns if col != 'timestamp']
|
||||
data_to_save = data_to_save[cols]
|
||||
elif pd.api.types.is_numeric_dtype(data_to_save.index.dtype):
|
||||
# If index is already numeric (e.g. float Unix timestamps from a previous save/load cycle),
|
||||
# make it a column named 'timestamp'.
|
||||
data_to_save['timestamp'] = data_to_save.index
|
||||
data_to_save.reset_index(drop=True, inplace=True)
|
||||
if 'timestamp' in data_to_save.columns and len(data_to_save.columns) > 1:
|
||||
cols = ['timestamp'] + [col for col in data_to_save.columns if col != 'timestamp']
|
||||
data_to_save = data_to_save[cols]
|
||||
else:
|
||||
# For other index types, or if no index that we want to specifically handle,
|
||||
# save with the current index. pandas to_csv will handle it.
|
||||
# This branch might be removed if we strictly expect either DatetimeIndex or a numeric one from previous save.
|
||||
pass # data_to_save remains as is, to_csv will write its index if index=True
|
||||
|
||||
# Save to CSV, ensuring the 'timestamp' column (if created) is written, and not the DataFrame's active index.
|
||||
full_path = os.path.join(self.data_dir, file_path)
|
||||
data_to_save.to_csv(full_path, index=False) # index=False because timestamp is now a column
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Data saved to {full_path} with Unix timestamp column.")
|
||||
|
||||
|
||||
def format_row(self, row):
|
||||
def format_row(self, row: Dict[str, Any]) -> Dict[str, str]:
|
||||
"""Format a row for a combined results CSV file
|
||||
|
||||
Args:
|
||||
row: row to format
|
||||
row: Dictionary containing row data
|
||||
|
||||
Returns:
|
||||
formatted row
|
||||
Dictionary with formatted values
|
||||
"""
|
||||
return self.result_formatter.format_row(row)
|
||||
|
||||
return {
|
||||
"timeframe": row["timeframe"],
|
||||
"stop_loss_pct": f"{row['stop_loss_pct']*100:.2f}%",
|
||||
"n_trades": row["n_trades"],
|
||||
"n_stop_loss": row["n_stop_loss"],
|
||||
"win_rate": f"{row['win_rate']*100:.2f}%",
|
||||
"max_drawdown": f"{row['max_drawdown']*100:.2f}%",
|
||||
"avg_trade": f"{row['avg_trade']*100:.2f}%",
|
||||
"profit_ratio": f"{row['profit_ratio']*100:.2f}%",
|
||||
"final_usd": f"{row['final_usd']:.2f}",
|
||||
"total_fees_usd": f"{row['total_fees_usd']:.2f}",
|
||||
}
|
||||
|
||||
def write_results_chunk(self, filename, fieldnames, rows, write_header=False, initial_usd=None):
|
||||
def write_results_chunk(self, filename: str, fieldnames: List[str],
|
||||
rows: List[Dict], write_header: bool = False,
|
||||
initial_usd: Optional[float] = None) -> None:
|
||||
"""Write a chunk of results to a CSV file
|
||||
|
||||
Args:
|
||||
filename: filename to write to
|
||||
fieldnames: list of fieldnames
|
||||
rows: list of rows
|
||||
write_header: whether to write the header
|
||||
initial_usd: initial USD
|
||||
initial_usd: initial USD value for header comment
|
||||
"""
|
||||
mode = 'w' if write_header else 'a'
|
||||
self.result_formatter.write_results_chunk(
|
||||
filename, fieldnames, rows, write_header, initial_usd
|
||||
)
|
||||
|
||||
def write_backtest_results(self, filename: str, fieldnames: List[str],
|
||||
rows: List[Dict], metadata_lines: Optional[List[str]] = None) -> str:
|
||||
"""Write combined backtest results to a CSV file
|
||||
|
||||
with open(filename, mode, newline="") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||
if write_header:
|
||||
csvfile.write(f"# initial_usd: {initial_usd}\n")
|
||||
writer.writeheader()
|
||||
|
||||
for row in rows:
|
||||
# Only keep keys that are in fieldnames
|
||||
filtered_row = {k: v for k, v in row.items() if k in fieldnames}
|
||||
writer.writerow(filtered_row)
|
||||
|
||||
def write_backtest_results(self, filename, fieldnames, rows, metadata_lines=None):
|
||||
"""Write a combined results to a CSV file
|
||||
Args:
|
||||
filename: filename to write to
|
||||
fieldnames: list of fieldnames
|
||||
rows: list of rows
|
||||
rows: list of result dictionaries
|
||||
metadata_lines: optional list of strings to write as header comments
|
||||
|
||||
Returns:
|
||||
Full path to the written file
|
||||
"""
|
||||
fname = os.path.join(self.results_dir, filename)
|
||||
with open(fname, "w", newline="") as csvfile:
|
||||
if metadata_lines:
|
||||
for line in metadata_lines:
|
||||
csvfile.write(f"{line}\n")
|
||||
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter='\t')
|
||||
writer.writeheader()
|
||||
for row in rows:
|
||||
writer.writerow(self.format_row(row))
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Combined results written to {fname}")
|
||||
|
||||
def write_trades(self, all_trade_rows, trades_fieldnames):
|
||||
"""Write trades to a CSV file
|
||||
return self.result_formatter.write_backtest_results(
|
||||
filename, fieldnames, rows, metadata_lines
|
||||
)
|
||||
|
||||
def write_trades(self, all_trade_rows: List[Dict], trades_fieldnames: List[str]) -> None:
|
||||
"""Write trades to separate CSV files grouped by timeframe and stop loss
|
||||
|
||||
Args:
|
||||
all_trade_rows: list of trade rows
|
||||
all_trade_rows: list of trade dictionaries
|
||||
trades_fieldnames: list of trade fieldnames
|
||||
logging: logging object
|
||||
"""
|
||||
|
||||
trades_by_combo = defaultdict(list)
|
||||
for trade in all_trade_rows:
|
||||
tf = trade.get("timeframe")
|
||||
sl = trade.get("stop_loss_pct")
|
||||
trades_by_combo[(tf, sl)].append(trade)
|
||||
|
||||
for (tf, sl), trades in trades_by_combo.items():
|
||||
sl_percent = int(round(sl * 100))
|
||||
trades_filename = os.path.join(self.results_dir, f"trades_{tf}_ST{sl_percent}pct.csv")
|
||||
with open(trades_filename, "w", newline="") as csvfile:
|
||||
writer = csv.DictWriter(csvfile, fieldnames=trades_fieldnames)
|
||||
writer.writeheader()
|
||||
for trade in trades:
|
||||
writer.writerow({k: trade.get(k, "") for k in trades_fieldnames})
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Trades written to {trades_filename}")
|
||||
self.result_formatter.write_trades(all_trade_rows, trades_fieldnames)
|
||||
73
cycles/utils/storage_utils.py
Normal file
73
cycles/utils/storage_utils.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class TimestampParsingError(Exception):
|
||||
"""Custom exception for timestamp parsing errors"""
|
||||
pass
|
||||
|
||||
|
||||
class DataLoadingError(Exception):
|
||||
"""Custom exception for data loading errors"""
|
||||
pass
|
||||
|
||||
|
||||
class DataSavingError(Exception):
|
||||
"""Custom exception for data saving errors"""
|
||||
pass
|
||||
|
||||
|
||||
def _parse_timestamp_column(data: pd.DataFrame, column_name: str) -> pd.DataFrame:
|
||||
"""Parse timestamp column handling both Unix timestamps and datetime strings
|
||||
|
||||
Args:
|
||||
data: DataFrame containing the timestamp column
|
||||
column_name: Name of the timestamp column
|
||||
|
||||
Returns:
|
||||
DataFrame with parsed timestamp column
|
||||
|
||||
Raises:
|
||||
TimestampParsingError: If timestamp parsing fails
|
||||
"""
|
||||
try:
|
||||
sample_timestamp = str(data[column_name].iloc[0])
|
||||
try:
|
||||
# Check if it's a Unix timestamp (numeric)
|
||||
float(sample_timestamp)
|
||||
# It's a Unix timestamp, convert using unit='s'
|
||||
data[column_name] = pd.to_datetime(data[column_name], unit='s')
|
||||
except ValueError:
|
||||
# It's already in datetime string format, convert without unit
|
||||
data[column_name] = pd.to_datetime(data[column_name])
|
||||
return data
|
||||
except Exception as e:
|
||||
raise TimestampParsingError(f"Failed to parse timestamp column '{column_name}': {e}")
|
||||
|
||||
|
||||
def _filter_by_date_range(data: pd.DataFrame, timestamp_col: str,
|
||||
start_date: pd.Timestamp, stop_date: pd.Timestamp) -> pd.DataFrame:
|
||||
"""Filter DataFrame by date range
|
||||
|
||||
Args:
|
||||
data: DataFrame to filter
|
||||
timestamp_col: Name of timestamp column
|
||||
start_date: Start date for filtering
|
||||
stop_date: Stop date for filtering
|
||||
|
||||
Returns:
|
||||
Filtered DataFrame
|
||||
"""
|
||||
return data[(data[timestamp_col] >= start_date) & (data[timestamp_col] <= stop_date)]
|
||||
|
||||
|
||||
def _normalize_column_names(data: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Convert all column names to lowercase
|
||||
|
||||
Args:
|
||||
data: DataFrame to normalize
|
||||
|
||||
Returns:
|
||||
DataFrame with lowercase column names
|
||||
"""
|
||||
data.columns = data.columns.str.lower()
|
||||
return data
|
||||
@@ -10,10 +10,12 @@ class SystemUtils:
|
||||
"""Determine optimal number of worker processes based on system resources"""
|
||||
cpu_count = os.cpu_count() or 4
|
||||
memory_gb = psutil.virtual_memory().total / (1024**3)
|
||||
# Heuristic: Use 75% of cores, but cap based on available memory
|
||||
# Assume each worker needs ~2GB for large datasets
|
||||
workers_by_memory = max(1, int(memory_gb / 2))
|
||||
workers_by_cpu = max(1, int(cpu_count * 0.75))
|
||||
|
||||
# OPTIMIZATION: More aggressive worker allocation for better performance
|
||||
workers_by_memory = max(1, int(memory_gb / 2)) # 2GB per worker
|
||||
workers_by_cpu = max(1, int(cpu_count * 0.8)) # Use 80% of CPU cores
|
||||
optimal_workers = min(workers_by_cpu, workers_by_memory, 8) # Cap at 8 workers
|
||||
|
||||
if self.logging is not None:
|
||||
self.logging.info(f"Using {min(workers_by_cpu, workers_by_memory)} workers for processing")
|
||||
return min(workers_by_cpu, workers_by_memory)
|
||||
self.logging.info(f"Using {optimal_workers} workers for processing (CPU-based: {workers_by_cpu}, Memory-based: {workers_by_memory})")
|
||||
return optimal_workers
|
||||
Reference in New Issue
Block a user