106 lines
3.9 KiB
Python
106 lines
3.9 KiB
Python
|
|
import os
|
||
|
|
import pandas as pd
|
||
|
|
from typing import Optional
|
||
|
|
import logging
|
||
|
|
|
||
|
|
from .storage_utils import DataSavingError
|
||
|
|
|
||
|
|
|
||
|
|
class DataSaver:
|
||
|
|
"""Handles saving data to various file formats"""
|
||
|
|
|
||
|
|
def __init__(self, data_dir: str, logging_instance: Optional[logging.Logger] = None):
|
||
|
|
"""Initialize data saver
|
||
|
|
|
||
|
|
Args:
|
||
|
|
data_dir: Directory for saving data files
|
||
|
|
logging_instance: Optional logging instance
|
||
|
|
"""
|
||
|
|
self.data_dir = data_dir
|
||
|
|
self.logging = logging_instance
|
||
|
|
|
||
|
|
def save_data(self, data: pd.DataFrame, file_path: str) -> None:
|
||
|
|
"""Save processed data to a CSV file.
|
||
|
|
If the DataFrame has a DatetimeIndex, it's converted to float Unix timestamps
|
||
|
|
(seconds since epoch) before saving. The index is saved as a column named 'timestamp'.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
data: DataFrame to save
|
||
|
|
file_path: path to the data file relative to the data_dir
|
||
|
|
|
||
|
|
Raises:
|
||
|
|
DataSavingError: If saving fails
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
data_to_save = data.copy()
|
||
|
|
data_to_save = self._prepare_data_for_saving(data_to_save)
|
||
|
|
|
||
|
|
# Save to CSV, ensuring the 'timestamp' column (if created) is written
|
||
|
|
full_path = os.path.join(self.data_dir, file_path)
|
||
|
|
data_to_save.to_csv(full_path, index=False)
|
||
|
|
|
||
|
|
if self.logging is not None:
|
||
|
|
self.logging.info(f"Data saved to {full_path} with Unix timestamp column.")
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
error_msg = f"Failed to save data to {file_path}: {e}"
|
||
|
|
if self.logging is not None:
|
||
|
|
self.logging.error(error_msg)
|
||
|
|
raise DataSavingError(error_msg) from e
|
||
|
|
|
||
|
|
def _prepare_data_for_saving(self, data: pd.DataFrame) -> pd.DataFrame:
|
||
|
|
"""Prepare DataFrame for saving by handling different index types
|
||
|
|
|
||
|
|
Args:
|
||
|
|
data: DataFrame to prepare
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
DataFrame ready for saving
|
||
|
|
"""
|
||
|
|
if isinstance(data.index, pd.DatetimeIndex):
|
||
|
|
return self._convert_datetime_index_to_timestamp(data)
|
||
|
|
elif pd.api.types.is_numeric_dtype(data.index.dtype):
|
||
|
|
return self._convert_numeric_index_to_timestamp(data)
|
||
|
|
else:
|
||
|
|
# For other index types, save with the current index
|
||
|
|
return data
|
||
|
|
|
||
|
|
def _convert_datetime_index_to_timestamp(self, data: pd.DataFrame) -> pd.DataFrame:
|
||
|
|
"""Convert DatetimeIndex to Unix timestamp column
|
||
|
|
|
||
|
|
Args:
|
||
|
|
data: DataFrame with DatetimeIndex
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
DataFrame with timestamp column
|
||
|
|
"""
|
||
|
|
# Convert DatetimeIndex to Unix timestamp (float seconds since epoch)
|
||
|
|
data['timestamp'] = data.index.astype('int64') / 1e9
|
||
|
|
data.reset_index(drop=True, inplace=True)
|
||
|
|
|
||
|
|
# Ensure 'timestamp' is the first column if other columns exist
|
||
|
|
if 'timestamp' in data.columns and len(data.columns) > 1:
|
||
|
|
cols = ['timestamp'] + [col for col in data.columns if col != 'timestamp']
|
||
|
|
data = data[cols]
|
||
|
|
|
||
|
|
return data
|
||
|
|
|
||
|
|
def _convert_numeric_index_to_timestamp(self, data: pd.DataFrame) -> pd.DataFrame:
|
||
|
|
"""Convert numeric index to timestamp column
|
||
|
|
|
||
|
|
Args:
|
||
|
|
data: DataFrame with numeric index
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
DataFrame with timestamp column
|
||
|
|
"""
|
||
|
|
# If index is already numeric (e.g. float Unix timestamps from a previous save/load cycle)
|
||
|
|
data['timestamp'] = data.index
|
||
|
|
data.reset_index(drop=True, inplace=True)
|
||
|
|
|
||
|
|
# Ensure 'timestamp' is the first column if other columns exist
|
||
|
|
if 'timestamp' in data.columns and len(data.columns) > 1:
|
||
|
|
cols = ['timestamp'] + [col for col in data.columns if col != 'timestamp']
|
||
|
|
data = data[cols]
|
||
|
|
|
||
|
|
return data
|