Refactor data module to enhance modularity and maintainability

- Extracted `OHLCVData` and validation logic into a new `common/ohlcv_data.py` module, promoting better organization and reusability.
- Updated `BaseDataCollector` to utilize the new `validate_ohlcv_data` function for improved data validation, enhancing code clarity and maintainability.
- Refactored imports in `data/__init__.py` to reflect the new structure, ensuring consistent access to common data types and exceptions.
- Removed redundant data validation logic from `BaseDataCollector`, streamlining its responsibilities.
- Added unit tests for `OHLCVData` and validation functions to ensure correctness and reliability.

These changes improve the architecture of the data module, aligning with project standards for maintainability and performance.
This commit is contained in:
Vasily.onl 2025-06-10 12:04:58 +08:00
parent 3db8fb1c41
commit 33f2110f19
15 changed files with 511 additions and 1009 deletions

View File

@ -6,9 +6,10 @@ processing and validating the data, and storing it in the database.
"""
from .base_collector import (
BaseDataCollector, DataCollectorError, DataValidationError,
CollectorStatus, OHLCVData
BaseDataCollector, DataCollectorError
)
from .collector.collector_state_telemetry import CollectorStatus
from .common.ohlcv_data import OHLCVData, DataValidationError
from .common.data_types import DataType, MarketDataPoint
from .collector_manager import CollectorManager, ManagerStatus, CollectorConfig

View File

@ -18,43 +18,7 @@ from .collector.collector_state_telemetry import CollectorStatus, CollectorState
from .collector.collector_connection_manager import ConnectionManager
from .collector.collector_callback_dispatcher import CallbackDispatcher
from .common.data_types import DataType, MarketDataPoint
@dataclass
class OHLCVData:
"""OHLCV (Open, High, Low, Close, Volume) data structure."""
symbol: str
timeframe: str
timestamp: datetime
open: Decimal
high: Decimal
low: Decimal
close: Decimal
volume: Decimal
trades_count: Optional[int] = None
def __post_init__(self):
"""Validate OHLCV data after initialization."""
if not self.timestamp.tzinfo:
self.timestamp = self.timestamp.replace(tzinfo=timezone.utc)
# Validate price data
if not all(isinstance(price, (Decimal, float, int)) for price in [self.open, self.high, self.low, self.close]):
raise DataValidationError("All OHLCV prices must be numeric")
if not isinstance(self.volume, (Decimal, float, int)):
raise DataValidationError("Volume must be numeric")
# Convert to Decimal for precision
self.open = Decimal(str(self.open))
self.high = Decimal(str(self.high))
self.low = Decimal(str(self.low))
self.close = Decimal(str(self.close))
self.volume = Decimal(str(self.volume))
# Validate price relationships
if not (self.low <= self.open <= self.high and self.low <= self.close <= self.high):
raise DataValidationError(f"Invalid OHLCV data: prices don't match expected relationships for {self.symbol}")
from .common.ohlcv_data import OHLCVData, DataValidationError, validate_ohlcv_data
class DataCollectorError(Exception):
@ -62,11 +26,6 @@ class DataCollectorError(Exception):
pass
class DataValidationError(DataCollectorError):
"""Exception raised when data validation fails."""
pass
class ConnectionError(DataCollectorError):
"""Exception raised when connection to data source fails."""
pass
@ -493,7 +452,17 @@ class BaseDataCollector(ABC):
Returns:
Dictionary containing status information
"""
return self._state_telemetry.get_status()
status = self._state_telemetry.get_status()
# Add BaseDataCollector specific information
status.update({
'symbols': list(self.symbols),
'data_types': [dt.value for dt in self.data_types],
'timeframes': self.timeframes,
'auto_restart': self.auto_restart
})
return status
def get_health_status(self) -> Dict[str, Any]:
"""
@ -553,38 +522,7 @@ class BaseDataCollector(ABC):
Raises:
DataValidationError: If data validation fails
"""
required_fields = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
# Check required fields
for field in required_fields:
if field not in data:
raise DataValidationError(f"Missing required field: {field}")
try:
# Parse timestamp
timestamp = data['timestamp']
if isinstance(timestamp, (int, float)):
# Assume Unix timestamp in milliseconds
timestamp = datetime.fromtimestamp(timestamp / 1000, tz=timezone.utc)
elif isinstance(timestamp, str):
timestamp = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
elif not isinstance(timestamp, datetime):
raise DataValidationError(f"Invalid timestamp format: {type(timestamp)}")
return OHLCVData(
symbol=symbol,
timeframe=timeframe,
timestamp=timestamp,
open=Decimal(str(data['open'])),
high=Decimal(str(data['high'])),
low=Decimal(str(data['low'])),
close=Decimal(str(data['close'])),
volume=Decimal(str(data['volume'])),
trades_count=data.get('trades_count')
)
except (ValueError, TypeError, KeyError) as e:
raise DataValidationError(f"Invalid OHLCV data for {symbol}: {e}")
return validate_ohlcv_data(data, symbol, timeframe)
def __repr__(self) -> str:
"""String representation of the collector."""

105
data/common/ohlcv_data.py Normal file
View File

@ -0,0 +1,105 @@
"""
OHLCV data structure and validation utilities.
This module provides standardized OHLCV (Open, High, Low, Close, Volume) data
structures and validation functions for financial market data.
"""
from dataclasses import dataclass
from datetime import datetime, timezone
from decimal import Decimal
from typing import Dict, Any, Optional
class DataValidationError(Exception):
"""Exception raised when OHLCV data validation fails."""
pass
@dataclass
class OHLCVData:
"""OHLCV (Open, High, Low, Close, Volume) data structure."""
symbol: str
timeframe: str
timestamp: datetime
open: Decimal
high: Decimal
low: Decimal
close: Decimal
volume: Decimal
trades_count: Optional[int] = None
def __post_init__(self):
"""Validate OHLCV data after initialization."""
if not self.timestamp.tzinfo:
self.timestamp = self.timestamp.replace(tzinfo=timezone.utc)
# Validate price data
if not all(isinstance(price, (Decimal, float, int)) for price in [self.open, self.high, self.low, self.close]):
raise DataValidationError("All OHLCV prices must be numeric")
if not isinstance(self.volume, (Decimal, float, int)):
raise DataValidationError("Volume must be numeric")
# Convert to Decimal for precision
self.open = Decimal(str(self.open))
self.high = Decimal(str(self.high))
self.low = Decimal(str(self.low))
self.close = Decimal(str(self.close))
self.volume = Decimal(str(self.volume))
# Validate price relationships
if not (self.low <= self.open <= self.high and self.low <= self.close <= self.high):
raise DataValidationError(f"Invalid OHLCV data: prices don't match expected relationships for {self.symbol}")
def validate_ohlcv_data(data: Dict[str, Any], symbol: str, timeframe: str) -> OHLCVData:
"""
Validate and convert raw OHLCV data to standardized format.
Args:
data: Raw OHLCV data dictionary
symbol: Trading symbol
timeframe: Timeframe (e.g., '1m', '5m', '1h')
Returns:
Validated OHLCVData object
Raises:
DataValidationError: If data validation fails
"""
required_fields = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
# Check required fields
for field in required_fields:
if field not in data:
raise DataValidationError(f"Missing required field: {field}")
try:
# Parse timestamp
timestamp = data['timestamp']
if isinstance(timestamp, (int, float)):
# Assume Unix timestamp in milliseconds
timestamp = datetime.fromtimestamp(timestamp / 1000, tz=timezone.utc)
elif isinstance(timestamp, str):
timestamp = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
elif not isinstance(timestamp, datetime):
raise DataValidationError(f"Invalid timestamp format: {type(timestamp)}")
return OHLCVData(
symbol=symbol,
timeframe=timeframe,
timestamp=timestamp,
open=Decimal(str(data['open'])),
high=Decimal(str(data['high'])),
low=Decimal(str(data['low'])),
close=Decimal(str(data['close'])),
volume=Decimal(str(data['volume'])),
trades_count=data.get('trades_count')
)
except (ValueError, TypeError, KeyError) as e:
raise DataValidationError(f"Invalid OHLCV data for {symbol}: {e}")
except Exception as e:
# Catch any other exceptions (like Decimal InvalidOperation)
raise DataValidationError(f"Invalid OHLCV data for {symbol}: {e}")

View File

@ -1,89 +0,0 @@
# Simplified Crypto Trading Bot Platform: Product Requirements Document
## Executive Summary
This simplified PRD addresses the need for a rapid-deployment crypto trading bot platform designed for internal testing and strategy development. The platform eliminates microservices complexity in favor of a monolithic architecture that can be functional within 1-2 weeks while supporting approximately 10 concurrent bots. The system focuses on core functionality including data collection, strategy execution, backtesting, and visualization without requiring advanced monitoring or orchestration tools.
## System Architecture Overview
The platform follows a streamlined monolithic design that consolidates all components within a single application boundary. This approach enables rapid development while maintaining clear separation between functional modules for future scalability.The architecture consists of six core components working together: Data Collection Module for exchange connectivity, Strategy Engine for unified signal generation, Bot Manager for concurrent bot orchestration, PostgreSQL database for data persistence, Backtesting Engine for historical simulation, and Dashboard for visualization and control.
## Simplified Technical Stack
### Core Technologies
The platform utilizes a Python-based technology stack optimized for rapid development. The backend employs Python 3.10+ with FastAPI or Flask for API services, PostgreSQL 14+ with TimescaleDB extension for time-series optimization, and Redis for real-time pub/sub messaging. The frontend leverages Dash with Plotly for interactive visualization and bot control interfaces.
### Database Design
The database schema emphasizes simplicity while supporting essential trading operations. Core tables include raw_market_data for exchange data storage, candles for OHLCV aggregation, strategies for algorithm definitions, bots for instance management, signals for trading decisions, trades for execution records, and bot_portfolio for performance tracking.
## Development Methodology
### Two-Week Implementation Timeline
The development follows a structured three-phase approach designed for rapid deployment. Phase 1 (Days 1-5) establishes foundational components including database setup, data collection implementation, and basic visualization. Phase 2 (Days 6-10) completes core functionality with backtesting engine development, trading logic implementation, and dashboard enhancement. Phase 3 (Days 11-14) focuses on system refinement, comprehensive testing, and deployment preparation.
### Strategy Implementation Example
The platform supports multiple trading strategies through a unified interface design. A simple moving average crossover strategy demonstrates the system's capability to generate buy and sell signals based on technical indicators.This example strategy shows how the system processes market data, calculates moving averages, generates trading signals, and tracks portfolio performance over time. The visualization includes price movements, moving average lines, signal markers, and portfolio value progression.
## Backtesting and Performance Analysis
### Strategy Validation Framework
The backtesting engine enables comprehensive strategy testing using historical market data. The system calculates key performance metrics including total returns, Sharpe ratios, maximum drawdown, and win/loss ratios to evaluate strategy effectiveness.
### Portfolio Management
The platform tracks portfolio allocation and performance throughout strategy execution. Real-time monitoring capabilities show the distribution between cryptocurrency holdings and cash reserves.
## Simplified Data Flow
### Real-Time Processing
The data collection module connects to exchange APIs to retrieve market information including order books, trades, and candlestick data. Raw data is stored in PostgreSQL while processed information is published through Redis channels for real-time distribution to active trading bots.
### Signal Generation and Execution
Strategies subscribe to relevant data streams and generate trading signals based on configured algorithms. The bot manager validates signals against portfolio constraints and executes simulated or live trades according to bot configurations.
## Future Scalability Considerations
### Microservices Migration Path
While implementing a monolithic architecture for rapid deployment, the system design maintains clear component boundaries that facilitate future extraction into microservices. API-first design principles ensure internal components communicate through well-defined interfaces that can be externalized as needed.
### Authentication and Multi-User Support
The current single-user design can be extended to support multiple users through role-based access control implementation. Database schema accommodates user management tables and permission structures without requiring significant architectural changes.
### Advanced Monitoring Integration
The simplified monitoring approach can be enhanced with Prometheus and Grafana integration when scaling requirements justify the additional complexity. Current basic monitoring provides foundation metrics that can be extended to comprehensive observability systems.
## Technical Implementation Details
### Time Series Data Management
The platform implements proper time aggregation aligned with exchange standards to ensure accurate candle formation. Timestamp alignment follows right-aligned methodology where 5-minute candles from 09:00:00-09:05:00 receive the 09:05:00 timestamp.
### Performance Optimization
Database indexing on timestamp and symbol fields ensures efficient time-series queries. Connection pooling prevents database connection leaks while prepared statements optimize query execution. Memory management includes proper cleanup of data objects after processing to maintain system stability.
## Success Metrics and Validation
### Development Milestones
Platform success is measured through specific deliverables including core functionality completion within 14 days, system stability maintenance at 99% uptime during internal testing, successful backtesting of at least 3 different strategies, and concurrent operation of 2+ bots for 72+ hours.
### Strategy Testing Capabilities
The system enables comprehensive strategy validation through historical simulation, real-time testing with virtual portfolios, and performance comparison across multiple algorithms. Backtesting results provide insights into strategy effectiveness before live deployment.
## Conclusion
This simplified crypto trading bot platform balances rapid development requirements with future scalability needs. The monolithic architecture enables deployment within 1-2 weeks while maintaining architectural flexibility for future enhancements. Clear component separation, comprehensive database design, and strategic technology choices create a foundation that supports both immediate testing objectives and long-term platform evolution.
The platform's focus on essential functionality without unnecessary complexity ensures teams can begin strategy testing quickly while building toward more sophisticated implementations as requirements expand. This approach maximizes development velocity while preserving options for future architectural evolution and feature enhancement.

View File

@ -1,608 +0,0 @@
# Simplified Crypto Trading Bot Platform: Product Requirements Document (PRD)
**Version:** 1.0
**Date:** May 30, 2025
**Author:** Vasily
**Status:** Draft
## Executive Summary
This PRD outlines the development of a simplified crypto trading bot platform that enables strategy testing, development, and execution without the complexity of microservices and advanced monitoring. The goal is to create a functional system within 1-2 weeks that allows for strategy testing while establishing a foundation that can scale in the future. The platform addresses key requirements including data collection, strategy execution, visualization, and backtesting capabilities in a monolithic architecture optimized for internal use.
## Current Requirements & Constraints
- **Speed to Deployment**: System must be functional within 1-2 weeks
- **Scale**: Support for 5-10 concurrent trading bots
- **Architecture**: Monolithic application instead of microservices
- **User Access**: Internal use only initially (no multi-user authentication)
- **Infrastructure**: Simplified deployment without Kubernetes/Docker Swarm
- **Monitoring**: Basic logging for modules
## System Architecture
### High-Level Architecture
The platform will follow a monolithic architecture pattern to enable rapid development while providing clear separation between components:
### Data Flow Architecture
```
OKX Exchange API (WebSocket)
Data Collector → OHLCV Aggregator → PostgreSQL (market_data)
↓ ↓
[Optional] Raw Trade Storage Redis Pub/Sub → Strategy Engine (JSON configs)
↓ ↓
Files/Database (raw_trades) Signal Generation → Bot Manager
PostgreSQL (signals, trades, bot_performance)
Dashboard (REST API) ← PostgreSQL (historical data)
Real-time Updates ← Redis Channels
```
**Data Processing Priority**:
1. **Real-time**: Raw data → OHLCV candles → Redis → Bots (primary flow)
2. **Historical**: OHLCV data from PostgreSQL for backtesting and charts
3. **Advanced Analysis**: Raw trade data (if stored) for detailed backtesting
### Redis Channel Design
```python
# Real-time market data distribution
MARKET_DATA_CHANNEL = "market:{symbol}" # OHLCV updates
BOT_SIGNALS_CHANNEL = "signals:{bot_id}" # Trading decisions
BOT_STATUS_CHANNEL = "status:{bot_id}" # Bot lifecycle events
SYSTEM_EVENTS_CHANNEL = "system:events" # Global notifications
```
### Configuration Strategy
**PostgreSQL for**: Market data, bot instances, trades, signals, performance metrics
**JSON files for**: Strategy parameters, bot configurations (rapid testing and parameter tuning)
```json
// config/strategies/ema_crossover.json
{
"strategy_name": "EMA_Crossover",
"parameters": {
"fast_period": 12,
"slow_period": 26,
"risk_percentage": 0.02
}
}
// config/bots/bot_001.json
{
"bot_id": "bot_001",
"strategy_file": "ema_crossover.json",
"symbol": "BTC-USDT",
"virtual_balance": 10000,
"enabled": true
}
```
### Error Handling Strategy
**Bot Crash Recovery**:
- Monitor bot processes every 30 seconds
- Auto-restart crashed bots if status = 'active'
- Log all crashes with stack traces
- Maximum 3 restart attempts per hour
**Exchange Connection Issues**:
- Retry with exponential backoff (1s, 2s, 4s, 8s, max 60s)
- Switch to backup WebSocket connection if available
- Log connection quality metrics
**Database Errors**:
- Continue operation with in-memory cache for up to 5 minutes
- Queue operations for retry when connection restored
- Alert on prolonged database disconnection
**Application Restart Recovery**:
- Read bot states from database on startup
- Restore active bots to 'active' status
- Resume data collection for all monitored symbols
### Component Details and Functional Requirements
1. **Data Collection Module**
- Connect to exchange APIs (OKX initially) via WebSocket
- Aggregate real-time trades into OHLCV candles (1m, 5m, 15m, 1h, 4h, 1d)
- Store OHLCV data in PostgreSQL for bot operations and backtesting
- Send real-time candle updates through Redis
- Optional: Store raw trade data for advanced backtesting
**FR-001: Unified Data Provider Interface**
- Support multiple exchanges through standardized adapters
- Real-time OHLCV aggregation with WebSocket connections
- Primary focus on candle data, raw data storage optional
- Data validation and error handling mechanisms
**FR-002: Market Data Processing**
- OHLCV aggregation with configurable timeframes (1m base, higher timeframes derived)
- Technical indicator calculation (SMA, EMA, RSI, MACD, Bollinger Bands) on OHLCV data
- Data normalization across different exchanges
- Time alignment following exchange standards (right-aligned candles)
2. **Strategy Engine**
- Provide unified interface for all trading strategies
- Support multiple strategy types with common parameter structure
- Generate trading signals based on market data
- Log strategy performance and signals
- Strategy implementation as a class.
**FR-003: Strategy Framework**
- Base strategy class with standardized interface
- Support for multiple strategy types
- Parameter configuration and optimization tools (JSON for the parameters)
- Signal generation with confidence scoring
**FR-004: Signal Processing**
- Real-time signal calculation and validation
- Signal persistence for analysis and debugging
- Multi-timeframe analysis capabilities
- Custom indicator development support
3. **Bot Manager**
- Create and manage up to 10 concurrent trading bots
- Configure bot parameters and associated strategies
- Start/stop individual bots
- Track bot status and performance
**FR-005: Bot Lifecycle Management**
- Bot creation with strategy and parameter selection
- Start/stop/pause functionality with state persistence
- Configuration management
- Resource allocation and monitoring (in future)
**FR-006: Portfolio Management**
- Position tracking and balance management
- Risk management controls (stop-loss, take-profit, position sizing)
- Multi-bot coordination and conflict resolution (in future)
- Real-time portfolio valuation (in future)
5. **Trading Execution**
- Simulate or execute trades based on configuration
- Stores trade information in database
**FR-007: Order Management**
- Order placement with multiple order types (market, limit, stop)
- Order tracking and status monitoring (in future)
- Execution confirmation and reconciliation (in future)
- Fee calculation and tracking (in future)
**FR-008: Risk Controls**
- Pre-trade risk validation
- Position limits and exposure controls (in future)
- Emergency stop mechanisms (in future)
- Compliance monitoring and reporting (in future)
4. **Database (PostgreSQL)**
- Store market data, bot configurations, and trading history
- Optimized schema for time-series data without complexity
- Support for data querying and aggregation
**Database (JSON)**
- Store strategy parameters and bot onfiguration in JSON in the beginning for simplicity of editing and testing
5. **Backtesting Engine**
- Run simulations on historical data using vectorized operations for speed
- Calculate performance metrics
- Support multiple timeframes and strategy parameter testing
- Generate comparison reports between strategies
**FR-009: Historical Simulation**
- Strategy backtesting on historical market data
- Performance metric calculation (Sharpe ratio, drawdown, win rate, total return)
- Parameter optimization through grid search (limited combinations for speed) (in future)
- Side-by-side strategy comparison with statistical significance
**FR-010: Simulation Engine**
- Vectorized signal calculation using pandas operations
- Realistic fee modeling (0.1% per trade for OKX)
- Look-ahead bias prevention with proper timestamp handling
- Configurable test periods (1 day to 24 months)
6. **Dashboard & Visualization**
- Display real-time market data and bot status
- Show portfolio value progression over time
- Visualize trade history with buy/sell markers on price charts
- Provide simple bot control interface (start/stop/configure)
**FR-011: Dashboard Interface**
- Real-time bot monitoring with status indicators
- Portfolio performance charts (total value, cash vs crypto allocation)
- Trade history table with P&L per trade
- Simple bot configuration forms for JSON parameter editing
**FR-012: Data Visualization**
- Interactive price charts with strategy signal overlays
- Portfolio value progression charts
- Performance comparison tables (multiple bots side-by-side)
- Fee tracking and total cost analysis
### Non-Functional Requirements
1 Performance Requirements
**NFR-001: Latency**
- Market data processing: <100ms from exchange to database
- Signal generation: <500ms for standard strategies
- API response time: <200ms for 95% of requests
- Dashboard updates: <2 seconds for real-time data
**NFR-002: Scalability**
- Database queries scalable to 1M+ records per table
- Horizontal scaling capability for all services (in future)
2. Reliability Requirements
**NFR-003: Availability**
- System uptime: 99.5% excluding planned maintenance
- Data collection: 99.9% uptime during market hours
- Automatic failover for critical services
- Graceful degradation during partial outages
**NFR-004: Data Integrity**
- Zero data loss for executed trades
- Transactional consistency for all financial operations
- Regular database backups with point-in-time recovery
- Data validation and error correction mechanisms
3. Security Requirements
**NFR-005: Authentication & Authorization** (in future)
**NFR-006: Data Protection**
- End-to-end encryption for sensitive data (in future)
- Secure storage of API keys and credentials
- Regular security audits and penetration testing (in future)
- Compliance with financial data protection regulations (in future)
## Technical Implementation
### Database Schema
The database schema separates frequently-accessed OHLCV data from raw tick data to optimize performance and storage.
```sql
-- OHLCV Market Data (primary table for bot operations)
CREATE TABLE market_data (
id SERIAL PRIMARY KEY,
exchange VARCHAR(50) NOT NULL DEFAULT 'okx',
symbol VARCHAR(20) NOT NULL,
timeframe VARCHAR(5) NOT NULL, -- 1m, 5m, 15m, 1h, 4h, 1d
timestamp TIMESTAMPTZ NOT NULL,
open DECIMAL(18,8) NOT NULL,
high DECIMAL(18,8) NOT NULL,
low DECIMAL(18,8) NOT NULL,
close DECIMAL(18,8) NOT NULL,
volume DECIMAL(18,8) NOT NULL,
trades_count INTEGER, -- number of trades in this candle
created_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(exchange, symbol, timeframe, timestamp)
);
CREATE INDEX idx_market_data_lookup ON market_data(symbol, timeframe, timestamp);
CREATE INDEX idx_market_data_recent ON market_data(timestamp DESC) WHERE timestamp > NOW() - INTERVAL '7 days';
-- Raw Trade Data (optional, for detailed backtesting only)
CREATE TABLE raw_trades (
id SERIAL PRIMARY KEY,
exchange VARCHAR(50) NOT NULL DEFAULT 'okx',
symbol VARCHAR(20) NOT NULL,
timestamp TIMESTAMPTZ NOT NULL,
type VARCHAR(10) NOT NULL, -- trade, order, balance, tick, books
data JSONB NOT NULL, -- response from the exchange
created_at TIMESTAMPTZ DEFAULT NOW()
) PARTITION BY RANGE (timestamp);
CREATE INDEX idx_raw_trades_symbol_time ON raw_trades(symbol, timestamp);
-- Monthly partitions for raw data (if using raw data)
-- CREATE TABLE raw_trades_y2024m01 PARTITION OF raw_trades
-- FOR VALUES FROM ('2024-01-01') TO ('2024-02-01');
-- Bot Management (simplified)
CREATE TABLE bots (
id SERIAL PRIMARY KEY,
name VARCHAR(100) NOT NULL,
strategy_name VARCHAR(50) NOT NULL,
symbol VARCHAR(20) NOT NULL,
timeframe VARCHAR(5) NOT NULL,
status VARCHAR(20) NOT NULL DEFAULT 'inactive', -- active, inactive, error
config_file VARCHAR(200), -- path to JSON config
virtual_balance DECIMAL(18,8) DEFAULT 10000,
current_balance DECIMAL(18,8) DEFAULT 10000,
last_heartbeat TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Trading Signals (for analysis and debugging)
CREATE TABLE signals (
id SERIAL PRIMARY KEY,
bot_id INTEGER REFERENCES bots(id),
timestamp TIMESTAMPTZ NOT NULL,
signal_type VARCHAR(10) NOT NULL, -- buy, sell, hold
price DECIMAL(18,8),
confidence DECIMAL(5,4),
indicators JSONB, -- technical indicator values
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX idx_signals_bot_time ON signals(bot_id, timestamp);
-- Trade Execution Records
CREATE TABLE trades (
id SERIAL PRIMARY KEY,
bot_id INTEGER REFERENCES bots(id),
signal_id INTEGER REFERENCES signals(id),
timestamp TIMESTAMPTZ NOT NULL,
side VARCHAR(5) NOT NULL, -- buy, sell
price DECIMAL(18,8) NOT NULL,
quantity DECIMAL(18,8) NOT NULL,
fees DECIMAL(18,8) DEFAULT 0,
pnl DECIMAL(18,8), -- profit/loss for this trade
balance_after DECIMAL(18,8), -- portfolio balance after trade
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX idx_trades_bot_time ON trades(bot_id, timestamp);
-- Performance Snapshots (for plotting portfolio over time)
CREATE TABLE bot_performance (
id SERIAL PRIMARY KEY,
bot_id INTEGER REFERENCES bots(id),
timestamp TIMESTAMPTZ NOT NULL,
total_value DECIMAL(18,8) NOT NULL, -- current portfolio value
cash_balance DECIMAL(18,8) NOT NULL,
crypto_balance DECIMAL(18,8) NOT NULL,
total_trades INTEGER DEFAULT 0,
winning_trades INTEGER DEFAULT 0,
total_fees DECIMAL(18,8) DEFAULT 0,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX idx_bot_performance_bot_time ON bot_performance(bot_id, timestamp);
```
**Data Storage Strategy**:
- **OHLCV Data**: Primary source for bot operations, kept indefinitely, optimized indexes
- **Raw Trade Data**: Optional table, only if detailed backtesting needed, can be partitioned monthly
- **Alternative for Raw Data**: Store in compressed files (Parquet/CSV) instead of database for cost efficiency
**MVP Approach**: Start with OHLCV data only, add raw data storage later if advanced backtesting requires it.
### Technology Stack
The platform will be built using the following technologies:
- **Backend Framework**: Python 3.10+ with Dash (includes built-in Flask server for REST API endpoints)
- **Database**: PostgreSQL 14+ (with TimescaleDB extension for time-series optimization)
- **Real-time Messaging**: Redis (for pub/sub messaging between components)
- **Frontend**: Dash with Plotly (for visualization and control interface) and Mantine UI components
- **Configuration**: JSON files for strategy parameters and bot configurations
- **Deployment**: Docker container setup for development and production
### API Design
**Dash Callbacks**: Real-time updates and user interactions
**REST Endpoints**: Historical data queries for backtesting and analysis
```python
# Built-in Flask routes for historical data
@app.server.route('/api/bot/<bot_id>/trades')
@app.server.route('/api/market/<symbol>/history')
@app.server.route('/api/backtest/results/<test_id>')
```
### Data Flow
The data flow follows a simple pattern to ensure efficient processing:
1. **Market Data Collection**:
- Collector fetches data from exchange APIs
- Raw data is stored in PostgreSQL
- Processed data (e.g., OHLCV candles) are calculated and stored
- Real-time updates are published to Redis channels
2. **Signal Generation**:
- Bots subscribe to relevant data channels and generate signals based on the strategy
- Signals are stored in database and published to Redis
3. **Trade Execution**:
- Bot manager receives signals from strategies
- Validates signals against bot parameters and portfolio
- Simulates or executes trades based on configuration
- Stores trade information in database
4. **Visualization**:
- Dashboard subscribes to real-time data and trading updates
- Queries historical data for charts and performance metrics
- Provides interface for bot management and configuration
## Development Roadmap
### Phase 1: Foundation (Days 1-5)
**Objective**: Establish core system components and data flow
1. **Day 1-2**: Database Setup and Data Collection
- Set up PostgreSQL with initial schema
- Implement OKX API connector
- Create data storage and processing logic
2. **Day 3-4**: Strategy Engine and Bot Manager
- Develop strategy interface and 1-2 example strategies
- Create bot manager with basic controls
- Implement Redis for real-time messaging
3. **Day 5**: Basic Visualization
- Set up Dash/Plotly for simple charts
- Create basic dashboard layout
- Connect to real-time data sources
- Create mockup strategies and bots
### Phase 2: Core Functionality (Days 6-10)
**Objective**: Complete essential features for strategy testing
1. **Day 6-7**: Backtesting Engine
- Get historical data from the database or file (have for BTC/USDT in csv format)
- Create performance calculation metrics
- Develop strategy comparison tools
2. **Day 8-9**: Trading Logic
- Implement virtual trading capability
- Create trade execution logic
- Develop portfolio tracking
3. **Day 10**: Dashboard Enhancement
- Improve visualization components
- Add bot control interface
- Implement real-time performance monitoring
### Phase 3: Refinement (Days 11-14)
**Objective**: Polish system and prepare for ongoing use
1. **Day 11-12**: Testing and Debugging
- Comprehensive system testing
- Fix identified issues
- Performance optimization
2. **Day 13-14**: Documentation and Deployment
- Create user documentation
- Prepare deployment process
- Set up basic monitoring
## Technical Considerations
### Scalability Path
While the initial system is designed as a monolithic application for rapid development, several considerations ensure future scalability:
1. **Module Separation**: Clear boundaries between components enable future extraction into microservices
2. **Database Design**: Schema supports partitioning and sharding for larger data volumes
3. **Message Queue**: Redis implementation paves way for more robust messaging (Kafka/RabbitMQ)
4. **API-First Design**: Internal components communicate through well-defined interfaces
### Time Aggregation
Special attention is given to time aggregation to ensure consistency with exchanges:
```python
def aggregate_candles(trades, timeframe, alignment='right'):
"""
Aggregate trade data into OHLCV candles with consistent timestamp alignment.
Parameters:
- trades: List of trade dictionaries with timestamp and price
- timeframe: String representing the timeframe (e.g., '1m', '5m', '1h')
- alignment: String indicating timestamp alignment ('right' or 'left')
Returns:
- Dictionary with OHLCV data
"""
# Convert timeframe to pandas offset
if timeframe.endswith('m'):
offset = pd.Timedelta(minutes=int(timeframe[:-1]))
elif timeframe.endswith('h'):
offset = pd.Timedelta(hours=int(timeframe[:-1]))
elif timeframe.endswith('d'):
offset = pd.Timedelta(days=int(timeframe[:-1]))
# Create DataFrame from trades
df = pd.DataFrame(trades)
# Convert timestamps to pandas datetime
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
# Floor timestamps to timeframe
if alignment == 'right':
df['candle_time'] = df['timestamp'].dt.floor(offset)
else:
df['candle_time'] = df['timestamp'].dt.ceil(offset) - offset
# Aggregate to OHLCV
candles = df.groupby('candle_time').agg({
'price': ['first', 'max', 'min', 'last'],
'amount': 'sum'
}).reset_index()
# Rename columns
candles.columns = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
return candles
```
### Performance Optimization
For the initial release, several performance optimizations are implemented:
1. **Database Indexing**: Proper indexes on timestamp and symbol fields
2. **Query Optimization**: Prepared statements and efficient query patterns
3. **Connection Pooling**: Database connection management to prevent leaks
4. **Data Aggregation**: Pre-calculation of common time intervals
5. **Memory Management**: Proper cleanup of data objects after processing
## User Interface
The initial user interface focuses on functionality over aesthetics, providing essential controls and visualizations, minimalistic design.
1. **Market Data View**
- Real-time price charts for monitored symbols
- Order book visualization
- Recent trades list
2. **Bot Management**
- Create/configure bot interface
- Start/stop controls
- Status indicators
3. **Strategy Dashboard**
- Strategy selection and configuration
- Signal visualization
- Performance metrics
4. **Backtesting Interface**
- Historical data selection
- Strategy parameter configuration
- Results visualization
## Risk Management & Mitigation
### Technical Risks
**Risk:** Exchange API rate limiting affecting data collection
**Mitigation:** Implement intelligent rate limiting, multiple API keys, and fallback data sources
**Risk:** Database performance degradation with large datasets
**Mitigation:** Implement data partitioning, archival strategies, and query optimization (in future)
**Risk:** System downtime during market volatility
**Mitigation:** Design redundant systems, implement circuit breakers, and emergency procedures (in future)
### Business Risks
**Risk:** Regulatory changes affecting crypto trading
**Mitigation:** Implement compliance monitoring, maintain regulatory awareness, design for adaptability
**Risk:** Competition from established trading platforms
**Mitigation:** Focus on unique value propositions, rapid feature development, strong user experience
### 8.3 User Risks
**Risk:** User losses due to platform errors
**Mitigation:** Comprehensive testing, simulation modes, risk warnings, and liability disclaimers
## Future Expansion
While keeping the initial implementation simple, the design accommodates future enhancements:
1. **Authentication System**: Add multi-user support with role-based access
2. **Advanced Strategies**: Support for machine learning and AI-based strategies
3. **Multi-Exchange Support**: Expand beyond OKX to other exchanges
4. **Microservices Migration**: Extract components into separate services
5. **Advanced Monitoring**: Integration with Prometheus/Grafana
6. **Cloud Deployment**: Support for AWS/GCP/Azure deployment
## Success Metrics
The platform's success will be measured by these key metrics:
1. **Development Timeline**: Complete core functionality within 14 days
2. **System Stability**: Maintain 99% uptime during internal testing. System should monitor itself and restart if needed (all or just modules)
3. **Strategy Testing**: Successfully backtest at least 3 different strategies
4. **Bot Performance**: Run at least 2 bots concurrently for 72+ hours

View File

@ -1,165 +0,0 @@
## Architecture Components
### 1. Data Collector
**Responsibility**: Unified data collection from multiple exchanges
```python
class DataCollector:
def __init__(self):
self.providers = {} # Registry of data providers
def register_provider(self, name: str, provider: DataProvider):
"""Register a new data provider"""
def start_collection(self, symbols: List[str]):
"""Start collecting data for specified symbols"""
def process_raw_data(self, raw_data: dict):
"""Process raw data into OHLCV format"""
def send_signal_to_bots(self, processed_data: dict):
"""Send Redis signal to active bots"""
```
### 2. Strategy Engine
**Responsibility**: Unified interface for all trading strategies
```python
class BaseStrategy:
def __init__(self, parameters: dict):
self.parameters = parameters
def process_data(self, data: pd.DataFrame) -> Signal:
"""Process market data and generate signals"""
raise NotImplementedError
def get_indicators(self) -> dict:
"""Return calculated indicators for plotting"""
return {}
```
### 3. Bot Manager
**Responsibility**: Orchestrate bot execution and state management
```python
class BotManager:
def __init__(self):
self.active_bots = {}
def start_bot(self, bot_id: int):
"""Start a bot instance"""
def stop_bot(self, bot_id: int):
"""Stop a bot instance"""
def process_signal(self, bot_id: int, signal: Signal):
"""Process signal and make trading decision"""
def update_bot_state(self, bot_id: int, state: dict):
"""Update bot state in database"""
```
## Communication Architecture
### Redis Pub/Sub Patterns
```python
# Real-time market data
MARKET_DATA_CHANNEL = "market_data:{symbol}"
# Bot-specific signals
BOT_SIGNAL_CHANNEL = "bot_signals:{bot_id}"
# Trade updates
TRADE_UPDATE_CHANNEL = "trade_updates:{bot_id}"
# System events
SYSTEM_EVENT_CHANNEL = "system_events"
```
### WebSocket Communication
```python
# Frontend real-time updates
WS_BOT_STATUS = "/ws/bot/{bot_id}/status"
WS_MARKET_DATA = "/ws/market/{symbol}"
WS_PORTFOLIO = "/ws/portfolio/{bot_id}"
```
## Time Aggregation Strategy
### Candlestick Alignment
- **Use RIGHT-ALIGNED timestamps** (industry standard)
- 5-minute candle with timestamp 09:05:00 represents data from 09:00:01 to 09:05:00
- Timestamp = close time of the candle
- Aligns with major exchanges (Binance, OKX, Coinbase)
### Aggregation Logic
```python
def aggregate_to_timeframe(ticks: List[dict], timeframe: str) -> dict:
"""
Aggregate tick data to specified timeframe
timeframe: '1m', '5m', '15m', '1h', '4h', '1d'
"""
# Convert timeframe to seconds
interval_seconds = parse_timeframe(timeframe)
# Group ticks by time intervals (right-aligned)
for group in group_by_interval(ticks, interval_seconds):
candle = {
'timestamp': group.end_time, # Right-aligned
'open': group.first_price,
'high': group.max_price,
'low': group.min_price,
'close': group.last_price,
'volume': group.total_volume
}
yield candle
```
## Backtesting Optimization
### Parallel Processing Strategy
```python
import multiprocessing as mp
from joblib import Parallel, delayed
import numba
@numba.jit(nopython=True)
def calculate_signals_vectorized(prices, parameters):
"""Vectorized signal calculation using Numba"""
# High-performance signal calculation
return signals
def backtest_strategy_batch(data_batch, strategy_params):
"""Backtest a batch of data in parallel"""
# Process batch of signals
signals = calculate_signals_vectorized(data_batch, strategy_params)
# Simulate trades incrementally
portfolio = simulate_trades(signals, data_batch)
return portfolio
# Parallel backtesting
def run_parallel_backtest(data, strategy_params, n_jobs=4):
data_batches = split_data_into_batches(data, n_jobs)
results = Parallel(n_jobs=n_jobs)(
delayed(backtest_strategy_batch)(batch, strategy_params)
for batch in data_batches
)
return combine_results(results)
```
### Optimization Techniques
1. **Vectorized Operations**: Use NumPy/Pandas for bulk calculations
2. **Numba JIT**: Compile critical loops for C-like performance
3. **Batch Processing**: Process signals in batches, simulate trades incrementally
4. **Memory Management**: Use efficient data structures (arrays vs lists)
5. **Parallel Execution**: Utilize multiple CPU cores for independent calculations
## Key Design Principles
1. **Data Separation**: Raw and processed data stored separately for audit trail
2. **Signal Tracking**: All signals recorded (executed or not) for analysis
3. **Real-time State**: Bot states updated in real-time for monitoring
4. **Audit Trail**: Complete record of all trading activities
5. **Scalability**: Architecture supports multiple bots and strategies
6. **Modularity**: Clear separation between data collection, strategy execution, and trading
7. **Fault Tolerance**: Redis for reliable message delivery, database transactions for consistency

View File

@ -0,0 +1,36 @@
# ADR-002: BaseDataCollector Refactoring and Component Extraction
## Status
Accepted
## Context
The `BaseDataCollector` class was initially monolithic, handling connection management, state and telemetry, and callback dispatching directly. This led to a less modular, harder-to-test, and less maintainable codebase. Additionally, `OHLCVData` and its associated validation, although broadly applicable, were tightly coupled within the `data` module, leading to potential import complexities and naming conflicts.
## Decision
To improve modularity, maintainability, testability, and reusability, we decided to refactor `BaseDataCollector` by extracting its core responsibilities into dedicated, smaller, and focused components. We also decided to relocate `OHLCVData` to a more common and accessible location.
### Extracted Components:
1. **`CollectorStateAndTelemetry`**: Responsible for managing collector status, health, statistics, and logging.
2. **`ConnectionManager`**: Responsible for handling WebSocket connection lifecycle (connect, disconnect, reconnect) and related error management.
3. **`CallbackDispatcher`**: Responsible for managing and dispatching data callbacks to registered listeners.
### OHLCVData Relocation:
- The `OHLCVData` class and the `validate_ohlcv_data` function, along with the `DataValidationError` exception, were moved from `data/ohlcv_data.py` to `data/common/ohlcv_data.py`.
## Consequences
**Positive:**
- **Improved Modularity**: `BaseDataCollector` is now leaner and focuses solely on orchestrating the new components.
- **Enhanced Testability**: Each extracted component can be unit-tested in isolation, reducing test complexity and improving test coverage.
- **Increased Maintainability**: Changes to connection logic, state management, or callback handling are isolated to their respective components, minimizing impact on other parts of the system.
- **Greater Reusability**: `CollectorStateAndTelemetry`, `ConnectionManager`, and `CallbackDispatcher` can potentially be reused in other contexts or for different types of collectors.
- **Clearer Separation of Concerns**: Each component has a single, well-defined responsibility.
- **Centralized `OHLCVData`**: Moving `OHLCVData` to `data/common` provides a more intuitive and accessible location for a common data structure, resolving potential import conflicts and improving code organization.
**Negative:**
- **Increased File Count**: More files are introduced, potentially increasing initial navigation overhead (mitigated by clear naming and directory structure).
- **Refactoring Overhead**: Required updating existing code to use the new components and adjusting imports across multiple files.
## Alternatives Considered
- **Keeping Monolithic `BaseDataCollector`**: Rejected due to the drawbacks of tightly coupled code (poor testability, maintainability).
- **Partial Extraction**: Considered extracting only one or two components, but decided against it to achieve maximum modularity benefits.
- **Different `OHLCVData` Location**: Considered `utils/data_types.py` or `data/models.py`, but `data/common/ohlcv_data.py` was deemed most appropriate given its nature as a common data structure within the `data` module.

View File

@ -62,15 +62,45 @@ For exchange-specific documentation, see [Exchange Implementations (`./exchanges
### 1. `BaseDataCollector`
An abstract base class that defines the common interface for all exchange collectors.
An abstract base class that defines the common interface for all exchange collectors. It now orchestrates specialized components for connection management, state and telemetry, and callback dispatching.
**Key Responsibilities:**
- Standardized `start`, `stop`, `restart` methods
- Built-in health monitoring with heartbeat and data silence detection
- Automatic reconnect and restart logic
- Asynchronous message handling
- Standardized `start`, `stop`, `restart` methods.
- Orchestrates connection handling via `ConnectionManager`.
- Delegates state, health, and statistics management to `CollectorStateAndTelemetry`.
- Utilizes `CallbackDispatcher` for managing and notifying data subscribers.
- Defines abstract methods for exchange-specific implementations (e.g., `_actual_connect`, `_actual_disconnect`, `_subscribe_channels`, `_process_message`).
### 2. `CollectorManager`
### 2. `CollectorStateAndTelemetry`
Manages the operational state, health, and performance statistics of a data collector.
**Key Responsibilities:**
- Tracks `CollectorStatus` (e.g., `RUNNING`, `STOPPED`, `ERROR`).
- Monitors health metrics like heartbeat and data silence.
- Collects and provides operational statistics (e.g., messages processed, errors).
- Provides centralized logging functionality for the collector.
### 3. `ConnectionManager`
Handles the WebSocket connection lifecycle and resilience for a data collector.
**Key Responsibilities:**
- Establishes and terminates WebSocket connections.
- Manages automatic reconnection attempts with exponential backoff.
- Handles connection-related errors and ensures robust connectivity.
- Tracks WebSocket connection state and statistics.
### 4. `CallbackDispatcher`
Manages and dispatches real-time data to registered callbacks.
**Key Responsibilities:**
- Registers and unregisters data callbacks for different `DataType`s.
- Notifies all subscribed listeners when new data points are received.
- Ensures efficient and reliable distribution of processed market data.
### 5. `CollectorManager`
A singleton class that manages all active data collectors in the system.
@ -80,7 +110,7 @@ A singleton class that manages all active data collectors in the system.
- Global health monitoring
- Coordination of restart policies
### 3. Exchange-Specific Collectors
### 6. Exchange-Specific Collectors
Concrete implementations of `BaseDataCollector` for each exchange (e.g., `OKXCollector`).

View File

@ -6,8 +6,8 @@
1. **Base Collector**
- Inherit from `BaseDataCollector`
- Implement required abstract methods
- Handle connection lifecycle
- Implement exchange-specific abstract methods (e.g., `_actual_connect`, `_actual_disconnect`, `_subscribe_channels`, `_process_message`)
- Leverage `ConnectionManager`, `CollectorStateAndTelemetry`, and `CallbackDispatcher` through the inherited `BaseDataCollector` functionalities
2. **WebSocket Client**
- Implement exchange-specific WebSocket handling

View File

@ -897,13 +897,13 @@ The OKX collector consists of three main components working together:
### `OKXCollector`
- **Main class**: `OKXCollector(BaseDataCollector)`
- **Responsibilities**:
- Manages WebSocket connection state
- Subscribes to required data channels
- Dispatches raw messages to the data processor
- Stores standardized data in the database
- Provides health and status monitoring
- **Main class**: `OKXCollector(BaseDataCollector)`
- **Responsibilities**:
- Implements exchange-specific connection and subscription logic (delegating to `ConnectionManager` for core connection handling).
- Processes and standardizes raw OKX WebSocket messages (delegating to `OKXDataProcessor`).
- Interacts with `CollectorStateAndTelemetry` for status, health, and logging.
- Uses `CallbackDispatcher` to notify subscribers of processed data.
- Stores standardized data in the database.
### `OKXWebSocketClient`
@ -915,12 +915,12 @@ The OKX collector consists of three main components working together:
### `OKXDataProcessor`
- **New in v2.0**: `OKXDataProcessor`
- **Responsibilities**:
- Validates incoming raw data from WebSocket
- Transforms data into standardized `StandardizedTrade` and `OHLCVCandle` formats
- Aggregates trades into OHLCV candles
- Invokes callbacks for processed trades and completed candles
- **New in v2.0**: `OKXDataProcessor`
- **Responsibilities**:
- Validates incoming raw data from WebSocket.
- Transforms data into standardized `MarketDataPoint` and `OHLCVData` formats (using the moved `OHLCVData`).
- Aggregates trades into OHLCV candles.
- Invokes callbacks for processed trades and completed candles.
## Configuration
@ -932,12 +932,12 @@ Configuration options for the `OKXCollector` class:
|-------------------------|---------------------|---------------------------------------|-----------------------------------------------------------------------------|
| `symbol` | `str` | - | Trading symbol (e.g., `BTC-USDT`) |
| `data_types` | `List[DataType]` | `[TRADE, ORDERBOOK]` | List of data types to collect |
| `auto_restart` | `bool` | `True` | Automatically restart on failures |
| `health_check_interval` | `float` | `30.0` | Seconds between health checks |
| `auto_restart` | `bool` | `True` | Automatically restart on failures (managed by `BaseDataCollector` via `ConnectionManager`) |
| `health_check_interval` | `float` | `30.0` | Seconds between health checks (managed by `BaseDataCollector` via `CollectorStateAndTelemetry`) |
| `store_raw_data` | `bool` | `True` | Store raw WebSocket data for debugging |
| `force_update_candles` | `bool` | `False` | If `True`, update existing candles; if `False`, keep existing ones unchanged |
| `logger` | `Logger` | `None` | Logger instance for conditional logging |
| `log_errors_only` | `bool` | `False` | If `True` and logger provided, only log error-level messages |
| `logger` | `Logger` | `None` | Logger instance for conditional logging (managed by `BaseDataCollector` via `CollectorStateAndTelemetry`) |
| `log_errors_only` | `bool` | `False` | If `True` and logger provided, only log error-level messages (managed by `BaseDataCollector` via `CollectorStateAndTelemetry`) |
### Health & Status Monitoring
@ -962,4 +962,4 @@ Example output:
}
```
## Database Integration
## Database Integration

View File

@ -26,10 +26,13 @@ This architecture allows for high scalability and fault tolerance.
- **Location**: `data/exchanges/okx/collector.py`
- **Responsibilities**:
- Connects to the OKX WebSocket API
- Subscribes to real-time data channels
- Processes and standardizes incoming data
- Stores data in the database
- Inherits from `BaseDataCollector` and implements exchange-specific data collection logic.
- Utilizes `ConnectionManager` for robust WebSocket connection management.
- Leverages `CollectorStateAndTelemetry` for internal status, health, and logging.
- Uses `CallbackDispatcher` to notify registered consumers of processed data.
- Subscribes to real-time data channels specific to OKX.
- Processes and standardizes incoming OKX data before dispatching.
- Stores processed data in the database.
## Configuration

View File

@ -18,7 +18,7 @@
## Tasks
- [ ] 0.0 Create `data/collector` directory
- [x] 0.0 Create `data/collector` directory
- [x] 1.0 Extract `CollectorStateAndTelemetry` Class
- [x] 1.1 Create `data/collector/collector_state_telemetry.py`.
- [x] 1.2 Move `CollectorStatus` enum to `data/collector/collector_state_telemetry.py`.
@ -44,23 +44,23 @@
- [x] 3.5 Add necessary imports to both `data/base_collector.py` and `data/collector/collector_callback_dispatcher.py`.
- [x] 3.6 Create `tests/data/collector/test_collector_callback_dispatcher.py` and add initial tests for the new class.
- [ ] 4.0 Refactor `BaseDataCollector` to use new components
- [ ] 4.1 Update `BaseDataCollector.__init__` to instantiate and use `CollectorStateAndTelemetry`, `ConnectionManager`, and `CallbackDispatcher` instances.
- [ ] 4.2 Replace direct access to moved attributes/methods with calls to the new component instances (e.g., `self.logger.info` becomes `self._state_telemetry.log_info`).
- [ ] 4.3 Modify `start`, `stop`, `restart`, `_message_loop`, `_health_monitor` to interact with the new components, delegating responsibilities appropriately.
- [ ] 4.4 Update `get_status` and `get_health_status` in `BaseDataCollector` to delegate to `CollectorStateAndTelemetry`.
- [ ] 4.5 Review and update abstract methods and their calls as needed, ensuring they interact correctly with the new components.
- [ ] 4.6 Ensure all existing tests for `BaseDataCollector` still pass after refactoring.
- [ ] 4.7 Update `data/exchanges/okx/collector.py` to use the new `CollectorStateAndTelemetry` and `ConnectionManager` classes for logging, status updates, and connection handling.
- [ ] 4.8 Update `data/collector_manager.py` to interact with the new `CollectorStateAndTelemetry` class for health checks and status retrieval from `BaseDataCollector` instances.
- [x] 4.0 Refactor `BaseDataCollector` to use new components
- [x] 4.1 Update `BaseDataCollector.__init__` to instantiate and use `CollectorStateAndTelemetry`, `ConnectionManager`, and `CallbackDispatcher` instances.
- [x] 4.2 Replace direct access to moved attributes/methods with calls to the new component instances (e.g., `self.logger.info` becomes `self._state_telemetry.log_info`).
- [x] 4.3 Modify `start`, `stop`, `restart`, `_message_loop`, `_health_monitor` to interact with the new components, delegating responsibilities appropriately.
- [x] 4.4 Update `get_status` and `get_health_status` in `BaseDataCollector` to delegate to `CollectorStateAndTelemetry`.
- [x] 4.5 Review and update abstract methods and their calls as needed, ensuring they interact correctly with the new components.
- [x] 4.6 Ensure all existing tests for `BaseDataCollector` still pass after refactoring.
- [x] 4.7 Update `data/exchanges/okx/collector.py` to use the new `CollectorStateAndTelemetry` and `ConnectionManager` classes for logging, status updates, and connection handling.
- [x] 4.8 Update `data/collector_manager.py` to interact with the new `CollectorStateAndTelemetry` class for health checks and status retrieval from `BaseDataCollector` instances.
- [ ] 5.0 Review and potentially extract `OHLCVData` and related validation
- [ ] 5.1 Analyze if `OHLCVData` and `validate_ohlcv_data` are frequently used outside of `data/base_collector.py`.
- [ ] 5.2 If analysis indicates external usage or clear separation benefits, move `OHLCVData` class and `DataValidationError` to a new `data/ohlcv_data.py` file.
- [ ] 5.3 Update imports in `data/base_collector.py` and any other affected files.
- [ ] 5.4 If `OHLCVData` is extracted, create `tests/data/test_ohlcv_data.py` with tests for its structure and validation logic.
- [x] 5.0 Review and potentially extract `OHLCVData` and related validation
- [x] 5.1 Analyze if `OHLCVData` and `validate_ohlcv_data` are frequently used outside of `data/base_collector.py`.
- [x] 5.2 If analysis indicates external usage or clear separation benefits, move `OHLCVData` class and `DataValidationError` to a new `data/ohlcv_data.py` file.
- [x] 5.3 Update imports in `data/base_collector.py` and any other affected files.
- [x] 5.4 If `OHLCVData` is extracted, create `tests/data/test_ohlcv_data.py` with tests for its structure and validation logic.
- [ ] 6.0 Update Module Imports
- [ ] 6.1 Update imports in `data/__init__.py` to reflect the new locations of `CollectorStatus`, `DataCollectorError`, `DataValidationError`, `DataType`, `MarketDataPoint`, and `OHLCVData` (if moved).
- [ ] 6.2 Update imports in `data/common/data_types.py` for `DataType` and `MarketDataPoint`.
- [ ] 6.3 Review and update imports in all test files (`tests/test_refactored_okx.py`, `tests/test_real_storage.py`, `tests/test_okx_collector.py`, `tests/test_exchange_factory.py`, `tests/test_data_collection_aggregation.py`, `tests/test_collector_manager.py`, `tests/test_base_collector.py`, `tests/database/test_database_operations.py`) and scripts (`scripts/production_clean.py`) that import directly from `data.base_collector`.
- [x] 6.0 Update Module Imports
- [x] 6.1 Update imports in `data/__init__.py` to reflect the new locations of `CollectorStatus`, `DataCollectorError`, `DataValidationError`, `DataType`, `MarketDataPoint`, and `OHLCVData` (if moved).
- [x] 6.2 Update imports in `data/common/data_types.py` for `DataType` and `MarketDataPoint`.
- [x] 6.3 Review and update imports in all test files (`tests/test_refactored_okx.py`, `tests/test_real_storage.py`, `tests/test_okx_collector.py`, `tests/test_exchange_factory.py`, `tests/test_data_collection_aggregation.py`, `tests/test_collector_manager.py`, `tests/test_base_collector.py`, `tests/database/test_database_operations.py`) and scripts (`scripts/production_clean.py`) that import directly from `data.base_collector`.

View File

@ -23,16 +23,26 @@ class TestDataCollector(BaseDataCollector):
self.subscribed = False
self.messages = []
async def connect(self) -> bool:
async def _actual_connect(self) -> bool:
"""Implementation of actual connection logic for testing."""
await asyncio.sleep(0.01) # Simulate connection delay
self.connected = True
return True
async def disconnect(self) -> None:
async def _actual_disconnect(self) -> None:
"""Implementation of actual disconnection logic for testing."""
await asyncio.sleep(0.01) # Simulate disconnection delay
self.connected = False
self.subscribed = False
async def connect(self) -> bool:
"""Connect using the connection manager."""
return await self._connection_manager.connect(self._actual_connect)
async def disconnect(self) -> None:
"""Disconnect using the connection manager."""
await self._connection_manager.disconnect(self._actual_disconnect)
async def subscribe_to_data(self, symbols: list, data_types: list) -> bool:
if not self.connected:
return False
@ -44,7 +54,7 @@ class TestDataCollector(BaseDataCollector):
return True
async def _process_message(self, message) -> MarketDataPoint:
self._stats['messages_received'] += 1
self._state_telemetry.increment_messages_received()
return MarketDataPoint(
exchange=self.exchange_name,
symbol=message.get('symbol', 'BTC-USDT'),
@ -58,8 +68,7 @@ class TestDataCollector(BaseDataCollector):
if self.messages:
message = self.messages.pop(0)
data_point = await self._process_message(message)
self._stats['messages_processed'] += 1
self._stats['last_message_time'] = datetime.now(timezone.utc)
# Note: increment_messages_processed() is called in _notify_callbacks()
await self._notify_callbacks(data_point)
else:
await asyncio.sleep(0.1) # Wait for messages
@ -83,7 +92,7 @@ class TestBaseDataCollector:
assert collector.symbols == {"BTC-USDT", "ETH-USDT"}
assert collector.data_types == [DataType.TICKER]
assert collector.status == CollectorStatus.STOPPED
assert not collector._running
assert not collector._state_telemetry._running
@pytest.mark.asyncio
async def test_start_stop_cycle(self, collector):
@ -94,7 +103,7 @@ class TestBaseDataCollector:
assert collector.status == CollectorStatus.RUNNING
assert collector.connected
assert collector.subscribed
assert collector._running
assert collector._state_telemetry._running
# Wait a bit for the message loop to start
await asyncio.sleep(0.1)
@ -102,7 +111,7 @@ class TestBaseDataCollector:
# Test stop
await collector.stop()
assert collector.status == CollectorStatus.STOPPED
assert not collector._running
assert not collector._state_telemetry._running
assert not collector.connected
assert not collector.subscribed
@ -131,8 +140,8 @@ class TestBaseDataCollector:
assert len(received_data) == 1
assert received_data[0].symbol == "BTC-USDT"
assert received_data[0].data_type == DataType.TICKER
assert collector._stats['messages_received'] == 1
assert collector._stats['messages_processed'] == 1
assert collector._state_telemetry._stats['messages_received'] == 1
assert collector._state_telemetry._stats['messages_processed'] == 1
def test_symbol_management(self, collector):
"""Test adding and removing symbols."""
@ -160,12 +169,12 @@ class TestBaseDataCollector:
# Add callbacks
collector.add_data_callback(DataType.TICKER, callback1)
collector.add_data_callback(DataType.TICKER, callback2)
assert len(collector._data_callbacks[DataType.TICKER]) == 2
assert len(collector._callback_dispatcher._data_callbacks[DataType.TICKER]) == 2
# Remove callback
collector.remove_data_callback(DataType.TICKER, callback1)
assert len(collector._data_callbacks[DataType.TICKER]) == 1
assert callback2 in collector._data_callbacks[DataType.TICKER]
assert len(collector._callback_dispatcher._data_callbacks[DataType.TICKER]) == 1
assert callback2 in collector._callback_dispatcher._data_callbacks[DataType.TICKER]
def test_get_status(self, collector):
"""Test status reporting."""
@ -302,11 +311,11 @@ async def test_connection_error_handling():
self.connect_attempts = 0
self.should_fail = True
async def connect(self) -> bool:
async def _actual_connect(self) -> bool:
self.connect_attempts += 1
if self.should_fail and self.connect_attempts < 3:
return False # Fail first 2 attempts
return await super().connect()
return await super()._actual_connect()
collector = FailingCollector()
@ -316,8 +325,8 @@ async def test_connection_error_handling():
assert collector.status == CollectorStatus.ERROR
# Reset for retry and allow success
collector._reconnect_attempts = 0
collector.status = CollectorStatus.STOPPED
collector._connection_manager._reconnect_attempts = 0
collector._state_telemetry.update_status(CollectorStatus.STOPPED)
collector.connect_attempts = 0 # Reset connection attempts
collector.should_fail = False # Allow connection to succeed

View File

@ -7,6 +7,7 @@ import pytest
from datetime import datetime, timezone
from unittest.mock import AsyncMock, MagicMock
from utils.logger import get_logger
from data.collector_manager import CollectorManager, ManagerStatus, CollectorConfig
from data.base_collector import BaseDataCollector, DataType, CollectorStatus
@ -22,7 +23,8 @@ class MockDataCollector(BaseDataCollector):
self.should_fail_subscribe = False
self.fail_count = 0
async def connect(self) -> bool:
async def _actual_connect(self) -> bool:
"""Implementation of actual connection logic for testing."""
if self.should_fail_connect and self.fail_count < 2:
self.fail_count += 1
return False
@ -30,10 +32,19 @@ class MockDataCollector(BaseDataCollector):
self.connected = True
return True
async def disconnect(self) -> None:
async def _actual_disconnect(self) -> None:
"""Implementation of actual disconnection logic for testing."""
await asyncio.sleep(0.01)
self.connected = False
self.subscribed = False
async def connect(self) -> bool:
"""Connect using the connection manager."""
return await self._connection_manager.connect(self._actual_connect)
async def disconnect(self) -> None:
"""Disconnect using the connection manager."""
await self._connection_manager.disconnect(self._actual_disconnect)
async def subscribe_to_data(self, symbols: list, data_types: list) -> bool:
if self.should_fail_subscribe:
@ -62,7 +73,8 @@ class TestCollectorManager:
@pytest.fixture
def manager(self):
"""Create a test manager instance."""
return CollectorManager("test_manager", global_health_check_interval=1.0)
test_logger = get_logger("test_manager_logger")
return CollectorManager("test_manager", global_health_check_interval=1.0, logger=test_logger)
@pytest.fixture
def mock_collector(self):

View File

@ -0,0 +1,230 @@
"""
Unit tests for the OHLCVData module.
"""
import pytest
from datetime import datetime, timezone
from decimal import Decimal
from data.common.ohlcv_data import OHLCVData, DataValidationError, validate_ohlcv_data
class TestOHLCVData:
"""Test cases for OHLCVData validation."""
def test_valid_ohlcv_data(self):
"""Test creating valid OHLCV data."""
ohlcv = OHLCVData(
symbol="BTC-USDT",
timeframe="1m",
timestamp=datetime.now(timezone.utc),
open=Decimal("50000"),
high=Decimal("50100"),
low=Decimal("49900"),
close=Decimal("50050"),
volume=Decimal("1.5"),
trades_count=100
)
assert ohlcv.symbol == "BTC-USDT"
assert ohlcv.timeframe == "1m"
assert isinstance(ohlcv.open, Decimal)
assert ohlcv.trades_count == 100
def test_invalid_ohlcv_relationships(self):
"""Test OHLCV validation for invalid price relationships."""
with pytest.raises(DataValidationError):
OHLCVData(
symbol="BTC-USDT",
timeframe="1m",
timestamp=datetime.now(timezone.utc),
open=Decimal("50000"),
high=Decimal("49000"), # High is less than open
low=Decimal("49900"),
close=Decimal("50050"),
volume=Decimal("1.5")
)
def test_ohlcv_decimal_conversion(self):
"""Test automatic conversion to Decimal."""
ohlcv = OHLCVData(
symbol="BTC-USDT",
timeframe="1m",
timestamp=datetime.now(timezone.utc),
open=50000.0, # float
high=50100, # int
low=49900, # int
close=50050.0, # float
volume=1.5 # float
)
assert isinstance(ohlcv.open, Decimal)
assert isinstance(ohlcv.high, Decimal)
assert isinstance(ohlcv.low, Decimal)
assert isinstance(ohlcv.close, Decimal)
assert isinstance(ohlcv.volume, Decimal)
def test_timezone_handling(self):
"""Test that naive datetimes get UTC timezone."""
naive_timestamp = datetime(2023, 1, 1, 12, 0, 0)
ohlcv = OHLCVData(
symbol="BTC-USDT",
timeframe="1m",
timestamp=naive_timestamp,
open=50000,
high=50100,
low=49900,
close=50050,
volume=1.5
)
assert ohlcv.timestamp.tzinfo == timezone.utc
def test_invalid_price_types(self):
"""Test validation fails for invalid price types."""
with pytest.raises(DataValidationError, match="All OHLCV prices must be numeric"):
OHLCVData(
symbol="BTC-USDT",
timeframe="1m",
timestamp=datetime.now(timezone.utc),
open="invalid", # Invalid type
high=50100,
low=49900,
close=50050,
volume=1.5
)
def test_invalid_volume_type(self):
"""Test validation fails for invalid volume type."""
with pytest.raises(DataValidationError, match="Volume must be numeric"):
OHLCVData(
symbol="BTC-USDT",
timeframe="1m",
timestamp=datetime.now(timezone.utc),
open=50000,
high=50100,
low=49900,
close=50050,
volume="invalid" # Invalid type
)
class TestValidateOhlcvData:
"""Test cases for validate_ohlcv_data function."""
def test_validate_success(self):
"""Test successful OHLCV data validation."""
raw_data = {
"timestamp": 1609459200000, # Unix timestamp in ms
"open": "50000",
"high": "50100",
"low": "49900",
"close": "50050",
"volume": "1.5",
"trades_count": 100
}
ohlcv = validate_ohlcv_data(raw_data, "BTC-USDT", "1m")
assert ohlcv.symbol == "BTC-USDT"
assert ohlcv.timeframe == "1m"
assert ohlcv.trades_count == 100
assert isinstance(ohlcv.open, Decimal)
assert ohlcv.open == Decimal("50000")
def test_validate_missing_field(self):
"""Test validation with missing required field."""
raw_data = {
"timestamp": 1609459200000,
"open": "50000",
"high": "50100",
# Missing 'low' field
"close": "50050",
"volume": "1.5"
}
with pytest.raises(DataValidationError, match="Missing required field: low"):
validate_ohlcv_data(raw_data, "BTC-USDT", "1m")
def test_validate_invalid_timestamp_string(self):
"""Test validation with invalid timestamp string."""
raw_data = {
"timestamp": "invalid_timestamp",
"open": "50000",
"high": "50100",
"low": "49900",
"close": "50050",
"volume": "1.5"
}
with pytest.raises(DataValidationError):
validate_ohlcv_data(raw_data, "BTC-USDT", "1m")
def test_validate_timestamp_formats(self):
"""Test validation with different timestamp formats."""
base_data = {
"open": "50000",
"high": "50100",
"low": "49900",
"close": "50050",
"volume": "1.5"
}
# Unix timestamp in milliseconds
data1 = {**base_data, "timestamp": 1609459200000}
ohlcv1 = validate_ohlcv_data(data1, "BTC-USDT", "1m")
assert isinstance(ohlcv1.timestamp, datetime)
# Unix timestamp in seconds (float)
data2 = {**base_data, "timestamp": 1609459200.5}
ohlcv2 = validate_ohlcv_data(data2, "BTC-USDT", "1m")
assert isinstance(ohlcv2.timestamp, datetime)
# ISO format string
data3 = {**base_data, "timestamp": "2021-01-01T00:00:00Z"}
ohlcv3 = validate_ohlcv_data(data3, "BTC-USDT", "1m")
assert isinstance(ohlcv3.timestamp, datetime)
# Already a datetime object
data4 = {**base_data, "timestamp": datetime.now(timezone.utc)}
ohlcv4 = validate_ohlcv_data(data4, "BTC-USDT", "1m")
assert isinstance(ohlcv4.timestamp, datetime)
def test_validate_invalid_numeric_data(self):
"""Test validation with invalid numeric price data."""
raw_data = {
"timestamp": 1609459200000,
"open": "invalid_number",
"high": "50100",
"low": "49900",
"close": "50050",
"volume": "1.5"
}
with pytest.raises(DataValidationError, match="Invalid OHLCV data for BTC-USDT"):
validate_ohlcv_data(raw_data, "BTC-USDT", "1m")
def test_validate_with_optional_fields(self):
"""Test validation works correctly with optional fields."""
raw_data = {
"timestamp": 1609459200000,
"open": "50000",
"high": "50100",
"low": "49900",
"close": "50050",
"volume": "1.5"
# No trades_count
}
ohlcv = validate_ohlcv_data(raw_data, "BTC-USDT", "1m")
assert ohlcv.trades_count is None
# With trades_count
raw_data["trades_count"] = 250
ohlcv = validate_ohlcv_data(raw_data, "BTC-USDT", "1m")
assert ohlcv.trades_count == 250
if __name__ == "__main__":
pytest.main([__file__, "-v"])