Implement data collection architecture with modular components
- Introduced a comprehensive data collection framework, including `CollectorServiceConfig`, `BaseDataCollector`, and `CollectorManager`, enhancing modularity and maintainability. - Developed `CollectorFactory` for streamlined collector creation, promoting separation of concerns and improved configuration handling. - Enhanced `DataCollectionService` to utilize the new architecture, ensuring robust error handling and logging practices. - Added `TaskManager` for efficient management of asynchronous tasks, improving performance and resource management. - Implemented health monitoring and auto-recovery features in `CollectorManager`, ensuring reliable operation of data collectors. - Updated imports across the codebase to reflect the new structure, ensuring consistent access to components. These changes significantly improve the architecture and maintainability of the data collection service, aligning with project standards for modularity, performance, and error handling.
This commit is contained in:
@@ -5,14 +5,14 @@ This package contains modules for collecting market data from various exchanges,
|
||||
processing and validating the data, and storing it in the database.
|
||||
"""
|
||||
|
||||
from .base_collector import (
|
||||
from .collector.base_collector import (
|
||||
BaseDataCollector, DataCollectorError
|
||||
)
|
||||
from .collector.collector_state_telemetry import CollectorStatus
|
||||
from .common.ohlcv_data import OHLCVData, DataValidationError
|
||||
from .common.data_types import DataType, MarketDataPoint
|
||||
from .collector_manager import CollectorManager
|
||||
from .collector_types import ManagerStatus, CollectorConfig
|
||||
from .collector.collector_manager import CollectorManager
|
||||
from .collector.collector_types import ManagerStatus, CollectorConfig
|
||||
|
||||
__all__ = [
|
||||
'BaseDataCollector',
|
||||
|
||||
@@ -14,11 +14,11 @@ from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
from utils.logger import get_logger
|
||||
from .collector.collector_state_telemetry import CollectorStatus, CollectorStateAndTelemetry
|
||||
from .collector.collector_connection_manager import ConnectionManager
|
||||
from .collector.collector_callback_dispatcher import CallbackDispatcher
|
||||
from .common.data_types import DataType, MarketDataPoint
|
||||
from .common.ohlcv_data import OHLCVData, DataValidationError, validate_ohlcv_data
|
||||
from .collector_state_telemetry import CollectorStatus, CollectorStateAndTelemetry
|
||||
from .collector_connection_manager import ConnectionManager
|
||||
from .collector_callback_dispatcher import CallbackDispatcher
|
||||
from ..common.data_types import DataType, MarketDataPoint
|
||||
from ..common.ohlcv_data import OHLCVData, DataValidationError, validate_ohlcv_data
|
||||
|
||||
|
||||
class DataCollectorError(Exception):
|
||||
@@ -14,6 +14,7 @@ from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, Any
|
||||
import logging
|
||||
import json
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent.parent
|
||||
@@ -30,11 +31,12 @@ logging.getLogger('sqlalchemy.pool').setLevel(logging.WARNING)
|
||||
logging.getLogger('sqlalchemy.dialects').setLevel(logging.WARNING)
|
||||
logging.getLogger('sqlalchemy.orm').setLevel(logging.WARNING)
|
||||
|
||||
from data.collector_manager import CollectorManager
|
||||
from config.service_config import ServiceConfig
|
||||
from data.collector_factory import CollectorFactory
|
||||
from .collector_manager import CollectorManager
|
||||
from config.collector_service_config import CollectorServiceConfig
|
||||
from .collector_factory import CollectorFactory
|
||||
from database.connection import init_database
|
||||
from utils.logger import get_logger
|
||||
from utils.async_task_manager import TaskManager
|
||||
|
||||
|
||||
class DataCollectionService:
|
||||
@@ -46,11 +48,12 @@ class DataCollectionService:
|
||||
self.logger = get_logger("data_collection_service", log_level="INFO", verbose=False)
|
||||
|
||||
# Initialize configuration and factory
|
||||
self.service_config = ServiceConfig(config_path, logger=self.logger)
|
||||
self.service_config = CollectorServiceConfig(config_path, logger=self.logger)
|
||||
self.config = self.service_config.load_config()
|
||||
self.collector_factory = CollectorFactory(logger=self.logger)
|
||||
|
||||
# Core components
|
||||
self.task_manager = TaskManager("data_collection_service", logger=self.logger)
|
||||
self.collector_manager = CollectorManager(logger=self.logger, log_errors_only=True)
|
||||
self.collectors: List = []
|
||||
|
||||
@@ -230,6 +233,9 @@ class DataCollectionService:
|
||||
sanitized_message = self._sanitize_error(f"Unexpected error during service shutdown: {e}")
|
||||
self.logger.error(sanitized_message, exc_info=True)
|
||||
self.stats['errors_count'] += 1
|
||||
finally:
|
||||
# Always cleanup task manager
|
||||
await self.task_manager.shutdown(graceful=True)
|
||||
|
||||
def get_status(self) -> Dict[str, Any]:
|
||||
"""Get current service status."""
|
||||
@@ -6,8 +6,8 @@ and error handling, separating collector creation logic from the main service.
|
||||
"""
|
||||
|
||||
from typing import Dict, Any, List, Optional
|
||||
from data.exchanges.factory import ExchangeFactory, ExchangeCollectorConfig
|
||||
from data.base_collector import DataType
|
||||
from ..exchanges.factory import ExchangeFactory, ExchangeCollectorConfig
|
||||
from .base_collector import DataType
|
||||
|
||||
|
||||
class CollectorFactory:
|
||||
@@ -9,9 +9,10 @@ import asyncio
|
||||
from typing import Dict, List, Optional, Any, Set
|
||||
|
||||
from utils.logger import get_logger
|
||||
from utils.async_task_manager import TaskManager
|
||||
from .base_collector import BaseDataCollector, CollectorStatus
|
||||
from .collector_types import ManagerStatus, CollectorConfig
|
||||
from .manager_components import (
|
||||
from ..manager_components import (
|
||||
CollectorLifecycleManager,
|
||||
ManagerHealthMonitor,
|
||||
ManagerStatsTracker,
|
||||
@@ -42,6 +43,7 @@ class CollectorManager:
|
||||
|
||||
# Initialize components
|
||||
self.logger_manager = ManagerLogger(logger, log_errors_only)
|
||||
self.task_manager = TaskManager(f"{manager_name}_tasks", logger=logger)
|
||||
self.lifecycle_manager = CollectorLifecycleManager(self.logger_manager)
|
||||
self.health_monitor = ManagerHealthMonitor(
|
||||
global_health_check_interval, self.logger_manager, self.lifecycle_manager)
|
||||
@@ -51,7 +53,6 @@ class CollectorManager:
|
||||
# Manager state
|
||||
self.status = ManagerStatus.STOPPED
|
||||
self._running = False
|
||||
self._tasks: Set[asyncio.Task] = set()
|
||||
|
||||
if self.logger_manager.is_debug_enabled():
|
||||
self.logger_manager.log_info(f"Initialized collector manager: {manager_name}")
|
||||
@@ -106,11 +107,13 @@ class CollectorManager:
|
||||
await self.lifecycle_manager.start_all_enabled_collectors()
|
||||
await self.health_monitor.start_monitoring()
|
||||
|
||||
# Track health monitoring task
|
||||
# Track health monitoring task with task manager
|
||||
health_task = self.health_monitor.get_health_task()
|
||||
if health_task:
|
||||
self._tasks.add(health_task)
|
||||
health_task.add_done_callback(self._tasks.discard)
|
||||
# Transfer task to task manager for better tracking
|
||||
self.task_manager._tasks.add(health_task)
|
||||
self.task_manager._task_names[health_task] = "health_monitor"
|
||||
health_task.add_done_callback(self.task_manager._task_done_callback)
|
||||
|
||||
# Start statistics cache updates
|
||||
await self.stats_tracker.start_cache_updates()
|
||||
@@ -164,11 +167,8 @@ class CollectorManager:
|
||||
await self.health_monitor.stop_monitoring()
|
||||
await self.stats_tracker.stop_cache_updates()
|
||||
|
||||
# Cancel manager tasks
|
||||
for task in list(self._tasks):
|
||||
task.cancel()
|
||||
if self._tasks:
|
||||
await asyncio.gather(*self._tasks, return_exceptions=True)
|
||||
# Gracefully shutdown task manager
|
||||
await self.task_manager.shutdown(graceful=True)
|
||||
|
||||
# Stop all collectors
|
||||
await self.lifecycle_manager.stop_all_collectors()
|
||||
@@ -10,7 +10,8 @@ from typing import Dict, List, Optional, Any, Type, Tuple
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from utils.logger import get_logger
|
||||
from ..base_collector import BaseDataCollector, DataType
|
||||
from ..collector.base_collector import BaseDataCollector
|
||||
from ..common.data_types import DataType
|
||||
from ..common import CandleProcessingConfig
|
||||
from .registry import EXCHANGE_REGISTRY, get_supported_exchanges, get_exchange_info
|
||||
from .exceptions import (
|
||||
|
||||
@@ -11,7 +11,7 @@ from datetime import datetime, timezone
|
||||
from typing import Dict, List, Optional, Any
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ...base_collector import (
|
||||
from ...collector.base_collector import (
|
||||
BaseDataCollector, DataType, CollectorStatus, MarketDataPoint,
|
||||
OHLCVData, DataValidationError, ConnectionError
|
||||
)
|
||||
|
||||
@@ -11,7 +11,7 @@ from decimal import Decimal
|
||||
from typing import Dict, List, Optional, Any, Union, Tuple
|
||||
from enum import Enum
|
||||
|
||||
from ...base_collector import DataType, MarketDataPoint
|
||||
from ...collector.base_collector import DataType, MarketDataPoint
|
||||
from ...common import (
|
||||
DataValidationResult,
|
||||
StandardizedTrade,
|
||||
|
||||
@@ -8,8 +8,8 @@ enabling, disabling, starting, and restarting collectors.
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Dict, Set, Optional
|
||||
from ..base_collector import BaseDataCollector, CollectorStatus
|
||||
from ..collector_types import CollectorConfig
|
||||
from ..collector.base_collector import BaseDataCollector, CollectorStatus
|
||||
from ..collector.collector_types import CollectorConfig
|
||||
|
||||
|
||||
class CollectorLifecycleManager:
|
||||
|
||||
@@ -8,7 +8,7 @@ auto-restart functionality, and health status tracking.
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
from typing import Set, Dict, Optional
|
||||
from ..base_collector import BaseDataCollector, CollectorStatus
|
||||
from ..collector.base_collector import BaseDataCollector, CollectorStatus
|
||||
|
||||
|
||||
class ManagerHealthMonitor:
|
||||
|
||||
@@ -8,7 +8,7 @@ to optimize performance by avoiding real-time calculations on every status reque
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, Any, Optional, List
|
||||
from ..base_collector import BaseDataCollector, CollectorStatus
|
||||
from ..collector.base_collector import BaseDataCollector, CollectorStatus
|
||||
|
||||
|
||||
class ManagerStatsTracker:
|
||||
@@ -48,6 +48,11 @@ class ManagerStatsTracker:
|
||||
self._cache_last_updated: Optional[datetime] = None
|
||||
self._cache_update_task: Optional[asyncio.Task] = None
|
||||
self._running = False
|
||||
|
||||
# Performance tracking for cache optimization
|
||||
self._cache_hit_count = 0
|
||||
self._cache_miss_count = 0
|
||||
self._last_performance_log = datetime.now(timezone.utc)
|
||||
|
||||
def set_running_state(self, running: bool) -> None:
|
||||
"""Set the running state of the tracker."""
|
||||
@@ -180,8 +185,13 @@ class ManagerStatsTracker:
|
||||
# Check if cache is recent enough (within 2x the update interval)
|
||||
cache_age = (datetime.now(timezone.utc) - self._cache_last_updated).total_seconds()
|
||||
if cache_age <= (self.cache_update_interval * 2):
|
||||
self._cache_hit_count += 1
|
||||
self._log_cache_performance_if_needed()
|
||||
return self._cached_status.copy()
|
||||
|
||||
# Cache miss - increment counter
|
||||
self._cache_miss_count += 1
|
||||
|
||||
# Calculate real-time status
|
||||
uptime_seconds = None
|
||||
if self._stats['uptime_start']:
|
||||
@@ -264,6 +274,9 @@ class ManagerStatsTracker:
|
||||
|
||||
def get_cache_info(self) -> Dict[str, Any]:
|
||||
"""Get information about the cache state."""
|
||||
total_requests = self._cache_hit_count + self._cache_miss_count
|
||||
hit_rate = (self._cache_hit_count / total_requests * 100) if total_requests > 0 else 0
|
||||
|
||||
return {
|
||||
'cache_enabled': True,
|
||||
'cache_update_interval': self.cache_update_interval,
|
||||
@@ -271,5 +284,27 @@ class ManagerStatsTracker:
|
||||
'cache_age_seconds': (
|
||||
(datetime.now(timezone.utc) - self._cache_last_updated).total_seconds()
|
||||
if self._cache_last_updated else None
|
||||
)
|
||||
}
|
||||
),
|
||||
'cache_hit_count': self._cache_hit_count,
|
||||
'cache_miss_count': self._cache_miss_count,
|
||||
'cache_hit_rate_percent': round(hit_rate, 2),
|
||||
'total_cache_requests': total_requests
|
||||
}
|
||||
|
||||
def _log_cache_performance_if_needed(self) -> None:
|
||||
"""Log cache performance metrics periodically."""
|
||||
current_time = datetime.now(timezone.utc)
|
||||
|
||||
# Log every 5 minutes
|
||||
if (current_time - self._last_performance_log).total_seconds() >= 300:
|
||||
total_requests = self._cache_hit_count + self._cache_miss_count
|
||||
if total_requests > 0:
|
||||
hit_rate = (self._cache_hit_count / total_requests * 100)
|
||||
|
||||
if self.logger_manager:
|
||||
self.logger_manager.log_debug(
|
||||
f"Cache performance: {hit_rate:.1f}% hit rate "
|
||||
f"({self._cache_hit_count} hits, {self._cache_miss_count} misses)"
|
||||
)
|
||||
|
||||
self._last_performance_log = current_time
|
||||
Reference in New Issue
Block a user