Implement data collection architecture with modular components

- Introduced a comprehensive data collection framework, including `CollectorServiceConfig`, `BaseDataCollector`, and `CollectorManager`, enhancing modularity and maintainability.
- Developed `CollectorFactory` for streamlined collector creation, promoting separation of concerns and improved configuration handling.
- Enhanced `DataCollectionService` to utilize the new architecture, ensuring robust error handling and logging practices.
- Added `TaskManager` for efficient management of asynchronous tasks, improving performance and resource management.
- Implemented health monitoring and auto-recovery features in `CollectorManager`, ensuring reliable operation of data collectors.
- Updated imports across the codebase to reflect the new structure, ensuring consistent access to components.

These changes significantly improve the architecture and maintainability of the data collection service, aligning with project standards for modularity, performance, and error handling.
This commit is contained in:
Vasily.onl
2025-06-10 13:40:28 +08:00
parent c28e4a9aaf
commit f6cb1485b1
18 changed files with 384 additions and 45 deletions

View File

@@ -5,14 +5,14 @@ This package contains modules for collecting market data from various exchanges,
processing and validating the data, and storing it in the database.
"""
from .base_collector import (
from .collector.base_collector import (
BaseDataCollector, DataCollectorError
)
from .collector.collector_state_telemetry import CollectorStatus
from .common.ohlcv_data import OHLCVData, DataValidationError
from .common.data_types import DataType, MarketDataPoint
from .collector_manager import CollectorManager
from .collector_types import ManagerStatus, CollectorConfig
from .collector.collector_manager import CollectorManager
from .collector.collector_types import ManagerStatus, CollectorConfig
__all__ = [
'BaseDataCollector',

View File

@@ -14,11 +14,11 @@ from dataclasses import dataclass
from enum import Enum
from utils.logger import get_logger
from .collector.collector_state_telemetry import CollectorStatus, CollectorStateAndTelemetry
from .collector.collector_connection_manager import ConnectionManager
from .collector.collector_callback_dispatcher import CallbackDispatcher
from .common.data_types import DataType, MarketDataPoint
from .common.ohlcv_data import OHLCVData, DataValidationError, validate_ohlcv_data
from .collector_state_telemetry import CollectorStatus, CollectorStateAndTelemetry
from .collector_connection_manager import ConnectionManager
from .collector_callback_dispatcher import CallbackDispatcher
from ..common.data_types import DataType, MarketDataPoint
from ..common.ohlcv_data import OHLCVData, DataValidationError, validate_ohlcv_data
class DataCollectorError(Exception):

View File

@@ -14,6 +14,7 @@ from datetime import datetime
from pathlib import Path
from typing import List, Optional, Dict, Any
import logging
import json
# Add project root to path
project_root = Path(__file__).parent.parent
@@ -30,11 +31,12 @@ logging.getLogger('sqlalchemy.pool').setLevel(logging.WARNING)
logging.getLogger('sqlalchemy.dialects').setLevel(logging.WARNING)
logging.getLogger('sqlalchemy.orm').setLevel(logging.WARNING)
from data.collector_manager import CollectorManager
from config.service_config import ServiceConfig
from data.collector_factory import CollectorFactory
from .collector_manager import CollectorManager
from config.collector_service_config import CollectorServiceConfig
from .collector_factory import CollectorFactory
from database.connection import init_database
from utils.logger import get_logger
from utils.async_task_manager import TaskManager
class DataCollectionService:
@@ -46,11 +48,12 @@ class DataCollectionService:
self.logger = get_logger("data_collection_service", log_level="INFO", verbose=False)
# Initialize configuration and factory
self.service_config = ServiceConfig(config_path, logger=self.logger)
self.service_config = CollectorServiceConfig(config_path, logger=self.logger)
self.config = self.service_config.load_config()
self.collector_factory = CollectorFactory(logger=self.logger)
# Core components
self.task_manager = TaskManager("data_collection_service", logger=self.logger)
self.collector_manager = CollectorManager(logger=self.logger, log_errors_only=True)
self.collectors: List = []
@@ -230,6 +233,9 @@ class DataCollectionService:
sanitized_message = self._sanitize_error(f"Unexpected error during service shutdown: {e}")
self.logger.error(sanitized_message, exc_info=True)
self.stats['errors_count'] += 1
finally:
# Always cleanup task manager
await self.task_manager.shutdown(graceful=True)
def get_status(self) -> Dict[str, Any]:
"""Get current service status."""

View File

@@ -6,8 +6,8 @@ and error handling, separating collector creation logic from the main service.
"""
from typing import Dict, Any, List, Optional
from data.exchanges.factory import ExchangeFactory, ExchangeCollectorConfig
from data.base_collector import DataType
from ..exchanges.factory import ExchangeFactory, ExchangeCollectorConfig
from .base_collector import DataType
class CollectorFactory:

View File

@@ -9,9 +9,10 @@ import asyncio
from typing import Dict, List, Optional, Any, Set
from utils.logger import get_logger
from utils.async_task_manager import TaskManager
from .base_collector import BaseDataCollector, CollectorStatus
from .collector_types import ManagerStatus, CollectorConfig
from .manager_components import (
from ..manager_components import (
CollectorLifecycleManager,
ManagerHealthMonitor,
ManagerStatsTracker,
@@ -42,6 +43,7 @@ class CollectorManager:
# Initialize components
self.logger_manager = ManagerLogger(logger, log_errors_only)
self.task_manager = TaskManager(f"{manager_name}_tasks", logger=logger)
self.lifecycle_manager = CollectorLifecycleManager(self.logger_manager)
self.health_monitor = ManagerHealthMonitor(
global_health_check_interval, self.logger_manager, self.lifecycle_manager)
@@ -51,7 +53,6 @@ class CollectorManager:
# Manager state
self.status = ManagerStatus.STOPPED
self._running = False
self._tasks: Set[asyncio.Task] = set()
if self.logger_manager.is_debug_enabled():
self.logger_manager.log_info(f"Initialized collector manager: {manager_name}")
@@ -106,11 +107,13 @@ class CollectorManager:
await self.lifecycle_manager.start_all_enabled_collectors()
await self.health_monitor.start_monitoring()
# Track health monitoring task
# Track health monitoring task with task manager
health_task = self.health_monitor.get_health_task()
if health_task:
self._tasks.add(health_task)
health_task.add_done_callback(self._tasks.discard)
# Transfer task to task manager for better tracking
self.task_manager._tasks.add(health_task)
self.task_manager._task_names[health_task] = "health_monitor"
health_task.add_done_callback(self.task_manager._task_done_callback)
# Start statistics cache updates
await self.stats_tracker.start_cache_updates()
@@ -164,11 +167,8 @@ class CollectorManager:
await self.health_monitor.stop_monitoring()
await self.stats_tracker.stop_cache_updates()
# Cancel manager tasks
for task in list(self._tasks):
task.cancel()
if self._tasks:
await asyncio.gather(*self._tasks, return_exceptions=True)
# Gracefully shutdown task manager
await self.task_manager.shutdown(graceful=True)
# Stop all collectors
await self.lifecycle_manager.stop_all_collectors()

View File

@@ -10,7 +10,8 @@ from typing import Dict, List, Optional, Any, Type, Tuple
from dataclasses import dataclass, field
from utils.logger import get_logger
from ..base_collector import BaseDataCollector, DataType
from ..collector.base_collector import BaseDataCollector
from ..common.data_types import DataType
from ..common import CandleProcessingConfig
from .registry import EXCHANGE_REGISTRY, get_supported_exchanges, get_exchange_info
from .exceptions import (

View File

@@ -11,7 +11,7 @@ from datetime import datetime, timezone
from typing import Dict, List, Optional, Any
from dataclasses import dataclass
from ...base_collector import (
from ...collector.base_collector import (
BaseDataCollector, DataType, CollectorStatus, MarketDataPoint,
OHLCVData, DataValidationError, ConnectionError
)

View File

@@ -11,7 +11,7 @@ from decimal import Decimal
from typing import Dict, List, Optional, Any, Union, Tuple
from enum import Enum
from ...base_collector import DataType, MarketDataPoint
from ...collector.base_collector import DataType, MarketDataPoint
from ...common import (
DataValidationResult,
StandardizedTrade,

View File

@@ -8,8 +8,8 @@ enabling, disabling, starting, and restarting collectors.
import asyncio
import time
from typing import Dict, Set, Optional
from ..base_collector import BaseDataCollector, CollectorStatus
from ..collector_types import CollectorConfig
from ..collector.base_collector import BaseDataCollector, CollectorStatus
from ..collector.collector_types import CollectorConfig
class CollectorLifecycleManager:

View File

@@ -8,7 +8,7 @@ auto-restart functionality, and health status tracking.
import asyncio
from datetime import datetime, timezone
from typing import Set, Dict, Optional
from ..base_collector import BaseDataCollector, CollectorStatus
from ..collector.base_collector import BaseDataCollector, CollectorStatus
class ManagerHealthMonitor:

View File

@@ -8,7 +8,7 @@ to optimize performance by avoiding real-time calculations on every status reque
import asyncio
from datetime import datetime, timezone
from typing import Dict, Any, Optional, List
from ..base_collector import BaseDataCollector, CollectorStatus
from ..collector.base_collector import BaseDataCollector, CollectorStatus
class ManagerStatsTracker:
@@ -48,6 +48,11 @@ class ManagerStatsTracker:
self._cache_last_updated: Optional[datetime] = None
self._cache_update_task: Optional[asyncio.Task] = None
self._running = False
# Performance tracking for cache optimization
self._cache_hit_count = 0
self._cache_miss_count = 0
self._last_performance_log = datetime.now(timezone.utc)
def set_running_state(self, running: bool) -> None:
"""Set the running state of the tracker."""
@@ -180,8 +185,13 @@ class ManagerStatsTracker:
# Check if cache is recent enough (within 2x the update interval)
cache_age = (datetime.now(timezone.utc) - self._cache_last_updated).total_seconds()
if cache_age <= (self.cache_update_interval * 2):
self._cache_hit_count += 1
self._log_cache_performance_if_needed()
return self._cached_status.copy()
# Cache miss - increment counter
self._cache_miss_count += 1
# Calculate real-time status
uptime_seconds = None
if self._stats['uptime_start']:
@@ -264,6 +274,9 @@ class ManagerStatsTracker:
def get_cache_info(self) -> Dict[str, Any]:
"""Get information about the cache state."""
total_requests = self._cache_hit_count + self._cache_miss_count
hit_rate = (self._cache_hit_count / total_requests * 100) if total_requests > 0 else 0
return {
'cache_enabled': True,
'cache_update_interval': self.cache_update_interval,
@@ -271,5 +284,27 @@ class ManagerStatsTracker:
'cache_age_seconds': (
(datetime.now(timezone.utc) - self._cache_last_updated).total_seconds()
if self._cache_last_updated else None
)
}
),
'cache_hit_count': self._cache_hit_count,
'cache_miss_count': self._cache_miss_count,
'cache_hit_rate_percent': round(hit_rate, 2),
'total_cache_requests': total_requests
}
def _log_cache_performance_if_needed(self) -> None:
"""Log cache performance metrics periodically."""
current_time = datetime.now(timezone.utc)
# Log every 5 minutes
if (current_time - self._last_performance_log).total_seconds() >= 300:
total_requests = self._cache_hit_count + self._cache_miss_count
if total_requests > 0:
hit_rate = (self._cache_hit_count / total_requests * 100)
if self.logger_manager:
self.logger_manager.log_debug(
f"Cache performance: {hit_rate:.1f}% hit rate "
f"({self._cache_hit_count} hits, {self._cache_miss_count} misses)"
)
self._last_performance_log = current_time