Enhance error handling and security measures in data collection services

- Implemented `_sanitize_error` method in `DataCollectionService` and `CollectorManager` to prevent leaking internal error details. - Improved error handling across various methods by catching specific exceptions and logging sanitized messages with `exc_info=True`. - Added file permission validation in `ServiceConfig` to ensure secure configuration file handling, including detailed logging for permission issues. - Refactored logging practices to enhance clarity and maintainability, ensuring consistent error reporting. These changes significantly bolster the security and robustness of the data collection services, aligning with project standards for error handling and maintainability.
2025-06-10 13:12:13 +08:00
parent 6ab40414e7
commit c28e4a9aaf
5 changed files with 312 additions and 30 deletions
--- a/config/service_config.py
+++ b/config/service_config.py
@@ -73,9 +73,25 @@ class ServiceConfig:
            
            return validated_config
            
-        except Exception as e:
+        except (FileNotFoundError, IsADirectoryError, PermissionError) as e:
+            # Handle file system related errors
            if self.logger:
-                self.logger.error(f"❌ Failed to load configuration: {e}", exc_info=True)
+                self.logger.error(f"❌ File system error loading configuration: {e}", exc_info=True)
+            raise
+        except (json.JSONDecodeError, UnicodeDecodeError) as e:
+            # Handle file parsing errors
+            if self.logger:
+                self.logger.error(f"❌ Configuration file parsing error: {e}", exc_info=True)
+            raise ValueError(f"Invalid configuration file format: {e}")
+        except (KeyError, ValueError, TypeError) as e:
+            # Handle configuration validation errors
+            if self.logger:
+                self.logger.error(f"❌ Configuration validation error: {e}", exc_info=True)
+            raise
+        except Exception as e:
+            # Catch any other unexpected errors
+            if self.logger:
+                self.logger.error(f"❌ Unexpected error loading configuration: {e}", exc_info=True)
            raise
    
    def _validate_file_permissions(self, config_file: Path) -> None:
@@ -87,24 +103,84 @@ class ServiceConfig:
            
        Raises:
            PermissionError: If file permissions are too permissive
+            ValueError: If file validation fails
        """
        try:
            file_stat = config_file.stat()
            file_mode = file_stat.st_mode
            
+            # List of security violations
+            violations = []
+            
            # Check if file is readable by others (security risk)
            if file_mode & stat.S_IROTH:
+                violations.append("readable by others")
                if self.logger:
                    self.logger.warning(f"⚠️ Configuration file {config_file} is readable by others")
            
            # Check if file is writable by others (security risk)
            if file_mode & stat.S_IWOTH:
+                violations.append("writable by others")
                if self.logger:
                    self.logger.warning(f"⚠️ Configuration file {config_file} is writable by others")
+            
+            # Check if file is writable by group (security risk)
+            if file_mode & stat.S_IWGRP:
+                violations.append("writable by group")
+                if self.logger:
+                    self.logger.warning(f"⚠️ Configuration file {config_file} is writable by group")
+            
+            # Check if file is readable by group (potential security risk)
+            if file_mode & stat.S_IRGRP:
+                violations.append("readable by group")
+                if self.logger:
+                    self.logger.warning(f"⚠️ Configuration file {config_file} is readable by group")
+            
+            # Check if file is executable (unnecessary for config files)
+            if file_mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH):
+                violations.append("executable")
+                if self.logger:
+                    self.logger.warning(f"⚠️ Configuration file {config_file} has execute permissions")
+            
+            # Enforce security policy - raise exception if critical violations found
+            critical_violations = [v for v in violations if "writable by" in v or "readable by others" in v]
+            if critical_violations:
+                error_msg = f"Configuration file {config_file} has insecure permissions: {', '.join(critical_violations)}. " \
+                           f"File should only be readable/writable by owner (mode 600 recommended)."
+                
+                if self.logger:
+                    self.logger.error(f"🔒 Security violation: {error_msg}", exc_info=True)
+                
+                # On Windows, permission checks might not work as expected, so we log but don't fail
+                if os.name == 'nt':  # Windows
+                    if self.logger:
+                        self.logger.warning("⚠️ On Windows, file permission validation is limited. Please ensure config files are properly secured.")
+                else:
+                    # On Unix-like systems, enforce strict permissions
+                    raise PermissionError(error_msg)
+            
+            # Validate file ownership (only on Unix-like systems)
+            if os.name != 'nt':
+                current_uid = os.getuid() if hasattr(os, 'getuid') else None
+                file_uid = file_stat.st_uid
+                
+                if current_uid is not None and file_uid != current_uid:
+                    warning_msg = f"Configuration file {config_file} is owned by different user (UID: {file_uid}, current: {current_uid})"
+                    if self.logger:
+                        self.logger.warning(f"⚠️ {warning_msg}")
+            
+            # Log successful validation
+            if not violations and self.logger:
+                self.logger.debug(f"✅ Configuration file {config_file} has secure permissions")
                    
+        except PermissionError:
+            # Re-raise permission errors
+            raise
        except Exception as e:
+            error_msg = f"Could not validate file permissions for {config_file}: {e}"
            if self.logger:
-                self.logger.warning(f"⚠️ Could not validate file permissions: {e}")
+                self.logger.error(f"❌ {error_msg}", exc_info=True)
+            raise ValueError(error_msg)
    
    def _validate_config_schema(self, config: Dict[str, Any]) -> Dict[str, Any]:
        """
@@ -194,15 +270,78 @@ class ServiceConfig:
            json.dump(default_config, f, indent=2)
        
        # Set secure file permissions (owner read/write only)
-        try:
-            os.chmod(config_file, stat.S_IRUSR | stat.S_IWUSR)
-        except Exception as e:
-            if self.logger:
-                self.logger.warning(f"⚠️ Could not set secure file permissions: {e}")
+        self._set_secure_permissions(config_file)
        
        if self.logger:
            self.logger.info(f"📄 Created default configuration: {config_file}")
    
+    def _set_secure_permissions(self, config_file: Path) -> None:
+        """
+        Set secure file permissions for configuration file.
+        
+        Args:
+            config_file: Path to the configuration file
+        """
+        try:
+            # Set permissions to owner read/write only (mode 600)
+            secure_mode = stat.S_IRUSR | stat.S_IWUSR
+            os.chmod(config_file, secure_mode)
+            
+            if self.logger:
+                self.logger.debug(f"Set secure permissions (600) for {config_file}")
+                
+        except (OSError, PermissionError) as e:
+            # Handle file system permission errors
+            if self.logger:
+                self.logger.warning(f"Could not set secure file permissions for {config_file}: {e}")
+        except Exception as e:
+            # Handle any other unexpected errors
+            if self.logger:
+                self.logger.warning(f"Unexpected error setting file permissions for {config_file}: {e}")
+    
+    def fix_file_permissions(self, config_file_path: Optional[str] = None) -> bool:
+        """
+        Fix file permissions for configuration file to be secure.
+        
+        Args:
+            config_file_path: Optional specific file path, defaults to instance config_path
+            
+        Returns:
+            bool: True if permissions were fixed successfully, False otherwise
+        """
+        try:
+            file_path = Path(config_file_path or self.config_path)
+            
+            if not file_path.exists():
+                if self.logger:
+                    self.logger.error(f"Configuration file {file_path} does not exist", exc_info=True)
+                return False
+            
+            # Set secure permissions
+            self._set_secure_permissions(file_path)
+            
+            # Verify the fix worked
+            try:
+                self._validate_file_permissions(file_path)
+                if self.logger:
+                    self.logger.info(f"✅ Successfully fixed permissions for {file_path}")
+                return True
+            except (PermissionError, ValueError):
+                if self.logger:
+                    self.logger.warning(f"⚠️ Permissions may still be insecure for {file_path}")
+                return False
+                
+        except (FileNotFoundError, PermissionError, OSError) as e:
+            # Handle file system related errors
+            if self.logger:
+                self.logger.error(f"❌ File system error fixing permissions: {e}", exc_info=True)
+            return False
+        except Exception as e:
+            # Handle any other unexpected errors
+            if self.logger:
+                self.logger.error(f"❌ Unexpected error fixing file permissions: {e}", exc_info=True)
+            return False
+    
    def _get_default_connection_config(self) -> Dict[str, Any]:
        """Get default connection configuration."""
        return {
@@ -318,13 +457,18 @@ class ServiceConfig:
            self.logger.info("Configuration updated in memory")
    
    def save_config(self) -> None:
-        """Save current configuration to file."""
+        """Save current configuration to file with secure permissions."""
        if self._config is None:
            raise RuntimeError("Configuration has not been loaded. Call load_config() first.")
        
        config_file = Path(self.config_path)
+        
+        # Save configuration
        with open(config_file, 'w') as f:
            json.dump(self._config, f, indent=2)
        
+        # Ensure secure permissions after saving
+        self._set_secure_permissions(config_file)
+        
        if self.logger:
-            self.logger.info(f"Configuration saved to {self.config_path}") 
+            self.logger.info(f"Configuration saved to {self.config_path} with secure permissions") 
--- a/data/collection_service.py
+++ b/data/collection_service.py
@@ -71,13 +71,57 @@ class DataCollectionService:
        self.logger.info("🚀 Data Collection Service initialized")
        self.logger.info(f"📁 Configuration: {config_path}")
    
+    def _sanitize_error(self, message: str) -> str:
+        """
+        Sanitize error message to prevent leaking internal details.
+        
+        Args:
+            message: Original error message
+            
+        Returns:
+            Sanitized error message
+        """
+        # Remove sensitive patterns that might leak internal information
+        sensitive_patterns = [
+            'password=',
+            'token=',
+            'key=',
+            'secret=',
+            'auth=',
+            'api_key=',
+            'api_secret=',
+            'access_token=',
+            'refresh_token='
+        ]
+        
+        sanitized = message
+        for pattern in sensitive_patterns:
+            if pattern.lower() in sanitized.lower():
+                # Replace the value part after = with [REDACTED]
+                parts = sanitized.split(pattern)
+                if len(parts) > 1:
+                    # Find the end of the value (space, comma, or end of string)
+                    value_part = parts[1]
+                    end_chars = [' ', ',', ')', ']', '}', '\n', '\t']
+                    end_idx = len(value_part)
+                    
+                    for char in end_chars:
+                        char_idx = value_part.find(char)
+                        if char_idx != -1 and char_idx < end_idx:
+                            end_idx = char_idx
+                    
+                    # Replace the value with [REDACTED]
+                    sanitized = parts[0] + pattern + '[REDACTED]' + value_part[end_idx:]
+        
+        return sanitized
+    
    async def initialize_collectors(self) -> bool:
        """Initialize all data collectors based on configuration."""
        try:
            collectors = await self.collector_factory.create_collectors_from_config(self.config)
            
            if not collectors:
-                self.logger.error("❌ No collectors were successfully created")
+                self.logger.error("❌ No collectors were successfully created", exc_info=True)
                return False
            
            for collector in collectors:
@@ -88,8 +132,22 @@ class DataCollectionService:
            self.logger.info(f"✅ Successfully initialized {len(collectors)} data collectors")
            return True
                
+        except (KeyError, AttributeError, TypeError) as e:
+            # Handle configuration and data structure errors
+            sanitized_message = self._sanitize_error(f"❌ Configuration error initializing collectors: {e}")
+            self.logger.error(sanitized_message, exc_info=True)
+            self.stats['errors_count'] += 1
+            return False
+        except (ConnectionError, OSError, IOError) as e:
+            # Handle connection and I/O related errors
+            sanitized_message = self._sanitize_error(f"❌ Connection/IO error initializing collectors: {e}")
+            self.logger.error(sanitized_message, exc_info=True)
+            self.stats['errors_count'] += 1
+            return False
        except Exception as e:
-            self.logger.error(f"❌ Failed to initialize collectors: {e}", exc_info=True)
+            # Catch any other unexpected errors
+            sanitized_message = self._sanitize_error(f"❌ Unexpected error initializing collectors: {e}")
+            self.logger.error(sanitized_message, exc_info=True)
            self.stats['errors_count'] += 1
            return False
    
@@ -117,12 +175,26 @@ class DataCollectionService:
                self.logger.info(f"📈 Active collectors: {self.stats['collectors_running']}")
                return True
            else:
-                self.logger.error("❌ Failed to start data collectors")
+                self.logger.error("Failed to start data collectors", exc_info=True)
                self.stats['errors_count'] += 1
                return False
                
+        except (ConnectionError, OSError, IOError) as e:
+            # Handle database and connection errors
+            sanitized_message = self._sanitize_error(f"Database/Connection error starting service: {e}")
+            self.logger.error(sanitized_message, exc_info=True)
+            self.stats['errors_count'] += 1
+            return False
+        except (AttributeError, TypeError, ValueError) as e:
+            # Handle configuration and data validation errors
+            sanitized_message = self._sanitize_error(f"❌ Configuration error starting service: {e}")
+            self.logger.error(sanitized_message, exc_info=True)
+            self.stats['errors_count'] += 1
+            return False
        except Exception as e:
-            self.logger.error(f"❌ Failed to start service: {e}", exc_info=True)
+            # Catch any other unexpected errors
+            sanitized_message = self._sanitize_error(f"❌ Unexpected error starting service: {e}")
+            self.logger.error(sanitized_message, exc_info=True)
            self.stats['errors_count'] += 1
            return False
    
@@ -144,8 +216,19 @@ class DataCollectionService:
            self.logger.info("✅ Data Collection Service stopped gracefully")
            self.logger.info(f"📊 Total uptime: {self.stats['total_uptime_seconds']:.1f} seconds")
            
+        except (asyncio.CancelledError, KeyboardInterrupt):
+            # Handle graceful shutdown scenarios
+            self.logger.warning("Service shutdown was interrupted")
+            self.stats['errors_count'] += 1
+        except (ConnectionError, OSError, IOError) as e:
+            # Handle connection and I/O related errors during shutdown
+            sanitized_message = self._sanitize_error(f"Connection/IO error during service shutdown: {e}")
+            self.logger.error(sanitized_message, exc_info=True)
+            self.stats['errors_count'] += 1
        except Exception as e:
-            self.logger.error(f"❌ Error during service shutdown: {e}", exc_info=True)
+            # Catch any other unexpected errors during shutdown
+            sanitized_message = self._sanitize_error(f"Unexpected error during service shutdown: {e}")
+            self.logger.error(sanitized_message, exc_info=True)
            self.stats['errors_count'] += 1
    
    def get_status(self) -> Dict[str, Any]:
@@ -234,8 +317,20 @@ class DataCollectionService:
            
            return True
            
+        except (asyncio.CancelledError, KeyboardInterrupt):
+            # Handle graceful shutdown scenarios
+            self.logger.info("Service run was cancelled gracefully")
+            return True
+        except (asyncio.TimeoutError, ConnectionError, OSError, IOError) as e:
+            # Handle timeout, connection and I/O related errors
+            sanitized_message = self._sanitize_error(f"Connection/Timeout error during service run: {e}")
+            self.logger.error(sanitized_message, exc_info=True)
+            self.stats['errors_count'] += 1
+            return False
        except Exception as e:
-            self.logger.error(f"❌ Service error: {e}", exc_info=True)
+            # Catch any other unexpected errors
+            sanitized_message = self._sanitize_error(f"Unexpected service error: {e}")
+            self.logger.error(sanitized_message, exc_info=True)
            self.stats['errors_count'] += 1
            return False
        finally:
--- a/data/collector_factory.py
+++ b/data/collector_factory.py
@@ -71,7 +71,7 @@ class CollectorFactory:
                return collector
            else:
                if self.logger:
-                    self.logger.error(f"Failed to create collector for {symbol}")
+                    self.logger.error(f"Failed to create collector for {symbol}", exc_info=True)
                return None
                
        except Exception as e:
@@ -94,13 +94,13 @@ class CollectorFactory:
            # Validate required fields
            if 'symbol' not in pair_config:
                if self.logger:
-                    self.logger.error("Trading pair missing required 'symbol' field")
+                    self.logger.error("Trading pair missing required 'symbol' field", exc_info=True)
                return None
            
            symbol = pair_config['symbol']
            if not isinstance(symbol, str) or '-' not in symbol:
                if self.logger:
-                    self.logger.error(f"Invalid symbol format: {symbol}. Expected format: 'BASE-QUOTE'")
+                    self.logger.error(f"Invalid symbol format: {symbol}. Expected format: 'BASE-QUOTE'", exc_info=True)
                return None
            
            # Apply defaults and validate data types
@@ -239,7 +239,7 @@ class CollectorFactory:
                else:
                    symbol = pair_config.get('symbol', 'unknown')
                    if self.logger:
-                        self.logger.error(f"Failed to create collector for {symbol}")
+                        self.logger.error(f"Failed to create collector for {symbol}", exc_info=True)
            
            if self.logger:
                self.logger.info(f"Successfully created {len(collectors)} collectors")
--- a/data/collector_manager.py
+++ b/data/collector_manager.py
@@ -56,6 +56,19 @@ class CollectorManager:
        if self.logger_manager.is_debug_enabled():
            self.logger_manager.log_info(f"Initialized collector manager: {manager_name}")
    
+    def _sanitize_error(self, message: str) -> str:
+        """
+        Sanitize error message to prevent leaking internal details.
+        
+        Args:
+            message: Original error message
+            
+        Returns:
+            Sanitized error message
+        """
+        # Delegate to the logger manager's sanitization method
+        return self.logger_manager._sanitize_error(message)
+    
    def add_collector(self, collector: BaseDataCollector, config: Optional[CollectorConfig] = None) -> None:
        """Add a collector to be managed."""
        self.lifecycle_manager.add_collector(collector, config)
@@ -107,9 +120,28 @@ class CollectorManager:
            self.logger_manager.log_info(f"Collector manager started - Managing {enabled_count} collectors")
            return True
            
-        except Exception as e:
+        except (asyncio.CancelledError, KeyboardInterrupt):
+            # Handle graceful shutdown scenarios
            self.status = ManagerStatus.ERROR
-            self.logger_manager.log_error(f"Failed to start collector manager: {e}", exc_info=True)
+            self.logger_manager.log_warning("Collector manager startup was cancelled")
+            return False
+        except (ConnectionError, OSError, IOError) as e:
+            # Handle connection and I/O related errors
+            self.status = ManagerStatus.ERROR
+            sanitized_message = self._sanitize_error(f"Connection/IO error starting collector manager: {e}")
+            self.logger_manager.log_error(sanitized_message, exc_info=True)
+            return False
+        except (AttributeError, TypeError, ValueError) as e:
+            # Handle configuration and data validation errors
+            self.status = ManagerStatus.ERROR
+            sanitized_message = self._sanitize_error(f"Configuration error starting collector manager: {e}")
+            self.logger_manager.log_error(sanitized_message, exc_info=True)
+            return False
+        except Exception as e:
+            # Catch any other unexpected errors
+            self.status = ManagerStatus.ERROR
+            sanitized_message = self._sanitize_error(f"Unexpected error starting collector manager: {e}")
+            self.logger_manager.log_error(sanitized_message, exc_info=True)
            return False
    
    async def stop(self) -> None:
@@ -144,9 +176,20 @@ class CollectorManager:
            self.status = ManagerStatus.STOPPED
            self.logger_manager.log_info("Collector manager stopped")
            
-        except Exception as e:
+        except (asyncio.CancelledError, KeyboardInterrupt):
+            # Handle graceful shutdown scenarios
            self.status = ManagerStatus.ERROR
-            self.logger_manager.log_error(f"Error stopping collector manager: {e}", exc_info=True)
+            self.logger_manager.log_warning("Collector manager shutdown was interrupted")
+        except (ConnectionError, OSError, IOError) as e:
+            # Handle connection and I/O related errors during shutdown
+            self.status = ManagerStatus.ERROR
+            sanitized_message = self._sanitize_error(f"Connection/IO error stopping collector manager: {e}")
+            self.logger_manager.log_error(sanitized_message, exc_info=True)
+        except Exception as e:
+            # Catch any other unexpected errors during shutdown
+            self.status = ManagerStatus.ERROR
+            sanitized_message = self._sanitize_error(f"Unexpected error stopping collector manager: {e}")
+            self.logger_manager.log_error(sanitized_message, exc_info=True)
    
    async def restart_collector(self, collector_name: str) -> bool:
        """Restart a specific collector."""
--- a/tasks/collector-service-tasks-optimization.md
+++ b/tasks/collector-service-tasks-optimization.md
@@ -105,12 +105,12 @@ Both files show good foundational architecture but exceed the recommended file s
  - [x] 2.4 Refactor `run()` method to be under 50 lines by extracting sub-logics (e.g., `_run_main_loop`).
  - [x] 2.5 Test './scripts/start_data_collection.py' and './scripts/production_clean.py' to ensure they work as expected.

- [ ] 3.0 Enhance Error Handling and Security
-  - [ ] 3.1 Implement a `_sanitize_error` method in `CollectorManager` and `DataCollectionService` to prevent leaking internal error details.
-  - [ ] 3.2 Add file permission validation for configuration files in `config/service_config.py`.
-  - [ ] 3.3 Review all `try-except` blocks to ensure specific exceptions are caught rather than broad `Exception`.
-  - [ ] 3.4 Ensure all logger calls include `exc_info=True` for error and critical logs.
-  - [ ] 3.5 Test './scripts/start_data_collection.py' and './scripts/production_clean.py' to ensure they work as expected.
+- [x] 3.0 Enhance Error Handling and Security
+  - [x] 3.1 Implement a `_sanitize_error` method in `CollectorManager` and `DataCollectionService` to prevent leaking internal error details.
+  - [x] 3.2 Add file permission validation for configuration files in `config/service_config.py`.
+  - [x] 3.3 Review all `try-except` blocks to ensure specific exceptions are caught rather than broad `Exception`.
+  - [x] 3.4 Ensure all logger calls include `exc_info=True` for error and critical logs.
+  - [x] 3.5 Test './scripts/start_data_collection.py' and './scripts/production_clean.py' to ensure they work as expected.


 - [ ] 4.0 Optimize Performance and Resource Management