""" 配置管理模块 提供配置加载和访问功能 """ import os import yaml from pathlib import Path from typing import Dict, Any, Optional from dataclasses import dataclass, field @dataclass class DatabaseConfig: """数据库配置""" host: str port: int user: str password: str database: str charset: str = "utf8mb4" pool_size: int = 5 pool_timeout: int = 30 connect_timeout: int = 10 def to_dict(self) -> Dict[str, Any]: return { "host": self.host, "port": self.port, "user": self.user, "password": self.password, "database": self.database, "charset": self.charset, } @dataclass class LoggingConfig: """日志配置""" level: str = "INFO" format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" console: bool = True file_enabled: bool = True file_path: str = "logs/crawler.log" max_bytes: int = 10485760 backup_count: int = 5 @dataclass class SeleniumConfig: """Selenium配置""" headless: bool = True log_level: int = 3 window_size: str = "1920,1080" page_load_timeout: int = 30 script_timeout: int = 30 implicit_wait: int = 10 scroll_pause_time: float = 5 max_scroll_times: int = 10 @dataclass class HttpConfig: """HTTP配置""" timeout: int = 10 retry_times: int = 3 retry_delay: int = 1 headers: Dict[str, str] = field(default_factory=dict) @dataclass class CrawlerConfig: """爬虫配置""" max_articles: int = 10 delay_between_requests: int = 0 concurrent_limit: int = 3 @dataclass class SystemConfig: """系统配置""" auto_refresh: bool = True refresh_interval: int = 30 log_retention: bool = False class Config: """全局配置管理器""" _instance: Optional['Config'] = None _config_data: Dict[str, Any] = {} def __new__(cls) -> 'Config': if cls._instance is None: cls._instance = super().__new__(cls) return cls._instance def __init__(self): if not self._config_data: self._load_config() def _load_config(self): """加载配置文件""" config_path = Path(__file__).parent / "config.yaml" if not config_path.exists(): raise FileNotFoundError(f"配置文件不存在: {config_path}") with open(config_path, 'r', encoding='utf-8') as f: self._config_data = yaml.safe_load(f) # 支持环境变量覆盖 self._apply_env_overrides() def _apply_env_overrides(self): """应用环境变量覆盖""" env_mappings = { 'DB_HOST': ['database', 'host'], 'DB_PORT': ['database', 'port'], 'DB_USER': ['database', 'user'], 'DB_PASSWORD': ['database', 'password'], 'DB_NAME': ['database', 'database'], } for env_key, config_path in env_mappings.items(): env_value = os.environ.get(env_key) if env_value: self._set_nested_value(config_path, env_value) def _set_nested_value(self, path: list, value: Any): """设置嵌套配置值""" data = self._config_data for key in path[:-1]: if key not in data: data[key] = {} data = data[key] data[path[-1]] = value def get(self, key: str, default: Any = None) -> Any: """获取配置值""" keys = key.split('.') data = self._config_data for k in keys: if isinstance(data, dict) and k in data: data = data[k] else: return default return data @property def database(self) -> DatabaseConfig: """获取数据库配置""" db = self.get('database', {}) return DatabaseConfig(**db) @property def logging(self) -> LoggingConfig: """获取日志配置""" log = self.get('logging', {}) return LoggingConfig(**log) @property def selenium(self) -> SeleniumConfig: """获取Selenium配置""" sel = self.get('selenium', {}) return SeleniumConfig(**sel) @property def http(self) -> HttpConfig: """获取HTTP配置""" http = self.get('http', {}) return HttpConfig(**http) @property def crawler(self) -> CrawlerConfig: """获取爬虫配置""" crawler = self.get('crawlers', {}) return CrawlerConfig(**crawler) @property def system(self) -> SystemConfig: """获取系统配置""" sys = self.get('system', {}) return SystemConfig(**sys) def get_source_config(self, source: str, category: str) -> Optional[Dict[str, Any]]: """获取新闻源配置""" return self.get(f'sources.{source}.categories.{category}') # 全局配置实例 config = Config()