181 lines
4.5 KiB
Python
181 lines
4.5 KiB
Python
"""
|
|
配置管理模块
|
|
提供配置加载和访问功能
|
|
"""
|
|
|
|
import os
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional
|
|
from dataclasses import dataclass, field
|
|
|
|
|
|
@dataclass
|
|
class DatabaseConfig:
|
|
"""数据库配置"""
|
|
host: str
|
|
port: int
|
|
user: str
|
|
password: str
|
|
database: str
|
|
charset: str = "utf8mb4"
|
|
pool_size: int = 5
|
|
pool_timeout: int = 30
|
|
connect_timeout: int = 10
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"host": self.host,
|
|
"port": self.port,
|
|
"user": self.user,
|
|
"password": self.password,
|
|
"database": self.database,
|
|
"charset": self.charset,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class LoggingConfig:
|
|
"""日志配置"""
|
|
level: str = "INFO"
|
|
format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
console: bool = True
|
|
file_enabled: bool = True
|
|
file_path: str = "logs/crawler.log"
|
|
max_bytes: int = 10485760
|
|
backup_count: int = 5
|
|
|
|
|
|
@dataclass
|
|
class SeleniumConfig:
|
|
"""Selenium配置"""
|
|
headless: bool = True
|
|
log_level: int = 3
|
|
window_size: str = "1920,1080"
|
|
page_load_timeout: int = 30
|
|
script_timeout: int = 30
|
|
implicit_wait: int = 10
|
|
scroll_pause_time: float = 5
|
|
max_scroll_times: int = 10
|
|
|
|
|
|
@dataclass
|
|
class HttpConfig:
|
|
"""HTTP配置"""
|
|
timeout: int = 10
|
|
retry_times: int = 3
|
|
retry_delay: int = 1
|
|
headers: Dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class CrawlerConfig:
|
|
"""爬虫配置"""
|
|
max_articles: int = 10
|
|
delay_between_requests: int = 0
|
|
concurrent_limit: int = 3
|
|
|
|
|
|
class Config:
|
|
"""全局配置管理器"""
|
|
|
|
_instance: Optional['Config'] = None
|
|
_config_data: Dict[str, Any] = {}
|
|
|
|
def __new__(cls) -> 'Config':
|
|
if cls._instance is None:
|
|
cls._instance = super().__new__(cls)
|
|
return cls._instance
|
|
|
|
def __init__(self):
|
|
if not self._config_data:
|
|
self._load_config()
|
|
|
|
def _load_config(self):
|
|
"""加载配置文件"""
|
|
config_path = Path(__file__).parent / "config.yaml"
|
|
|
|
if not config_path.exists():
|
|
raise FileNotFoundError(f"配置文件不存在: {config_path}")
|
|
|
|
with open(config_path, 'r', encoding='utf-8') as f:
|
|
self._config_data = yaml.safe_load(f)
|
|
|
|
# 支持环境变量覆盖
|
|
self._apply_env_overrides()
|
|
|
|
def _apply_env_overrides(self):
|
|
"""应用环境变量覆盖"""
|
|
env_mappings = {
|
|
'DB_HOST': ['database', 'host'],
|
|
'DB_PORT': ['database', 'port'],
|
|
'DB_USER': ['database', 'user'],
|
|
'DB_PASSWORD': ['database', 'password'],
|
|
'DB_NAME': ['database', 'database'],
|
|
}
|
|
|
|
for env_key, config_path in env_mappings.items():
|
|
env_value = os.environ.get(env_key)
|
|
if env_value:
|
|
self._set_nested_value(config_path, env_value)
|
|
|
|
def _set_nested_value(self, path: list, value: Any):
|
|
"""设置嵌套配置值"""
|
|
data = self._config_data
|
|
for key in path[:-1]:
|
|
if key not in data:
|
|
data[key] = {}
|
|
data = data[key]
|
|
data[path[-1]] = value
|
|
|
|
def get(self, key: str, default: Any = None) -> Any:
|
|
"""获取配置值"""
|
|
keys = key.split('.')
|
|
data = self._config_data
|
|
|
|
for k in keys:
|
|
if isinstance(data, dict) and k in data:
|
|
data = data[k]
|
|
else:
|
|
return default
|
|
|
|
return data
|
|
|
|
@property
|
|
def database(self) -> DatabaseConfig:
|
|
"""获取数据库配置"""
|
|
db = self.get('database', {})
|
|
return DatabaseConfig(**db)
|
|
|
|
@property
|
|
def logging(self) -> LoggingConfig:
|
|
"""获取日志配置"""
|
|
log = self.get('logging', {})
|
|
return LoggingConfig(**log)
|
|
|
|
@property
|
|
def selenium(self) -> SeleniumConfig:
|
|
"""获取Selenium配置"""
|
|
sel = self.get('selenium', {})
|
|
return SeleniumConfig(**sel)
|
|
|
|
@property
|
|
def http(self) -> HttpConfig:
|
|
"""获取HTTP配置"""
|
|
http = self.get('http', {})
|
|
return HttpConfig(**http)
|
|
|
|
@property
|
|
def crawler(self) -> CrawlerConfig:
|
|
"""获取爬虫配置"""
|
|
crawler = self.get('crawlers', {})
|
|
return CrawlerConfig(**crawler)
|
|
|
|
def get_source_config(self, source: str, category: str) -> Optional[Dict[str, Any]]:
|
|
"""获取新闻源配置"""
|
|
return self.get(f'sources.{source}.categories.{category}')
|
|
|
|
|
|
# 全局配置实例
|
|
config = Config()
|