news-classifier/crawler-module/config/settings.py

181 lines
4.5 KiB
Python

"""
配置管理模块
提供配置加载和访问功能
"""
import os
import yaml
from pathlib import Path
from typing import Dict, Any, Optional
from dataclasses import dataclass, field
@dataclass
class DatabaseConfig:
"""数据库配置"""
host: str
port: int
user: str
password: str
database: str
charset: str = "utf8mb4"
pool_size: int = 5
pool_timeout: int = 30
connect_timeout: int = 10
def to_dict(self) -> Dict[str, Any]:
return {
"host": self.host,
"port": self.port,
"user": self.user,
"password": self.password,
"database": self.database,
"charset": self.charset,
}
@dataclass
class LoggingConfig:
"""日志配置"""
level: str = "INFO"
format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
console: bool = True
file_enabled: bool = True
file_path: str = "logs/crawler.log"
max_bytes: int = 10485760
backup_count: int = 5
@dataclass
class SeleniumConfig:
"""Selenium配置"""
headless: bool = True
log_level: int = 3
window_size: str = "1920,1080"
page_load_timeout: int = 30
script_timeout: int = 30
implicit_wait: int = 10
scroll_pause_time: float = 1.2
max_scroll_times: int = 10
@dataclass
class HttpConfig:
"""HTTP配置"""
timeout: int = 10
retry_times: int = 3
retry_delay: int = 1
headers: Dict[str, str] = field(default_factory=dict)
@dataclass
class CrawlerConfig:
"""爬虫配置"""
max_articles: int = 10
delay_between_requests: int = 0
concurrent_limit: int = 3
class Config:
"""全局配置管理器"""
_instance: Optional['Config'] = None
_config_data: Dict[str, Any] = {}
def __new__(cls) -> 'Config':
if cls._instance is None:
cls._instance = super().__new__(cls)
return cls._instance
def __init__(self):
if not self._config_data:
self._load_config()
def _load_config(self):
"""加载配置文件"""
config_path = Path(__file__).parent / "config.yaml"
if not config_path.exists():
raise FileNotFoundError(f"配置文件不存在: {config_path}")
with open(config_path, 'r', encoding='utf-8') as f:
self._config_data = yaml.safe_load(f)
# 支持环境变量覆盖
self._apply_env_overrides()
def _apply_env_overrides(self):
"""应用环境变量覆盖"""
env_mappings = {
'DB_HOST': ['database', 'host'],
'DB_PORT': ['database', 'port'],
'DB_USER': ['database', 'user'],
'DB_PASSWORD': ['database', 'password'],
'DB_NAME': ['database', 'database'],
}
for env_key, config_path in env_mappings.items():
env_value = os.environ.get(env_key)
if env_value:
self._set_nested_value(config_path, env_value)
def _set_nested_value(self, path: list, value: Any):
"""设置嵌套配置值"""
data = self._config_data
for key in path[:-1]:
if key not in data:
data[key] = {}
data = data[key]
data[path[-1]] = value
def get(self, key: str, default: Any = None) -> Any:
"""获取配置值"""
keys = key.split('.')
data = self._config_data
for k in keys:
if isinstance(data, dict) and k in data:
data = data[k]
else:
return default
return data
@property
def database(self) -> DatabaseConfig:
"""获取数据库配置"""
db = self.get('database', {})
return DatabaseConfig(**db)
@property
def logging(self) -> LoggingConfig:
"""获取日志配置"""
log = self.get('logging', {})
return LoggingConfig(**log)
@property
def selenium(self) -> SeleniumConfig:
"""获取Selenium配置"""
sel = self.get('selenium', {})
return SeleniumConfig(**sel)
@property
def http(self) -> HttpConfig:
"""获取HTTP配置"""
http = self.get('http', {})
return HttpConfig(**http)
@property
def crawler(self) -> CrawlerConfig:
"""获取爬虫配置"""
crawler = self.get('crawlers', {})
return CrawlerConfig(**crawler)
def get_source_config(self, source: str, category: str) -> Optional[Dict[str, Any]]:
"""获取新闻源配置"""
return self.get(f'sources.{source}.categories.{category}')
# 全局配置实例
config = Config()