news-classifier/crawler-module/config/config.yaml

113 lines
2.6 KiB
YAML

# 新闻爬虫系统配置文件
# 应用配置
app:
name: "News Crawler"
version: "2.0.0"
debug: false
timezone: "Asia/Shanghai"
# 数据库配置
database:
host: "localhost"
port: 3306
user: "root"
password: "root"
database: "news"
charset: "utf8mb4"
pool_size: 5
pool_timeout: 30
connect_timeout: 10
# HTTP请求配置
http:
timeout: 10
retry_times: 3
retry_delay: 1
headers:
User-Agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
Accept-Language: "zh-CN,zh;q=0.9,en;q=0.8"
# Selenium配置
selenium:
headless: true
log_level: 3
window_size: "1920,1080"
page_load_timeout: 30
script_timeout: 30
implicit_wait: 10
scroll_pause_time: 1.2
max_scroll_times: 10
# 日志配置
logging:
level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
console: true
file_enabled: true
file_path: "logs/crawler.log"
max_bytes: 10485760 # 10MB
backup_count: 5
# 爬虫配置
crawlers:
max_articles: 10
delay_between_requests: 0 # 秒
concurrent_limit: 3
# 新闻源配置
sources:
netease:
base_url: "https://www.163.com"
categories:
entertainment:
url: "https://ent.163.com/"
category_id: 1
name: "娱乐"
css_selector: "ul.newsdata_list"
tech:
url: "https://tech.163.com/"
category_id: 4
name: "科技"
css_selector: "ul.newsdata_list"
sports:
url: "https://sports.163.com/"
category_id: 2
name: "体育"
css_selector: "ul.channel_news_ul"
money:
url: "https://money.163.com/"
category_id: 3
name: "财经"
css_selector: "ul.channel_news_ul"
auto:
url: "https://auto.163.com/"
category_id: 6
name: "汽车"
css_selector: "ul.news-list"
gov:
url: "https://gov.163.com/"
category_id: 7
name: "政务"
css_selector: "div.datalist"
health:
url: "https://jiankang.163.com/"
category_id: 8
name: "健康"
css_selector: "div.column_content.trade"
war:
url: "https://war.163.com/"
category_id: 5
name: "军事"
css_selector: "ul.newsdata_list"
kr36:
base_url: "https://www.36kr.com"
categories:
ai:
url: "https://www.36kr.com/information/AI/"
category_id: 9
name: "AI"
css_selector: "div.kr-information-left"