128 lines
3.1 KiB
YAML
128 lines
3.1 KiB
YAML
# 新闻爬虫系统配置文件
|
|
|
|
# 应用配置
|
|
app:
|
|
name: "News Crawler"
|
|
version: "2.0.0"
|
|
debug: false
|
|
timezone: "Asia/Shanghai"
|
|
|
|
# 数据库配置
|
|
database:
|
|
host: "localhost"
|
|
port: 3306
|
|
user: "root"
|
|
password: "root"
|
|
database: "news"
|
|
charset: "utf8mb4"
|
|
pool_size: 5
|
|
pool_timeout: 30
|
|
connect_timeout: 10
|
|
|
|
# HTTP请求配置
|
|
http:
|
|
timeout: 10
|
|
retry_times: 3
|
|
retry_delay: 1
|
|
headers:
|
|
User-Agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"
|
|
Accept-Language: "zh-CN,zh;q=0.9,en;q=0.8"
|
|
|
|
# Selenium配置
|
|
selenium:
|
|
headless: true
|
|
log_level: 3
|
|
window_size: "1920,1080"
|
|
page_load_timeout: 60
|
|
script_timeout: 30
|
|
implicit_wait: 20
|
|
scroll_pause_time: 1.2
|
|
max_scroll_times: 10
|
|
|
|
# 日志配置
|
|
logging:
|
|
level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL
|
|
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
console: true
|
|
file_enabled: true
|
|
file_path: "logs/crawler.log"
|
|
max_bytes: 10485760 # 10MB
|
|
backup_count: 5
|
|
|
|
# 爬虫配置
|
|
crawlers:
|
|
max_articles: 10
|
|
delay_between_requests: 0 # 秒
|
|
concurrent_limit: 3
|
|
|
|
# 新闻源配置
|
|
sources:
|
|
netease:
|
|
base_url: "https://www.163.com"
|
|
categories:
|
|
entertainment:
|
|
url: "https://ent.163.com/"
|
|
category_id: 1
|
|
name: "娱乐"
|
|
css_selector: "ul.newsdata_list"
|
|
tech:
|
|
url: "https://tech.163.com/"
|
|
category_id: 4
|
|
name: "科技"
|
|
css_selector: "ul.newsdata_list"
|
|
sports:
|
|
url: "https://sports.163.com/"
|
|
category_id: 2
|
|
name: "体育"
|
|
css_selector: "ul.channel_news_ul"
|
|
money:
|
|
url: "https://money.163.com/"
|
|
category_id: 3
|
|
name: "财经"
|
|
css_selector: "ul.channel_news_ul"
|
|
auto:
|
|
url: "https://auto.163.com/"
|
|
category_id: 6
|
|
name: "汽车"
|
|
css_selector: "ul.news-list"
|
|
gov:
|
|
url: "https://gov.163.com/"
|
|
category_id: 7
|
|
name: "政务"
|
|
css_selector: "div.datalist"
|
|
health:
|
|
url: "https://jiankang.163.com/"
|
|
category_id: 8
|
|
name: "健康"
|
|
css_selector: "div.column_content.trade"
|
|
war:
|
|
url: "https://war.163.com/"
|
|
category_id: 5
|
|
name: "军事"
|
|
css_selector: "ul.newsdata_list"
|
|
|
|
kr36:
|
|
base_url: "https://www.36kr.com"
|
|
categories:
|
|
ai:
|
|
url: "https://www.36kr.com/information/AI/"
|
|
category_id: 9
|
|
name: "AI"
|
|
css_selector: "div.kr-information-left"
|
|
health:
|
|
url: "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
|
|
category_id: 8
|
|
name: "健康"
|
|
css_selector: "div.kr-search-result-list"
|
|
|
|
sina:
|
|
base_url: "https://sina.com.cn"
|
|
categories:
|
|
auto:
|
|
url: "https://auto.sina.com.cn/"
|
|
category_id: 6
|
|
name: "汽车"
|
|
css_selector: "div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1"
|
|
detail_css_selector: "div.main-content"
|