190 lines
4.6 KiB
YAML
190 lines
4.6 KiB
YAML
app:
|
|
debug: false
|
|
name: News Crawler
|
|
timezone: Asia/Shanghai
|
|
version: 2.0.0
|
|
crawler:
|
|
max_articles: 10
|
|
retry_times: 3
|
|
timeout: 20
|
|
database:
|
|
charset: utf8mb4
|
|
connect_timeout: 10
|
|
database: news
|
|
host: localhost
|
|
password: root
|
|
pool_size: 5
|
|
pool_timeout: 30
|
|
port: 3306
|
|
user: root
|
|
http:
|
|
headers:
|
|
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
|
|
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
|
|
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,
|
|
like Gecko) Chrome/120.0.0.0 Safari/537.36
|
|
retry_delay: 1
|
|
retry_times: 3
|
|
timeout: 10
|
|
logging:
|
|
backup_count: 5
|
|
console: true
|
|
file_enabled: true
|
|
file_path: logs/crawler.log
|
|
format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
level: INFO
|
|
max_bytes: 10485760
|
|
selenium:
|
|
headless: true
|
|
implicit_wait: 20
|
|
log_level: 3
|
|
max_scroll_times: 10
|
|
page_load_timeout: 60
|
|
script_timeout: 30
|
|
scroll_pause_time: 1.2
|
|
window_size: 1920,1080
|
|
sources:
|
|
kr36:
|
|
base_url: https://www.36kr.com
|
|
categories:
|
|
ai:
|
|
category_id: 9
|
|
css_selector: div.kr-information-left
|
|
name: AI
|
|
url: https://www.36kr.com/information/AI/
|
|
health:
|
|
category_id: 8
|
|
css_selector: div.kr-search-result-list
|
|
name: 健康
|
|
url: https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7
|
|
netease:
|
|
base_url: https://www.163.com
|
|
categories:
|
|
auto:
|
|
category_id: 6
|
|
css_selector: ul.news-list
|
|
name: 汽车
|
|
url: https://auto.163.com/
|
|
entertainment:
|
|
category_id: 1
|
|
css_selector: ul.newsdata_list
|
|
name: 娱乐
|
|
url: https://ent.163.com/
|
|
gov:
|
|
category_id: 7
|
|
css_selector: div.datalist
|
|
name: 政务
|
|
url: https://gov.163.com/
|
|
health:
|
|
category_id: 8
|
|
css_selector: div.column_content.trade
|
|
name: 健康
|
|
url: https://jiankang.163.com/
|
|
money:
|
|
category_id: 3
|
|
css_selector: ul.channel_news_ul
|
|
name: 财经
|
|
url: https://money.163.com/
|
|
sports:
|
|
category_id: 2
|
|
css_selector: ul.channel_news_ul
|
|
name: 体育
|
|
url: https://sports.163.com/
|
|
tech:
|
|
category_id: 4
|
|
css_selector: ul.newsdata_list
|
|
name: 科技
|
|
url: https://tech.163.com/
|
|
war:
|
|
category_id: 5
|
|
css_selector: ul.newsdata_list
|
|
name: 军事
|
|
url: https://war.163.com/
|
|
sina:
|
|
base_url: https://sina.com.cn
|
|
categories:
|
|
auto:
|
|
category_id: 6
|
|
css_selector: div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1
|
|
detail_css_selector: div.main-content
|
|
name: 汽车
|
|
url: https://auto.sina.com.cn/
|
|
gov:
|
|
category_id: 7
|
|
css_selector: a[href]
|
|
name: 政务
|
|
url: https://gov.sina.com.cn/
|
|
souhu:
|
|
base_url: https://news.sohu.com/
|
|
categories:
|
|
house:
|
|
category_id: 10
|
|
css_selector: .TPLTextFeedItem, .TPLImageTextFeedItem
|
|
name: 房产
|
|
url: https://house.focus.cn/zixun/
|
|
tencent:
|
|
base_url: https://news.qq.com/
|
|
categories:
|
|
ai:
|
|
category_id: 9
|
|
css_selector: ''
|
|
name: AI
|
|
url: https://i.news.qq.com/gw/pc_search/result
|
|
auto:
|
|
category_id: 6
|
|
css_selector: ''
|
|
name: 汽车
|
|
url: https://news.qq.com/ch/auto
|
|
entertainment:
|
|
category_id: 1
|
|
css_selector: ''
|
|
name: 娱乐
|
|
url: https://news.qq.com/ch/ent
|
|
finance:
|
|
category_id: 3
|
|
css_selector: ''
|
|
name: 财经
|
|
url: https://news.qq.com/ch/finance
|
|
health:
|
|
category_id: 8
|
|
css_selector: ''
|
|
name: 健康
|
|
url: https://news.qq.com/ch/health
|
|
house:
|
|
category_id: 10
|
|
css_selector: ''
|
|
name: 房产
|
|
url: https://news.qq.com/ch/house/
|
|
tech:
|
|
category_id: 4
|
|
css_selector: ''
|
|
name: 科技
|
|
url: https://news.qq.com/ch/tech
|
|
war:
|
|
category_id: 5
|
|
css_selector: ''
|
|
name: 军事
|
|
url: https://news.qq.com/ch/milite
|
|
war_web:
|
|
category_id: 5
|
|
css_selector: div[id='channel-feed-area']
|
|
name: 军事(网页版)
|
|
url: https://news.qq.com/ch/milite
|
|
system:
|
|
auto_refresh: true
|
|
log_retention: false
|
|
refresh_interval: 30
|
|
web:
|
|
cors_origins:
|
|
- http://localhost:5173
|
|
- http://localhost:3000
|
|
- http://127.0.0.1:5173
|
|
- http://127.0.0.1:3000
|
|
debug: false
|
|
host: 0.0.0.0
|
|
log_retention_days: 30
|
|
port: 8000
|
|
task_log_dir: logs/tasks
|
|
ws_ping_interval: 20
|
|
ws_ping_timeout: 20
|