news-classifier/crawler-module/config/config.yaml

190 lines
4.6 KiB
YAML

app:
debug: false
name: News Crawler
timezone: Asia/Shanghai
version: 2.0.0
crawler:
max_articles: 10
retry_times: 3
timeout: 20
database:
charset: utf8mb4
connect_timeout: 10
database: news
host: localhost
password: root
pool_size: 5
pool_timeout: 30
port: 3306
user: root
http:
headers:
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Language: zh-CN,zh;q=0.9,en;q=0.8
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML,
like Gecko) Chrome/120.0.0.0 Safari/537.36
retry_delay: 1
retry_times: 3
timeout: 10
logging:
backup_count: 5
console: true
file_enabled: true
file_path: logs/crawler.log
format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
level: INFO
max_bytes: 10485760
selenium:
headless: true
implicit_wait: 20
log_level: 3
max_scroll_times: 10
page_load_timeout: 60
script_timeout: 30
scroll_pause_time: 1.2
window_size: 1920,1080
sources:
kr36:
base_url: https://www.36kr.com
categories:
ai:
category_id: 9
css_selector: div.kr-information-left
name: AI
url: https://www.36kr.com/information/AI/
health:
category_id: 8
css_selector: div.kr-search-result-list
name: 健康
url: https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7
netease:
base_url: https://www.163.com
categories:
auto:
category_id: 6
css_selector: ul.news-list
name: 汽车
url: https://auto.163.com/
entertainment:
category_id: 1
css_selector: ul.newsdata_list
name: 娱乐
url: https://ent.163.com/
gov:
category_id: 7
css_selector: div.datalist
name: 政务
url: https://gov.163.com/
health:
category_id: 8
css_selector: div.column_content.trade
name: 健康
url: https://jiankang.163.com/
money:
category_id: 3
css_selector: ul.channel_news_ul
name: 财经
url: https://money.163.com/
sports:
category_id: 2
css_selector: ul.channel_news_ul
name: 体育
url: https://sports.163.com/
tech:
category_id: 4
css_selector: ul.newsdata_list
name: 科技
url: https://tech.163.com/
war:
category_id: 5
css_selector: ul.newsdata_list
name: 军事
url: https://war.163.com/
sina:
base_url: https://sina.com.cn
categories:
auto:
category_id: 6
css_selector: div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1
detail_css_selector: div.main-content
name: 汽车
url: https://auto.sina.com.cn/
gov:
category_id: 7
css_selector: a[href]
name: 政务
url: https://gov.sina.com.cn/
souhu:
base_url: https://news.sohu.com/
categories:
house:
category_id: 10
css_selector: .TPLTextFeedItem, .TPLImageTextFeedItem
name: 房产
url: https://house.focus.cn/zixun/
tencent:
base_url: https://news.qq.com/
categories:
ai:
category_id: 9
css_selector: ''
name: AI
url: https://i.news.qq.com/gw/pc_search/result
auto:
category_id: 6
css_selector: ''
name: 汽车
url: https://news.qq.com/ch/auto
entertainment:
category_id: 1
css_selector: ''
name: 娱乐
url: https://news.qq.com/ch/ent
finance:
category_id: 3
css_selector: ''
name: 财经
url: https://news.qq.com/ch/finance
health:
category_id: 8
css_selector: ''
name: 健康
url: https://news.qq.com/ch/health
house:
category_id: 10
css_selector: ''
name: 房产
url: https://news.qq.com/ch/house/
tech:
category_id: 4
css_selector: ''
name: 科技
url: https://news.qq.com/ch/tech
war:
category_id: 5
css_selector: ''
name: 军事
url: https://news.qq.com/ch/milite
war_web:
category_id: 5
css_selector: div[id='channel-feed-area']
name: 军事(网页版)
url: https://news.qq.com/ch/milite
system:
auto_refresh: true
log_retention: false
refresh_interval: 30
web:
cors_origins:
- http://localhost:5173
- http://localhost:3000
- http://127.0.0.1:5173
- http://127.0.0.1:3000
debug: false
host: 0.0.0.0
log_retention_days: 30
port: 8000
task_log_dir: logs/tasks
ws_ping_interval: 20
ws_ping_timeout: 20