# 新闻爬虫系统配置文件 # 应用配置 app: name: "News Crawler" version: "2.0.0" debug: false timezone: "Asia/Shanghai" # 数据库配置 database: host: "localhost" port: 3306 user: "root" password: "root" database: "news" charset: "utf8mb4" pool_size: 5 pool_timeout: 30 connect_timeout: 10 # HTTP请求配置 http: timeout: 10 retry_times: 3 retry_delay: 1 headers: User-Agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8" Accept-Language: "zh-CN,zh;q=0.9,en;q=0.8" # Selenium配置 selenium: headless: true log_level: 3 window_size: "1920,1080" page_load_timeout: 30 script_timeout: 30 implicit_wait: 10 scroll_pause_time: 1.2 max_scroll_times: 10 # 日志配置 logging: level: "INFO" # DEBUG, INFO, WARNING, ERROR, CRITICAL format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" console: true file_enabled: true file_path: "logs/crawler.log" max_bytes: 10485760 # 10MB backup_count: 5 # 爬虫配置 crawlers: max_articles: 10 delay_between_requests: 0 # 秒 concurrent_limit: 3 # 新闻源配置 sources: netease: base_url: "https://www.163.com" categories: entertainment: url: "https://ent.163.com/" category_id: 1 name: "娱乐" css_selector: "ul.newsdata_list" tech: url: "https://tech.163.com/" category_id: 4 name: "科技" css_selector: "ul.newsdata_list" sports: url: "https://sports.163.com/" category_id: 2 name: "体育" css_selector: "ul.channel_news_ul" money: url: "https://money.163.com/" category_id: 3 name: "财经" css_selector: "ul.channel_news_ul" auto: url: "https://auto.163.com/" category_id: 6 name: "汽车" css_selector: "ul.news-list" gov: url: "https://gov.163.com/" category_id: 7 name: "政务" css_selector: "div.datalist" health: url: "https://jiankang.163.com/" category_id: 8 name: "健康" css_selector: "div.column_content.trade" war: url: "https://war.163.com/" category_id: 5 name: "军事" css_selector: "ul.newsdata_list" kr36: base_url: "https://www.36kr.com" categories: ai: url: "https://www.36kr.com/information/AI/" category_id: 9 name: "AI" css_selector: "div.kr-information-left"