""" 腾讯军事新闻爬虫(网页版) 使用 Selenium 动态加载页面,适用于网页抓取 """ from typing import List from bs4 import BeautifulSoup import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from base.crawler_base import DynamicCrawler, Article from parsers.tencent_parser import TencentParser class WarWebCrawler(DynamicCrawler): """腾讯军事新闻爬虫(网页版)""" def _extract_article_urls(self, html: str) -> List[str]: """ 从HTML中提取文章URL列表 Args: html: 页面HTML内容 Returns: 文章URL列表 """ soup = BeautifulSoup(html, "lxml") urls = [] # 选择军事频道的文章列表 # dt-params*='article_type=0' 过滤掉视频新闻 div_list = soup.select("div[id='channel-feed-area'] div.channel-feed-list div.channel-feed-item[dt-params*='article_type=0']") for div in div_list: article_link = div.select_one("a.article-title") if article_link: href = article_link.get('href') if href: # 处理相对路径 if href.startswith('/'): href = f"https://news.qq.com{href}" urls.append(href) return urls def _fetch_articles(self, urls: List[str]) -> List[Article]: """ 爬取文章详情 Args: urls: 文章URL列表 Returns: 文章列表 """ articles = [] parser = TencentParser() for i, url in enumerate(urls[:self.max_articles]): try: article = parser.parse(url) article.category_id = self.category_id article.source = "腾讯" if not article.author: article.author = "腾讯军事" if article.is_valid(): articles.append(article) self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") except Exception as e: self.logger.error(f"解析文章失败: {url} - {e}") continue return articles