news-classifier/crawler-module/src/crawlers/tencent/war_web.py

"""
腾讯军事新闻爬虫（网页版）
使用 Selenium 动态加载页面，适用于网页抓取
"""

from typing import List
from bs4 import BeautifulSoup

import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

from base.crawler_base import DynamicCrawler, Article
from parsers.tencent_parser import TencentParser


class WarWebCrawler(DynamicCrawler):
    """腾讯军事新闻爬虫（网页版）"""

    def _extract_article_urls(self, html: str) -> List[str]:
        """
        从HTML中提取文章URL列表

        Args:
            html: 页面HTML内容

        Returns:
            文章URL列表
        """
        soup = BeautifulSoup(html, "lxml")
        urls = []

        # 选择军事频道的文章列表
        # dt-params*='article_type=0' 过滤掉视频新闻
        div_list = soup.select("div[id='channel-feed-area'] div.channel-feed-list div.channel-feed-item[dt-params*='article_type=0']")

        for div in div_list:
            article_link = div.select_one("a.article-title")
            if article_link:
                href = article_link.get('href')
                if href:
                    # 处理相对路径
                    if href.startswith('/'):
                        href = f"https://news.qq.com{href}"
                    urls.append(href)

        return urls

    def _fetch_articles(self, urls: List[str]) -> List[Article]:
        """
        爬取文章详情

        Args:
            urls: 文章URL列表

        Returns:
            文章列表
        """
        articles = []
        parser = TencentParser()

        for i, url in enumerate(urls[:self.max_articles]):
            try:
                article = parser.parse(url)
                article.category_id = self.category_id
                article.source = "腾讯"

                if not article.author:
                    article.author = "腾讯军事"

                if article.is_valid():
                    articles.append(article)
                    self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")

            except Exception as e:
                self.logger.error(f"解析文章失败: {url} - {e}")
                continue

        return articles