news-classifier/crawler-module/src/crawlers/tencent/auto.py

"""
腾讯汽车新闻爬虫
"""

import time
import random
import hashlib
from typing import List
import requests

import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

from base.crawler_base import StaticCrawler, Article
from parsers.tencent_parser import TencentParser


class AutoCrawler(StaticCrawler):
    """腾讯汽车新闻爬虫"""

    def __init__(self, source: str, category: str):
        super().__init__(source, category)

        # 腾讯API配置
        self.api_url = "https://i.news.qq.com/web_feed/getPCList"
        self.channel_id = "news_news_auto"  # 汽车频道
        self.seen_ids = set()
        self.item_count = 20  # 每页固定请求20条

    def _generate_trace_id(self):
        """生成trace_id"""
        random_str = str(random.random()) + str(time.time())
        return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12]

    def crawl(self) -> List[Article]:
        """
        执行爬取任务（重写基类方法以支持API接口）

        Returns:
            文章列表
        """
        self.logger.info(f"开始爬取腾讯{self.category_name}新闻")

        try:
            # 生成设备ID
            device_id = self._generate_trace_id()

            # 获取文章URL列表
            article_urls = self._fetch_article_urls_from_api(device_id)
            self.logger.info(f"找到 {len(article_urls)} 篇文章")

            # 爬取文章详情
            articles = self._fetch_articles(article_urls)

            self.logger.info(f"成功爬取 {len(articles)} 篇文章")
            return articles

        except Exception as e:
            self.logger.error(f"爬取失败: {e}", exc_info=True)
            return []
        finally:
            self._cleanup()

    def _fetch_article_urls_from_api(self, device_id: str) -> List[str]:
        """
        从API获取文章URL列表

        Args:
            device_id: 设备ID

        Returns:
            文章URL列表
        """
        urls = []

        # 根据 max_articles 动态计算需要抓取的页数
        # 每页20条，向上取整
        import math
        max_pages = math.ceil(self.max_articles / self.item_count)
        self.logger.info(f"根据 max_articles={self.max_articles}，计算需要抓取 {max_pages} 页")

        for flush_num in range(max_pages):
            payload = {
                "base_req": {"from": "pc"},
                "forward": "1",
                "qimei36": device_id,
                "device_id": device_id,
                "flush_num": flush_num + 1,
                "channel_id": self.channel_id,
                "item_count": self.item_count,
                "is_local_chlid": "0"
            }

            try:
                headers = {
                    "User-Agent": self.http_client.session.headers.get("User-Agent"),
                    "Referer": "https://new.qq.com/",
                    "Origin": "https://new.qq.com",
                    "Content-Type": "application/json"
                }

                response = requests.post(
                    self.api_url,
                    headers=headers,
                    json=payload,
                    timeout=10
                )

                if response.status_code == 200:
                    data = response.json()
                    if data.get("code") == 0 and "data" in data:
                        news_list = data["data"]
                        if not news_list:
                            self.logger.info("没有更多数据了")
                            break

                        # 提取URL
                        for item in news_list:
                            news_id = item.get("id")

                            # 去重
                            if news_id in self.seen_ids:
                                continue
                            self.seen_ids.add(news_id)

                            # 过滤视频新闻（articletype == "4"）
                            article_type = item.get("articletype")
                            if article_type == "4":
                                continue

                            # 提取URL
                            url = item.get("link_info", {}).get("url")
                            if url:
                                urls.append(url)

                                # 如果已经获取到足够的文章数量，提前终止
                                if len(urls) >= self.max_articles:
                                    self.logger.info(f"已获取 {len(urls)} 篇文章，达到目标数量，停止抓取")
                                    break

                        # 如果外层循环也需要终止
                        if len(urls) >= self.max_articles:
                            break

                    else:
                        self.logger.warning(f"接口返回错误: {data.get('message')}")
                else:
                    self.logger.warning(f"HTTP请求失败: {response.status_code}")

            except Exception as e:
                self.logger.error(f"获取API数据失败: {e}")

            # 延迟，避免请求过快
            time.sleep(random.uniform(1, 2))

        return urls

    def _fetch_page(self) -> str:
        """
        获取页面HTML（腾讯爬虫不使用此方法）

        Returns:
            空字符串
        """
        return ""

    def _extract_article_urls(self, html: str) -> List[str]:
        """
        从HTML中提取文章URL列表（腾讯爬虫不使用此方法）

        Args:
            html: 页面HTML内容

        Returns:
            空列表
        """
        return []

    def _fetch_articles(self, urls: List[str]) -> List[Article]:
        """
        爬取文章详情

        Args:
            urls: 文章URL列表

        Returns:
            文章列表
        """
        articles = []
        parser = TencentParser()

        for i, url in enumerate(urls[:self.max_articles]):
            try:
                article = parser.parse(url)
                article.category_id = self.category_id
                article.source = "腾讯"

                if not article.author:
                    article.author = "腾讯汽车"

                if article.is_valid():
                    articles.append(article)
                    self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")

            except Exception as e:
                self.logger.error(f"解析文章失败: {url} - {e}")
                continue

        return articles