""" 腾讯汽车新闻爬虫 """ import time import random import hashlib from typing import List import requests import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from base.crawler_base import StaticCrawler, Article from parsers.tencent_parser import TencentParser class AutoCrawler(StaticCrawler): """腾讯汽车新闻爬虫""" def __init__(self, source: str, category: str): super().__init__(source, category) # 腾讯API配置 self.api_url = "https://i.news.qq.com/web_feed/getPCList" self.channel_id = "news_news_auto" # 汽车频道 self.seen_ids = set() self.item_count = 20 # 每页固定请求20条 def _generate_trace_id(self): """生成trace_id""" random_str = str(random.random()) + str(time.time()) return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12] def crawl(self) -> List[Article]: """ 执行爬取任务(重写基类方法以支持API接口) Returns: 文章列表 """ self.logger.info(f"开始爬取腾讯{self.category_name}新闻") try: # 生成设备ID device_id = self._generate_trace_id() # 获取文章URL列表 article_urls = self._fetch_article_urls_from_api(device_id) self.logger.info(f"找到 {len(article_urls)} 篇文章") # 爬取文章详情 articles = self._fetch_articles(article_urls) self.logger.info(f"成功爬取 {len(articles)} 篇文章") return articles except Exception as e: self.logger.error(f"爬取失败: {e}", exc_info=True) return [] finally: self._cleanup() def _fetch_article_urls_from_api(self, device_id: str) -> List[str]: """ 从API获取文章URL列表 Args: device_id: 设备ID Returns: 文章URL列表 """ urls = [] # 根据 max_articles 动态计算需要抓取的页数 # 每页20条,向上取整 import math max_pages = math.ceil(self.max_articles / self.item_count) self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages} 页") for flush_num in range(max_pages): payload = { "base_req": {"from": "pc"}, "forward": "1", "qimei36": device_id, "device_id": device_id, "flush_num": flush_num + 1, "channel_id": self.channel_id, "item_count": self.item_count, "is_local_chlid": "0" } try: headers = { "User-Agent": self.http_client.session.headers.get("User-Agent"), "Referer": "https://new.qq.com/", "Origin": "https://new.qq.com", "Content-Type": "application/json" } response = requests.post( self.api_url, headers=headers, json=payload, timeout=10 ) if response.status_code == 200: data = response.json() if data.get("code") == 0 and "data" in data: news_list = data["data"] if not news_list: self.logger.info("没有更多数据了") break # 提取URL for item in news_list: news_id = item.get("id") # 去重 if news_id in self.seen_ids: continue self.seen_ids.add(news_id) # 过滤视频新闻(articletype == "4") article_type = item.get("articletype") if article_type == "4": continue # 提取URL url = item.get("link_info", {}).get("url") if url: urls.append(url) # 如果已经获取到足够的文章数量,提前终止 if len(urls) >= self.max_articles: self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取") break # 如果外层循环也需要终止 if len(urls) >= self.max_articles: break else: self.logger.warning(f"接口返回错误: {data.get('message')}") else: self.logger.warning(f"HTTP请求失败: {response.status_code}") except Exception as e: self.logger.error(f"获取API数据失败: {e}") # 延迟,避免请求过快 time.sleep(random.uniform(1, 2)) return urls def _fetch_page(self) -> str: """ 获取页面HTML(腾讯爬虫不使用此方法) Returns: 空字符串 """ return "" def _extract_article_urls(self, html: str) -> List[str]: """ 从HTML中提取文章URL列表(腾讯爬虫不使用此方法) Args: html: 页面HTML内容 Returns: 空列表 """ return [] def _fetch_articles(self, urls: List[str]) -> List[Article]: """ 爬取文章详情 Args: urls: 文章URL列表 Returns: 文章列表 """ articles = [] parser = TencentParser() for i, url in enumerate(urls[:self.max_articles]): try: article = parser.parse(url) article.category_id = self.category_id article.source = "腾讯" if not article.author: article.author = "腾讯汽车" if article.is_valid(): articles.append(article) self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") except Exception as e: self.logger.error(f"解析文章失败: {url} - {e}") continue return articles