210 lines
6.5 KiB
Python
210 lines
6.5 KiB
Python
"""
|
||
腾讯汽车新闻爬虫
|
||
"""
|
||
|
||
import time
|
||
import random
|
||
import hashlib
|
||
from typing import List
|
||
import requests
|
||
|
||
import sys
|
||
import os
|
||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||
|
||
from base.crawler_base import StaticCrawler, Article
|
||
from parsers.tencent_parser import TencentParser
|
||
|
||
|
||
class AutoCrawler(StaticCrawler):
|
||
"""腾讯汽车新闻爬虫"""
|
||
|
||
def __init__(self, source: str, category: str):
|
||
super().__init__(source, category)
|
||
|
||
# 腾讯API配置
|
||
self.api_url = "https://i.news.qq.com/web_feed/getPCList"
|
||
self.channel_id = "news_news_auto" # 汽车频道
|
||
self.seen_ids = set()
|
||
self.item_count = 20 # 每页固定请求20条
|
||
|
||
def _generate_trace_id(self):
|
||
"""生成trace_id"""
|
||
random_str = str(random.random()) + str(time.time())
|
||
return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12]
|
||
|
||
def crawl(self) -> List[Article]:
|
||
"""
|
||
执行爬取任务(重写基类方法以支持API接口)
|
||
|
||
Returns:
|
||
文章列表
|
||
"""
|
||
self.logger.info(f"开始爬取腾讯{self.category_name}新闻")
|
||
|
||
try:
|
||
# 生成设备ID
|
||
device_id = self._generate_trace_id()
|
||
|
||
# 获取文章URL列表
|
||
article_urls = self._fetch_article_urls_from_api(device_id)
|
||
self.logger.info(f"找到 {len(article_urls)} 篇文章")
|
||
|
||
# 爬取文章详情
|
||
articles = self._fetch_articles(article_urls)
|
||
|
||
self.logger.info(f"成功爬取 {len(articles)} 篇文章")
|
||
return articles
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"爬取失败: {e}", exc_info=True)
|
||
return []
|
||
finally:
|
||
self._cleanup()
|
||
|
||
def _fetch_article_urls_from_api(self, device_id: str) -> List[str]:
|
||
"""
|
||
从API获取文章URL列表
|
||
|
||
Args:
|
||
device_id: 设备ID
|
||
|
||
Returns:
|
||
文章URL列表
|
||
"""
|
||
urls = []
|
||
|
||
# 根据 max_articles 动态计算需要抓取的页数
|
||
# 每页20条,向上取整
|
||
import math
|
||
max_pages = math.ceil(self.max_articles / self.item_count)
|
||
self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages} 页")
|
||
|
||
for flush_num in range(max_pages):
|
||
payload = {
|
||
"base_req": {"from": "pc"},
|
||
"forward": "1",
|
||
"qimei36": device_id,
|
||
"device_id": device_id,
|
||
"flush_num": flush_num + 1,
|
||
"channel_id": self.channel_id,
|
||
"item_count": self.item_count,
|
||
"is_local_chlid": "0"
|
||
}
|
||
|
||
try:
|
||
headers = {
|
||
"User-Agent": self.http_client.session.headers.get("User-Agent"),
|
||
"Referer": "https://new.qq.com/",
|
||
"Origin": "https://new.qq.com",
|
||
"Content-Type": "application/json"
|
||
}
|
||
|
||
response = requests.post(
|
||
self.api_url,
|
||
headers=headers,
|
||
json=payload,
|
||
timeout=10
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
if data.get("code") == 0 and "data" in data:
|
||
news_list = data["data"]
|
||
if not news_list:
|
||
self.logger.info("没有更多数据了")
|
||
break
|
||
|
||
# 提取URL
|
||
for item in news_list:
|
||
news_id = item.get("id")
|
||
|
||
# 去重
|
||
if news_id in self.seen_ids:
|
||
continue
|
||
self.seen_ids.add(news_id)
|
||
|
||
# 过滤视频新闻(articletype == "4")
|
||
article_type = item.get("articletype")
|
||
if article_type == "4":
|
||
continue
|
||
|
||
# 提取URL
|
||
url = item.get("link_info", {}).get("url")
|
||
if url:
|
||
urls.append(url)
|
||
|
||
# 如果已经获取到足够的文章数量,提前终止
|
||
if len(urls) >= self.max_articles:
|
||
self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取")
|
||
break
|
||
|
||
# 如果外层循环也需要终止
|
||
if len(urls) >= self.max_articles:
|
||
break
|
||
|
||
else:
|
||
self.logger.warning(f"接口返回错误: {data.get('message')}")
|
||
else:
|
||
self.logger.warning(f"HTTP请求失败: {response.status_code}")
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"获取API数据失败: {e}")
|
||
|
||
# 延迟,避免请求过快
|
||
time.sleep(random.uniform(1, 2))
|
||
|
||
return urls
|
||
|
||
def _fetch_page(self) -> str:
|
||
"""
|
||
获取页面HTML(腾讯爬虫不使用此方法)
|
||
|
||
Returns:
|
||
空字符串
|
||
"""
|
||
return ""
|
||
|
||
def _extract_article_urls(self, html: str) -> List[str]:
|
||
"""
|
||
从HTML中提取文章URL列表(腾讯爬虫不使用此方法)
|
||
|
||
Args:
|
||
html: 页面HTML内容
|
||
|
||
Returns:
|
||
空列表
|
||
"""
|
||
return []
|
||
|
||
def _fetch_articles(self, urls: List[str]) -> List[Article]:
|
||
"""
|
||
爬取文章详情
|
||
|
||
Args:
|
||
urls: 文章URL列表
|
||
|
||
Returns:
|
||
文章列表
|
||
"""
|
||
articles = []
|
||
parser = TencentParser()
|
||
|
||
for i, url in enumerate(urls[:self.max_articles]):
|
||
try:
|
||
article = parser.parse(url)
|
||
article.category_id = self.category_id
|
||
article.source = "腾讯"
|
||
|
||
if not article.author:
|
||
article.author = "腾讯汽车"
|
||
|
||
if article.is_valid():
|
||
articles.append(article)
|
||
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
|
||
|
||
except Exception as e:
|
||
self.logger.error(f"解析文章失败: {url} - {e}")
|
||
continue
|
||
|
||
return articles |