news-classifier/crawler-module/src/crawlers/tencent/auto.py

210 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
腾讯汽车新闻爬虫
"""
import time
import random
import hashlib
from typing import List
import requests
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from base.crawler_base import StaticCrawler, Article
from parsers.tencent_parser import TencentParser
class AutoCrawler(StaticCrawler):
"""腾讯汽车新闻爬虫"""
def __init__(self, source: str, category: str):
super().__init__(source, category)
# 腾讯API配置
self.api_url = "https://i.news.qq.com/web_feed/getPCList"
self.channel_id = "news_news_auto" # 汽车频道
self.seen_ids = set()
self.item_count = 20 # 每页固定请求20条
def _generate_trace_id(self):
"""生成trace_id"""
random_str = str(random.random()) + str(time.time())
return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12]
def crawl(self) -> List[Article]:
"""
执行爬取任务重写基类方法以支持API接口
Returns:
文章列表
"""
self.logger.info(f"开始爬取腾讯{self.category_name}新闻")
try:
# 生成设备ID
device_id = self._generate_trace_id()
# 获取文章URL列表
article_urls = self._fetch_article_urls_from_api(device_id)
self.logger.info(f"找到 {len(article_urls)} 篇文章")
# 爬取文章详情
articles = self._fetch_articles(article_urls)
self.logger.info(f"成功爬取 {len(articles)} 篇文章")
return articles
except Exception as e:
self.logger.error(f"爬取失败: {e}", exc_info=True)
return []
finally:
self._cleanup()
def _fetch_article_urls_from_api(self, device_id: str) -> List[str]:
"""
从API获取文章URL列表
Args:
device_id: 设备ID
Returns:
文章URL列表
"""
urls = []
# 根据 max_articles 动态计算需要抓取的页数
# 每页20条向上取整
import math
max_pages = math.ceil(self.max_articles / self.item_count)
self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages}")
for flush_num in range(max_pages):
payload = {
"base_req": {"from": "pc"},
"forward": "1",
"qimei36": device_id,
"device_id": device_id,
"flush_num": flush_num + 1,
"channel_id": self.channel_id,
"item_count": self.item_count,
"is_local_chlid": "0"
}
try:
headers = {
"User-Agent": self.http_client.session.headers.get("User-Agent"),
"Referer": "https://new.qq.com/",
"Origin": "https://new.qq.com",
"Content-Type": "application/json"
}
response = requests.post(
self.api_url,
headers=headers,
json=payload,
timeout=10
)
if response.status_code == 200:
data = response.json()
if data.get("code") == 0 and "data" in data:
news_list = data["data"]
if not news_list:
self.logger.info("没有更多数据了")
break
# 提取URL
for item in news_list:
news_id = item.get("id")
# 去重
if news_id in self.seen_ids:
continue
self.seen_ids.add(news_id)
# 过滤视频新闻articletype == "4"
article_type = item.get("articletype")
if article_type == "4":
continue
# 提取URL
url = item.get("link_info", {}).get("url")
if url:
urls.append(url)
# 如果已经获取到足够的文章数量,提前终止
if len(urls) >= self.max_articles:
self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取")
break
# 如果外层循环也需要终止
if len(urls) >= self.max_articles:
break
else:
self.logger.warning(f"接口返回错误: {data.get('message')}")
else:
self.logger.warning(f"HTTP请求失败: {response.status_code}")
except Exception as e:
self.logger.error(f"获取API数据失败: {e}")
# 延迟,避免请求过快
time.sleep(random.uniform(1, 2))
return urls
def _fetch_page(self) -> str:
"""
获取页面HTML腾讯爬虫不使用此方法
Returns:
空字符串
"""
return ""
def _extract_article_urls(self, html: str) -> List[str]:
"""
从HTML中提取文章URL列表腾讯爬虫不使用此方法
Args:
html: 页面HTML内容
Returns:
空列表
"""
return []
def _fetch_articles(self, urls: List[str]) -> List[Article]:
"""
爬取文章详情
Args:
urls: 文章URL列表
Returns:
文章列表
"""
articles = []
parser = TencentParser()
for i, url in enumerate(urls[:self.max_articles]):
try:
article = parser.parse(url)
article.category_id = self.category_id
article.source = "腾讯"
if not article.author:
article.author = "腾讯汽车"
if article.is_valid():
articles.append(article)
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
except Exception as e:
self.logger.error(f"解析文章失败: {url} - {e}")
continue
return articles