From 4cb71256e6ccd43531e1b88ce280c2db3f56340f Mon Sep 17 00:00:00 2001 From: shenjianZ Date: Sat, 17 Jan 2026 09:02:41 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=96=B0=E5=A2=9E=E7=88=AC=E8=99=AB?= =?UTF-8?q?=E7=BB=9F=E8=AE=A1=E5=8A=9F=E8=83=BD=E3=80=81=E5=A4=9A=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E6=94=AF=E6=8C=81=E5=8F=8A=E8=85=BE=E8=AE=AF=E8=B4=A2?= =?UTF-8?q?=E7=BB=8FAPI=E7=88=AC=E8=99=AB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 主要更新: 1. 新增统计展示功能 - 添加 CrawlerStats 数据类,记录爬取/插入/重复数量 - run_crawler() 返回详细统计信息而非简单布尔值 - 新增 display_stats() 函数,支持单个/汇总两种展示格式 - 自动按数据源分组展示统计信息 2. CLI支持多爬虫运行 - 修改 crawler 参数支持多个值(nargs='*') - 支持三种运行方式:单个爬虫、多个爬虫、--all全部爬虫 - 自动识别单个/多个场景并切换展示格式 3. 新增腾讯财经API爬虫 - 创建 src/crawlers/tencent/finance.py - 使用腾讯新闻 API 接口,性能优于Selenium爬虫 - channel_id: news_news_finance - 支持 API 分页和去重 4. 更新配置和文档 - config.yaml 新增腾讯财经分类配置(category_id: 3) - 更新《添加新爬虫指南》v2.0,包含API爬虫示例和统计功能说明 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- crawler-module/config/config.yaml | 16 ++ crawler-module/docs/添加新爬虫指南.md | 267 ++++++++++++++++-- crawler-module/src/cli/main.py | 212 ++++++++++++-- .../src/crawlers/tencent/finance.py | 211 ++++++++++++++ 4 files changed, 654 insertions(+), 52 deletions(-) create mode 100644 crawler-module/src/crawlers/tencent/finance.py diff --git a/crawler-module/config/config.yaml b/crawler-module/config/config.yaml index 1cf3125..bb1ad19 100644 --- a/crawler-module/config/config.yaml +++ b/crawler-module/config/config.yaml @@ -164,3 +164,19 @@ sources: category_id: 4 name: "科技" css_selector: "" + entertainment: + url: "https://news.qq.com/ch/ent" + category_id: 1 + name: "娱乐" + css_selector: "" + finance: + url: "https://news.qq.com/ch/finance" + category_id: 3 + name: "财经" + css_selector: "" + ai: + url: "https://i.news.qq.com/gw/pc_search/result" + category_id: 9 + name: "AI" + css_selector: "" + # 注意:此分类通过搜索接口获取数据,而非正常的分类列表接口 diff --git a/crawler-module/docs/添加新爬虫指南.md b/crawler-module/docs/添加新爬虫指南.md index ba4ec27..b7ced3a 100644 --- a/crawler-module/docs/添加新爬虫指南.md +++ b/crawler-module/docs/添加新爬虫指南.md @@ -49,6 +49,20 @@ crawler-module/ 3. **配置驱动模式**: 通过 YAML 配置文件管理爬虫参数 4. **工厂模式**: CLI 通过动态导入创建爬虫实例 +### 爬虫类型 + +| 类型 | 基类 | 适用场景 | 依赖 | 示例 | +|------|------|----------|------|------| +| **API爬虫** | `StaticCrawler` | 有数据API接口 | requests | 腾讯科技/财经 | +| **静态爬虫** | `StaticCrawler` | HTML直接渲染 | requests | 简单网站 | +| **动态爬虫** | `DynamicCrawler` | JS动态加载 | Selenium | 网易/36氪 | + +### 新增功能(v2.0) + +- **多爬虫运行**: 支持同时运行多个指定爬虫 +- **统计展示**: 自动展示爬取数、插入数、重复数 +- **分组统计**: 按数据源分组展示汇总信息 + --- ## 添加新爬虫的完整流程 @@ -77,7 +91,8 @@ crawler-module/ 在编写代码之前,需要分析目标网站的以下信息: -#### 1.1 确定网站类型 +#### 1.1 确定网站类型和爬虫方式 +- **API接口**: 网站提供数据API(优先选择,性能最好) - **静态网站**: 内容直接在 HTML 中,使用 `StaticCrawler` - **动态网站**: 内容通过 JavaScript 加载,使用 `DynamicCrawler` @@ -195,8 +210,26 @@ class TechCrawler(DynamicCrawler): #### 2.3 爬虫类说明 **继承基类选择**: -- `DynamicCrawler`: 使用 Selenium,适合动态网站 -- `StaticCrawler`: 使用 requests,适合静态网站 +- `DynamicCrawler`: 使用 Selenium,适合动态网站(需滚动加载) +- `StaticCrawler`: 使用 requests,适合静态网站或API接口 + +**三种实现方式**: + +1. **API爬虫**(推荐,性能最好) + - 继承 `StaticCrawler` + - 重写 `crawl()` 方法 + - 直接调用API接口获取数据 + - 参考:`src/crawlers/tencent/tech.py` + +2. **静态爬虫** + - 继承 `StaticCrawler` + - 实现 `_extract_article_urls()` 和 `_fetch_articles()` + - 使用 BeautifulSoup 解析HTML + +3. **动态爬虫** + - 继承 `DynamicCrawler` + - 实现 `_extract_article_urls()` 和 `_fetch_articles()` + - 使用 Selenium 自动化浏览器 **必须实现的方法**: - `_extract_article_urls(html)`: 从列表页提取文章 URL @@ -414,26 +447,81 @@ CRAWLER_CLASSES = { ### 步骤 6: 测试和调试 -#### 6.1 运行单个爬虫 +#### 6.1 运行爬虫 +**方式1:单个爬虫** ```bash -# 进入项目目录 -cd D:\tmp\write\news-classifier\crawler-module - -# 运行新爬虫 python -m src.cli.main example:tech - -# 限制爬取数量 python -m src.cli.main example:tech --max 3 ``` -#### 6.2 列出所有爬虫 +**方式2:多个爬虫**(v2.0新增) +```bash +python -m src.cli.main example:tech example:finance netease:tech --max 5 +``` + +**方式3:所有爬虫** +```bash +python -m src.cli.main --all +python -m src.cli.main --all --max 5 +``` + +#### 6.2 统计信息展示(v2.0新增) + +**单个爬虫输出:** +``` +============================================================ +爬虫统计: example:tech +============================================================ +状态: [成功] +爬取数量: 10 篇 +插入数量: 8 条 +重复数量: 2 条 +============================================================ +``` + +**多个爬虫输出:** +``` +================================================================================ +爬虫任务汇总统计 +================================================================================ + +【EXAMPLE】 +-------------------------------------------------------------------------------- +分类 状态 爬取 插入 重复 +-------------------------------------------------------------------------------- +tech [成功] 10 8 2 +finance [成功] 10 9 1 +-------------------------------------------------------------------------------- +小计 2/2 成功 20 17 3 + +【NETEASE】 +-------------------------------------------------------------------------------- +分类 状态 爬取 插入 重复 +-------------------------------------------------------------------------------- +tech [成功] 10 10 0 +-------------------------------------------------------------------------------- +小计 1/1 成功 10 10 0 + +================================================================================ +总计统计 +================================================================================ +总爬虫数: 3 +成功数: 3 +失败数: 0 +总爬取: 30 篇 +总插入: 27 条 +总重复: 3 条 +================================================================================ +``` + +#### 6.3 列出所有爬虫 ```bash python -m src.cli.main --list ``` -应该能看到新添加的爬虫: +输出: ``` 可用的爬虫: - netease:entertainment @@ -443,11 +531,13 @@ python -m src.cli.main --list - example:entertainment ``` -#### 6.3 查看日志 +#### 6.4 查看日志 ```bash # 日志文件位置 type logs\crawler.log +# Linux/Mac: +tail -f logs/crawler.log ``` #### 6.4 调试技巧 @@ -483,6 +573,126 @@ for article in articles: ## 示例代码 +### 示例1:API爬虫(腾讯财经) + +**适用场景**: 网站提供数据API接口,性能最好 + +**爬虫类**: `src/crawlers/tencent/finance.py` + +```python +""" +腾讯财经新闻爬虫(API版) +""" +import time +import random +import hashlib +from typing import List +import requests + +from base.crawler_base import StaticCrawler, Article +from parsers.tencent_parser import TencentParser + +class FinanceCrawler(StaticCrawler): + """腾讯财经新闻爬虫(API版)""" + + def __init__(self, source: str, category: str): + super().__init__(source, category) + + # 腾讯API配置 + self.api_url = "https://i.news.qq.com/web_feed/getPCList" + self.channel_id = "news_news_finance" # 财经频道ID + self.seen_ids = set() + self.item_count = 20 # 每页固定请求20条 + + def crawl(self) -> List[Article]: + """执行爬取任务(重写基类方法以支持API接口)""" + self.logger.info(f"开始爬取腾讯{self.category_name}新闻") + + try: + device_id = self._generate_trace_id() + article_urls = self._fetch_article_urls_from_api(device_id) + self.logger.info(f"找到 {len(article_urls)} 篇文章") + + articles = self._fetch_articles(article_urls) + self.logger.info(f"成功爬取 {len(articles)} 篇文章") + return articles + + except Exception as e: + self.logger.error(f"爬取失败: {e}", exc_info=True) + return [] + finally: + self._cleanup() + + def _fetch_article_urls_from_api(self, device_id: str) -> List[str]: + """从API获取文章URL列表""" + urls = [] + import math + max_pages = math.ceil(self.max_articles / self.item_count) + + for flush_num in range(max_pages): + payload = { + "base_req": {"from": "pc"}, + "forward": "1", + "qimei36": device_id, + "device_id": device_id, + "flush_num": flush_num + 1, + "channel_id": self.channel_id, + "item_count": self.item_count, + "is_local_chlid": "0" + } + + try: + response = requests.post(self.api_url, json=payload, timeout=10) + + if response.status_code == 200: + data = response.json() + if data.get("code") == 0 and "data" in data: + news_list = data["data"] + for item in news_list: + news_id = item.get("id") + if news_id not in self.seen_ids: + self.seen_ids.add(news_id) + url = item.get("link_info", {}).get("url") + if url: + urls.append(url) + if len(urls) >= self.max_articles: + break + if len(urls) >= self.max_articles: + break + + except Exception as e: + self.logger.error(f"获取API数据失败: {e}") + + time.sleep(random.uniform(1, 2)) + + return urls + + def _generate_trace_id(self): + """生成trace_id""" + random_str = str(random.random()) + str(time.time()) + return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12] +``` + +**配置**: `config/config.yaml` +```yaml +tencent: + categories: + finance: + url: "https://news.qq.com/ch/finance" + category_id: 3 + name: "财经" + css_selector: "" +``` + +**运行**: +```bash +python -m src.cli.main tencent:finance --max 5 +``` + +--- + +### 示例2:动态爬虫(网易科技) + ### 完整示例:添加新浪娱乐爬虫 假设我们要为新浪网站添加娱乐分类爬虫: @@ -772,16 +982,25 @@ GROUP BY url HAVING count > 1; ``` -### Q8: 如何批量运行所有爬虫? +### Q8: 如何批量运行爬虫? + +**v2.0支持三种运行方式:** ```bash -# 运行所有爬虫 -python -m src.cli.main --all +# 1. 单个爬虫 +python -m src.cli.main tencent:finance --max 5 -# 限制每个爬虫的数量 +# 2. 指定多个爬虫(跨数据源) +python -m src.cli.main tencent:finance tencent:tech netease:tech --max 3 + +# 3. 所有爬虫 python -m src.cli.main --all --max 5 ``` +**统计功能自动启用:** +- 单个爬虫:显示简明统计 +- 多个爬虫:显示按数据源分组的汇总统计 + ### Q9: 如何修改最大爬取数量? **方法 1: 命令行参数** @@ -913,17 +1132,21 @@ PyYAML>=6.0 添加新爬虫的核心步骤: -1. ✅ 分析目标网站结构 +1. ✅ 分析目标网站结构(确定使用API/静态/动态方式) 2. ✅ 创建爬虫类(继承 `DynamicCrawler` 或 `StaticCrawler`) 3. ✅ 创建解析器类(继承 `BaseParser`) 4. ✅ 更新配置文件(`config.yaml`) 5. ✅ 注册爬虫到 CLI(`src/cli/main.py`) -6. ✅ 测试运行 +6. ✅ 测试运行(单个/多个/全部) 遵循本指南,您可以为新闻爬虫系统添加任意数量的新网站和分类爬虫。 --- -**文档版本**: 1.0 -**最后更新**: 2026-01-15 -**维护者**: 新闻爬虫项目组 \ No newline at end of file +**文档版本**: 2.0 +**最后更新**: 2026-01-17 +**维护者**: 新闻爬虫项目组 + +**版本更新说明:** +- v2.0: 新增API爬虫类型、多爬虫支持、统计展示功能 +- v1.0: 初始版本 \ No newline at end of file diff --git a/crawler-module/src/cli/main.py b/crawler-module/src/cli/main.py index aa90c9c..5978946 100644 --- a/crawler-module/src/cli/main.py +++ b/crawler-module/src/cli/main.py @@ -5,7 +5,9 @@ import argparse import sys -from typing import List +from typing import List, Optional, Union +from dataclasses import dataclass +from collections import defaultdict import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -43,10 +45,29 @@ CRAWLER_CLASSES = { 'health': ('crawlers.tencent.health', 'HealthCrawler'), 'house': ('crawlers.tencent.house', 'HouseCrawler'), 'tech': ('crawlers.tencent.tech', 'TechCrawler'), + 'entertainment': ('crawlers.tencent.entertainment', 'EntertainmentCrawler'), + 'finance': ('crawlers.tencent.finance', 'FinanceCrawler'), + 'ai': ('crawlers.tencent.ai', 'SearchAICrawler'), }, } +@dataclass +class CrawlerStats: + """单个爬虫的统计信息""" + source: str + category: str + success: bool + crawled_count: int # 爬取的文章数 + inserted_count: int # 插入成功的文章数 + duplicate_count: int # 重复的文章数 + error: Optional[str] = None + + @property + def crawler_name(self) -> str: + return f"{self.source}:{self.category}" + + def init_logging(): """初始化日志系统""" Logger.get_logger("news-crawler") @@ -104,7 +125,7 @@ def get_crawler_class(source: str, category: str): return getattr(module, class_name) -def run_crawler(source: str, category: str, max_articles: int = None) -> bool: +def run_crawler(source: str, category: str, max_articles: int = None) -> CrawlerStats: """ 运行指定爬虫 @@ -114,7 +135,7 @@ def run_crawler(source: str, category: str, max_articles: int = None) -> bool: max_articles: 最大文章数 Returns: - 是否成功 + CrawlerStats: 统计信息对象 """ logger = get_logger(__name__) @@ -128,13 +149,22 @@ def run_crawler(source: str, category: str, max_articles: int = None) -> bool: # 覆盖最大文章数 if max_articles: crawler.max_articles = max_articles - + articles = crawler.crawl() + crawled_count = len(articles) if not articles: logger.warning(f"未爬取到任何文章") - return False + return CrawlerStats( + source=source, + category=category, + success=False, + crawled_count=0, + inserted_count=0, + duplicate_count=0, + error="未爬取到任何文章" + ) # 转换为数据模型 news_list = [ @@ -151,16 +181,119 @@ def run_crawler(source: str, category: str, max_articles: int = None) -> bool: if article.is_valid() ] + valid_count = len(news_list) + # 保存到数据库 repository = NewsRepository() - count = repository.save_news(news_list) + inserted_count = repository.save_news(news_list) + duplicate_count = valid_count - inserted_count - logger.info(f"任务完成,保存了 {count} 条新闻") - return count > 0 + success = inserted_count > 0 + + logger.info(f"任务完成,爬取 {crawled_count} 篇,保存 {inserted_count} 条") + + return CrawlerStats( + source=source, + category=category, + success=success, + crawled_count=crawled_count, + inserted_count=inserted_count, + duplicate_count=duplicate_count + ) except Exception as e: logger.error(f"运行爬虫失败: {e}", exc_info=True) - return False + return CrawlerStats( + source=source, + category=category, + success=False, + crawled_count=0, + inserted_count=0, + duplicate_count=0, + error=str(e) + ) + + +def display_stats(stats: Union[CrawlerStats, List[CrawlerStats]]): + """ + 展示统计信息 + + Args: + stats: 单个统计对象或统计列表 + """ + if isinstance(stats, CrawlerStats): + # 单个爬虫的统计信息 + print("\n" + "="*60) + print(f"爬虫统计: {stats.crawler_name}") + print("="*60) + print(f"状态: {'[成功]' if stats.success else '[失败]'}") + print(f"爬取数量: {stats.crawled_count} 篇") + print(f"插入数量: {stats.inserted_count} 条") + if stats.duplicate_count > 0: + print(f"重复数量: {stats.duplicate_count} 条") + if stats.error: + print(f"错误信息: {stats.error}") + print("="*60 + "\n") + + elif isinstance(stats, list) and len(stats) > 0: + # 多个爬虫的统计信息(汇总) + print("\n" + "="*80) + print("爬虫任务汇总统计") + print("="*80) + + # 按数据源分组 + grouped = defaultdict(list) + for stat in stats: + grouped[stat.source].append(stat) + + # 按数据源展示 + for source, source_stats in grouped.items(): + print(f"\n【{source.upper()}】") + print("-"*80) + + # 表头 + print(f"{'分类':<12} {'状态':<8} {'爬取':<8} {'插入':<8} {'重复':<8}") + print("-"*80) + + total_crawled = 0 + total_inserted = 0 + total_duplicate = 0 + success_count = 0 + + for stat in source_stats: + status = "[成功]" if stat.success else "[失败]" + print(f"{stat.category:<12} {status:<8} " + f"{stat.crawled_count:<8} {stat.inserted_count:<8} " + f"{stat.duplicate_count:<8}") + + total_crawled += stat.crawled_count + total_inserted += stat.inserted_count + total_duplicate += stat.duplicate_count + if stat.success: + success_count += 1 + + # 汇总行 + print("-"*80) + print(f"{'小计':<12} {success_count}/{len(source_stats)} 成功 " + f" {total_crawled:<8} {total_inserted:<8} {total_duplicate:<8}") + + # 总计 + print("\n" + "="*80) + print("总计统计") + print("="*80) + + total_crawled_all = sum(s.crawled_count for s in stats) + total_inserted_all = sum(s.inserted_count for s in stats) + total_duplicate_all = sum(s.duplicate_count for s in stats) + success_count_all = sum(1 for s in stats if s.success) + + print(f"总爬虫数: {len(stats)}") + print(f"成功数: {success_count_all}") + print(f"失败数: {len(stats) - success_count_all}") + print(f"总爬取: {total_crawled_all} 篇") + print(f"总插入: {total_inserted_all} 条") + print(f"总重复: {total_duplicate_all} 条") + print("="*80 + "\n") def main(): @@ -170,18 +303,18 @@ def main(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: - %(prog)s --list # 列出所有可用爬虫 - %(prog)s netease:tech # 爬取网易科技新闻 - %(prog)s kr36:ai # 爬取36氪AI新闻 - %(prog)s netease:tech --max 5 # 爬取5篇网易科技新闻 - %(prog)s --all # 运行所有爬虫 + %(prog)s --list # 列出所有可用爬虫 + %(prog)s netease:tech # 爬取单个网易科技新闻 + %(prog)s netease:tech kr36:ai tencent:tech # 爬取多个指定的爬虫 + %(prog)s --all # 运行所有爬虫 + %(prog)s netease:tech --max 5 # 爬取5篇网易科技新闻 """ ) parser.add_argument( 'crawler', - nargs='?', - help='爬虫名称 (格式: source:category)' + nargs='*', + help='爬虫名称 (格式: source:category),可指定多个' ) parser.add_argument( @@ -225,32 +358,51 @@ def main(): if args.all: logger.info("运行所有爬虫...") crawlers = list_crawlers() - success_count = 0 + all_stats = [] for crawler_name in crawlers: source, category = crawler_name.split(':') logger.info(f"正在运行 {crawler_name}...") - if run_crawler(source, category, args.max): - success_count += 1 + stats = run_crawler(source, category, args.max) + all_stats.append(stats) - logger.info(f"完成: {success_count}/{len(crawlers)} 个爬虫成功") - return 0 if success_count == len(crawlers) else 1 + # 展示汇总统计 + display_stats(all_stats) - # 处理单个爬虫 + # 返回码:全部成功返回0,否则返回1 + return 0 if all(s.success for s in all_stats) else 1 + + # 处理爬虫列表(支持单个或多个) if not args.crawler: parser.print_help() return 1 - try: - source, category = args.crawler.split(':') - except ValueError: - logger.error(f"爬虫名称格式错误,应为 'source:category'") - return 1 + # 验证爬虫格式 + crawler_list = [] + for crawler_name in args.crawler: + try: + source, category = crawler_name.split(':') + crawler_list.append((source, category)) + except ValueError: + logger.error(f"爬虫名称格式错误: '{crawler_name}',应为 'source:category'") + return 1 - if run_crawler(source, category, args.max): - return 0 + # 运行爬虫并收集统计 + all_stats = [] + for source, category in crawler_list: + crawler_name = f"{source}:{category}" + logger.info(f"正在运行 {crawler_name}...") + stats = run_crawler(source, category, args.max) + all_stats.append(stats) + + # 展示统计(单个或汇总) + if len(all_stats) == 1: + display_stats(all_stats[0]) else: - return 1 + display_stats(all_stats) + + # 返回码:全部成功返回0,否则返回1 + return 0 if all(s.success for s in all_stats) else 1 if __name__ == "__main__": diff --git a/crawler-module/src/crawlers/tencent/finance.py b/crawler-module/src/crawlers/tencent/finance.py new file mode 100644 index 0000000..8046e12 --- /dev/null +++ b/crawler-module/src/crawlers/tencent/finance.py @@ -0,0 +1,211 @@ +""" +腾讯财经新闻爬虫(API版) +使用腾讯新闻 API 接口获取数据,性能更好 +""" + +import time +import random +import hashlib +from typing import List +import requests + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from base.crawler_base import StaticCrawler, Article +from parsers.tencent_parser import TencentParser + + +class FinanceCrawler(StaticCrawler): + """腾讯财经新闻爬虫(API版)""" + + def __init__(self, source: str, category: str): + super().__init__(source, category) + + # 腾讯API配置 + self.api_url = "https://i.news.qq.com/web_feed/getPCList" + self.channel_id = "news_news_finance" # 财经频道 + self.seen_ids = set() + self.item_count = 20 # 每页固定请求20条 + + def _generate_trace_id(self): + """生成trace_id""" + random_str = str(random.random()) + str(time.time()) + return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12] + + def crawl(self) -> List[Article]: + """ + 执行爬取任务(重写基类方法以支持API接口) + + Returns: + 文章列表 + """ + self.logger.info(f"开始爬取腾讯{self.category_name}新闻") + + try: + # 生成设备ID + device_id = self._generate_trace_id() + + # 获取文章URL列表 + article_urls = self._fetch_article_urls_from_api(device_id) + self.logger.info(f"找到 {len(article_urls)} 篇文章") + + # 爬取文章详情 + articles = self._fetch_articles(article_urls) + + self.logger.info(f"成功爬取 {len(articles)} 篇文章") + return articles + + except Exception as e: + self.logger.error(f"爬取失败: {e}", exc_info=True) + return [] + finally: + self._cleanup() + + def _fetch_article_urls_from_api(self, device_id: str) -> List[str]: + """ + 从API获取文章URL列表 + + Args: + device_id: 设备ID + + Returns: + 文章URL列表 + """ + urls = [] + + # 根据 max_articles 动态计算需要抓取的页数 + # 每页20条,向上取整 + import math + max_pages = math.ceil(self.max_articles / self.item_count) + self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages} 页") + + for flush_num in range(max_pages): + payload = { + "base_req": {"from": "pc"}, + "forward": "1", + "qimei36": device_id, + "device_id": device_id, + "flush_num": flush_num + 1, + "channel_id": self.channel_id, + "item_count": self.item_count, + "is_local_chlid": "0" + } + + try: + headers = { + "User-Agent": self.http_client.session.headers.get("User-Agent"), + "Referer": "https://new.qq.com/", + "Origin": "https://new.qq.com", + "Content-Type": "application/json" + } + + response = requests.post( + self.api_url, + headers=headers, + json=payload, + timeout=10 + ) + + if response.status_code == 200: + data = response.json() + if data.get("code") == 0 and "data" in data: + news_list = data["data"] + if not news_list: + self.logger.info("没有更多数据了") + break + + # 提取URL + for item in news_list: + news_id = item.get("id") + + # 去重 + if news_id in self.seen_ids: + continue + self.seen_ids.add(news_id) + + # 过滤视频新闻(articletype == "4") + article_type = item.get("articletype") + if article_type == "4": + continue + + # 提取URL + url = item.get("link_info", {}).get("url") + if url: + urls.append(url) + + # 如果已经获取到足够的文章数量,提前终止 + if len(urls) >= self.max_articles: + self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取") + break + + # 如果外层循环也需要终止 + if len(urls) >= self.max_articles: + break + + else: + self.logger.warning(f"接口返回错误: {data.get('message')}") + else: + self.logger.warning(f"HTTP请求失败: {response.status_code}") + + except Exception as e: + self.logger.error(f"获取API数据失败: {e}") + + # 延迟,避免请求过快 + time.sleep(random.uniform(1, 2)) + + return urls + + def _fetch_page(self) -> str: + """ + 获取页面HTML(腾讯爬虫不使用此方法) + + Returns: + 空字符串 + """ + return "" + + def _extract_article_urls(self, html: str) -> List[str]: + """ + 从HTML中提取文章URL列表(腾讯爬虫不使用此方法) + + Args: + html: 页面HTML内容 + + Returns: + 空列表 + """ + return [] + + def _fetch_articles(self, urls: List[str]) -> List[Article]: + """ + 爬取文章详情 + + Args: + urls: 文章URL列表 + + Returns: + 文章列表 + """ + articles = [] + parser = TencentParser() + + for i, url in enumerate(urls[:self.max_articles]): + try: + article = parser.parse(url) + article.category_id = self.category_id + article.source = "腾讯" + + if not article.author: + article.author = "腾讯财经" + + if article.is_valid(): + articles.append(article) + self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") + + except Exception as e: + self.logger.error(f"解析文章失败: {url} - {e}") + continue + + return articles