""" 命令行接口 提供统一的爬虫启动入口 """ import argparse import sys from typing import List import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from config.settings import config from utils.logger import get_logger, Logger from database.repository import NewsRepository from database.models import NewsModel # 爬虫类映射 CRAWLER_CLASSES = { 'netease': { 'entertainment': ('crawlers.netease.entertainment', 'EntertainmentCrawler'), 'tech': ('crawlers.netease.tech', 'TechCrawler'), 'sports': ('crawlers.netease.sports', 'SportsCrawler'), 'money': ('crawlers.netease.money', 'MoneyCrawler'), 'auto': ('crawlers.netease.auto', 'AutoCrawler'), 'gov': ('crawlers.netease.gov', 'GovCrawler'), 'health': ('crawlers.netease.health', 'HealthCrawler'), 'war': ('crawlers.netease.war', 'WarCrawler'), }, 'kr36': { 'ai': ('crawlers.kr36.ai', 'AICrawler'), 'health': ('crawlers.kr36.health', 'HealthCrawler'), }, 'sina': { 'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'), 'gov': ('crawlers.sina.gov', 'SinaGovCrawler'), }, 'tencent': { 'auto': ('crawlers.tencent.auto', 'AutoCrawler'), 'war': ('crawlers.tencent.war', 'WarCrawler'), 'war_web': ('crawlers.tencent.war_web', 'WarWebCrawler'), }, } def init_logging(): """初始化日志系统""" Logger.get_logger("news-crawler") def list_crawlers() -> List[str]: """列出所有可用的爬虫""" crawlers = [] # 网易爬虫 netease_categories = config.get('sources.netease.categories', {}) for category in netease_categories.keys(): crawlers.append(f"netease:{category}") # 36氪爬虫 kr36_categories = config.get('sources.kr36.categories', {}) for category in kr36_categories.keys(): crawlers.append(f"kr36:{category}") # 新浪爬虫 sina_categories = config.get('sources.sina.categories', {}) for category in sina_categories.keys(): crawlers.append(f"sina:{category}") # 腾讯爬虫 tencent_categories = config.get('sources.tencent.categories', {}) for category in tencent_categories.keys(): crawlers.append(f"tencent:{category}") return crawlers def get_crawler_class(source: str, category: str): """ 动态导入爬虫类 Args: source: 新闻源 category: 分类 Returns: 爬虫类 """ if source not in CRAWLER_CLASSES: raise ValueError(f"不支持的数据源: {source}") if category not in CRAWLER_CLASSES[source]: raise ValueError(f"不支持的分类: {source}:{category}") module_name, class_name = CRAWLER_CLASSES[source][category] # 动态导入模块 import importlib module = importlib.import_module(f"src.{module_name}") return getattr(module, class_name) def run_crawler(source: str, category: str, max_articles: int = None) -> bool: """ 运行指定爬虫 Args: source: 新闻源 category: 分类 max_articles: 最大文章数 Returns: 是否成功 """ logger = get_logger(__name__) try: # 动态导入爬虫类 crawler_class = get_crawler_class(source, category) # 创建并运行爬虫 crawler = crawler_class(source, category) print("创建并运行爬虫") # 覆盖最大文章数 if max_articles: crawler.max_articles = max_articles articles = crawler.crawl() if not articles: logger.warning(f"未爬取到任何文章") return False # 转换为数据模型 news_list = [ NewsModel( url=article.url, title=article.title or "", content=article.content, category_id=article.category_id, source=article.source, publish_time=article.publish_time, author=article.author, ) for article in articles if article.is_valid() ] # 保存到数据库 repository = NewsRepository() count = repository.save_news(news_list) logger.info(f"任务完成,保存了 {count} 条新闻") return count > 0 except Exception as e: logger.error(f"运行爬虫失败: {e}", exc_info=True) return False def main(): """主函数""" parser = argparse.ArgumentParser( description="新闻爬虫系统", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" 示例: %(prog)s --list # 列出所有可用爬虫 %(prog)s netease:tech # 爬取网易科技新闻 %(prog)s kr36:ai # 爬取36氪AI新闻 %(prog)s netease:tech --max 5 # 爬取5篇网易科技新闻 %(prog)s --all # 运行所有爬虫 """ ) parser.add_argument( 'crawler', nargs='?', help='爬虫名称 (格式: source:category)' ) parser.add_argument( '--list', action='store_true', help='列出所有可用爬虫' ) parser.add_argument( '--all', action='store_true', help='运行所有爬虫' ) parser.add_argument( '--max', type=int, help='最大爬取文章数' ) parser.add_argument( '--debug', action='store_true', help='开启调试模式' ) args = parser.parse_args() # 初始化日志 init_logging() logger = get_logger(__name__) # 处理--list参数 if args.list: print("可用的爬虫:") for crawler in list_crawlers(): print(f" - {crawler}") return 0 # 处理--all参数 if args.all: logger.info("运行所有爬虫...") crawlers = list_crawlers() success_count = 0 for crawler_name in crawlers: source, category = crawler_name.split(':') logger.info(f"正在运行 {crawler_name}...") if run_crawler(source, category, args.max): success_count += 1 logger.info(f"完成: {success_count}/{len(crawlers)} 个爬虫成功") return 0 if success_count == len(crawlers) else 1 # 处理单个爬虫 if not args.crawler: parser.print_help() return 1 try: source, category = args.crawler.split(':') except ValueError: logger.error(f"爬虫名称格式错误,应为 'source:category'") return 1 if run_crawler(source, category, args.max): return 0 else: return 1 if __name__ == "__main__": sys.exit(main())