news-classifier/crawler-module/src/cli/main.py

255 lines
6.7 KiB
Python

"""
命令行接口
提供统一的爬虫启动入口
"""
import argparse
import sys
from typing import List
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from config.settings import config
from utils.logger import get_logger, Logger
from database.repository import NewsRepository
from database.models import NewsModel
# 爬虫类映射
CRAWLER_CLASSES = {
'netease': {
'entertainment': ('crawlers.netease.entertainment', 'EntertainmentCrawler'),
'tech': ('crawlers.netease.tech', 'TechCrawler'),
'sports': ('crawlers.netease.sports', 'SportsCrawler'),
'money': ('crawlers.netease.money', 'MoneyCrawler'),
'auto': ('crawlers.netease.auto', 'AutoCrawler'),
'gov': ('crawlers.netease.gov', 'GovCrawler'),
'health': ('crawlers.netease.health', 'HealthCrawler'),
'war': ('crawlers.netease.war', 'WarCrawler'),
},
'kr36': {
'ai': ('crawlers.kr36.ai', 'AICrawler'),
'health': ('crawlers.kr36.health', 'HealthCrawler'),
},
'sina': {
'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
'gov': ('crawlers.sina.gov', 'SinaGovCrawler'),
},
'tencent': {
'auto': ('crawlers.tencent.auto', 'AutoCrawler'),
'war': ('crawlers.tencent.war', 'WarCrawler'),
'war_web': ('crawlers.tencent.war_web', 'WarWebCrawler'),
},
}
def init_logging():
"""初始化日志系统"""
Logger.get_logger("news-crawler")
def list_crawlers() -> List[str]:
"""列出所有可用的爬虫"""
crawlers = []
# 网易爬虫
netease_categories = config.get('sources.netease.categories', {})
for category in netease_categories.keys():
crawlers.append(f"netease:{category}")
# 36氪爬虫
kr36_categories = config.get('sources.kr36.categories', {})
for category in kr36_categories.keys():
crawlers.append(f"kr36:{category}")
# 新浪爬虫
sina_categories = config.get('sources.sina.categories', {})
for category in sina_categories.keys():
crawlers.append(f"sina:{category}")
# 腾讯爬虫
tencent_categories = config.get('sources.tencent.categories', {})
for category in tencent_categories.keys():
crawlers.append(f"tencent:{category}")
return crawlers
def get_crawler_class(source: str, category: str):
"""
动态导入爬虫类
Args:
source: 新闻源
category: 分类
Returns:
爬虫类
"""
if source not in CRAWLER_CLASSES:
raise ValueError(f"不支持的数据源: {source}")
if category not in CRAWLER_CLASSES[source]:
raise ValueError(f"不支持的分类: {source}:{category}")
module_name, class_name = CRAWLER_CLASSES[source][category]
# 动态导入模块
import importlib
module = importlib.import_module(f"src.{module_name}")
return getattr(module, class_name)
def run_crawler(source: str, category: str, max_articles: int = None) -> bool:
"""
运行指定爬虫
Args:
source: 新闻源
category: 分类
max_articles: 最大文章数
Returns:
是否成功
"""
logger = get_logger(__name__)
try:
# 动态导入爬虫类
crawler_class = get_crawler_class(source, category)
# 创建并运行爬虫
crawler = crawler_class(source, category)
print("创建并运行爬虫")
# 覆盖最大文章数
if max_articles:
crawler.max_articles = max_articles
articles = crawler.crawl()
if not articles:
logger.warning(f"未爬取到任何文章")
return False
# 转换为数据模型
news_list = [
NewsModel(
url=article.url,
title=article.title or "",
content=article.content,
category_id=article.category_id,
source=article.source,
publish_time=article.publish_time,
author=article.author,
)
for article in articles
if article.is_valid()
]
# 保存到数据库
repository = NewsRepository()
count = repository.save_news(news_list)
logger.info(f"任务完成,保存了 {count} 条新闻")
return count > 0
except Exception as e:
logger.error(f"运行爬虫失败: {e}", exc_info=True)
return False
def main():
"""主函数"""
parser = argparse.ArgumentParser(
description="新闻爬虫系统",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
%(prog)s --list # 列出所有可用爬虫
%(prog)s netease:tech # 爬取网易科技新闻
%(prog)s kr36:ai # 爬取36氪AI新闻
%(prog)s netease:tech --max 5 # 爬取5篇网易科技新闻
%(prog)s --all # 运行所有爬虫
"""
)
parser.add_argument(
'crawler',
nargs='?',
help='爬虫名称 (格式: source:category)'
)
parser.add_argument(
'--list',
action='store_true',
help='列出所有可用爬虫'
)
parser.add_argument(
'--all',
action='store_true',
help='运行所有爬虫'
)
parser.add_argument(
'--max',
type=int,
help='最大爬取文章数'
)
parser.add_argument(
'--debug',
action='store_true',
help='开启调试模式'
)
args = parser.parse_args()
# 初始化日志
init_logging()
logger = get_logger(__name__)
# 处理--list参数
if args.list:
print("可用的爬虫:")
for crawler in list_crawlers():
print(f" - {crawler}")
return 0
# 处理--all参数
if args.all:
logger.info("运行所有爬虫...")
crawlers = list_crawlers()
success_count = 0
for crawler_name in crawlers:
source, category = crawler_name.split(':')
logger.info(f"正在运行 {crawler_name}...")
if run_crawler(source, category, args.max):
success_count += 1
logger.info(f"完成: {success_count}/{len(crawlers)} 个爬虫成功")
return 0 if success_count == len(crawlers) else 1
# 处理单个爬虫
if not args.crawler:
parser.print_help()
return 1
try:
source, category = args.crawler.split(':')
except ValueError:
logger.error(f"爬虫名称格式错误,应为 'source:category'")
return 1
if run_crawler(source, category, args.max):
return 0
else:
return 1
if __name__ == "__main__":
sys.exit(main())