255 lines
6.7 KiB
Python
255 lines
6.7 KiB
Python
"""
|
|
命令行接口
|
|
提供统一的爬虫启动入口
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
from typing import List
|
|
|
|
import os
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from config.settings import config
|
|
from utils.logger import get_logger, Logger
|
|
from database.repository import NewsRepository
|
|
from database.models import NewsModel
|
|
|
|
|
|
# 爬虫类映射
|
|
CRAWLER_CLASSES = {
|
|
'netease': {
|
|
'entertainment': ('crawlers.netease.entertainment', 'EntertainmentCrawler'),
|
|
'tech': ('crawlers.netease.tech', 'TechCrawler'),
|
|
'sports': ('crawlers.netease.sports', 'SportsCrawler'),
|
|
'money': ('crawlers.netease.money', 'MoneyCrawler'),
|
|
'auto': ('crawlers.netease.auto', 'AutoCrawler'),
|
|
'gov': ('crawlers.netease.gov', 'GovCrawler'),
|
|
'health': ('crawlers.netease.health', 'HealthCrawler'),
|
|
'war': ('crawlers.netease.war', 'WarCrawler'),
|
|
},
|
|
'kr36': {
|
|
'ai': ('crawlers.kr36.ai', 'AICrawler'),
|
|
'health': ('crawlers.kr36.health', 'HealthCrawler'),
|
|
},
|
|
'sina': {
|
|
'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
|
|
'gov': ('crawlers.sina.gov', 'SinaGovCrawler'),
|
|
},
|
|
'tencent': {
|
|
'auto': ('crawlers.tencent.auto', 'AutoCrawler'),
|
|
'war': ('crawlers.tencent.war', 'WarCrawler'),
|
|
'war_web': ('crawlers.tencent.war_web', 'WarWebCrawler'),
|
|
},
|
|
}
|
|
|
|
|
|
def init_logging():
|
|
"""初始化日志系统"""
|
|
Logger.get_logger("news-crawler")
|
|
|
|
|
|
def list_crawlers() -> List[str]:
|
|
"""列出所有可用的爬虫"""
|
|
crawlers = []
|
|
|
|
# 网易爬虫
|
|
netease_categories = config.get('sources.netease.categories', {})
|
|
for category in netease_categories.keys():
|
|
crawlers.append(f"netease:{category}")
|
|
|
|
# 36氪爬虫
|
|
kr36_categories = config.get('sources.kr36.categories', {})
|
|
for category in kr36_categories.keys():
|
|
crawlers.append(f"kr36:{category}")
|
|
|
|
# 新浪爬虫
|
|
sina_categories = config.get('sources.sina.categories', {})
|
|
for category in sina_categories.keys():
|
|
crawlers.append(f"sina:{category}")
|
|
|
|
# 腾讯爬虫
|
|
tencent_categories = config.get('sources.tencent.categories', {})
|
|
for category in tencent_categories.keys():
|
|
crawlers.append(f"tencent:{category}")
|
|
|
|
return crawlers
|
|
|
|
|
|
def get_crawler_class(source: str, category: str):
|
|
"""
|
|
动态导入爬虫类
|
|
|
|
Args:
|
|
source: 新闻源
|
|
category: 分类
|
|
|
|
Returns:
|
|
爬虫类
|
|
"""
|
|
if source not in CRAWLER_CLASSES:
|
|
raise ValueError(f"不支持的数据源: {source}")
|
|
|
|
if category not in CRAWLER_CLASSES[source]:
|
|
raise ValueError(f"不支持的分类: {source}:{category}")
|
|
|
|
module_name, class_name = CRAWLER_CLASSES[source][category]
|
|
|
|
# 动态导入模块
|
|
import importlib
|
|
module = importlib.import_module(f"src.{module_name}")
|
|
return getattr(module, class_name)
|
|
|
|
|
|
def run_crawler(source: str, category: str, max_articles: int = None) -> bool:
|
|
"""
|
|
运行指定爬虫
|
|
|
|
Args:
|
|
source: 新闻源
|
|
category: 分类
|
|
max_articles: 最大文章数
|
|
|
|
Returns:
|
|
是否成功
|
|
"""
|
|
logger = get_logger(__name__)
|
|
|
|
try:
|
|
# 动态导入爬虫类
|
|
crawler_class = get_crawler_class(source, category)
|
|
|
|
# 创建并运行爬虫
|
|
crawler = crawler_class(source, category)
|
|
print("创建并运行爬虫")
|
|
# 覆盖最大文章数
|
|
if max_articles:
|
|
crawler.max_articles = max_articles
|
|
|
|
|
|
articles = crawler.crawl()
|
|
|
|
if not articles:
|
|
logger.warning(f"未爬取到任何文章")
|
|
return False
|
|
|
|
# 转换为数据模型
|
|
news_list = [
|
|
NewsModel(
|
|
url=article.url,
|
|
title=article.title or "",
|
|
content=article.content,
|
|
category_id=article.category_id,
|
|
source=article.source,
|
|
publish_time=article.publish_time,
|
|
author=article.author,
|
|
)
|
|
for article in articles
|
|
if article.is_valid()
|
|
]
|
|
|
|
# 保存到数据库
|
|
repository = NewsRepository()
|
|
count = repository.save_news(news_list)
|
|
|
|
logger.info(f"任务完成,保存了 {count} 条新闻")
|
|
return count > 0
|
|
|
|
except Exception as e:
|
|
logger.error(f"运行爬虫失败: {e}", exc_info=True)
|
|
return False
|
|
|
|
|
|
def main():
|
|
"""主函数"""
|
|
parser = argparse.ArgumentParser(
|
|
description="新闻爬虫系统",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
示例:
|
|
%(prog)s --list # 列出所有可用爬虫
|
|
%(prog)s netease:tech # 爬取网易科技新闻
|
|
%(prog)s kr36:ai # 爬取36氪AI新闻
|
|
%(prog)s netease:tech --max 5 # 爬取5篇网易科技新闻
|
|
%(prog)s --all # 运行所有爬虫
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
'crawler',
|
|
nargs='?',
|
|
help='爬虫名称 (格式: source:category)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--list',
|
|
action='store_true',
|
|
help='列出所有可用爬虫'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--all',
|
|
action='store_true',
|
|
help='运行所有爬虫'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--max',
|
|
type=int,
|
|
help='最大爬取文章数'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--debug',
|
|
action='store_true',
|
|
help='开启调试模式'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# 初始化日志
|
|
init_logging()
|
|
logger = get_logger(__name__)
|
|
|
|
# 处理--list参数
|
|
if args.list:
|
|
print("可用的爬虫:")
|
|
for crawler in list_crawlers():
|
|
print(f" - {crawler}")
|
|
return 0
|
|
|
|
# 处理--all参数
|
|
if args.all:
|
|
logger.info("运行所有爬虫...")
|
|
crawlers = list_crawlers()
|
|
success_count = 0
|
|
|
|
for crawler_name in crawlers:
|
|
source, category = crawler_name.split(':')
|
|
logger.info(f"正在运行 {crawler_name}...")
|
|
if run_crawler(source, category, args.max):
|
|
success_count += 1
|
|
|
|
logger.info(f"完成: {success_count}/{len(crawlers)} 个爬虫成功")
|
|
return 0 if success_count == len(crawlers) else 1
|
|
|
|
# 处理单个爬虫
|
|
if not args.crawler:
|
|
parser.print_help()
|
|
return 1
|
|
|
|
try:
|
|
source, category = args.crawler.split(':')
|
|
except ValueError:
|
|
logger.error(f"爬虫名称格式错误,应为 'source:category'")
|
|
return 1
|
|
|
|
if run_crawler(source, category, args.max):
|
|
return 0
|
|
else:
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|