import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from base.parser_base import BaseParser from datetime import datetime from typing import Optional from base.crawler_base import Article from bs4 import BeautifulSoup from utils.logger import get_logger from utils.http_client import HttpClient class SinaAutoParser(BaseParser): """新浪网汽车新闻解析器""" def __init__(self): self.logger = get_logger(__name__) self.http_client = HttpClient() def parse(self, url: str) -> Article: """ 解析新浪网文章详情页 Args: url: 文章URL Returns: 文章对象 """ html = self.http_client.get(url) soup = BeautifulSoup(html, "lxml") # 获取文章标题 article_title_tag = soup.select_one("div.main-content h1.main-title") article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题" # 获取文章发布时间 def normalize_time(time_str): for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"): try: dt = datetime.strptime(time_str, fmt) return dt.strftime("%Y-%m-%d %H:%M:%S") except: continue return time_str # 如果都不匹配,返回原字符串 time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date") publish_time = normalize_time(time_tag.get_text(strip=True)) if time_tag else "1949-01-01 12:00:00" # 获取文章作者 author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a") author = author_tag.get_text(strip=True) if author_tag else "未知" # 获取文章正文段落 article_div = soup.select_one("div.main-content div.article") if not article_div: raise ValueError("无法找到文章内容") paragraphs = article_div.find_all('p') content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)) return Article( url=url, title=article_title, publish_time=publish_time, author=author, content=content, category_id=6, # 汽车分类ID source="sina" )