news-classifier/crawler-module/src/parsers/sina_parser.py

import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

from base.parser_base import BaseParser
from datetime import datetime
from typing import Optional
from base.crawler_base import Article
from bs4 import BeautifulSoup
from utils.logger import get_logger
from utils.http_client import HttpClient


class SinaAutoParser(BaseParser):
    """新浪网汽车新闻解析器"""

    def __init__(self):
        self.logger = get_logger(__name__)
        self.http_client = HttpClient()

    def parse(self, url: str) -> Article:
        """
        解析新浪网文章详情页

        Args:
            url: 文章URL

        Returns:
            文章对象
        """
        html = self.http_client.get(url)
        soup = BeautifulSoup(html, "lxml")

        # 获取文章标题
        article_title_tag = soup.select_one("div.main-content h1.main-title")
        article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题"

        # 获取文章发布时间
        def normalize_time(time_str):
            for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
                try:
                    dt = datetime.strptime(time_str, fmt)
                    return dt.strftime("%Y-%m-%d %H:%M:%S")
                except:
                    continue
            return time_str  # 如果都不匹配，返回原字符串

        time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
        publish_time = normalize_time(time_tag.get_text(strip=True)) if time_tag else "1949-01-01 12:00:00"

        # 获取文章作者
        author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
        author = author_tag.get_text(strip=True) if author_tag else "未知"

        # 获取文章正文段落
        article_div = soup.select_one("div.main-content div.article")
        if not article_div:
            raise ValueError("无法找到文章内容")

        paragraphs = article_div.find_all('p')
        content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))

        return Article(
            url=url,
            title=article_title,
            publish_time=publish_time,
            author=author,
            content=content,
            category_id=6,  # 汽车分类ID
            source="sina"
        )