news-classifier/crawler-module/src/parsers/souhu_parser.py

"""
搜狐文章解析器
"""

from bs4 import BeautifulSoup
import re

import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from base.parser_base import BaseParser
from base.crawler_base import Article
from utils.http_client import HttpClient
from utils.logger import get_logger


class SouhuParser(BaseParser):
    """焦点房产网文章解析器"""

    def __init__(self):
        self.logger = get_logger(__name__)
        self.http_client = HttpClient()

    def parse(self, url: str) -> Article:
        """
        解析焦点房产网文章详情页

        Args:
            url: 文章URL

        Returns:
            文章对象
        """
        html = self.http_client.get(url)
        soup = BeautifulSoup(html, "lxml")

        # 找到文章容器
        article_container = soup.select_one("#article-container")

        if not article_container:
            return Article(
                url=url,
                title=None,
                publish_time=None,
                author=None,
                content="",
            )

        # 提取标题
        title = None
        title_tag = article_container.select_one("div.text-title h1")
        if title_tag:
            title = title_tag.get_text(strip=True)

        # 提取作者
        author = None
        author_tag = article_container.select_one(".article-info .author a")
        if author_tag:
            author = author_tag.get_text(strip=True)

        # 提取发布时间
        publish_time = None
        time_tag = article_container.select_one(".article-info .author #news-time")
        if time_tag:
            publish_time = time_tag.get_text(strip=True)

        # 提取正文内容
        content = ""
        content_tag = soup.select_one("article#mp-editor")

        if content_tag:
            # 使用 separator='\n\n' 让每个段落之间空一行
            content = content_tag.get_text(separator='\n\n', strip=True)

        return Article(
            url=url,
            title=title,
            publish_time=publish_time,
            author=author,
            content=content,
        )