""" 搜狐文章解析器 """ from bs4 import BeautifulSoup import re import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from base.parser_base import BaseParser from base.crawler_base import Article from utils.http_client import HttpClient from utils.logger import get_logger class SouhuParser(BaseParser): """焦点房产网文章解析器""" def __init__(self): self.logger = get_logger(__name__) self.http_client = HttpClient() def parse(self, url: str) -> Article: """ 解析焦点房产网文章详情页 Args: url: 文章URL Returns: 文章对象 """ html = self.http_client.get(url) soup = BeautifulSoup(html, "lxml") # 找到文章容器 article_container = soup.select_one("#article-container") if not article_container: return Article( url=url, title=None, publish_time=None, author=None, content="", ) # 提取标题 title = None title_tag = article_container.select_one("div.text-title h1") if title_tag: title = title_tag.get_text(strip=True) # 提取作者 author = None author_tag = article_container.select_one(".article-info .author a") if author_tag: author = author_tag.get_text(strip=True) # 提取发布时间 publish_time = None time_tag = article_container.select_one(".article-info .author #news-time") if time_tag: publish_time = time_tag.get_text(strip=True) # 提取正文内容 content = "" content_tag = soup.select_one("article#mp-editor") if content_tag: # 使用 separator='\n\n' 让每个段落之间空一行 content = content_tag.get_text(separator='\n\n', strip=True) return Article( url=url, title=title, publish_time=publish_time, author=author, content=content, )