83 lines
2.1 KiB
Python
83 lines
2.1 KiB
Python
"""
|
|
搜狐文章解析器
|
|
"""
|
|
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from base.parser_base import BaseParser
|
|
from base.crawler_base import Article
|
|
from utils.http_client import HttpClient
|
|
from utils.logger import get_logger
|
|
|
|
|
|
class SouhuParser(BaseParser):
|
|
"""焦点房产网文章解析器"""
|
|
|
|
def __init__(self):
|
|
self.logger = get_logger(__name__)
|
|
self.http_client = HttpClient()
|
|
|
|
def parse(self, url: str) -> Article:
|
|
"""
|
|
解析焦点房产网文章详情页
|
|
|
|
Args:
|
|
url: 文章URL
|
|
|
|
Returns:
|
|
文章对象
|
|
"""
|
|
html = self.http_client.get(url)
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
# 找到文章容器
|
|
article_container = soup.select_one("#article-container")
|
|
|
|
if not article_container:
|
|
return Article(
|
|
url=url,
|
|
title=None,
|
|
publish_time=None,
|
|
author=None,
|
|
content="",
|
|
)
|
|
|
|
# 提取标题
|
|
title = None
|
|
title_tag = article_container.select_one("div.text-title h1")
|
|
if title_tag:
|
|
title = title_tag.get_text(strip=True)
|
|
|
|
# 提取作者
|
|
author = None
|
|
author_tag = article_container.select_one(".article-info .author a")
|
|
if author_tag:
|
|
author = author_tag.get_text(strip=True)
|
|
|
|
# 提取发布时间
|
|
publish_time = None
|
|
time_tag = article_container.select_one(".article-info .author #news-time")
|
|
if time_tag:
|
|
publish_time = time_tag.get_text(strip=True)
|
|
|
|
# 提取正文内容
|
|
content = ""
|
|
content_tag = soup.select_one("article#mp-editor")
|
|
|
|
if content_tag:
|
|
# 使用 separator='\n\n' 让每个段落之间空一行
|
|
content = content_tag.get_text(separator='\n\n', strip=True)
|
|
|
|
return Article(
|
|
url=url,
|
|
title=title,
|
|
publish_time=publish_time,
|
|
author=author,
|
|
content=content,
|
|
)
|