news-classifier/crawler-module/src/parsers/souhu_parser.py

83 lines
2.1 KiB
Python

"""
搜狐文章解析器
"""
from bs4 import BeautifulSoup
import re
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from base.parser_base import BaseParser
from base.crawler_base import Article
from utils.http_client import HttpClient
from utils.logger import get_logger
class SouhuParser(BaseParser):
"""焦点房产网文章解析器"""
def __init__(self):
self.logger = get_logger(__name__)
self.http_client = HttpClient()
def parse(self, url: str) -> Article:
"""
解析焦点房产网文章详情页
Args:
url: 文章URL
Returns:
文章对象
"""
html = self.http_client.get(url)
soup = BeautifulSoup(html, "lxml")
# 找到文章容器
article_container = soup.select_one("#article-container")
if not article_container:
return Article(
url=url,
title=None,
publish_time=None,
author=None,
content="",
)
# 提取标题
title = None
title_tag = article_container.select_one("div.text-title h1")
if title_tag:
title = title_tag.get_text(strip=True)
# 提取作者
author = None
author_tag = article_container.select_one(".article-info .author a")
if author_tag:
author = author_tag.get_text(strip=True)
# 提取发布时间
publish_time = None
time_tag = article_container.select_one(".article-info .author #news-time")
if time_tag:
publish_time = time_tag.get_text(strip=True)
# 提取正文内容
content = ""
content_tag = soup.select_one("article#mp-editor")
if content_tag:
# 使用 separator='\n\n' 让每个段落之间空一行
content = content_tag.get_text(separator='\n\n', strip=True)
return Article(
url=url,
title=title,
publish_time=publish_time,
author=author,
content=content,
)