71 lines
2.4 KiB
Python
71 lines
2.4 KiB
Python
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
|
|
from base.parser_base import BaseParser
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
from base.crawler_base import Article
|
|
from bs4 import BeautifulSoup
|
|
from utils.logger import get_logger
|
|
from utils.http_client import HttpClient
|
|
|
|
|
|
class SinaAutoParser(BaseParser):
|
|
"""新浪网汽车新闻解析器"""
|
|
|
|
def __init__(self):
|
|
self.logger = get_logger(__name__)
|
|
self.http_client = HttpClient()
|
|
|
|
def parse(self, url: str) -> Article:
|
|
"""
|
|
解析新浪网文章详情页
|
|
|
|
Args:
|
|
url: 文章URL
|
|
|
|
Returns:
|
|
文章对象
|
|
"""
|
|
html = self.http_client.get(url)
|
|
soup = BeautifulSoup(html, "lxml")
|
|
|
|
# 获取文章标题
|
|
article_title_tag = soup.select_one("div.main-content h1.main-title")
|
|
article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题"
|
|
|
|
# 获取文章发布时间
|
|
def normalize_time(time_str):
|
|
for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
|
|
try:
|
|
dt = datetime.strptime(time_str, fmt)
|
|
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
except:
|
|
continue
|
|
return time_str # 如果都不匹配,返回原字符串
|
|
|
|
time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
|
|
publish_time = normalize_time(time_tag.get_text(strip=True)) if time_tag else "1949-01-01 12:00:00"
|
|
|
|
# 获取文章作者
|
|
author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
|
|
author = author_tag.get_text(strip=True) if author_tag else "未知"
|
|
|
|
# 获取文章正文段落
|
|
article_div = soup.select_one("div.main-content div.article")
|
|
if not article_div:
|
|
raise ValueError("无法找到文章内容")
|
|
|
|
paragraphs = article_div.find_all('p')
|
|
content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
|
|
|
|
return Article(
|
|
url=url,
|
|
title=article_title,
|
|
publish_time=publish_time,
|
|
author=author,
|
|
content=content,
|
|
category_id=6, # 汽车分类ID
|
|
source="sina"
|
|
) |