news-classifier/crawler-module/src/parsers/sina_parser.py

71 lines
2.4 KiB
Python

import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from base.parser_base import BaseParser
from datetime import datetime
from typing import Optional
from base.crawler_base import Article
from bs4 import BeautifulSoup
from utils.logger import get_logger
from utils.http_client import HttpClient
class SinaAutoParser(BaseParser):
"""新浪网汽车新闻解析器"""
def __init__(self):
self.logger = get_logger(__name__)
self.http_client = HttpClient()
def parse(self, url: str) -> Article:
"""
解析新浪网文章详情页
Args:
url: 文章URL
Returns:
文章对象
"""
html = self.http_client.get(url)
soup = BeautifulSoup(html, "lxml")
# 获取文章标题
article_title_tag = soup.select_one("div.main-content h1.main-title")
article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题"
# 获取文章发布时间
def normalize_time(time_str):
for fmt in ("%Y年%m月%d%H:%M", "%Y-%m-%d %H:%M:%S"):
try:
dt = datetime.strptime(time_str, fmt)
return dt.strftime("%Y-%m-%d %H:%M:%S")
except:
continue
return time_str # 如果都不匹配,返回原字符串
time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
publish_time = normalize_time(time_tag.get_text(strip=True)) if time_tag else "1949-01-01 12:00:00"
# 获取文章作者
author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
author = author_tag.get_text(strip=True) if author_tag else "未知"
# 获取文章正文段落
article_div = soup.select_one("div.main-content div.article")
if not article_div:
raise ValueError("无法找到文章内容")
paragraphs = article_div.find_all('p')
content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
return Article(
url=url,
title=article_title,
publish_time=publish_time,
author=author,
content=content,
category_id=6, # 汽车分类ID
source="sina"
)