news-classifier/crawler-module/sina-auto.txt

83 lines
2.7 KiB
Plaintext

这是新浪网关于爬取汽车相关新闻的代码
```python
import requests
from bs4 import BeautifulSoup
URL = "https://auto.sina.com.cn/"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
}
resp = requests.get(URL,headers=headers,timeout=10)
# resp.raise_for_status()
# resp.encoding = "utf-8"
# print(resp.text)
with open("example/example-10.html","r",encoding="utf-8") as f:
html = f.read()
# soup = BeautifulSoup(resp.text,"lxml")
soup = BeautifulSoup(html,"lxml")
div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
for item in div_list:
a = item.select_one("div.ty-card-l a")
href = a.get("href")
# print(a.get('href'),a.get_text().strip())
resp = requests.get(url=href,headers=headers)
resp.encoding = resp.apparent_encoding # requests 会尝试猜测编码
soup = BeautifulSoup(resp.text,"lxml")
# 获取文章标题
article_title_tag = soup.select_one("div.main-content h1.main-title")
if article_title_tag:
article_title = article_title_tag.get_text(strip=True)
if not article_title:
article_title = "未知标题"
else:
article_title = "未知标题"
# print("标题:", article_title)
# 获取文章发布时间
from datetime import datetime
# 日期时间格式化函数
def normalize_time(time_str):
for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
try:
dt = datetime.strptime(time_str, fmt)
return dt.strftime("%Y-%m-%d %H:%M:%S")
except:
continue
return time_str # 如果都不匹配,返回原字符串
time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
if time_tag: # 只有存在时间标签才进行格式化
publish_time = normalize_time(time_tag.get_text(strip=True))
else:
publish_time = "1949-01-01 12:00:00"
#print(publish_time)
# 获取文章作者
author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
if author_tag:
author = author_tag.get_text(strip=True)
else:
author = "未知"
# print(author)
# 获取文章正文段落
article_div = soup.select_one("div.main-content div.article") # 核心文章容器
if not article_div:
# print("不是文章详情页,跳过")
continue # 如果不是详情页就跳过
paragraphs = article_div.find_all('p')
article_text = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
# print("正文:\n", article_text)
```