83 lines
2.7 KiB
Plaintext
83 lines
2.7 KiB
Plaintext
|
|
这是新浪网关于爬取汽车相关新闻的代码
|
|
```python
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
URL = "https://auto.sina.com.cn/"
|
|
|
|
headers = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
}
|
|
resp = requests.get(URL,headers=headers,timeout=10)
|
|
# resp.raise_for_status()
|
|
# resp.encoding = "utf-8"
|
|
# print(resp.text)
|
|
with open("example/example-10.html","r",encoding="utf-8") as f:
|
|
html = f.read()
|
|
|
|
# soup = BeautifulSoup(resp.text,"lxml")
|
|
soup = BeautifulSoup(html,"lxml")
|
|
div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
|
|
|
|
for item in div_list:
|
|
a = item.select_one("div.ty-card-l a")
|
|
href = a.get("href")
|
|
# print(a.get('href'),a.get_text().strip())
|
|
|
|
resp = requests.get(url=href,headers=headers)
|
|
resp.encoding = resp.apparent_encoding # requests 会尝试猜测编码
|
|
soup = BeautifulSoup(resp.text,"lxml")
|
|
# 获取文章标题
|
|
article_title_tag = soup.select_one("div.main-content h1.main-title")
|
|
if article_title_tag:
|
|
article_title = article_title_tag.get_text(strip=True)
|
|
if not article_title:
|
|
article_title = "未知标题"
|
|
else:
|
|
article_title = "未知标题"
|
|
# print("标题:", article_title)
|
|
# 获取文章发布时间
|
|
from datetime import datetime
|
|
|
|
# 日期时间格式化函数
|
|
def normalize_time(time_str):
|
|
for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
|
|
try:
|
|
dt = datetime.strptime(time_str, fmt)
|
|
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
except:
|
|
continue
|
|
return time_str # 如果都不匹配,返回原字符串
|
|
|
|
time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
|
|
if time_tag: # 只有存在时间标签才进行格式化
|
|
publish_time = normalize_time(time_tag.get_text(strip=True))
|
|
else:
|
|
publish_time = "1949-01-01 12:00:00"
|
|
#print(publish_time)
|
|
|
|
# 获取文章作者
|
|
author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
|
|
if author_tag:
|
|
author = author_tag.get_text(strip=True)
|
|
else:
|
|
author = "未知"
|
|
# print(author)
|
|
# 获取文章正文段落
|
|
article_div = soup.select_one("div.main-content div.article") # 核心文章容器
|
|
if not article_div:
|
|
# print("不是文章详情页,跳过")
|
|
continue # 如果不是详情页就跳过
|
|
paragraphs = article_div.find_all('p')
|
|
article_text = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
|
|
# print("正文:\n", article_text)
|
|
|
|
|
|
```
|