这是新浪网关于爬取汽车相关新闻的代码 ```python import requests from bs4 import BeautifulSoup URL = "https://auto.sina.com.cn/" headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ) } resp = requests.get(URL,headers=headers,timeout=10) # resp.raise_for_status() # resp.encoding = "utf-8" # print(resp.text) with open("example/example-10.html","r",encoding="utf-8") as f: html = f.read() # soup = BeautifulSoup(resp.text,"lxml") soup = BeautifulSoup(html,"lxml") div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1") for item in div_list: a = item.select_one("div.ty-card-l a") href = a.get("href") # print(a.get('href'),a.get_text().strip()) resp = requests.get(url=href,headers=headers) resp.encoding = resp.apparent_encoding # requests 会尝试猜测编码 soup = BeautifulSoup(resp.text,"lxml") # 获取文章标题 article_title_tag = soup.select_one("div.main-content h1.main-title") if article_title_tag: article_title = article_title_tag.get_text(strip=True) if not article_title: article_title = "未知标题" else: article_title = "未知标题" # print("标题:", article_title) # 获取文章发布时间 from datetime import datetime # 日期时间格式化函数 def normalize_time(time_str): for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"): try: dt = datetime.strptime(time_str, fmt) return dt.strftime("%Y-%m-%d %H:%M:%S") except: continue return time_str # 如果都不匹配,返回原字符串 time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date") if time_tag: # 只有存在时间标签才进行格式化 publish_time = normalize_time(time_tag.get_text(strip=True)) else: publish_time = "1949-01-01 12:00:00" #print(publish_time) # 获取文章作者 author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a") if author_tag: author = author_tag.get_text(strip=True) else: author = "未知" # print(author) # 获取文章正文段落 article_div = soup.select_one("div.main-content div.article") # 核心文章容器 if not article_div: # print("不是文章详情页,跳过") continue # 如果不是详情页就跳过 paragraphs = article_div.find_all('p') article_text = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)) # print("正文:\n", article_text) ```