news-classifier/crawler-module/sina-auto.txt


这是新浪网关于爬取汽车相关新闻的代码
```python
import requests
from bs4 import BeautifulSoup


URL = "https://auto.sina.com.cn/"

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
}
resp = requests.get(URL,headers=headers,timeout=10)
# resp.raise_for_status()
# resp.encoding = "utf-8"
# print(resp.text)
with open("example/example-10.html","r",encoding="utf-8") as f:
    html = f.read()

# soup = BeautifulSoup(resp.text,"lxml")
soup = BeautifulSoup(html,"lxml")
div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")

for item in div_list:
    a = item.select_one("div.ty-card-l a")
    href = a.get("href")
    # print(a.get('href'),a.get_text().strip())

    resp = requests.get(url=href,headers=headers)
    resp.encoding = resp.apparent_encoding  # requests 会尝试猜测编码
    soup = BeautifulSoup(resp.text,"lxml")
    # 获取文章标题
    article_title_tag = soup.select_one("div.main-content h1.main-title")
    if article_title_tag:
        article_title = article_title_tag.get_text(strip=True)
        if not article_title:
            article_title = "未知标题"
    else:
        article_title = "未知标题"
    # print("标题:", article_title)
    # 获取文章发布时间
    from datetime import datetime

    # 日期时间格式化函数
    def normalize_time(time_str):
        for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
            try:
                dt = datetime.strptime(time_str, fmt)
                return dt.strftime("%Y-%m-%d %H:%M:%S")
            except:
                continue
        return time_str  # 如果都不匹配，返回原字符串

    time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
    if time_tag:  # 只有存在时间标签才进行格式化
        publish_time = normalize_time(time_tag.get_text(strip=True))
    else:
        publish_time = "1949-01-01 12:00:00"
    #print(publish_time)

    # 获取文章作者
    author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
    if author_tag:
        author = author_tag.get_text(strip=True)
    else:
        author = "未知"
    # print(author)
    # 获取文章正文段落
    article_div = soup.select_one("div.main-content div.article")  # 核心文章容器
    if not article_div:
        # print("不是文章详情页，跳过")
        continue  # 如果不是详情页就跳过
    paragraphs = article_div.find_all('p')
    article_text = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
    # print("正文:\n", article_text)


```