33 lines
952 B
Plaintext
33 lines
952 B
Plaintext
|
|
这是36kr关于爬取健康相关新闻的代码
|
|
```python
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
|
|
URL = "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
|
|
TARGET_URL = "https://www.36kr.com"
|
|
headers = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
}
|
|
resp = requests.get(URL,headers=headers,timeout=10)
|
|
resp.raise_for_status()
|
|
resp.encoding = "utf-8"
|
|
# print(resp.text)
|
|
with open("example/example-11.html","r",encoding="utf-8") as f:
|
|
html = f.read()
|
|
|
|
# soup = BeautifulSoup(resp.text,"lxml")
|
|
soup = BeautifulSoup(html,"lxml")
|
|
li_list = soup.select("div.kr-layout div.kr-layout-main div.kr-layout-content div.kr-search-result-list ul.kr-search-result-list-main > li")
|
|
|
|
for item in li_list:
|
|
a = item.select_one("div.kr-shadow-content a")
|
|
href = TARGET_URL+ a.get("href")
|
|
print(href)
|
|
|
|
``` |