这是关于搜狐新闻网爬取房产分类新闻的一个可行的代码
注意这里需要使用到动态加载 ,并且需要点击翻页导航的下一页按钮进行分页,根据 --max 的参数决定翻页的次数
这是页面的翻页导航
```html
```
```python
import requests
from bs4 import BeautifulSoup
URL = "https://house.focus.cn/zixun/"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
}
resp = requests.get(URL,headers=headers,timeout=10)
resp.raise_for_status()
resp.encoding = "utf-8"
# with open("example/example-14.html", "r",encoding="utf-8") as f:
# html = f.read()
def parser_article(href):
resp_article = requests.get(href, headers=headers)
soup_article = BeautifulSoup(resp_article.text,"lxml")
# print(resp.text)
# 标题
title_tag = soup_article.select_one("#article-container div.text-title h1")
title = title_tag.get_text(strip=True)
# 作者
author_tag = soup_article.select_one("#article-container .article-info .author a")
author = author_tag.get_text(strip=True)
#发布时间
time_tag = soup_article.select_one("#article-container .article-info .author #news-time")
time = time_tag.get_text(strip=True)
# 文本内容
content_tag = soup_article.select_one("article#mp-editor")
if content_tag:
# strip=True 去除首尾空白
# separator='\n\n' 让每个段落之间空一行
content = content_tag.get_text(separator='\n\n', strip=True)
else:
content = "未找到正文内容"
print(title,author,time)
print(content)
# soup = BeautifulSoup(resp.text,"lxml")
soup = BeautifulSoup(html,"lxml")
div_list = soup.select('.TPLTextFeedItem, .TPLImageTextFeedItem')
# print(div_list)
for item in div_list:
link_tag = item.select_one('a')
if link_tag and link_tag.has_attr('href'):
url = link_tag['href']
# 处理可能的相对协议链接 (//www...)
if url.startswith('//'):
url = 'https:' + url
parser_article(url)
```