65 lines
3.2 KiB
Plaintext
65 lines
3.2 KiB
Plaintext
|
|
这是关于搜狐新闻网爬取房产分类新闻的一个可行的代码
|
|
注意这里需要使用到动态加载 ,并且需要点击翻页导航的下一页按钮进行分页,根据 --max 的参数决定翻页的次数
|
|
这是页面的翻页导航
|
|
```html
|
|
<div data-v-4b61222c="" data-v-32183bf0="" class="Pagination"><div data-v-4b61222c="" class="pagination-content"><div data-v-4b61222c="" class="pagination-item pagination-item-pre">首页</div> <div data-v-4b61222c="" class="pagination-item pagination-item-pre">上一页</div> <div data-v-4b61222c="" class="pagination-item-content"><div data-v-4b61222c="" class="pagination-item pagination-item-0">1</div><div data-v-4b61222c="" class="pagination-item pagination-item-1">2</div> <div data-v-4b61222c="" class="pagination-item-point">···</div> <div data-v-4b61222c="" class="pagination-item pagination-item-0">4</div><div data-v-4b61222c="" class="pagination-item pagination-item-1 active-item">5</div><div data-v-4b61222c="" class="pagination-item pagination-item-2">6</div> <div data-v-4b61222c="" class="pagination-item-point">···</div> <div data-v-4b61222c="" class="pagination-item pagination-item-0">99</div><div data-v-4b61222c="" class="pagination-item pagination-item-1">100</div></div> <div data-v-4b61222c="" class="pagination-item pagination-item-next pagination-more">下一页</div></div></div>
|
|
```
|
|
```python
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
URL = "https://house.focus.cn/zixun/"
|
|
headers = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/120.0.0.0 Safari/537.36"
|
|
)
|
|
}
|
|
resp = requests.get(URL,headers=headers,timeout=10)
|
|
resp.raise_for_status()
|
|
resp.encoding = "utf-8"
|
|
# with open("example/example-14.html", "r",encoding="utf-8") as f:
|
|
# html = f.read()
|
|
def parser_article(href):
|
|
resp_article = requests.get(href, headers=headers)
|
|
soup_article = BeautifulSoup(resp_article.text,"lxml")
|
|
# print(resp.text)
|
|
# 标题
|
|
title_tag = soup_article.select_one("#article-container div.text-title h1")
|
|
title = title_tag.get_text(strip=True)
|
|
# 作者
|
|
author_tag = soup_article.select_one("#article-container .article-info .author a")
|
|
author = author_tag.get_text(strip=True)
|
|
#发布时间
|
|
time_tag = soup_article.select_one("#article-container .article-info .author #news-time")
|
|
time = time_tag.get_text(strip=True)
|
|
# 文本内容
|
|
content_tag = soup_article.select_one("article#mp-editor")
|
|
if content_tag:
|
|
# strip=True 去除首尾空白
|
|
# separator='\n\n' 让每个段落之间空一行
|
|
content = content_tag.get_text(separator='\n\n', strip=True)
|
|
else:
|
|
content = "未找到正文内容"
|
|
print(title,author,time)
|
|
print(content)
|
|
|
|
# soup = BeautifulSoup(resp.text,"lxml")
|
|
soup = BeautifulSoup(html,"lxml")
|
|
div_list = soup.select('.TPLTextFeedItem, .TPLImageTextFeedItem')
|
|
# print(div_list)
|
|
for item in div_list:
|
|
link_tag = item.select_one('a')
|
|
if link_tag and link_tag.has_attr('href'):
|
|
url = link_tag['href']
|
|
# 处理可能的相对协议链接 (//www...)
|
|
if url.startswith('//'):
|
|
url = 'https:' + url
|
|
parser_article(url)
|
|
|
|
|
|
|
|
``` |