这是关于搜狐新闻网爬取房产分类新闻的一个可行的代码注意这里需要使用到动态加载，并且需要点击翻页导航的下一页按钮进行分页，根据 --max 的参数决定翻页的次数这是页面的翻页导航 ```html ``` ```python import requests from bs4 import BeautifulSoup URL = "https://house.focus.cn/zixun/" headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ) } resp = requests.get(URL,headers=headers,timeout=10) resp.raise_for_status() resp.encoding = "utf-8" # with open("example/example-14.html", "r",encoding="utf-8") as f: # html = f.read() def parser_article(href): resp_article = requests.get(href, headers=headers) soup_article = BeautifulSoup(resp_article.text,"lxml") # print(resp.text) # 标题 title_tag = soup_article.select_one("#article-container div.text-title h1") title = title_tag.get_text(strip=True) # 作者 author_tag = soup_article.select_one("#article-container .article-info .author a") author = author_tag.get_text(strip=True) #发布时间 time_tag = soup_article.select_one("#article-container .article-info .author #news-time") time = time_tag.get_text(strip=True) # 文本内容 content_tag = soup_article.select_one("article#mp-editor") if content_tag: # strip=True 去除首尾空白 # separator='\n\n' 让每个段落之间空一行 content = content_tag.get_text(separator='\n\n', strip=True) else: content = "未找到正文内容" print(title,author,time) print(content) # soup = BeautifulSoup(resp.text,"lxml") soup = BeautifulSoup(html,"lxml") div_list = soup.select('.TPLTextFeedItem, .TPLImageTextFeedItem') # print(div_list) for item in div_list: link_tag = item.select_one('a') if link_tag and link_tag.has_attr('href'): url = link_tag['href'] # 处理可能的相对协议链接 (//www...) if url.startswith('//'): url = 'https:' + url parser_article(url) ```