这是关于搜狐新闻网爬取房产分类新闻的一个可行的代码 注意这里需要使用到动态加载 ,并且需要点击翻页导航的下一页按钮进行分页,根据 --max 的参数决定翻页的次数 这是页面的翻页导航 ```html ``` ```python import requests from bs4 import BeautifulSoup URL = "https://house.focus.cn/zixun/" headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ) } resp = requests.get(URL,headers=headers,timeout=10) resp.raise_for_status() resp.encoding = "utf-8" # with open("example/example-14.html", "r",encoding="utf-8") as f: # html = f.read() def parser_article(href): resp_article = requests.get(href, headers=headers) soup_article = BeautifulSoup(resp_article.text,"lxml") # print(resp.text) # 标题 title_tag = soup_article.select_one("#article-container div.text-title h1") title = title_tag.get_text(strip=True) # 作者 author_tag = soup_article.select_one("#article-container .article-info .author a") author = author_tag.get_text(strip=True) #发布时间 time_tag = soup_article.select_one("#article-container .article-info .author #news-time") time = time_tag.get_text(strip=True) # 文本内容 content_tag = soup_article.select_one("article#mp-editor") if content_tag: # strip=True 去除首尾空白 # separator='\n\n' 让每个段落之间空一行 content = content_tag.get_text(separator='\n\n', strip=True) else: content = "未找到正文内容" print(title,author,time) print(content) # soup = BeautifulSoup(resp.text,"lxml") soup = BeautifulSoup(html,"lxml") div_list = soup.select('.TPLTextFeedItem, .TPLImageTextFeedItem') # print(div_list) for item in div_list: link_tag = item.select_one('a') if link_tag and link_tag.has_attr('href'): url = link_tag['href'] # 处理可能的相对协议链接 (//www...) if url.startswith('//'): url = 'https:' + url parser_article(url) ```