feat: 增加kr36 爬虫滚动屏幕次数

This commit is contained in:
shenjianZ 2026-01-14 20:52:17 +08:00
parent 3ce7683a42
commit 543ce5ec0a
8 changed files with 189 additions and 87 deletions

View File

@ -34,9 +34,9 @@ selenium:
headless: true
log_level: 3
window_size: "1920,1080"
page_load_timeout: 30
page_load_timeout: 60
script_timeout: 30
implicit_wait: 10
implicit_wait: 20
scroll_pause_time: 1.2
max_scroll_times: 10
@ -110,6 +110,11 @@ sources:
category_id: 9
name: "AI"
css_selector: "div.kr-information-left"
health:
url: "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
category_id: 8
name: "健康"
css_selector: "div.kr-search-result-list"
sina:
base_url: "https://sina.com.cn"

View File

@ -55,7 +55,7 @@ class SeleniumConfig:
page_load_timeout: int = 30
script_timeout: int = 30
implicit_wait: int = 10
scroll_pause_time: float = 1.2
scroll_pause_time: float = 5
max_scroll_times: int = 10

View File

@ -0,0 +1,33 @@
这是36kr关于爬取健康相关新闻的代码
```python
import requests
from bs4 import BeautifulSoup
import re
URL = "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
TARGET_URL = "https://www.36kr.com"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
}
resp = requests.get(URL,headers=headers,timeout=10)
resp.raise_for_status()
resp.encoding = "utf-8"
# print(resp.text)
with open("example/example-11.html","r",encoding="utf-8") as f:
html = f.read()
# soup = BeautifulSoup(resp.text,"lxml")
soup = BeautifulSoup(html,"lxml")
li_list = soup.select("div.kr-layout div.kr-layout-main div.kr-layout-content div.kr-search-result-list ul.kr-search-result-list-main > li")
for item in li_list:
a = item.select_one("div.kr-shadow-content a")
href = TARGET_URL+ a.get("href")
print(href)
```

View File

@ -1,82 +0,0 @@
这是新浪网关于爬取汽车相关新闻的代码
```python
import requests
from bs4 import BeautifulSoup
URL = "https://auto.sina.com.cn/"
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)
}
resp = requests.get(URL,headers=headers,timeout=10)
# resp.raise_for_status()
# resp.encoding = "utf-8"
# print(resp.text)
with open("example/example-10.html","r",encoding="utf-8") as f:
html = f.read()
# soup = BeautifulSoup(resp.text,"lxml")
soup = BeautifulSoup(html,"lxml")
div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
for item in div_list:
a = item.select_one("div.ty-card-l a")
href = a.get("href")
# print(a.get('href'),a.get_text().strip())
resp = requests.get(url=href,headers=headers)
resp.encoding = resp.apparent_encoding # requests 会尝试猜测编码
soup = BeautifulSoup(resp.text,"lxml")
# 获取文章标题
article_title_tag = soup.select_one("div.main-content h1.main-title")
if article_title_tag:
article_title = article_title_tag.get_text(strip=True)
if not article_title:
article_title = "未知标题"
else:
article_title = "未知标题"
# print("标题:", article_title)
# 获取文章发布时间
from datetime import datetime
# 日期时间格式化函数
def normalize_time(time_str):
for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
try:
dt = datetime.strptime(time_str, fmt)
return dt.strftime("%Y-%m-%d %H:%M:%S")
except:
continue
return time_str # 如果都不匹配,返回原字符串
time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
if time_tag: # 只有存在时间标签才进行格式化
publish_time = normalize_time(time_tag.get_text(strip=True))
else:
publish_time = "1949-01-01 12:00:00"
#print(publish_time)
# 获取文章作者
author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
if author_tag:
author = author_tag.get_text(strip=True)
else:
author = "未知"
# print(author)
# 获取文章正文段落
article_div = soup.select_one("div.main-content div.article") # 核心文章容器
if not article_div:
# print("不是文章详情页,跳过")
continue # 如果不是详情页就跳过
paragraphs = article_div.find_all('p')
article_text = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
# print("正文:\n", article_text)
```

View File

@ -30,6 +30,7 @@ CRAWLER_CLASSES = {
},
'kr36': {
'ai': ('crawlers.kr36.ai', 'AICrawler'),
'health': ('crawlers.kr36.health', 'HealthCrawler'),
},
'sina': {
'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),

View File

@ -18,6 +18,29 @@ class AICrawler(DynamicCrawler):
ARTICLE_BASE_URL = "https://www.36kr.com/p/"
def _fetch_page(self) -> str:
"""获取页面HTML - 点击加载更多按钮"""
if not self._driver:
from utils.selenium_driver import SeleniumDriver
self._driver = SeleniumDriver()
try:
# 获取页面
self._driver._driver.get(self.url)
# 等待页面加载
import time
time.sleep(2)
# 点击加载更多按钮多次
self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10)
return self._driver._driver.page_source
except Exception as e:
self.logger.error(f"获取页面源码失败: {self.url} - {e}")
raise
def _extract_article_urls(self, html: str) -> List[str]:
"""从HTML中提取文章URL列表"""
soup = BeautifulSoup(html, "lxml")

View File

@ -0,0 +1,87 @@
"""
36氪健康新闻爬虫
"""
from typing import List
from bs4 import BeautifulSoup
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from base.crawler_base import DynamicCrawler, Article
from parsers.kr36_parser import Kr36Parser
class HealthCrawler(DynamicCrawler):
"""36氪健康新闻爬虫"""
ARTICLE_BASE_URL = "https://www.36kr.com"
def _fetch_page(self) -> str:
"""获取页面HTML - 点击加载更多按钮"""
if not self._driver:
from utils.selenium_driver import SeleniumDriver
self._driver = SeleniumDriver()
try:
# 获取页面
self._driver._driver.get(self.url)
# 等待页面加载
import time
time.sleep(2)
# 点击加载更多按钮多次
self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10)
return self._driver._driver.page_source
except Exception as e:
self.logger.error(f"获取页面源码失败: {self.url} - {e}")
raise
def _extract_article_urls(self, html: str) -> List[str]:
"""从HTML中提取文章URL列表"""
soup = BeautifulSoup(html, "lxml")
urls = []
li_list = soup.select(
"div.kr-layout div.kr-layout-main div.kr-layout-content "
"div.kr-search-result-list ul.kr-search-result-list-main > li"
)
for item in li_list:
a = item.select_one("div.kr-shadow-content a")
if a:
href = a.get("href")
if href:
# 构建完整文章URL
full_url = self.ARTICLE_BASE_URL + href
urls.append(full_url)
return urls
def _fetch_articles(self, urls: List[str]) -> List[Article]:
"""爬取文章详情"""
articles = []
parser = Kr36Parser()
for i, url in enumerate(urls[:self.max_articles]):
try:
article = parser.parse(url)
article.category_id = self.category_id
article.source = "36kr"
if not article.author:
article.author = "36氪"
if article.is_valid():
articles.append(article)
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
except Exception as e:
self.logger.error(f"解析文章失败: {url} - {e}")
continue
return articles

View File

@ -39,8 +39,8 @@ class SeleniumDriver:
options.add_argument(f"--log-level={config.selenium.log_level}")
# 无头模式
if config.selenium.headless:
options.add_argument("--headless=new")
# if config.selenium.headless:
# options.add_argument("--headless=new")
# 窗口大小
if config.selenium.window_size:
@ -128,6 +128,41 @@ class SeleniumDriver:
self.logger.debug(f"滚动完成,最终高度: {last_height}")
def click_load_more(self, selector: str = "div.kr-loading-more-button", max_clicks: int = 10):
"""
点击加载更多按钮
Args:
selector: 加载更多按钮的CSS选择器
max_clicks: 最大点击次数
"""
if not self._driver:
self._create_driver()
click_count = 0
for i in range(max_clicks):
try:
# 多次滚动到页面底部
for j in range(3):
self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
# 点击加载更多按钮(不管按钮是否可见或可点击)
load_more_btn = self._driver.find_element(By.CSS_SELECTOR, selector)
load_more_btn.click()
click_count += 1
self.logger.debug(f"点击加载更多按钮 {click_count}")
# 等待内容加载
time.sleep(config.selenium.scroll_pause_time)
except Exception as e:
self.logger.debug(f"点击加载更多按钮失败: {e}")
break
self.logger.info(f"加载更多按钮共点击 {click_count}")
def quit(self):
"""退出驱动"""
if self._driver: