feat: 增加kr36 爬虫滚动屏幕次数

2026-01-14 20:52:17 +08:00 · 2026-01-14 20:52:17 +08:00 · 543ce5ec0a
parent 3ce7683a42
commit 543ce5ec0a
8 changed files with 189 additions and 87 deletions
--- a/crawler-module/config/config.yaml
+++ b/crawler-module/config/config.yaml
@ -34,9 +34,9 @@ selenium:
  headless: true
  log_level: 3
  window_size: "1920,1080"
-  page_load_timeout: 30
+  page_load_timeout: 60
  script_timeout: 30
-  implicit_wait: 10
+  implicit_wait: 20
  scroll_pause_time: 1.2
  max_scroll_times: 10

@ -110,6 +110,11 @@ sources:
        category_id: 9
        name: "AI"
        css_selector: "div.kr-information-left"
+      health:
+        url: "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
+        category_id: 8
+        name: "健康"
+        css_selector: "div.kr-search-result-list"

  sina:
    base_url: "https://sina.com.cn"
--- a/crawler-module/config/settings.py
+++ b/crawler-module/config/settings.py
@ -55,7 +55,7 @@ class SeleniumConfig:
    page_load_timeout: int = 30
    script_timeout: int = 30
    implicit_wait: int = 10
-    scroll_pause_time: float = 1.2
+    scroll_pause_time: float = 5
    max_scroll_times: int = 10


--- a/crawler-module/kr36-health.txt
+++ b/crawler-module/kr36-health.txt
@ -0,0 +1,33 @@
+
+这是36kr关于爬取健康相关新闻的代码
+```python
+import requests
+from bs4 import BeautifulSoup
+import re
+
+URL = "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
+TARGET_URL = "https://www.36kr.com"
+headers = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/120.0.0.0 Safari/537.36"
+    )
+}
+resp = requests.get(URL,headers=headers,timeout=10)
+resp.raise_for_status()
+resp.encoding = "utf-8"
+# print(resp.text)
+with open("example/example-11.html","r",encoding="utf-8") as f:
+    html = f.read()
+
+# soup = BeautifulSoup(resp.text,"lxml")
+soup = BeautifulSoup(html,"lxml")
+li_list = soup.select("div.kr-layout div.kr-layout-main div.kr-layout-content div.kr-search-result-list ul.kr-search-result-list-main > li")
+
+for item in li_list:
+    a = item.select_one("div.kr-shadow-content a")
+    href = TARGET_URL+ a.get("href")
+    print(href)
+
+```
--- a/crawler-module/sina-auto.txt
+++ b/crawler-module/sina-auto.txt
@ -1,82 +0,0 @@
-
-这是新浪网关于爬取汽车相关新闻的代码
-```python
-import requests
-from bs4 import BeautifulSoup
-
-
-URL = "https://auto.sina.com.cn/"
-
-headers = {
-    "User-Agent": (
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
-        "AppleWebKit/537.36 (KHTML, like Gecko) "
-        "Chrome/120.0.0.0 Safari/537.36"
-    )
-}
-resp = requests.get(URL,headers=headers,timeout=10)
-# resp.raise_for_status()
-# resp.encoding = "utf-8"
-# print(resp.text)
-with open("example/example-10.html","r",encoding="utf-8") as f:
-    html = f.read()
-
-# soup = BeautifulSoup(resp.text,"lxml")
-soup = BeautifulSoup(html,"lxml")
-div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
-
-for item in div_list:
-    a = item.select_one("div.ty-card-l a")
-    href = a.get("href")
-    # print(a.get('href'),a.get_text().strip())
-
-    resp = requests.get(url=href,headers=headers)
-    resp.encoding = resp.apparent_encoding  # requests 会尝试猜测编码
-    soup = BeautifulSoup(resp.text,"lxml")
-    # 获取文章标题
-    article_title_tag = soup.select_one("div.main-content h1.main-title")
-    if article_title_tag:
-        article_title = article_title_tag.get_text(strip=True)
-        if not article_title:
-            article_title = "未知标题"
-    else:
-        article_title = "未知标题"
-    # print("标题:", article_title)
-    # 获取文章发布时间
-    from datetime import datetime
-
-    # 日期时间格式化函数
-    def normalize_time(time_str):
-        for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
-            try:
-                dt = datetime.strptime(time_str, fmt)
-                return dt.strftime("%Y-%m-%d %H:%M:%S")
-            except:
-                continue
-        return time_str  # 如果都不匹配，返回原字符串
-
-    time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
-    if time_tag:  # 只有存在时间标签才进行格式化
-        publish_time = normalize_time(time_tag.get_text(strip=True))
-    else:
-        publish_time = "1949-01-01 12:00:00"
-    #print(publish_time)
-
-    # 获取文章作者
-    author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
-    if author_tag:
-        author = author_tag.get_text(strip=True)
-    else:
-        author = "未知"
-    # print(author)
-    # 获取文章正文段落
-    article_div = soup.select_one("div.main-content div.article")  # 核心文章容器
-    if not article_div:
-        # print("不是文章详情页，跳过")
-        continue  # 如果不是详情页就跳过
-    paragraphs = article_div.find_all('p')
-    article_text = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
-    # print("正文:\n", article_text)
-
-
-```
--- a/crawler-module/src/cli/main.py
+++ b/crawler-module/src/cli/main.py
@ -30,6 +30,7 @@ CRAWLER_CLASSES = {
    },
    'kr36': {
        'ai': ('crawlers.kr36.ai', 'AICrawler'),
+        'health': ('crawlers.kr36.health', 'HealthCrawler'),
    },
    'sina': {
        'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
--- a/crawler-module/src/crawlers/kr36/ai.py
+++ b/crawler-module/src/crawlers/kr36/ai.py
@ -18,6 +18,29 @@ class AICrawler(DynamicCrawler):

    ARTICLE_BASE_URL = "https://www.36kr.com/p/"

+    def _fetch_page(self) -> str:
+        """获取页面HTML - 点击加载更多按钮"""
+        if not self._driver:
+            from utils.selenium_driver import SeleniumDriver
+            self._driver = SeleniumDriver()
+
+        try:
+            # 获取页面
+            self._driver._driver.get(self.url)
+
+            # 等待页面加载
+            import time
+            time.sleep(2)
+
+            # 点击加载更多按钮多次
+            self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10)
+
+            return self._driver._driver.page_source
+
+        except Exception as e:
+            self.logger.error(f"获取页面源码失败: {self.url} - {e}")
+            raise
+
    def _extract_article_urls(self, html: str) -> List[str]:
        """从HTML中提取文章URL列表"""
        soup = BeautifulSoup(html, "lxml")
--- a/crawler-module/src/crawlers/kr36/health.py
+++ b/crawler-module/src/crawlers/kr36/health.py
@ -0,0 +1,87 @@
+"""
+36氪健康新闻爬虫
+"""
+
+from typing import List
+from bs4 import BeautifulSoup
+
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from base.crawler_base import DynamicCrawler, Article
+from parsers.kr36_parser import Kr36Parser
+
+
+class HealthCrawler(DynamicCrawler):
+    """36氪健康新闻爬虫"""
+
+    ARTICLE_BASE_URL = "https://www.36kr.com"
+
+    def _fetch_page(self) -> str:
+        """获取页面HTML - 点击加载更多按钮"""
+        if not self._driver:
+            from utils.selenium_driver import SeleniumDriver
+            self._driver = SeleniumDriver()
+
+        try:
+            # 获取页面
+            self._driver._driver.get(self.url)
+
+            # 等待页面加载
+            import time
+            time.sleep(2)
+
+            # 点击加载更多按钮多次
+            self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10)
+
+            return self._driver._driver.page_source
+
+        except Exception as e:
+            self.logger.error(f"获取页面源码失败: {self.url} - {e}")
+            raise
+
+    def _extract_article_urls(self, html: str) -> List[str]:
+        """从HTML中提取文章URL列表"""
+        soup = BeautifulSoup(html, "lxml")
+        urls = []
+
+        li_list = soup.select(
+            "div.kr-layout div.kr-layout-main div.kr-layout-content "
+            "div.kr-search-result-list ul.kr-search-result-list-main > li"
+        )
+
+        for item in li_list:
+            a = item.select_one("div.kr-shadow-content a")
+            if a:
+                href = a.get("href")
+                if href:
+                    # 构建完整文章URL
+                    full_url = self.ARTICLE_BASE_URL + href
+                    urls.append(full_url)
+
+        return urls
+
+    def _fetch_articles(self, urls: List[str]) -> List[Article]:
+        """爬取文章详情"""
+        articles = []
+        parser = Kr36Parser()
+
+        for i, url in enumerate(urls[:self.max_articles]):
+            try:
+                article = parser.parse(url)
+                article.category_id = self.category_id
+                article.source = "36kr"
+
+                if not article.author:
+                    article.author = "36氪"
+
+                if article.is_valid():
+                    articles.append(article)
+                    self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
+
+            except Exception as e:
+                self.logger.error(f"解析文章失败: {url} - {e}")
+                continue
+
+        return articles
--- a/crawler-module/src/utils/selenium_driver.py
+++ b/crawler-module/src/utils/selenium_driver.py
@ -39,8 +39,8 @@ class SeleniumDriver:
        options.add_argument(f"--log-level={config.selenium.log_level}")

        # 无头模式
-        if config.selenium.headless:
-            options.add_argument("--headless=new")
+        # if config.selenium.headless:
+        #     options.add_argument("--headless=new")

        # 窗口大小
        if config.selenium.window_size:
@ -128,6 +128,41 @@ class SeleniumDriver:

        self.logger.debug(f"滚动完成，最终高度: {last_height}")

+    def click_load_more(self, selector: str = "div.kr-loading-more-button", max_clicks: int = 10):
+        """
+        点击加载更多按钮
+
+        Args:
+            selector: 加载更多按钮的CSS选择器
+            max_clicks: 最大点击次数
+        """
+        if not self._driver:
+            self._create_driver()
+
+        click_count = 0
+
+        for i in range(max_clicks):
+            try:
+                # 多次滚动到页面底部
+                for j in range(3):
+                    self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+                    time.sleep(0.5)
+
+                # 点击加载更多按钮（不管按钮是否可见或可点击）
+                load_more_btn = self._driver.find_element(By.CSS_SELECTOR, selector)
+                load_more_btn.click()
+                click_count += 1
+                self.logger.debug(f"点击加载更多按钮 {click_count} 次")
+
+                # 等待内容加载
+                time.sleep(config.selenium.scroll_pause_time)
+
+            except Exception as e:
+                self.logger.debug(f"点击加载更多按钮失败: {e}")
+                break
+
+        self.logger.info(f"加载更多按钮共点击 {click_count} 次")
+
    def quit(self):
        """退出驱动"""
        if self._driver: