feat: 增加kr36 爬虫滚动屏幕次数

2026-01-14 20:52:17 +08:00 · 2026-01-14 20:52:17 +08:00 · 543ce5ec0a
parent 3ce7683a42
commit 543ce5ec0a
8 changed files with 189 additions and 87 deletions
--- a/crawler-module/config/config.yaml
+++ b/crawler-module/config/config.yaml
@ -34,9 +34,9 @@ selenium:
  headless: true
  log_level: 3
  window_size: "1920,1080"
-  page_load_timeout: 30
+  page_load_timeout: 60
  script_timeout: 30
-  implicit_wait: 10
+  implicit_wait: 20
  scroll_pause_time: 1.2
  max_scroll_times: 10
@ -110,6 +110,11 @@ sources:
        category_id: 9
        name: "AI"
        css_selector: "div.kr-information-left"
      health:
        url: "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
        category_id: 8
        name: "健康"
        css_selector: "div.kr-search-result-list"
  sina:
    base_url: "https://sina.com.cn"
--- a/crawler-module/config/settings.py
+++ b/crawler-module/config/settings.py
@ -55,7 +55,7 @@ class SeleniumConfig:
    page_load_timeout: int = 30
    script_timeout: int = 30
    implicit_wait: int = 10
-    scroll_pause_time: float = 1.2
+    scroll_pause_time: float = 5
    max_scroll_times: int = 10
--- a/crawler-module/kr36-health.txt
+++ b/crawler-module/kr36-health.txt
@ -0,0 +1,33 @@
 这是36kr关于爬取健康相关新闻的代码
 ```python
 import requests
 from bs4 import BeautifulSoup
 import re
 URL = "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
 TARGET_URL = "https://www.36kr.com"
 headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
 }
 resp = requests.get(URL,headers=headers,timeout=10)
 resp.raise_for_status()
 resp.encoding = "utf-8"
 # print(resp.text)
 with open("example/example-11.html","r",encoding="utf-8") as f:
    html = f.read()
 # soup = BeautifulSoup(resp.text,"lxml")
 soup = BeautifulSoup(html,"lxml")
 li_list = soup.select("div.kr-layout div.kr-layout-main div.kr-layout-content div.kr-search-result-list ul.kr-search-result-list-main > li")
 for item in li_list:
    a = item.select_one("div.kr-shadow-content a")
    href = TARGET_URL+ a.get("href")
    print(href)
 ```
--- a/crawler-module/sina-auto.txt
+++ b/crawler-module/sina-auto.txt
@ -1,82 +0,0 @@
 这是新浪网关于爬取汽车相关新闻的代码
 ```python
 import requests
 from bs4 import BeautifulSoup
 URL = "https://auto.sina.com.cn/"
 headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
 }
 resp = requests.get(URL,headers=headers,timeout=10)
 # resp.raise_for_status()
 # resp.encoding = "utf-8"
 # print(resp.text)
 with open("example/example-10.html","r",encoding="utf-8") as f:
    html = f.read()
 # soup = BeautifulSoup(resp.text,"lxml")
 soup = BeautifulSoup(html,"lxml")
 div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
 for item in div_list:
    a = item.select_one("div.ty-card-l a")
    href = a.get("href")
    # print(a.get('href'),a.get_text().strip())
    resp = requests.get(url=href,headers=headers)
    resp.encoding = resp.apparent_encoding  # requests 会尝试猜测编码
    soup = BeautifulSoup(resp.text,"lxml")
    # 获取文章标题
    article_title_tag = soup.select_one("div.main-content h1.main-title")
    if article_title_tag:
        article_title = article_title_tag.get_text(strip=True)
        if not article_title:
            article_title = "未知标题"
    else:
        article_title = "未知标题"
    # print("标题:", article_title)
    # 获取文章发布时间
    from datetime import datetime
    # 日期时间格式化函数
    def normalize_time(time_str):
        for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
            try:
                dt = datetime.strptime(time_str, fmt)
                return dt.strftime("%Y-%m-%d %H:%M:%S")
            except:
                continue
        return time_str  # 如果都不匹配，返回原字符串
    time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
    if time_tag:  # 只有存在时间标签才进行格式化
        publish_time = normalize_time(time_tag.get_text(strip=True))
    else:
        publish_time = "1949-01-01 12:00:00"
    #print(publish_time)
    # 获取文章作者
    author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
    if author_tag:
        author = author_tag.get_text(strip=True)
    else:
        author = "未知"
    # print(author)
    # 获取文章正文段落
    article_div = soup.select_one("div.main-content div.article")  # 核心文章容器
    if not article_div:
        # print("不是文章详情页，跳过")
        continue  # 如果不是详情页就跳过
    paragraphs = article_div.find_all('p')
    article_text = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
    # print("正文:\n", article_text)
 ```
--- a/crawler-module/src/cli/main.py
+++ b/crawler-module/src/cli/main.py
@ -30,6 +30,7 @@ CRAWLER_CLASSES = {
    },
    'kr36': {
        'ai': ('crawlers.kr36.ai', 'AICrawler'),
        'health': ('crawlers.kr36.health', 'HealthCrawler'),
    },
    'sina': {
        'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
--- a/crawler-module/src/crawlers/kr36/ai.py
+++ b/crawler-module/src/crawlers/kr36/ai.py
@ -18,6 +18,29 @@ class AICrawler(DynamicCrawler):
    ARTICLE_BASE_URL = "https://www.36kr.com/p/"
    def _fetch_page(self) -> str:
        """获取页面HTML - 点击加载更多按钮"""
        if not self._driver:
            from utils.selenium_driver import SeleniumDriver
            self._driver = SeleniumDriver()
        try:
            # 获取页面
            self._driver._driver.get(self.url)
            # 等待页面加载
            import time
            time.sleep(2)
            # 点击加载更多按钮多次
            self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10)
            return self._driver._driver.page_source
        except Exception as e:
            self.logger.error(f"获取页面源码失败: {self.url} - {e}")
            raise
    def _extract_article_urls(self, html: str) -> List[str]:
        """从HTML中提取文章URL列表"""
        soup = BeautifulSoup(html, "lxml")
--- a/crawler-module/src/crawlers/kr36/health.py
+++ b/crawler-module/src/crawlers/kr36/health.py
@ -0,0 +1,87 @@
 """
 36氪健康新闻爬虫
 """
 from typing import List
 from bs4 import BeautifulSoup
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 from base.crawler_base import DynamicCrawler, Article
 from parsers.kr36_parser import Kr36Parser
 class HealthCrawler(DynamicCrawler):
    """36氪健康新闻爬虫"""
    ARTICLE_BASE_URL = "https://www.36kr.com"
    def _fetch_page(self) -> str:
        """获取页面HTML - 点击加载更多按钮"""
        if not self._driver:
            from utils.selenium_driver import SeleniumDriver
            self._driver = SeleniumDriver()
        try:
            # 获取页面
            self._driver._driver.get(self.url)
            # 等待页面加载
            import time
            time.sleep(2)
            # 点击加载更多按钮多次
            self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10)
            return self._driver._driver.page_source
        except Exception as e:
            self.logger.error(f"获取页面源码失败: {self.url} - {e}")
            raise
    def _extract_article_urls(self, html: str) -> List[str]:
        """从HTML中提取文章URL列表"""
        soup = BeautifulSoup(html, "lxml")
        urls = []
        li_list = soup.select(
            "div.kr-layout div.kr-layout-main div.kr-layout-content "
            "div.kr-search-result-list ul.kr-search-result-list-main > li"
        )
        for item in li_list:
            a = item.select_one("div.kr-shadow-content a")
            if a:
                href = a.get("href")
                if href:
                    # 构建完整文章URL
                    full_url = self.ARTICLE_BASE_URL + href
                    urls.append(full_url)
        return urls
    def _fetch_articles(self, urls: List[str]) -> List[Article]:
        """爬取文章详情"""
        articles = []
        parser = Kr36Parser()
        for i, url in enumerate(urls[:self.max_articles]):
            try:
                article = parser.parse(url)
                article.category_id = self.category_id
                article.source = "36kr"
                if not article.author:
                    article.author = "36氪"
                if article.is_valid():
                    articles.append(article)
                    self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
            except Exception as e:
                self.logger.error(f"解析文章失败: {url} - {e}")
                continue
        return articles
--- a/crawler-module/src/utils/selenium_driver.py
+++ b/crawler-module/src/utils/selenium_driver.py
@ -39,8 +39,8 @@ class SeleniumDriver:
        options.add_argument(f"--log-level={config.selenium.log_level}")
        # 无头模式
-        if config.selenium.headless:
+        # if config.selenium.headless:
-            options.add_argument("--headless=new")
+        #     options.add_argument("--headless=new")
        # 窗口大小
        if config.selenium.window_size:
@ -128,6 +128,41 @@ class SeleniumDriver:
        self.logger.debug(f"滚动完成，最终高度: {last_height}")
    def click_load_more(self, selector: str = "div.kr-loading-more-button", max_clicks: int = 10):
        """
        点击加载更多按钮
        Args:
            selector: 加载更多按钮的CSS选择器
            max_clicks: 最大点击次数
        """
        if not self._driver:
            self._create_driver()
        click_count = 0
        for i in range(max_clicks):
            try:
                # 多次滚动到页面底部
                for j in range(3):
                    self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(0.5)
                # 点击加载更多按钮（不管按钮是否可见或可点击）
                load_more_btn = self._driver.find_element(By.CSS_SELECTOR, selector)
                load_more_btn.click()
                click_count += 1
                self.logger.debug(f"点击加载更多按钮 {click_count} 次")
                # 等待内容加载
                time.sleep(config.selenium.scroll_pause_time)
            except Exception as e:
                self.logger.debug(f"点击加载更多按钮失败: {e}")
                break
        self.logger.info(f"加载更多按钮共点击 {click_count} 次")
    def quit(self):
        """退出驱动"""
        if self._driver: