From 543ce5ec0a3c3090d0824717b38c888284ceb14a Mon Sep 17 00:00:00 2001 From: shenjianZ Date: Wed, 14 Jan 2026 20:52:17 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E5=A2=9E=E5=8A=A0kr36=20=E7=88=AC?= =?UTF-8?q?=E8=99=AB=E6=BB=9A=E5=8A=A8=E5=B1=8F=E5=B9=95=E6=AC=A1=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler-module/config/config.yaml | 9 ++- crawler-module/config/settings.py | 2 +- crawler-module/kr36-health.txt | 33 ++++++++ crawler-module/sina-auto.txt | 82 ------------------- crawler-module/src/cli/main.py | 1 + crawler-module/src/crawlers/kr36/ai.py | 23 ++++++ crawler-module/src/crawlers/kr36/health.py | 87 +++++++++++++++++++++ crawler-module/src/utils/selenium_driver.py | 39 ++++++++- 8 files changed, 189 insertions(+), 87 deletions(-) create mode 100644 crawler-module/kr36-health.txt delete mode 100644 crawler-module/sina-auto.txt create mode 100644 crawler-module/src/crawlers/kr36/health.py diff --git a/crawler-module/config/config.yaml b/crawler-module/config/config.yaml index ff9ca0f..b01eb71 100644 --- a/crawler-module/config/config.yaml +++ b/crawler-module/config/config.yaml @@ -34,9 +34,9 @@ selenium: headless: true log_level: 3 window_size: "1920,1080" - page_load_timeout: 30 + page_load_timeout: 60 script_timeout: 30 - implicit_wait: 10 + implicit_wait: 20 scroll_pause_time: 1.2 max_scroll_times: 10 @@ -110,6 +110,11 @@ sources: category_id: 9 name: "AI" css_selector: "div.kr-information-left" + health: + url: "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7" + category_id: 8 + name: "健康" + css_selector: "div.kr-search-result-list" sina: base_url: "https://sina.com.cn" diff --git a/crawler-module/config/settings.py b/crawler-module/config/settings.py index aba6a05..1ebeff4 100644 --- a/crawler-module/config/settings.py +++ b/crawler-module/config/settings.py @@ -55,7 +55,7 @@ class SeleniumConfig: page_load_timeout: int = 30 script_timeout: int = 30 implicit_wait: int = 10 - scroll_pause_time: float = 1.2 + scroll_pause_time: float = 5 max_scroll_times: int = 10 diff --git a/crawler-module/kr36-health.txt b/crawler-module/kr36-health.txt new file mode 100644 index 0000000..dda0b1b --- /dev/null +++ b/crawler-module/kr36-health.txt @@ -0,0 +1,33 @@ + +这是36kr关于爬取健康相关新闻的代码 +```python +import requests +from bs4 import BeautifulSoup +import re + +URL = "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7" +TARGET_URL = "https://www.36kr.com" +headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ) +} +resp = requests.get(URL,headers=headers,timeout=10) +resp.raise_for_status() +resp.encoding = "utf-8" +# print(resp.text) +with open("example/example-11.html","r",encoding="utf-8") as f: + html = f.read() + +# soup = BeautifulSoup(resp.text,"lxml") +soup = BeautifulSoup(html,"lxml") +li_list = soup.select("div.kr-layout div.kr-layout-main div.kr-layout-content div.kr-search-result-list ul.kr-search-result-list-main > li") + +for item in li_list: + a = item.select_one("div.kr-shadow-content a") + href = TARGET_URL+ a.get("href") + print(href) + +``` \ No newline at end of file diff --git a/crawler-module/sina-auto.txt b/crawler-module/sina-auto.txt deleted file mode 100644 index 0bac1d5..0000000 --- a/crawler-module/sina-auto.txt +++ /dev/null @@ -1,82 +0,0 @@ - -这是新浪网关于爬取汽车相关新闻的代码 -```python -import requests -from bs4 import BeautifulSoup - - -URL = "https://auto.sina.com.cn/" - -headers = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/120.0.0.0 Safari/537.36" - ) -} -resp = requests.get(URL,headers=headers,timeout=10) -# resp.raise_for_status() -# resp.encoding = "utf-8" -# print(resp.text) -with open("example/example-10.html","r",encoding="utf-8") as f: - html = f.read() - -# soup = BeautifulSoup(resp.text,"lxml") -soup = BeautifulSoup(html,"lxml") -div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1") - -for item in div_list: - a = item.select_one("div.ty-card-l a") - href = a.get("href") - # print(a.get('href'),a.get_text().strip()) - - resp = requests.get(url=href,headers=headers) - resp.encoding = resp.apparent_encoding # requests 会尝试猜测编码 - soup = BeautifulSoup(resp.text,"lxml") - # 获取文章标题 - article_title_tag = soup.select_one("div.main-content h1.main-title") - if article_title_tag: - article_title = article_title_tag.get_text(strip=True) - if not article_title: - article_title = "未知标题" - else: - article_title = "未知标题" - # print("标题:", article_title) - # 获取文章发布时间 - from datetime import datetime - - # 日期时间格式化函数 - def normalize_time(time_str): - for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"): - try: - dt = datetime.strptime(time_str, fmt) - return dt.strftime("%Y-%m-%d %H:%M:%S") - except: - continue - return time_str # 如果都不匹配,返回原字符串 - - time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date") - if time_tag: # 只有存在时间标签才进行格式化 - publish_time = normalize_time(time_tag.get_text(strip=True)) - else: - publish_time = "1949-01-01 12:00:00" - #print(publish_time) - - # 获取文章作者 - author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a") - if author_tag: - author = author_tag.get_text(strip=True) - else: - author = "未知" - # print(author) - # 获取文章正文段落 - article_div = soup.select_one("div.main-content div.article") # 核心文章容器 - if not article_div: - # print("不是文章详情页,跳过") - continue # 如果不是详情页就跳过 - paragraphs = article_div.find_all('p') - article_text = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)) - # print("正文:\n", article_text) - - -``` diff --git a/crawler-module/src/cli/main.py b/crawler-module/src/cli/main.py index 2df0e96..6879683 100644 --- a/crawler-module/src/cli/main.py +++ b/crawler-module/src/cli/main.py @@ -30,6 +30,7 @@ CRAWLER_CLASSES = { }, 'kr36': { 'ai': ('crawlers.kr36.ai', 'AICrawler'), + 'health': ('crawlers.kr36.health', 'HealthCrawler'), }, 'sina': { 'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'), diff --git a/crawler-module/src/crawlers/kr36/ai.py b/crawler-module/src/crawlers/kr36/ai.py index dc154a3..2f14e41 100644 --- a/crawler-module/src/crawlers/kr36/ai.py +++ b/crawler-module/src/crawlers/kr36/ai.py @@ -18,6 +18,29 @@ class AICrawler(DynamicCrawler): ARTICLE_BASE_URL = "https://www.36kr.com/p/" + def _fetch_page(self) -> str: + """获取页面HTML - 点击加载更多按钮""" + if not self._driver: + from utils.selenium_driver import SeleniumDriver + self._driver = SeleniumDriver() + + try: + # 获取页面 + self._driver._driver.get(self.url) + + # 等待页面加载 + import time + time.sleep(2) + + # 点击加载更多按钮多次 + self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10) + + return self._driver._driver.page_source + + except Exception as e: + self.logger.error(f"获取页面源码失败: {self.url} - {e}") + raise + def _extract_article_urls(self, html: str) -> List[str]: """从HTML中提取文章URL列表""" soup = BeautifulSoup(html, "lxml") diff --git a/crawler-module/src/crawlers/kr36/health.py b/crawler-module/src/crawlers/kr36/health.py new file mode 100644 index 0000000..df610d2 --- /dev/null +++ b/crawler-module/src/crawlers/kr36/health.py @@ -0,0 +1,87 @@ +""" +36氪健康新闻爬虫 +""" + +from typing import List +from bs4 import BeautifulSoup + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from base.crawler_base import DynamicCrawler, Article +from parsers.kr36_parser import Kr36Parser + + +class HealthCrawler(DynamicCrawler): + """36氪健康新闻爬虫""" + + ARTICLE_BASE_URL = "https://www.36kr.com" + + def _fetch_page(self) -> str: + """获取页面HTML - 点击加载更多按钮""" + if not self._driver: + from utils.selenium_driver import SeleniumDriver + self._driver = SeleniumDriver() + + try: + # 获取页面 + self._driver._driver.get(self.url) + + # 等待页面加载 + import time + time.sleep(2) + + # 点击加载更多按钮多次 + self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10) + + return self._driver._driver.page_source + + except Exception as e: + self.logger.error(f"获取页面源码失败: {self.url} - {e}") + raise + + def _extract_article_urls(self, html: str) -> List[str]: + """从HTML中提取文章URL列表""" + soup = BeautifulSoup(html, "lxml") + urls = [] + + li_list = soup.select( + "div.kr-layout div.kr-layout-main div.kr-layout-content " + "div.kr-search-result-list ul.kr-search-result-list-main > li" + ) + + for item in li_list: + a = item.select_one("div.kr-shadow-content a") + if a: + href = a.get("href") + if href: + # 构建完整文章URL + full_url = self.ARTICLE_BASE_URL + href + urls.append(full_url) + + return urls + + def _fetch_articles(self, urls: List[str]) -> List[Article]: + """爬取文章详情""" + articles = [] + parser = Kr36Parser() + + for i, url in enumerate(urls[:self.max_articles]): + try: + article = parser.parse(url) + article.category_id = self.category_id + article.source = "36kr" + + if not article.author: + article.author = "36氪" + + if article.is_valid(): + articles.append(article) + self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") + + except Exception as e: + self.logger.error(f"解析文章失败: {url} - {e}") + continue + + return articles \ No newline at end of file diff --git a/crawler-module/src/utils/selenium_driver.py b/crawler-module/src/utils/selenium_driver.py index 62d1ee6..7a8c06e 100644 --- a/crawler-module/src/utils/selenium_driver.py +++ b/crawler-module/src/utils/selenium_driver.py @@ -39,8 +39,8 @@ class SeleniumDriver: options.add_argument(f"--log-level={config.selenium.log_level}") # 无头模式 - if config.selenium.headless: - options.add_argument("--headless=new") + # if config.selenium.headless: + # options.add_argument("--headless=new") # 窗口大小 if config.selenium.window_size: @@ -128,6 +128,41 @@ class SeleniumDriver: self.logger.debug(f"滚动完成,最终高度: {last_height}") + def click_load_more(self, selector: str = "div.kr-loading-more-button", max_clicks: int = 10): + """ + 点击加载更多按钮 + + Args: + selector: 加载更多按钮的CSS选择器 + max_clicks: 最大点击次数 + """ + if not self._driver: + self._create_driver() + + click_count = 0 + + for i in range(max_clicks): + try: + # 多次滚动到页面底部 + for j in range(3): + self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(0.5) + + # 点击加载更多按钮(不管按钮是否可见或可点击) + load_more_btn = self._driver.find_element(By.CSS_SELECTOR, selector) + load_more_btn.click() + click_count += 1 + self.logger.debug(f"点击加载更多按钮 {click_count} 次") + + # 等待内容加载 + time.sleep(config.selenium.scroll_pause_time) + + except Exception as e: + self.logger.debug(f"点击加载更多按钮失败: {e}") + break + + self.logger.info(f"加载更多按钮共点击 {click_count} 次") + def quit(self): """退出驱动""" if self._driver: