feat: 增加kr36 爬虫滚动屏幕次数
This commit is contained in:
parent
3ce7683a42
commit
543ce5ec0a
|
|
@ -34,9 +34,9 @@ selenium:
|
||||||
headless: true
|
headless: true
|
||||||
log_level: 3
|
log_level: 3
|
||||||
window_size: "1920,1080"
|
window_size: "1920,1080"
|
||||||
page_load_timeout: 30
|
page_load_timeout: 60
|
||||||
script_timeout: 30
|
script_timeout: 30
|
||||||
implicit_wait: 10
|
implicit_wait: 20
|
||||||
scroll_pause_time: 1.2
|
scroll_pause_time: 1.2
|
||||||
max_scroll_times: 10
|
max_scroll_times: 10
|
||||||
|
|
||||||
|
|
@ -110,6 +110,11 @@ sources:
|
||||||
category_id: 9
|
category_id: 9
|
||||||
name: "AI"
|
name: "AI"
|
||||||
css_selector: "div.kr-information-left"
|
css_selector: "div.kr-information-left"
|
||||||
|
health:
|
||||||
|
url: "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
|
||||||
|
category_id: 8
|
||||||
|
name: "健康"
|
||||||
|
css_selector: "div.kr-search-result-list"
|
||||||
|
|
||||||
sina:
|
sina:
|
||||||
base_url: "https://sina.com.cn"
|
base_url: "https://sina.com.cn"
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@ class SeleniumConfig:
|
||||||
page_load_timeout: int = 30
|
page_load_timeout: int = 30
|
||||||
script_timeout: int = 30
|
script_timeout: int = 30
|
||||||
implicit_wait: int = 10
|
implicit_wait: int = 10
|
||||||
scroll_pause_time: float = 1.2
|
scroll_pause_time: float = 5
|
||||||
max_scroll_times: int = 10
|
max_scroll_times: int = 10
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,33 @@
|
||||||
|
|
||||||
|
这是36kr关于爬取健康相关新闻的代码
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
|
||||||
|
URL = "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
|
||||||
|
TARGET_URL = "https://www.36kr.com"
|
||||||
|
headers = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
resp = requests.get(URL,headers=headers,timeout=10)
|
||||||
|
resp.raise_for_status()
|
||||||
|
resp.encoding = "utf-8"
|
||||||
|
# print(resp.text)
|
||||||
|
with open("example/example-11.html","r",encoding="utf-8") as f:
|
||||||
|
html = f.read()
|
||||||
|
|
||||||
|
# soup = BeautifulSoup(resp.text,"lxml")
|
||||||
|
soup = BeautifulSoup(html,"lxml")
|
||||||
|
li_list = soup.select("div.kr-layout div.kr-layout-main div.kr-layout-content div.kr-search-result-list ul.kr-search-result-list-main > li")
|
||||||
|
|
||||||
|
for item in li_list:
|
||||||
|
a = item.select_one("div.kr-shadow-content a")
|
||||||
|
href = TARGET_URL+ a.get("href")
|
||||||
|
print(href)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
@ -1,82 +0,0 @@
|
||||||
|
|
||||||
这是新浪网关于爬取汽车相关新闻的代码
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
|
|
||||||
URL = "https://auto.sina.com.cn/"
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"User-Agent": (
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
||||||
"Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
)
|
|
||||||
}
|
|
||||||
resp = requests.get(URL,headers=headers,timeout=10)
|
|
||||||
# resp.raise_for_status()
|
|
||||||
# resp.encoding = "utf-8"
|
|
||||||
# print(resp.text)
|
|
||||||
with open("example/example-10.html","r",encoding="utf-8") as f:
|
|
||||||
html = f.read()
|
|
||||||
|
|
||||||
# soup = BeautifulSoup(resp.text,"lxml")
|
|
||||||
soup = BeautifulSoup(html,"lxml")
|
|
||||||
div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
|
|
||||||
|
|
||||||
for item in div_list:
|
|
||||||
a = item.select_one("div.ty-card-l a")
|
|
||||||
href = a.get("href")
|
|
||||||
# print(a.get('href'),a.get_text().strip())
|
|
||||||
|
|
||||||
resp = requests.get(url=href,headers=headers)
|
|
||||||
resp.encoding = resp.apparent_encoding # requests 会尝试猜测编码
|
|
||||||
soup = BeautifulSoup(resp.text,"lxml")
|
|
||||||
# 获取文章标题
|
|
||||||
article_title_tag = soup.select_one("div.main-content h1.main-title")
|
|
||||||
if article_title_tag:
|
|
||||||
article_title = article_title_tag.get_text(strip=True)
|
|
||||||
if not article_title:
|
|
||||||
article_title = "未知标题"
|
|
||||||
else:
|
|
||||||
article_title = "未知标题"
|
|
||||||
# print("标题:", article_title)
|
|
||||||
# 获取文章发布时间
|
|
||||||
from datetime import datetime
|
|
||||||
|
|
||||||
# 日期时间格式化函数
|
|
||||||
def normalize_time(time_str):
|
|
||||||
for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
|
|
||||||
try:
|
|
||||||
dt = datetime.strptime(time_str, fmt)
|
|
||||||
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
||||||
except:
|
|
||||||
continue
|
|
||||||
return time_str # 如果都不匹配,返回原字符串
|
|
||||||
|
|
||||||
time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
|
|
||||||
if time_tag: # 只有存在时间标签才进行格式化
|
|
||||||
publish_time = normalize_time(time_tag.get_text(strip=True))
|
|
||||||
else:
|
|
||||||
publish_time = "1949-01-01 12:00:00"
|
|
||||||
#print(publish_time)
|
|
||||||
|
|
||||||
# 获取文章作者
|
|
||||||
author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
|
|
||||||
if author_tag:
|
|
||||||
author = author_tag.get_text(strip=True)
|
|
||||||
else:
|
|
||||||
author = "未知"
|
|
||||||
# print(author)
|
|
||||||
# 获取文章正文段落
|
|
||||||
article_div = soup.select_one("div.main-content div.article") # 核心文章容器
|
|
||||||
if not article_div:
|
|
||||||
# print("不是文章详情页,跳过")
|
|
||||||
continue # 如果不是详情页就跳过
|
|
||||||
paragraphs = article_div.find_all('p')
|
|
||||||
article_text = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
|
|
||||||
# print("正文:\n", article_text)
|
|
||||||
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
@ -30,6 +30,7 @@ CRAWLER_CLASSES = {
|
||||||
},
|
},
|
||||||
'kr36': {
|
'kr36': {
|
||||||
'ai': ('crawlers.kr36.ai', 'AICrawler'),
|
'ai': ('crawlers.kr36.ai', 'AICrawler'),
|
||||||
|
'health': ('crawlers.kr36.health', 'HealthCrawler'),
|
||||||
},
|
},
|
||||||
'sina': {
|
'sina': {
|
||||||
'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
|
'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,29 @@ class AICrawler(DynamicCrawler):
|
||||||
|
|
||||||
ARTICLE_BASE_URL = "https://www.36kr.com/p/"
|
ARTICLE_BASE_URL = "https://www.36kr.com/p/"
|
||||||
|
|
||||||
|
def _fetch_page(self) -> str:
|
||||||
|
"""获取页面HTML - 点击加载更多按钮"""
|
||||||
|
if not self._driver:
|
||||||
|
from utils.selenium_driver import SeleniumDriver
|
||||||
|
self._driver = SeleniumDriver()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 获取页面
|
||||||
|
self._driver._driver.get(self.url)
|
||||||
|
|
||||||
|
# 等待页面加载
|
||||||
|
import time
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# 点击加载更多按钮多次
|
||||||
|
self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10)
|
||||||
|
|
||||||
|
return self._driver._driver.page_source
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"获取页面源码失败: {self.url} - {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
def _extract_article_urls(self, html: str) -> List[str]:
|
def _extract_article_urls(self, html: str) -> List[str]:
|
||||||
"""从HTML中提取文章URL列表"""
|
"""从HTML中提取文章URL列表"""
|
||||||
soup = BeautifulSoup(html, "lxml")
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,87 @@
|
||||||
|
"""
|
||||||
|
36氪健康新闻爬虫
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||||
|
|
||||||
|
from base.crawler_base import DynamicCrawler, Article
|
||||||
|
from parsers.kr36_parser import Kr36Parser
|
||||||
|
|
||||||
|
|
||||||
|
class HealthCrawler(DynamicCrawler):
|
||||||
|
"""36氪健康新闻爬虫"""
|
||||||
|
|
||||||
|
ARTICLE_BASE_URL = "https://www.36kr.com"
|
||||||
|
|
||||||
|
def _fetch_page(self) -> str:
|
||||||
|
"""获取页面HTML - 点击加载更多按钮"""
|
||||||
|
if not self._driver:
|
||||||
|
from utils.selenium_driver import SeleniumDriver
|
||||||
|
self._driver = SeleniumDriver()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 获取页面
|
||||||
|
self._driver._driver.get(self.url)
|
||||||
|
|
||||||
|
# 等待页面加载
|
||||||
|
import time
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# 点击加载更多按钮多次
|
||||||
|
self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10)
|
||||||
|
|
||||||
|
return self._driver._driver.page_source
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"获取页面源码失败: {self.url} - {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
def _extract_article_urls(self, html: str) -> List[str]:
|
||||||
|
"""从HTML中提取文章URL列表"""
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
urls = []
|
||||||
|
|
||||||
|
li_list = soup.select(
|
||||||
|
"div.kr-layout div.kr-layout-main div.kr-layout-content "
|
||||||
|
"div.kr-search-result-list ul.kr-search-result-list-main > li"
|
||||||
|
)
|
||||||
|
|
||||||
|
for item in li_list:
|
||||||
|
a = item.select_one("div.kr-shadow-content a")
|
||||||
|
if a:
|
||||||
|
href = a.get("href")
|
||||||
|
if href:
|
||||||
|
# 构建完整文章URL
|
||||||
|
full_url = self.ARTICLE_BASE_URL + href
|
||||||
|
urls.append(full_url)
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
def _fetch_articles(self, urls: List[str]) -> List[Article]:
|
||||||
|
"""爬取文章详情"""
|
||||||
|
articles = []
|
||||||
|
parser = Kr36Parser()
|
||||||
|
|
||||||
|
for i, url in enumerate(urls[:self.max_articles]):
|
||||||
|
try:
|
||||||
|
article = parser.parse(url)
|
||||||
|
article.category_id = self.category_id
|
||||||
|
article.source = "36kr"
|
||||||
|
|
||||||
|
if not article.author:
|
||||||
|
article.author = "36氪"
|
||||||
|
|
||||||
|
if article.is_valid():
|
||||||
|
articles.append(article)
|
||||||
|
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"解析文章失败: {url} - {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return articles
|
||||||
|
|
@ -39,8 +39,8 @@ class SeleniumDriver:
|
||||||
options.add_argument(f"--log-level={config.selenium.log_level}")
|
options.add_argument(f"--log-level={config.selenium.log_level}")
|
||||||
|
|
||||||
# 无头模式
|
# 无头模式
|
||||||
if config.selenium.headless:
|
# if config.selenium.headless:
|
||||||
options.add_argument("--headless=new")
|
# options.add_argument("--headless=new")
|
||||||
|
|
||||||
# 窗口大小
|
# 窗口大小
|
||||||
if config.selenium.window_size:
|
if config.selenium.window_size:
|
||||||
|
|
@ -128,6 +128,41 @@ class SeleniumDriver:
|
||||||
|
|
||||||
self.logger.debug(f"滚动完成,最终高度: {last_height}")
|
self.logger.debug(f"滚动完成,最终高度: {last_height}")
|
||||||
|
|
||||||
|
def click_load_more(self, selector: str = "div.kr-loading-more-button", max_clicks: int = 10):
|
||||||
|
"""
|
||||||
|
点击加载更多按钮
|
||||||
|
|
||||||
|
Args:
|
||||||
|
selector: 加载更多按钮的CSS选择器
|
||||||
|
max_clicks: 最大点击次数
|
||||||
|
"""
|
||||||
|
if not self._driver:
|
||||||
|
self._create_driver()
|
||||||
|
|
||||||
|
click_count = 0
|
||||||
|
|
||||||
|
for i in range(max_clicks):
|
||||||
|
try:
|
||||||
|
# 多次滚动到页面底部
|
||||||
|
for j in range(3):
|
||||||
|
self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# 点击加载更多按钮(不管按钮是否可见或可点击)
|
||||||
|
load_more_btn = self._driver.find_element(By.CSS_SELECTOR, selector)
|
||||||
|
load_more_btn.click()
|
||||||
|
click_count += 1
|
||||||
|
self.logger.debug(f"点击加载更多按钮 {click_count} 次")
|
||||||
|
|
||||||
|
# 等待内容加载
|
||||||
|
time.sleep(config.selenium.scroll_pause_time)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.debug(f"点击加载更多按钮失败: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
self.logger.info(f"加载更多按钮共点击 {click_count} 次")
|
||||||
|
|
||||||
def quit(self):
|
def quit(self):
|
||||||
"""退出驱动"""
|
"""退出驱动"""
|
||||||
if self._driver:
|
if self._driver:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue