feat: 增加kr36 爬虫滚动屏幕次数
This commit is contained in:
parent
3ce7683a42
commit
543ce5ec0a
|
|
@ -34,9 +34,9 @@ selenium:
|
|||
headless: true
|
||||
log_level: 3
|
||||
window_size: "1920,1080"
|
||||
page_load_timeout: 30
|
||||
page_load_timeout: 60
|
||||
script_timeout: 30
|
||||
implicit_wait: 10
|
||||
implicit_wait: 20
|
||||
scroll_pause_time: 1.2
|
||||
max_scroll_times: 10
|
||||
|
||||
|
|
@ -110,6 +110,11 @@ sources:
|
|||
category_id: 9
|
||||
name: "AI"
|
||||
css_selector: "div.kr-information-left"
|
||||
health:
|
||||
url: "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
|
||||
category_id: 8
|
||||
name: "健康"
|
||||
css_selector: "div.kr-search-result-list"
|
||||
|
||||
sina:
|
||||
base_url: "https://sina.com.cn"
|
||||
|
|
|
|||
|
|
@ -55,7 +55,7 @@ class SeleniumConfig:
|
|||
page_load_timeout: int = 30
|
||||
script_timeout: int = 30
|
||||
implicit_wait: int = 10
|
||||
scroll_pause_time: float = 1.2
|
||||
scroll_pause_time: float = 5
|
||||
max_scroll_times: int = 10
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,33 @@
|
|||
|
||||
这是36kr关于爬取健康相关新闻的代码
|
||||
```python
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
URL = "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7"
|
||||
TARGET_URL = "https://www.36kr.com"
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
}
|
||||
resp = requests.get(URL,headers=headers,timeout=10)
|
||||
resp.raise_for_status()
|
||||
resp.encoding = "utf-8"
|
||||
# print(resp.text)
|
||||
with open("example/example-11.html","r",encoding="utf-8") as f:
|
||||
html = f.read()
|
||||
|
||||
# soup = BeautifulSoup(resp.text,"lxml")
|
||||
soup = BeautifulSoup(html,"lxml")
|
||||
li_list = soup.select("div.kr-layout div.kr-layout-main div.kr-layout-content div.kr-search-result-list ul.kr-search-result-list-main > li")
|
||||
|
||||
for item in li_list:
|
||||
a = item.select_one("div.kr-shadow-content a")
|
||||
href = TARGET_URL+ a.get("href")
|
||||
print(href)
|
||||
|
||||
```
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
|
||||
这是新浪网关于爬取汽车相关新闻的代码
|
||||
```python
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
URL = "https://auto.sina.com.cn/"
|
||||
|
||||
headers = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
}
|
||||
resp = requests.get(URL,headers=headers,timeout=10)
|
||||
# resp.raise_for_status()
|
||||
# resp.encoding = "utf-8"
|
||||
# print(resp.text)
|
||||
with open("example/example-10.html","r",encoding="utf-8") as f:
|
||||
html = f.read()
|
||||
|
||||
# soup = BeautifulSoup(resp.text,"lxml")
|
||||
soup = BeautifulSoup(html,"lxml")
|
||||
div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
|
||||
|
||||
for item in div_list:
|
||||
a = item.select_one("div.ty-card-l a")
|
||||
href = a.get("href")
|
||||
# print(a.get('href'),a.get_text().strip())
|
||||
|
||||
resp = requests.get(url=href,headers=headers)
|
||||
resp.encoding = resp.apparent_encoding # requests 会尝试猜测编码
|
||||
soup = BeautifulSoup(resp.text,"lxml")
|
||||
# 获取文章标题
|
||||
article_title_tag = soup.select_one("div.main-content h1.main-title")
|
||||
if article_title_tag:
|
||||
article_title = article_title_tag.get_text(strip=True)
|
||||
if not article_title:
|
||||
article_title = "未知标题"
|
||||
else:
|
||||
article_title = "未知标题"
|
||||
# print("标题:", article_title)
|
||||
# 获取文章发布时间
|
||||
from datetime import datetime
|
||||
|
||||
# 日期时间格式化函数
|
||||
def normalize_time(time_str):
|
||||
for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
|
||||
try:
|
||||
dt = datetime.strptime(time_str, fmt)
|
||||
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except:
|
||||
continue
|
||||
return time_str # 如果都不匹配,返回原字符串
|
||||
|
||||
time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
|
||||
if time_tag: # 只有存在时间标签才进行格式化
|
||||
publish_time = normalize_time(time_tag.get_text(strip=True))
|
||||
else:
|
||||
publish_time = "1949-01-01 12:00:00"
|
||||
#print(publish_time)
|
||||
|
||||
# 获取文章作者
|
||||
author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
|
||||
if author_tag:
|
||||
author = author_tag.get_text(strip=True)
|
||||
else:
|
||||
author = "未知"
|
||||
# print(author)
|
||||
# 获取文章正文段落
|
||||
article_div = soup.select_one("div.main-content div.article") # 核心文章容器
|
||||
if not article_div:
|
||||
# print("不是文章详情页,跳过")
|
||||
continue # 如果不是详情页就跳过
|
||||
paragraphs = article_div.find_all('p')
|
||||
article_text = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
|
||||
# print("正文:\n", article_text)
|
||||
|
||||
|
||||
```
|
||||
|
|
@ -30,6 +30,7 @@ CRAWLER_CLASSES = {
|
|||
},
|
||||
'kr36': {
|
||||
'ai': ('crawlers.kr36.ai', 'AICrawler'),
|
||||
'health': ('crawlers.kr36.health', 'HealthCrawler'),
|
||||
},
|
||||
'sina': {
|
||||
'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
|
||||
|
|
|
|||
|
|
@ -18,6 +18,29 @@ class AICrawler(DynamicCrawler):
|
|||
|
||||
ARTICLE_BASE_URL = "https://www.36kr.com/p/"
|
||||
|
||||
def _fetch_page(self) -> str:
|
||||
"""获取页面HTML - 点击加载更多按钮"""
|
||||
if not self._driver:
|
||||
from utils.selenium_driver import SeleniumDriver
|
||||
self._driver = SeleniumDriver()
|
||||
|
||||
try:
|
||||
# 获取页面
|
||||
self._driver._driver.get(self.url)
|
||||
|
||||
# 等待页面加载
|
||||
import time
|
||||
time.sleep(2)
|
||||
|
||||
# 点击加载更多按钮多次
|
||||
self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10)
|
||||
|
||||
return self._driver._driver.page_source
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"获取页面源码失败: {self.url} - {e}")
|
||||
raise
|
||||
|
||||
def _extract_article_urls(self, html: str) -> List[str]:
|
||||
"""从HTML中提取文章URL列表"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,87 @@
|
|||
"""
|
||||
36氪健康新闻爬虫
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
|
||||
from base.crawler_base import DynamicCrawler, Article
|
||||
from parsers.kr36_parser import Kr36Parser
|
||||
|
||||
|
||||
class HealthCrawler(DynamicCrawler):
|
||||
"""36氪健康新闻爬虫"""
|
||||
|
||||
ARTICLE_BASE_URL = "https://www.36kr.com"
|
||||
|
||||
def _fetch_page(self) -> str:
|
||||
"""获取页面HTML - 点击加载更多按钮"""
|
||||
if not self._driver:
|
||||
from utils.selenium_driver import SeleniumDriver
|
||||
self._driver = SeleniumDriver()
|
||||
|
||||
try:
|
||||
# 获取页面
|
||||
self._driver._driver.get(self.url)
|
||||
|
||||
# 等待页面加载
|
||||
import time
|
||||
time.sleep(2)
|
||||
|
||||
# 点击加载更多按钮多次
|
||||
self._driver.click_load_more("div.kr-loading-more-button", max_clicks=10)
|
||||
|
||||
return self._driver._driver.page_source
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"获取页面源码失败: {self.url} - {e}")
|
||||
raise
|
||||
|
||||
def _extract_article_urls(self, html: str) -> List[str]:
|
||||
"""从HTML中提取文章URL列表"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
urls = []
|
||||
|
||||
li_list = soup.select(
|
||||
"div.kr-layout div.kr-layout-main div.kr-layout-content "
|
||||
"div.kr-search-result-list ul.kr-search-result-list-main > li"
|
||||
)
|
||||
|
||||
for item in li_list:
|
||||
a = item.select_one("div.kr-shadow-content a")
|
||||
if a:
|
||||
href = a.get("href")
|
||||
if href:
|
||||
# 构建完整文章URL
|
||||
full_url = self.ARTICLE_BASE_URL + href
|
||||
urls.append(full_url)
|
||||
|
||||
return urls
|
||||
|
||||
def _fetch_articles(self, urls: List[str]) -> List[Article]:
|
||||
"""爬取文章详情"""
|
||||
articles = []
|
||||
parser = Kr36Parser()
|
||||
|
||||
for i, url in enumerate(urls[:self.max_articles]):
|
||||
try:
|
||||
article = parser.parse(url)
|
||||
article.category_id = self.category_id
|
||||
article.source = "36kr"
|
||||
|
||||
if not article.author:
|
||||
article.author = "36氪"
|
||||
|
||||
if article.is_valid():
|
||||
articles.append(article)
|
||||
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"解析文章失败: {url} - {e}")
|
||||
continue
|
||||
|
||||
return articles
|
||||
|
|
@ -39,8 +39,8 @@ class SeleniumDriver:
|
|||
options.add_argument(f"--log-level={config.selenium.log_level}")
|
||||
|
||||
# 无头模式
|
||||
if config.selenium.headless:
|
||||
options.add_argument("--headless=new")
|
||||
# if config.selenium.headless:
|
||||
# options.add_argument("--headless=new")
|
||||
|
||||
# 窗口大小
|
||||
if config.selenium.window_size:
|
||||
|
|
@ -128,6 +128,41 @@ class SeleniumDriver:
|
|||
|
||||
self.logger.debug(f"滚动完成,最终高度: {last_height}")
|
||||
|
||||
def click_load_more(self, selector: str = "div.kr-loading-more-button", max_clicks: int = 10):
|
||||
"""
|
||||
点击加载更多按钮
|
||||
|
||||
Args:
|
||||
selector: 加载更多按钮的CSS选择器
|
||||
max_clicks: 最大点击次数
|
||||
"""
|
||||
if not self._driver:
|
||||
self._create_driver()
|
||||
|
||||
click_count = 0
|
||||
|
||||
for i in range(max_clicks):
|
||||
try:
|
||||
# 多次滚动到页面底部
|
||||
for j in range(3):
|
||||
self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||
time.sleep(0.5)
|
||||
|
||||
# 点击加载更多按钮(不管按钮是否可见或可点击)
|
||||
load_more_btn = self._driver.find_element(By.CSS_SELECTOR, selector)
|
||||
load_more_btn.click()
|
||||
click_count += 1
|
||||
self.logger.debug(f"点击加载更多按钮 {click_count} 次")
|
||||
|
||||
# 等待内容加载
|
||||
time.sleep(config.selenium.scroll_pause_time)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.debug(f"点击加载更多按钮失败: {e}")
|
||||
break
|
||||
|
||||
self.logger.info(f"加载更多按钮共点击 {click_count} 次")
|
||||
|
||||
def quit(self):
|
||||
"""退出驱动"""
|
||||
if self._driver:
|
||||
|
|
|
|||
Loading…
Reference in New Issue