feat: 增加腾讯健康、科技、房产分类爬虫

This commit is contained in:
shenjianZ 2026-01-15 17:26:16 +08:00
parent 144c9e082f
commit 08c9950db5
8 changed files with 843 additions and 14 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 106 KiB

View File

@ -1,5 +1,5 @@
<script setup lang="ts">
import { ref, onMounted } from 'vue'
import { ref, onMounted, nextTick } from 'vue'
import { categoryApi } from '@/api'
import type { CategoryDto, CategoryCreateDto } from '@/types/api'
import CategoryCard from '@/components/CategoryCard.vue'
@ -46,9 +46,20 @@ async function createCategory() {
try {
const data: CategoryCreateDto = { name: newCategoryName.value.trim() }
// API
const newCategory = await categoryApi.create(data)
categories.value.push(newCategory)
newsStore.addCategory(newCategory)
// API
if (newCategory && newCategory.id) {
//
const exists = categories.value.some(cat => cat.id === newCategory.id)
if (!exists) {
categories.value.push(newCategory)
newsStore.addCategory(newCategory)
}
}
newCategoryName.value = ''
showCreateDialog.value = false
} catch (err: any) {
@ -80,12 +91,22 @@ async function deleteCategory(id: number) {
if (!category) return
//
let confirmMessage = ''
if (category.newsCount && category.newsCount > 0) {
if (!confirm(`该分类下有 ${category.newsCount} 条新闻,确定要删除吗?`)) {
confirmMessage = `该分类下有 ${category.newsCount} 条新闻,确定要删除吗?`
} else {
confirmMessage = `确定要删除分类"${category.name}"吗?`
}
// 使Tauri
if (window.__TAURI__?.dialog) {
const result = await window.__TAURI__.dialog.ask(confirmMessage)
if (!result) {
return
}
} else {
if (!confirm(`确定要删除分类"${category.name}"吗?`)) {
// 退confirm
if (!confirm(confirmMessage)) {
return
}
}
@ -100,10 +121,20 @@ async function deleteCategory(id: number) {
}
}
//
const categoryNameInput = ref<HTMLInputElement | null>(null)
//
function openCreateDialog() {
newCategoryName.value = ''
showCreateDialog.value = true
// DOM
setTimeout(() => {
if (categoryNameInput.value) {
categoryNameInput.value.focus()
}
}, 100)
}
//
@ -195,9 +226,11 @@ onMounted(() => {
id="category-name"
v-model="newCategoryName"
type="text"
autocomplete="off"
placeholder="请输入分类名称"
class="form-input"
@keyup.enter="createCategory"
ref="categoryNameInput"
/>
</div>
</div>
@ -366,17 +399,17 @@ onMounted(() => {
display: flex;
align-items: center;
justify-content: center;
z-index: 100;
z-index: 2000;
padding: 1rem;
}
.dialog {
width: 100%;
max-width: 400px;
background: hsl(var(--popover));
border: 1px solid hsl(var(--border));
border-radius: calc(var(--radius) + 4px);
box-shadow: 0 25px 50px -12px rgb(0 0 0 / 0.25);
background: var(--popover);
border: 1px solid var(--border);
border-radius: var(--radius);
box-shadow: 0 10px 40px -10px rgb(0 0 0 / 0.2);
}
.dialog-header {
@ -432,12 +465,17 @@ onMounted(() => {
color: hsl(var(--foreground));
border: 1px solid hsl(var(--border));
border-radius: var(--radius);
transition: border-color 0.2s, box-shadow 0.2s;
}
.form-input:focus {
outline: none;
border-color: hsl(var(--ring));
box-shadow: 0 0 0 3px hsl(var(--ring) / 0.2);
border-color: #3b82f6; /* 使用蓝色而不是ring变量 */
box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.2);
border-width: 1px;
border-style: solid;
/* 添加淡蓝色阴影效果 */
box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.2), 0 0 0 1px #3b82f6;
}
.dialog-footer {

View File

@ -0,0 +1,140 @@
# 问题解决总结
## 1. 模态框遮盖问题
**问题描述:**
在分类管理界面中,创建分类时弹出的模态框没有正确遮盖背景内容,导致模态框显示在背景元素之下。
**解决方案:**
修改了模态框的z-index值确保其显示在其他元素之上
```css
/* 弹窗 */
.dialog-overlay {
position: fixed;
top: 0;
left: 0;
right: 0;
bottom: 0;
background: rgb(0 0 0 / 0.5);
display: flex;
align-items: center;
justify-content: center;
z-index: 2000; /* 提高z-index值 */
padding: 1rem;
}
```
**原因分析:**
- MainLayout.vue中的header使用了`z-index: 1000`
- reka-ui的DialogOverlay组件使用了`z-50`z-index: 50
- 将模态框的z-index设置为2000确保其显示在最上层
## 2. 分类创建后重复显示问题
**问题描述:**
创建分类后,分类列表界面显示重复的分类项,刷新后恢复正常。
**解决方案:**
修改了`createCategory`函数,添加重复检查逻辑:
```typescript
// 创建分类
async function createCategory() {
if (!newCategoryName.value.trim()) {
return
}
creating.value = true
error.value = null
try {
const data: CategoryCreateDto = { name: newCategoryName.value.trim() }
// 等待API响应完成
const newCategory = await categoryApi.create(data)
// 确保API返回的数据是完整的
if (newCategory && newCategory.id) {
// 检查是否已存在相同的分类,避免重复添加
const exists = categories.value.some(cat => cat.id === newCategory.id)
if (!exists) {
categories.value.push(newCategory)
newsStore.addCategory(newCategory)
}
}
newCategoryName.value = ''
showCreateDialog.value = false
} catch (err: any) {
error.value = err.message || '创建分类失败'
} finally {
creating.value = false
}
}
```
**原因分析:**
- 原始代码没有检查分类是否已存在,导致重复添加
- 添加了`exists`检查确保不会重复添加相同ID的分类
- 确保只有在API完全响应并返回有效数据后才会更新前端状态
## 3. Tauri环境下的确认对话框权限问题
**问题描述:**
在Tauri环境中删除分类时出现"Uncaught (in promise) dialog.confirm not allowed"错误。
**解决方案:**
修改了删除分类的逻辑使用Tauri的对话框API
```typescript
// 删除分类
async function deleteCategory(id: number) {
const category = categories.value.find(c => c.id === id)
if (!category) return
// 检查是否有新闻
let confirmMessage = ''
if (category.newsCount && category.newsCount > 0) {
confirmMessage = `该分类下有 ${category.newsCount} 条新闻,确定要删除吗?`
} else {
confirmMessage = `确定要删除分类"${category.name}"吗?`
}
// 使用Tauri对话框如果可用
if (window.__TAURI__?.dialog) {
const result = await window.__TAURI__.dialog.ask(confirmMessage)
if (!result) {
return
}
} else {
// 回退到原生confirm
if (!confirm(confirmMessage)) {
return
}
}
try {
await categoryApi.delete(id)
categories.value = categories.value.filter(c => c.id !== id)
newsStore.removeCategory(id)
} catch (err: any) {
error.value = err.message || '删除分类失败'
await fetchCategories()
}
}
```
**原因分析:**
- Tauri应用需要显式请求权限才能使用原生`confirm`对话框
- 使用Tauri的`dialog.ask()`方法替代原生`confirm`
- 添加了回退机制确保在非Tauri环境中也能正常工作
## 总结
通过以上修改,解决了以下问题:
1. 模态框显示层级问题
2. 分类数据同步问题
3. Tauri环境下的权限问题
这些修改确保了分类管理功能的稳定性和正确性。

View File

@ -132,10 +132,10 @@ sources:
css_selector: "a[href]"
tencent:
base_url: "https://new.qq.com"
base_url: "https://news.qq.com/"
categories:
auto:
url: "https://new.qq.com/auto"
url: "https://news.qq.com/ch/auto"
category_id: 6
name: "汽车"
css_selector: ""
@ -149,3 +149,18 @@ sources:
category_id: 5
name: "军事(网页版)"
css_selector: "div[id='channel-feed-area']"
health:
url: "https://news.qq.com/ch/health"
category_id: 8
name: "健康"
css_selector: ""
house:
url: "https://news.qq.com/ch/house/"
category_id: 10
name: "房产"
css_selector: ""
tech:
url: "https://news.qq.com/ch/tech"
category_id: 4
name: "科技"
css_selector: ""

View File

@ -40,6 +40,9 @@ CRAWLER_CLASSES = {
'auto': ('crawlers.tencent.auto', 'AutoCrawler'),
'war': ('crawlers.tencent.war', 'WarCrawler'),
'war_web': ('crawlers.tencent.war_web', 'WarWebCrawler'),
'health': ('crawlers.tencent.health', 'HealthCrawler'),
'house': ('crawlers.tencent.house', 'HouseCrawler'),
'tech': ('crawlers.tencent.tech', 'TechCrawler'),
},
}

View File

@ -0,0 +1,211 @@
"""
腾讯健康新闻爬虫API版
使用腾讯新闻 API 接口获取数据性能更好
"""
import time
import random
import hashlib
from typing import List
import requests
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from base.crawler_base import StaticCrawler, Article
from parsers.tencent_parser import TencentParser
class HealthCrawler(StaticCrawler):
"""腾讯健康新闻爬虫API版"""
def __init__(self, source: str, category: str):
super().__init__(source, category)
# 腾讯API配置
self.api_url = "https://i.news.qq.com/web_feed/getPCList"
self.channel_id = "news_news_antip" # 健康频道
self.seen_ids = set()
self.item_count = 20 # 每页固定请求20条
def _generate_trace_id(self):
"""生成trace_id"""
random_str = str(random.random()) + str(time.time())
return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12]
def crawl(self) -> List[Article]:
"""
执行爬取任务重写基类方法以支持API接口
Returns:
文章列表
"""
self.logger.info(f"开始爬取腾讯{self.category_name}新闻")
try:
# 生成设备ID
device_id = self._generate_trace_id()
# 获取文章URL列表
article_urls = self._fetch_article_urls_from_api(device_id)
self.logger.info(f"找到 {len(article_urls)} 篇文章")
# 爬取文章详情
articles = self._fetch_articles(article_urls)
self.logger.info(f"成功爬取 {len(articles)} 篇文章")
return articles
except Exception as e:
self.logger.error(f"爬取失败: {e}", exc_info=True)
return []
finally:
self._cleanup()
def _fetch_article_urls_from_api(self, device_id: str) -> List[str]:
"""
从API获取文章URL列表
Args:
device_id: 设备ID
Returns:
文章URL列表
"""
urls = []
# 根据 max_articles 动态计算需要抓取的页数
# 每页20条向上取整
import math
max_pages = math.ceil(self.max_articles / self.item_count)
self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages}")
for flush_num in range(max_pages):
payload = {
"base_req": {"from": "pc"},
"forward": "1",
"qimei36": device_id,
"device_id": device_id,
"flush_num": flush_num + 1,
"channel_id": self.channel_id,
"item_count": self.item_count,
"is_local_chlid": "0"
}
try:
headers = {
"User-Agent": self.http_client.session.headers.get("User-Agent"),
"Referer": "https://new.qq.com/",
"Origin": "https://new.qq.com",
"Content-Type": "application/json"
}
response = requests.post(
self.api_url,
headers=headers,
json=payload,
timeout=10
)
if response.status_code == 200:
data = response.json()
if data.get("code") == 0 and "data" in data:
news_list = data["data"]
if not news_list:
self.logger.info("没有更多数据了")
break
# 提取URL
for item in news_list:
news_id = item.get("id")
# 去重
if news_id in self.seen_ids:
continue
self.seen_ids.add(news_id)
# 过滤视频新闻articletype == "4"
article_type = item.get("articletype")
if article_type == "4":
continue
# 提取URL
url = item.get("link_info", {}).get("url")
if url:
urls.append(url)
# 如果已经获取到足够的文章数量,提前终止
if len(urls) >= self.max_articles:
self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取")
break
# 如果外层循环也需要终止
if len(urls) >= self.max_articles:
break
else:
self.logger.warning(f"接口返回错误: {data.get('message')}")
else:
self.logger.warning(f"HTTP请求失败: {response.status_code}")
except Exception as e:
self.logger.error(f"获取API数据失败: {e}")
# 延迟,避免请求过快
time.sleep(random.uniform(1, 2))
return urls
def _fetch_page(self) -> str:
"""
获取页面HTML腾讯爬虫不使用此方法
Returns:
空字符串
"""
return ""
def _extract_article_urls(self, html: str) -> List[str]:
"""
从HTML中提取文章URL列表腾讯爬虫不使用此方法
Args:
html: 页面HTML内容
Returns:
空列表
"""
return []
def _fetch_articles(self, urls: List[str]) -> List[Article]:
"""
爬取文章详情
Args:
urls: 文章URL列表
Returns:
文章列表
"""
articles = []
parser = TencentParser()
for i, url in enumerate(urls[:self.max_articles]):
try:
article = parser.parse(url)
article.category_id = self.category_id
article.source = "腾讯"
if not article.author:
article.author = "腾讯健康"
if article.is_valid():
articles.append(article)
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
except Exception as e:
self.logger.error(f"解析文章失败: {url} - {e}")
continue
return articles

View File

@ -0,0 +1,211 @@
"""
腾讯房产新闻爬虫API版
使用腾讯新闻 API 接口获取数据性能更好
"""
import time
import random
import hashlib
from typing import List
import requests
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from base.crawler_base import StaticCrawler, Article
from parsers.tencent_parser import TencentParser
class HouseCrawler(StaticCrawler):
"""腾讯房产新闻爬虫API版"""
def __init__(self, source: str, category: str):
super().__init__(source, category)
# 腾讯API配置
self.api_url = "https://i.news.qq.com/web_feed/getPCList"
self.channel_id = "news_news_house" # 房产频道
self.seen_ids = set()
self.item_count = 20 # 每页固定请求20条
def _generate_trace_id(self):
"""生成trace_id"""
random_str = str(random.random()) + str(time.time())
return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12]
def crawl(self) -> List[Article]:
"""
执行爬取任务重写基类方法以支持API接口
Returns:
文章列表
"""
self.logger.info(f"开始爬取腾讯{self.category_name}新闻")
try:
# 生成设备ID
device_id = self._generate_trace_id()
# 获取文章URL列表
article_urls = self._fetch_article_urls_from_api(device_id)
self.logger.info(f"找到 {len(article_urls)} 篇文章")
# 爬取文章详情
articles = self._fetch_articles(article_urls)
self.logger.info(f"成功爬取 {len(articles)} 篇文章")
return articles
except Exception as e:
self.logger.error(f"爬取失败: {e}", exc_info=True)
return []
finally:
self._cleanup()
def _fetch_article_urls_from_api(self, device_id: str) -> List[str]:
"""
从API获取文章URL列表
Args:
device_id: 设备ID
Returns:
文章URL列表
"""
urls = []
# 根据 max_articles 动态计算需要抓取的页数
# 每页20条向上取整
import math
max_pages = math.ceil(self.max_articles / self.item_count)
self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages}")
for flush_num in range(max_pages):
payload = {
"base_req": {"from": "pc"},
"forward": "1",
"qimei36": device_id,
"device_id": device_id,
"flush_num": flush_num + 1,
"channel_id": self.channel_id,
"item_count": self.item_count,
"is_local_chlid": "0"
}
try:
headers = {
"User-Agent": self.http_client.session.headers.get("User-Agent"),
"Referer": "https://new.qq.com/",
"Origin": "https://new.qq.com",
"Content-Type": "application/json"
}
response = requests.post(
self.api_url,
headers=headers,
json=payload,
timeout=10
)
if response.status_code == 200:
data = response.json()
if data.get("code") == 0 and "data" in data:
news_list = data["data"]
if not news_list:
self.logger.info("没有更多数据了")
break
# 提取URL
for item in news_list:
news_id = item.get("id")
# 去重
if news_id in self.seen_ids:
continue
self.seen_ids.add(news_id)
# 过滤视频新闻articletype == "4"
article_type = item.get("articletype")
if article_type == "4":
continue
# 提取URL
url = item.get("link_info", {}).get("url")
if url:
urls.append(url)
# 如果已经获取到足够的文章数量,提前终止
if len(urls) >= self.max_articles:
self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取")
break
# 如果外层循环也需要终止
if len(urls) >= self.max_articles:
break
else:
self.logger.warning(f"接口返回错误: {data.get('message')}")
else:
self.logger.warning(f"HTTP请求失败: {response.status_code}")
except Exception as e:
self.logger.error(f"获取API数据失败: {e}")
# 延迟,避免请求过快
time.sleep(random.uniform(1, 2))
return urls
def _fetch_page(self) -> str:
"""
获取页面HTML腾讯爬虫不使用此方法
Returns:
空字符串
"""
return ""
def _extract_article_urls(self, html: str) -> List[str]:
"""
从HTML中提取文章URL列表腾讯爬虫不使用此方法
Args:
html: 页面HTML内容
Returns:
空列表
"""
return []
def _fetch_articles(self, urls: List[str]) -> List[Article]:
"""
爬取文章详情
Args:
urls: 文章URL列表
Returns:
文章列表
"""
articles = []
parser = TencentParser()
for i, url in enumerate(urls[:self.max_articles]):
try:
article = parser.parse(url)
article.category_id = self.category_id
article.source = "腾讯"
if not article.author:
article.author = "腾讯房产"
if article.is_valid():
articles.append(article)
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
except Exception as e:
self.logger.error(f"解析文章失败: {url} - {e}")
continue
return articles

View File

@ -0,0 +1,211 @@
"""
腾讯科技新闻爬虫API版
使用腾讯新闻 API 接口获取数据性能更好
"""
import time
import random
import hashlib
from typing import List
import requests
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from base.crawler_base import StaticCrawler, Article
from parsers.tencent_parser import TencentParser
class TechCrawler(StaticCrawler):
"""腾讯科技新闻爬虫API版"""
def __init__(self, source: str, category: str):
super().__init__(source, category)
# 腾讯API配置
self.api_url = "https://i.news.qq.com/web_feed/getPCList"
self.channel_id = "news_news_tech" # 科技频道
self.seen_ids = set()
self.item_count = 20 # 每页固定请求20条
def _generate_trace_id(self):
"""生成trace_id"""
random_str = str(random.random()) + str(time.time())
return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12]
def crawl(self) -> List[Article]:
"""
执行爬取任务重写基类方法以支持API接口
Returns:
文章列表
"""
self.logger.info(f"开始爬取腾讯{self.category_name}新闻")
try:
# 生成设备ID
device_id = self._generate_trace_id()
# 获取文章URL列表
article_urls = self._fetch_article_urls_from_api(device_id)
self.logger.info(f"找到 {len(article_urls)} 篇文章")
# 爬取文章详情
articles = self._fetch_articles(article_urls)
self.logger.info(f"成功爬取 {len(articles)} 篇文章")
return articles
except Exception as e:
self.logger.error(f"爬取失败: {e}", exc_info=True)
return []
finally:
self._cleanup()
def _fetch_article_urls_from_api(self, device_id: str) -> List[str]:
"""
从API获取文章URL列表
Args:
device_id: 设备ID
Returns:
文章URL列表
"""
urls = []
# 根据 max_articles 动态计算需要抓取的页数
# 每页20条向上取整
import math
max_pages = math.ceil(self.max_articles / self.item_count)
self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages}")
for flush_num in range(max_pages):
payload = {
"base_req": {"from": "pc"},
"forward": "1",
"qimei36": device_id,
"device_id": device_id,
"flush_num": flush_num + 1,
"channel_id": self.channel_id,
"item_count": self.item_count,
"is_local_chlid": "0"
}
try:
headers = {
"User-Agent": self.http_client.session.headers.get("User-Agent"),
"Referer": "https://new.qq.com/",
"Origin": "https://new.qq.com",
"Content-Type": "application/json"
}
response = requests.post(
self.api_url,
headers=headers,
json=payload,
timeout=10
)
if response.status_code == 200:
data = response.json()
if data.get("code") == 0 and "data" in data:
news_list = data["data"]
if not news_list:
self.logger.info("没有更多数据了")
break
# 提取URL
for item in news_list:
news_id = item.get("id")
# 去重
if news_id in self.seen_ids:
continue
self.seen_ids.add(news_id)
# 过滤视频新闻articletype == "4"
article_type = item.get("articletype")
if article_type == "4":
continue
# 提取URL
url = item.get("link_info", {}).get("url")
if url:
urls.append(url)
# 如果已经获取到足够的文章数量,提前终止
if len(urls) >= self.max_articles:
self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取")
break
# 如果外层循环也需要终止
if len(urls) >= self.max_articles:
break
else:
self.logger.warning(f"接口返回错误: {data.get('message')}")
else:
self.logger.warning(f"HTTP请求失败: {response.status_code}")
except Exception as e:
self.logger.error(f"获取API数据失败: {e}")
# 延迟,避免请求过快
time.sleep(random.uniform(1, 2))
return urls
def _fetch_page(self) -> str:
"""
获取页面HTML腾讯爬虫不使用此方法
Returns:
空字符串
"""
return ""
def _extract_article_urls(self, html: str) -> List[str]:
"""
从HTML中提取文章URL列表腾讯爬虫不使用此方法
Args:
html: 页面HTML内容
Returns:
空列表
"""
return []
def _fetch_articles(self, urls: List[str]) -> List[Article]:
"""
爬取文章详情
Args:
urls: 文章URL列表
Returns:
文章列表
"""
articles = []
parser = TencentParser()
for i, url in enumerate(urls[:self.max_articles]):
try:
article = parser.parse(url)
article.category_id = self.category_id
article.source = "腾讯"
if not article.author:
article.author = "腾讯科技"
if article.is_valid():
articles.append(article)
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
except Exception as e:
self.logger.error(f"解析文章失败: {url} - {e}")
continue
return articles