feat: 增加腾讯健康、科技、房产分类爬虫
This commit is contained in:
parent
144c9e082f
commit
08c9950db5
Binary file not shown.
|
After Width: | Height: | Size: 106 KiB |
|
|
@ -1,5 +1,5 @@
|
|||
<script setup lang="ts">
|
||||
import { ref, onMounted } from 'vue'
|
||||
import { ref, onMounted, nextTick } from 'vue'
|
||||
import { categoryApi } from '@/api'
|
||||
import type { CategoryDto, CategoryCreateDto } from '@/types/api'
|
||||
import CategoryCard from '@/components/CategoryCard.vue'
|
||||
|
|
@ -46,9 +46,20 @@ async function createCategory() {
|
|||
|
||||
try {
|
||||
const data: CategoryCreateDto = { name: newCategoryName.value.trim() }
|
||||
|
||||
// 等待API响应完成
|
||||
const newCategory = await categoryApi.create(data)
|
||||
categories.value.push(newCategory)
|
||||
newsStore.addCategory(newCategory)
|
||||
|
||||
// 确保API返回的数据是完整的
|
||||
if (newCategory && newCategory.id) {
|
||||
// 检查是否已存在相同的分类,避免重复添加
|
||||
const exists = categories.value.some(cat => cat.id === newCategory.id)
|
||||
if (!exists) {
|
||||
categories.value.push(newCategory)
|
||||
newsStore.addCategory(newCategory)
|
||||
}
|
||||
}
|
||||
|
||||
newCategoryName.value = ''
|
||||
showCreateDialog.value = false
|
||||
} catch (err: any) {
|
||||
|
|
@ -80,12 +91,22 @@ async function deleteCategory(id: number) {
|
|||
if (!category) return
|
||||
|
||||
// 检查是否有新闻
|
||||
let confirmMessage = ''
|
||||
if (category.newsCount && category.newsCount > 0) {
|
||||
if (!confirm(`该分类下有 ${category.newsCount} 条新闻,确定要删除吗?`)) {
|
||||
confirmMessage = `该分类下有 ${category.newsCount} 条新闻,确定要删除吗?`
|
||||
} else {
|
||||
confirmMessage = `确定要删除分类"${category.name}"吗?`
|
||||
}
|
||||
|
||||
// 使用Tauri对话框(如果可用)
|
||||
if (window.__TAURI__?.dialog) {
|
||||
const result = await window.__TAURI__.dialog.ask(confirmMessage)
|
||||
if (!result) {
|
||||
return
|
||||
}
|
||||
} else {
|
||||
if (!confirm(`确定要删除分类"${category.name}"吗?`)) {
|
||||
// 回退到原生confirm
|
||||
if (!confirm(confirmMessage)) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
|
@ -100,10 +121,20 @@ async function deleteCategory(id: number) {
|
|||
}
|
||||
}
|
||||
|
||||
// 输入框引用
|
||||
const categoryNameInput = ref<HTMLInputElement | null>(null)
|
||||
|
||||
// 打开创建弹窗
|
||||
function openCreateDialog() {
|
||||
newCategoryName.value = ''
|
||||
showCreateDialog.value = true
|
||||
|
||||
// 延迟聚焦,确保DOM已更新
|
||||
setTimeout(() => {
|
||||
if (categoryNameInput.value) {
|
||||
categoryNameInput.value.focus()
|
||||
}
|
||||
}, 100)
|
||||
}
|
||||
|
||||
// 关闭创建弹窗
|
||||
|
|
@ -195,9 +226,11 @@ onMounted(() => {
|
|||
id="category-name"
|
||||
v-model="newCategoryName"
|
||||
type="text"
|
||||
autocomplete="off"
|
||||
placeholder="请输入分类名称"
|
||||
class="form-input"
|
||||
@keyup.enter="createCategory"
|
||||
ref="categoryNameInput"
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
|
|
@ -366,17 +399,17 @@ onMounted(() => {
|
|||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
z-index: 100;
|
||||
z-index: 2000;
|
||||
padding: 1rem;
|
||||
}
|
||||
|
||||
.dialog {
|
||||
width: 100%;
|
||||
max-width: 400px;
|
||||
background: hsl(var(--popover));
|
||||
border: 1px solid hsl(var(--border));
|
||||
border-radius: calc(var(--radius) + 4px);
|
||||
box-shadow: 0 25px 50px -12px rgb(0 0 0 / 0.25);
|
||||
background: var(--popover);
|
||||
border: 1px solid var(--border);
|
||||
border-radius: var(--radius);
|
||||
box-shadow: 0 10px 40px -10px rgb(0 0 0 / 0.2);
|
||||
}
|
||||
|
||||
.dialog-header {
|
||||
|
|
@ -432,12 +465,17 @@ onMounted(() => {
|
|||
color: hsl(var(--foreground));
|
||||
border: 1px solid hsl(var(--border));
|
||||
border-radius: var(--radius);
|
||||
transition: border-color 0.2s, box-shadow 0.2s;
|
||||
}
|
||||
|
||||
.form-input:focus {
|
||||
outline: none;
|
||||
border-color: hsl(var(--ring));
|
||||
box-shadow: 0 0 0 3px hsl(var(--ring) / 0.2);
|
||||
border-color: #3b82f6; /* 使用蓝色而不是ring变量 */
|
||||
box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.2);
|
||||
border-width: 1px;
|
||||
border-style: solid;
|
||||
/* 添加淡蓝色阴影效果 */
|
||||
box-shadow: 0 0 0 3px rgba(59, 130, 246, 0.2), 0 0 0 1px #3b82f6;
|
||||
}
|
||||
|
||||
.dialog-footer {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,140 @@
|
|||
# 问题解决总结
|
||||
|
||||
## 1. 模态框遮盖问题
|
||||
|
||||
**问题描述:**
|
||||
在分类管理界面中,创建分类时弹出的模态框没有正确遮盖背景内容,导致模态框显示在背景元素之下。
|
||||
|
||||
**解决方案:**
|
||||
修改了模态框的z-index值,确保其显示在其他元素之上:
|
||||
|
||||
```css
|
||||
/* 弹窗 */
|
||||
.dialog-overlay {
|
||||
position: fixed;
|
||||
top: 0;
|
||||
left: 0;
|
||||
right: 0;
|
||||
bottom: 0;
|
||||
background: rgb(0 0 0 / 0.5);
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
z-index: 2000; /* 提高z-index值 */
|
||||
padding: 1rem;
|
||||
}
|
||||
```
|
||||
|
||||
**原因分析:**
|
||||
- MainLayout.vue中的header使用了`z-index: 1000`
|
||||
- reka-ui的DialogOverlay组件使用了`z-50`(z-index: 50)
|
||||
- 将模态框的z-index设置为2000确保其显示在最上层
|
||||
|
||||
## 2. 分类创建后重复显示问题
|
||||
|
||||
**问题描述:**
|
||||
创建分类后,分类列表界面显示重复的分类项,刷新后恢复正常。
|
||||
|
||||
**解决方案:**
|
||||
修改了`createCategory`函数,添加重复检查逻辑:
|
||||
|
||||
```typescript
|
||||
// 创建分类
|
||||
async function createCategory() {
|
||||
if (!newCategoryName.value.trim()) {
|
||||
return
|
||||
}
|
||||
|
||||
creating.value = true
|
||||
error.value = null
|
||||
|
||||
try {
|
||||
const data: CategoryCreateDto = { name: newCategoryName.value.trim() }
|
||||
|
||||
// 等待API响应完成
|
||||
const newCategory = await categoryApi.create(data)
|
||||
|
||||
// 确保API返回的数据是完整的
|
||||
if (newCategory && newCategory.id) {
|
||||
// 检查是否已存在相同的分类,避免重复添加
|
||||
const exists = categories.value.some(cat => cat.id === newCategory.id)
|
||||
if (!exists) {
|
||||
categories.value.push(newCategory)
|
||||
newsStore.addCategory(newCategory)
|
||||
}
|
||||
}
|
||||
|
||||
newCategoryName.value = ''
|
||||
showCreateDialog.value = false
|
||||
} catch (err: any) {
|
||||
error.value = err.message || '创建分类失败'
|
||||
} finally {
|
||||
creating.value = false
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**原因分析:**
|
||||
- 原始代码没有检查分类是否已存在,导致重复添加
|
||||
- 添加了`exists`检查,确保不会重复添加相同ID的分类
|
||||
- 确保只有在API完全响应并返回有效数据后,才会更新前端状态
|
||||
|
||||
## 3. Tauri环境下的确认对话框权限问题
|
||||
|
||||
**问题描述:**
|
||||
在Tauri环境中,删除分类时出现"Uncaught (in promise) dialog.confirm not allowed"错误。
|
||||
|
||||
**解决方案:**
|
||||
修改了删除分类的逻辑,使用Tauri的对话框API:
|
||||
|
||||
```typescript
|
||||
// 删除分类
|
||||
async function deleteCategory(id: number) {
|
||||
const category = categories.value.find(c => c.id === id)
|
||||
if (!category) return
|
||||
|
||||
// 检查是否有新闻
|
||||
let confirmMessage = ''
|
||||
if (category.newsCount && category.newsCount > 0) {
|
||||
confirmMessage = `该分类下有 ${category.newsCount} 条新闻,确定要删除吗?`
|
||||
} else {
|
||||
confirmMessage = `确定要删除分类"${category.name}"吗?`
|
||||
}
|
||||
|
||||
// 使用Tauri对话框(如果可用)
|
||||
if (window.__TAURI__?.dialog) {
|
||||
const result = await window.__TAURI__.dialog.ask(confirmMessage)
|
||||
if (!result) {
|
||||
return
|
||||
}
|
||||
} else {
|
||||
// 回退到原生confirm
|
||||
if (!confirm(confirmMessage)) {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
await categoryApi.delete(id)
|
||||
categories.value = categories.value.filter(c => c.id !== id)
|
||||
newsStore.removeCategory(id)
|
||||
} catch (err: any) {
|
||||
error.value = err.message || '删除分类失败'
|
||||
await fetchCategories()
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**原因分析:**
|
||||
- Tauri应用需要显式请求权限才能使用原生`confirm`对话框
|
||||
- 使用Tauri的`dialog.ask()`方法替代原生`confirm`
|
||||
- 添加了回退机制,确保在非Tauri环境中也能正常工作
|
||||
|
||||
## 总结
|
||||
|
||||
通过以上修改,解决了以下问题:
|
||||
1. 模态框显示层级问题
|
||||
2. 分类数据同步问题
|
||||
3. Tauri环境下的权限问题
|
||||
|
||||
这些修改确保了分类管理功能的稳定性和正确性。
|
||||
|
|
@ -132,10 +132,10 @@ sources:
|
|||
css_selector: "a[href]"
|
||||
|
||||
tencent:
|
||||
base_url: "https://new.qq.com"
|
||||
base_url: "https://news.qq.com/"
|
||||
categories:
|
||||
auto:
|
||||
url: "https://new.qq.com/auto"
|
||||
url: "https://news.qq.com/ch/auto"
|
||||
category_id: 6
|
||||
name: "汽车"
|
||||
css_selector: ""
|
||||
|
|
@ -149,3 +149,18 @@ sources:
|
|||
category_id: 5
|
||||
name: "军事(网页版)"
|
||||
css_selector: "div[id='channel-feed-area']"
|
||||
health:
|
||||
url: "https://news.qq.com/ch/health"
|
||||
category_id: 8
|
||||
name: "健康"
|
||||
css_selector: ""
|
||||
house:
|
||||
url: "https://news.qq.com/ch/house/"
|
||||
category_id: 10
|
||||
name: "房产"
|
||||
css_selector: ""
|
||||
tech:
|
||||
url: "https://news.qq.com/ch/tech"
|
||||
category_id: 4
|
||||
name: "科技"
|
||||
css_selector: ""
|
||||
|
|
|
|||
|
|
@ -40,6 +40,9 @@ CRAWLER_CLASSES = {
|
|||
'auto': ('crawlers.tencent.auto', 'AutoCrawler'),
|
||||
'war': ('crawlers.tencent.war', 'WarCrawler'),
|
||||
'war_web': ('crawlers.tencent.war_web', 'WarWebCrawler'),
|
||||
'health': ('crawlers.tencent.health', 'HealthCrawler'),
|
||||
'house': ('crawlers.tencent.house', 'HouseCrawler'),
|
||||
'tech': ('crawlers.tencent.tech', 'TechCrawler'),
|
||||
},
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,211 @@
|
|||
"""
|
||||
腾讯健康新闻爬虫(API版)
|
||||
使用腾讯新闻 API 接口获取数据,性能更好
|
||||
"""
|
||||
|
||||
import time
|
||||
import random
|
||||
import hashlib
|
||||
from typing import List
|
||||
import requests
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
|
||||
from base.crawler_base import StaticCrawler, Article
|
||||
from parsers.tencent_parser import TencentParser
|
||||
|
||||
|
||||
class HealthCrawler(StaticCrawler):
|
||||
"""腾讯健康新闻爬虫(API版)"""
|
||||
|
||||
def __init__(self, source: str, category: str):
|
||||
super().__init__(source, category)
|
||||
|
||||
# 腾讯API配置
|
||||
self.api_url = "https://i.news.qq.com/web_feed/getPCList"
|
||||
self.channel_id = "news_news_antip" # 健康频道
|
||||
self.seen_ids = set()
|
||||
self.item_count = 20 # 每页固定请求20条
|
||||
|
||||
def _generate_trace_id(self):
|
||||
"""生成trace_id"""
|
||||
random_str = str(random.random()) + str(time.time())
|
||||
return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12]
|
||||
|
||||
def crawl(self) -> List[Article]:
|
||||
"""
|
||||
执行爬取任务(重写基类方法以支持API接口)
|
||||
|
||||
Returns:
|
||||
文章列表
|
||||
"""
|
||||
self.logger.info(f"开始爬取腾讯{self.category_name}新闻")
|
||||
|
||||
try:
|
||||
# 生成设备ID
|
||||
device_id = self._generate_trace_id()
|
||||
|
||||
# 获取文章URL列表
|
||||
article_urls = self._fetch_article_urls_from_api(device_id)
|
||||
self.logger.info(f"找到 {len(article_urls)} 篇文章")
|
||||
|
||||
# 爬取文章详情
|
||||
articles = self._fetch_articles(article_urls)
|
||||
|
||||
self.logger.info(f"成功爬取 {len(articles)} 篇文章")
|
||||
return articles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"爬取失败: {e}", exc_info=True)
|
||||
return []
|
||||
finally:
|
||||
self._cleanup()
|
||||
|
||||
def _fetch_article_urls_from_api(self, device_id: str) -> List[str]:
|
||||
"""
|
||||
从API获取文章URL列表
|
||||
|
||||
Args:
|
||||
device_id: 设备ID
|
||||
|
||||
Returns:
|
||||
文章URL列表
|
||||
"""
|
||||
urls = []
|
||||
|
||||
# 根据 max_articles 动态计算需要抓取的页数
|
||||
# 每页20条,向上取整
|
||||
import math
|
||||
max_pages = math.ceil(self.max_articles / self.item_count)
|
||||
self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages} 页")
|
||||
|
||||
for flush_num in range(max_pages):
|
||||
payload = {
|
||||
"base_req": {"from": "pc"},
|
||||
"forward": "1",
|
||||
"qimei36": device_id,
|
||||
"device_id": device_id,
|
||||
"flush_num": flush_num + 1,
|
||||
"channel_id": self.channel_id,
|
||||
"item_count": self.item_count,
|
||||
"is_local_chlid": "0"
|
||||
}
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": self.http_client.session.headers.get("User-Agent"),
|
||||
"Referer": "https://new.qq.com/",
|
||||
"Origin": "https://new.qq.com",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
self.api_url,
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data.get("code") == 0 and "data" in data:
|
||||
news_list = data["data"]
|
||||
if not news_list:
|
||||
self.logger.info("没有更多数据了")
|
||||
break
|
||||
|
||||
# 提取URL
|
||||
for item in news_list:
|
||||
news_id = item.get("id")
|
||||
|
||||
# 去重
|
||||
if news_id in self.seen_ids:
|
||||
continue
|
||||
self.seen_ids.add(news_id)
|
||||
|
||||
# 过滤视频新闻(articletype == "4")
|
||||
article_type = item.get("articletype")
|
||||
if article_type == "4":
|
||||
continue
|
||||
|
||||
# 提取URL
|
||||
url = item.get("link_info", {}).get("url")
|
||||
if url:
|
||||
urls.append(url)
|
||||
|
||||
# 如果已经获取到足够的文章数量,提前终止
|
||||
if len(urls) >= self.max_articles:
|
||||
self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取")
|
||||
break
|
||||
|
||||
# 如果外层循环也需要终止
|
||||
if len(urls) >= self.max_articles:
|
||||
break
|
||||
|
||||
else:
|
||||
self.logger.warning(f"接口返回错误: {data.get('message')}")
|
||||
else:
|
||||
self.logger.warning(f"HTTP请求失败: {response.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"获取API数据失败: {e}")
|
||||
|
||||
# 延迟,避免请求过快
|
||||
time.sleep(random.uniform(1, 2))
|
||||
|
||||
return urls
|
||||
|
||||
def _fetch_page(self) -> str:
|
||||
"""
|
||||
获取页面HTML(腾讯爬虫不使用此方法)
|
||||
|
||||
Returns:
|
||||
空字符串
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _extract_article_urls(self, html: str) -> List[str]:
|
||||
"""
|
||||
从HTML中提取文章URL列表(腾讯爬虫不使用此方法)
|
||||
|
||||
Args:
|
||||
html: 页面HTML内容
|
||||
|
||||
Returns:
|
||||
空列表
|
||||
"""
|
||||
return []
|
||||
|
||||
def _fetch_articles(self, urls: List[str]) -> List[Article]:
|
||||
"""
|
||||
爬取文章详情
|
||||
|
||||
Args:
|
||||
urls: 文章URL列表
|
||||
|
||||
Returns:
|
||||
文章列表
|
||||
"""
|
||||
articles = []
|
||||
parser = TencentParser()
|
||||
|
||||
for i, url in enumerate(urls[:self.max_articles]):
|
||||
try:
|
||||
article = parser.parse(url)
|
||||
article.category_id = self.category_id
|
||||
article.source = "腾讯"
|
||||
|
||||
if not article.author:
|
||||
article.author = "腾讯健康"
|
||||
|
||||
if article.is_valid():
|
||||
articles.append(article)
|
||||
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"解析文章失败: {url} - {e}")
|
||||
continue
|
||||
|
||||
return articles
|
||||
|
|
@ -0,0 +1,211 @@
|
|||
"""
|
||||
腾讯房产新闻爬虫(API版)
|
||||
使用腾讯新闻 API 接口获取数据,性能更好
|
||||
"""
|
||||
|
||||
import time
|
||||
import random
|
||||
import hashlib
|
||||
from typing import List
|
||||
import requests
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
|
||||
from base.crawler_base import StaticCrawler, Article
|
||||
from parsers.tencent_parser import TencentParser
|
||||
|
||||
|
||||
class HouseCrawler(StaticCrawler):
|
||||
"""腾讯房产新闻爬虫(API版)"""
|
||||
|
||||
def __init__(self, source: str, category: str):
|
||||
super().__init__(source, category)
|
||||
|
||||
# 腾讯API配置
|
||||
self.api_url = "https://i.news.qq.com/web_feed/getPCList"
|
||||
self.channel_id = "news_news_house" # 房产频道
|
||||
self.seen_ids = set()
|
||||
self.item_count = 20 # 每页固定请求20条
|
||||
|
||||
def _generate_trace_id(self):
|
||||
"""生成trace_id"""
|
||||
random_str = str(random.random()) + str(time.time())
|
||||
return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12]
|
||||
|
||||
def crawl(self) -> List[Article]:
|
||||
"""
|
||||
执行爬取任务(重写基类方法以支持API接口)
|
||||
|
||||
Returns:
|
||||
文章列表
|
||||
"""
|
||||
self.logger.info(f"开始爬取腾讯{self.category_name}新闻")
|
||||
|
||||
try:
|
||||
# 生成设备ID
|
||||
device_id = self._generate_trace_id()
|
||||
|
||||
# 获取文章URL列表
|
||||
article_urls = self._fetch_article_urls_from_api(device_id)
|
||||
self.logger.info(f"找到 {len(article_urls)} 篇文章")
|
||||
|
||||
# 爬取文章详情
|
||||
articles = self._fetch_articles(article_urls)
|
||||
|
||||
self.logger.info(f"成功爬取 {len(articles)} 篇文章")
|
||||
return articles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"爬取失败: {e}", exc_info=True)
|
||||
return []
|
||||
finally:
|
||||
self._cleanup()
|
||||
|
||||
def _fetch_article_urls_from_api(self, device_id: str) -> List[str]:
|
||||
"""
|
||||
从API获取文章URL列表
|
||||
|
||||
Args:
|
||||
device_id: 设备ID
|
||||
|
||||
Returns:
|
||||
文章URL列表
|
||||
"""
|
||||
urls = []
|
||||
|
||||
# 根据 max_articles 动态计算需要抓取的页数
|
||||
# 每页20条,向上取整
|
||||
import math
|
||||
max_pages = math.ceil(self.max_articles / self.item_count)
|
||||
self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages} 页")
|
||||
|
||||
for flush_num in range(max_pages):
|
||||
payload = {
|
||||
"base_req": {"from": "pc"},
|
||||
"forward": "1",
|
||||
"qimei36": device_id,
|
||||
"device_id": device_id,
|
||||
"flush_num": flush_num + 1,
|
||||
"channel_id": self.channel_id,
|
||||
"item_count": self.item_count,
|
||||
"is_local_chlid": "0"
|
||||
}
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": self.http_client.session.headers.get("User-Agent"),
|
||||
"Referer": "https://new.qq.com/",
|
||||
"Origin": "https://new.qq.com",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
self.api_url,
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data.get("code") == 0 and "data" in data:
|
||||
news_list = data["data"]
|
||||
if not news_list:
|
||||
self.logger.info("没有更多数据了")
|
||||
break
|
||||
|
||||
# 提取URL
|
||||
for item in news_list:
|
||||
news_id = item.get("id")
|
||||
|
||||
# 去重
|
||||
if news_id in self.seen_ids:
|
||||
continue
|
||||
self.seen_ids.add(news_id)
|
||||
|
||||
# 过滤视频新闻(articletype == "4")
|
||||
article_type = item.get("articletype")
|
||||
if article_type == "4":
|
||||
continue
|
||||
|
||||
# 提取URL
|
||||
url = item.get("link_info", {}).get("url")
|
||||
if url:
|
||||
urls.append(url)
|
||||
|
||||
# 如果已经获取到足够的文章数量,提前终止
|
||||
if len(urls) >= self.max_articles:
|
||||
self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取")
|
||||
break
|
||||
|
||||
# 如果外层循环也需要终止
|
||||
if len(urls) >= self.max_articles:
|
||||
break
|
||||
|
||||
else:
|
||||
self.logger.warning(f"接口返回错误: {data.get('message')}")
|
||||
else:
|
||||
self.logger.warning(f"HTTP请求失败: {response.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"获取API数据失败: {e}")
|
||||
|
||||
# 延迟,避免请求过快
|
||||
time.sleep(random.uniform(1, 2))
|
||||
|
||||
return urls
|
||||
|
||||
def _fetch_page(self) -> str:
|
||||
"""
|
||||
获取页面HTML(腾讯爬虫不使用此方法)
|
||||
|
||||
Returns:
|
||||
空字符串
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _extract_article_urls(self, html: str) -> List[str]:
|
||||
"""
|
||||
从HTML中提取文章URL列表(腾讯爬虫不使用此方法)
|
||||
|
||||
Args:
|
||||
html: 页面HTML内容
|
||||
|
||||
Returns:
|
||||
空列表
|
||||
"""
|
||||
return []
|
||||
|
||||
def _fetch_articles(self, urls: List[str]) -> List[Article]:
|
||||
"""
|
||||
爬取文章详情
|
||||
|
||||
Args:
|
||||
urls: 文章URL列表
|
||||
|
||||
Returns:
|
||||
文章列表
|
||||
"""
|
||||
articles = []
|
||||
parser = TencentParser()
|
||||
|
||||
for i, url in enumerate(urls[:self.max_articles]):
|
||||
try:
|
||||
article = parser.parse(url)
|
||||
article.category_id = self.category_id
|
||||
article.source = "腾讯"
|
||||
|
||||
if not article.author:
|
||||
article.author = "腾讯房产"
|
||||
|
||||
if article.is_valid():
|
||||
articles.append(article)
|
||||
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"解析文章失败: {url} - {e}")
|
||||
continue
|
||||
|
||||
return articles
|
||||
|
|
@ -0,0 +1,211 @@
|
|||
"""
|
||||
腾讯科技新闻爬虫(API版)
|
||||
使用腾讯新闻 API 接口获取数据,性能更好
|
||||
"""
|
||||
|
||||
import time
|
||||
import random
|
||||
import hashlib
|
||||
from typing import List
|
||||
import requests
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
|
||||
from base.crawler_base import StaticCrawler, Article
|
||||
from parsers.tencent_parser import TencentParser
|
||||
|
||||
|
||||
class TechCrawler(StaticCrawler):
|
||||
"""腾讯科技新闻爬虫(API版)"""
|
||||
|
||||
def __init__(self, source: str, category: str):
|
||||
super().__init__(source, category)
|
||||
|
||||
# 腾讯API配置
|
||||
self.api_url = "https://i.news.qq.com/web_feed/getPCList"
|
||||
self.channel_id = "news_news_tech" # 科技频道
|
||||
self.seen_ids = set()
|
||||
self.item_count = 20 # 每页固定请求20条
|
||||
|
||||
def _generate_trace_id(self):
|
||||
"""生成trace_id"""
|
||||
random_str = str(random.random()) + str(time.time())
|
||||
return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12]
|
||||
|
||||
def crawl(self) -> List[Article]:
|
||||
"""
|
||||
执行爬取任务(重写基类方法以支持API接口)
|
||||
|
||||
Returns:
|
||||
文章列表
|
||||
"""
|
||||
self.logger.info(f"开始爬取腾讯{self.category_name}新闻")
|
||||
|
||||
try:
|
||||
# 生成设备ID
|
||||
device_id = self._generate_trace_id()
|
||||
|
||||
# 获取文章URL列表
|
||||
article_urls = self._fetch_article_urls_from_api(device_id)
|
||||
self.logger.info(f"找到 {len(article_urls)} 篇文章")
|
||||
|
||||
# 爬取文章详情
|
||||
articles = self._fetch_articles(article_urls)
|
||||
|
||||
self.logger.info(f"成功爬取 {len(articles)} 篇文章")
|
||||
return articles
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"爬取失败: {e}", exc_info=True)
|
||||
return []
|
||||
finally:
|
||||
self._cleanup()
|
||||
|
||||
def _fetch_article_urls_from_api(self, device_id: str) -> List[str]:
|
||||
"""
|
||||
从API获取文章URL列表
|
||||
|
||||
Args:
|
||||
device_id: 设备ID
|
||||
|
||||
Returns:
|
||||
文章URL列表
|
||||
"""
|
||||
urls = []
|
||||
|
||||
# 根据 max_articles 动态计算需要抓取的页数
|
||||
# 每页20条,向上取整
|
||||
import math
|
||||
max_pages = math.ceil(self.max_articles / self.item_count)
|
||||
self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages} 页")
|
||||
|
||||
for flush_num in range(max_pages):
|
||||
payload = {
|
||||
"base_req": {"from": "pc"},
|
||||
"forward": "1",
|
||||
"qimei36": device_id,
|
||||
"device_id": device_id,
|
||||
"flush_num": flush_num + 1,
|
||||
"channel_id": self.channel_id,
|
||||
"item_count": self.item_count,
|
||||
"is_local_chlid": "0"
|
||||
}
|
||||
|
||||
try:
|
||||
headers = {
|
||||
"User-Agent": self.http_client.session.headers.get("User-Agent"),
|
||||
"Referer": "https://new.qq.com/",
|
||||
"Origin": "https://new.qq.com",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
self.api_url,
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data.get("code") == 0 and "data" in data:
|
||||
news_list = data["data"]
|
||||
if not news_list:
|
||||
self.logger.info("没有更多数据了")
|
||||
break
|
||||
|
||||
# 提取URL
|
||||
for item in news_list:
|
||||
news_id = item.get("id")
|
||||
|
||||
# 去重
|
||||
if news_id in self.seen_ids:
|
||||
continue
|
||||
self.seen_ids.add(news_id)
|
||||
|
||||
# 过滤视频新闻(articletype == "4")
|
||||
article_type = item.get("articletype")
|
||||
if article_type == "4":
|
||||
continue
|
||||
|
||||
# 提取URL
|
||||
url = item.get("link_info", {}).get("url")
|
||||
if url:
|
||||
urls.append(url)
|
||||
|
||||
# 如果已经获取到足够的文章数量,提前终止
|
||||
if len(urls) >= self.max_articles:
|
||||
self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取")
|
||||
break
|
||||
|
||||
# 如果外层循环也需要终止
|
||||
if len(urls) >= self.max_articles:
|
||||
break
|
||||
|
||||
else:
|
||||
self.logger.warning(f"接口返回错误: {data.get('message')}")
|
||||
else:
|
||||
self.logger.warning(f"HTTP请求失败: {response.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"获取API数据失败: {e}")
|
||||
|
||||
# 延迟,避免请求过快
|
||||
time.sleep(random.uniform(1, 2))
|
||||
|
||||
return urls
|
||||
|
||||
def _fetch_page(self) -> str:
|
||||
"""
|
||||
获取页面HTML(腾讯爬虫不使用此方法)
|
||||
|
||||
Returns:
|
||||
空字符串
|
||||
"""
|
||||
return ""
|
||||
|
||||
def _extract_article_urls(self, html: str) -> List[str]:
|
||||
"""
|
||||
从HTML中提取文章URL列表(腾讯爬虫不使用此方法)
|
||||
|
||||
Args:
|
||||
html: 页面HTML内容
|
||||
|
||||
Returns:
|
||||
空列表
|
||||
"""
|
||||
return []
|
||||
|
||||
def _fetch_articles(self, urls: List[str]) -> List[Article]:
|
||||
"""
|
||||
爬取文章详情
|
||||
|
||||
Args:
|
||||
urls: 文章URL列表
|
||||
|
||||
Returns:
|
||||
文章列表
|
||||
"""
|
||||
articles = []
|
||||
parser = TencentParser()
|
||||
|
||||
for i, url in enumerate(urls[:self.max_articles]):
|
||||
try:
|
||||
article = parser.parse(url)
|
||||
article.category_id = self.category_id
|
||||
article.source = "腾讯"
|
||||
|
||||
if not article.author:
|
||||
article.author = "腾讯科技"
|
||||
|
||||
if article.is_valid():
|
||||
articles.append(article)
|
||||
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"解析文章失败: {url} - {e}")
|
||||
continue
|
||||
|
||||
return articles
|
||||
Loading…
Reference in New Issue