79 lines
2.2 KiB
Python
79 lines
2.2 KiB
Python
"""
|
|
腾讯军事新闻爬虫(网页版)
|
|
使用 Selenium 动态加载页面,适用于网页抓取
|
|
"""
|
|
|
|
from typing import List
|
|
from bs4 import BeautifulSoup
|
|
|
|
import sys
|
|
import os
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
|
|
from base.crawler_base import DynamicCrawler, Article
|
|
from parsers.tencent_parser import TencentParser
|
|
|
|
|
|
class WarWebCrawler(DynamicCrawler):
|
|
"""腾讯军事新闻爬虫(网页版)"""
|
|
|
|
def _extract_article_urls(self, html: str) -> List[str]:
|
|
"""
|
|
从HTML中提取文章URL列表
|
|
|
|
Args:
|
|
html: 页面HTML内容
|
|
|
|
Returns:
|
|
文章URL列表
|
|
"""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
urls = []
|
|
|
|
# 选择军事频道的文章列表
|
|
# dt-params*='article_type=0' 过滤掉视频新闻
|
|
div_list = soup.select("div[id='channel-feed-area'] div.channel-feed-list div.channel-feed-item[dt-params*='article_type=0']")
|
|
|
|
for div in div_list:
|
|
article_link = div.select_one("a.article-title")
|
|
if article_link:
|
|
href = article_link.get('href')
|
|
if href:
|
|
# 处理相对路径
|
|
if href.startswith('/'):
|
|
href = f"https://news.qq.com{href}"
|
|
urls.append(href)
|
|
|
|
return urls
|
|
|
|
def _fetch_articles(self, urls: List[str]) -> List[Article]:
|
|
"""
|
|
爬取文章详情
|
|
|
|
Args:
|
|
urls: 文章URL列表
|
|
|
|
Returns:
|
|
文章列表
|
|
"""
|
|
articles = []
|
|
parser = TencentParser()
|
|
|
|
for i, url in enumerate(urls[:self.max_articles]):
|
|
try:
|
|
article = parser.parse(url)
|
|
article.category_id = self.category_id
|
|
article.source = "腾讯"
|
|
|
|
if not article.author:
|
|
article.author = "腾讯军事"
|
|
|
|
if article.is_valid():
|
|
articles.append(article)
|
|
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"解析文章失败: {url} - {e}")
|
|
continue
|
|
|
|
return articles |