news-classifier/crawler-module/src/crawlers/tencent/war_web.py

79 lines
2.2 KiB
Python

"""
腾讯军事新闻爬虫(网页版)
使用 Selenium 动态加载页面,适用于网页抓取
"""
from typing import List
from bs4 import BeautifulSoup
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from base.crawler_base import DynamicCrawler, Article
from parsers.tencent_parser import TencentParser
class WarWebCrawler(DynamicCrawler):
"""腾讯军事新闻爬虫(网页版)"""
def _extract_article_urls(self, html: str) -> List[str]:
"""
从HTML中提取文章URL列表
Args:
html: 页面HTML内容
Returns:
文章URL列表
"""
soup = BeautifulSoup(html, "lxml")
urls = []
# 选择军事频道的文章列表
# dt-params*='article_type=0' 过滤掉视频新闻
div_list = soup.select("div[id='channel-feed-area'] div.channel-feed-list div.channel-feed-item[dt-params*='article_type=0']")
for div in div_list:
article_link = div.select_one("a.article-title")
if article_link:
href = article_link.get('href')
if href:
# 处理相对路径
if href.startswith('/'):
href = f"https://news.qq.com{href}"
urls.append(href)
return urls
def _fetch_articles(self, urls: List[str]) -> List[Article]:
"""
爬取文章详情
Args:
urls: 文章URL列表
Returns:
文章列表
"""
articles = []
parser = TencentParser()
for i, url in enumerate(urls[:self.max_articles]):
try:
article = parser.parse(url)
article.category_id = self.category_id
article.source = "腾讯"
if not article.author:
article.author = "腾讯军事"
if article.is_valid():
articles.append(article)
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
except Exception as e:
self.logger.error(f"解析文章失败: {url} - {e}")
continue
return articles