feat: add sina auto crawler

This commit is contained in:
shenjianZ 2026-01-14 19:17:09 +08:00
parent 61a5b7d301
commit 2afdd698b2
6 changed files with 157 additions and 7 deletions

View File

@ -110,3 +110,13 @@ sources:
category_id: 9 category_id: 9
name: "AI" name: "AI"
css_selector: "div.kr-information-left" css_selector: "div.kr-information-left"
sina:
base_url: "https://sina.com.cn"
categories:
auto:
url: "https://auto.sina.com.cn/"
category_id: 6
name: "汽车"
css_selector: "div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1"
detail_css_selector: "div.main-content"

View File

@ -102,7 +102,6 @@ class BaseCrawler(ABC):
try: try:
# 获取页面HTML # 获取页面HTML
html = self._fetch_page() html = self._fetch_page()
# 解析文章列表 # 解析文章列表
article_urls = self._extract_article_urls(html) article_urls = self._extract_article_urls(html)
self.logger.info(f"找到 {len(article_urls)} 篇文章") self.logger.info(f"找到 {len(article_urls)} 篇文章")

View File

@ -31,6 +31,9 @@ CRAWLER_CLASSES = {
'kr36': { 'kr36': {
'ai': ('crawlers.kr36.ai', 'AICrawler'), 'ai': ('crawlers.kr36.ai', 'AICrawler'),
}, },
'sina': {
'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
},
} }
@ -53,6 +56,11 @@ def list_crawlers() -> List[str]:
for category in kr36_categories.keys(): for category in kr36_categories.keys():
crawlers.append(f"kr36:{category}") crawlers.append(f"kr36:{category}")
# 新浪爬虫
sina_categories = config.get('sources.sina.categories', {})
for category in sina_categories.keys():
crawlers.append(f"sina:{category}")
return crawlers return crawlers
@ -101,7 +109,7 @@ def run_crawler(source: str, category: str, max_articles: int = None) -> bool:
# 创建并运行爬虫 # 创建并运行爬虫
crawler = crawler_class(source, category) crawler = crawler_class(source, category)
print("创建并运行爬虫")
# 覆盖最大文章数 # 覆盖最大文章数
if max_articles: if max_articles:
crawler.max_articles = max_articles crawler.max_articles = max_articles

View File

@ -0,0 +1,60 @@
"""
新浪汽车新闻爬虫
"""
from typing import List
from bs4 import BeautifulSoup
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from base.crawler_base import DynamicCrawler, Article
from parsers.sina_parser import SinaAutoParser
class SinaAutoCrawler(DynamicCrawler):
"""新浪汽车新闻爬虫"""
def _extract_article_urls(self, html: str) -> List[str]:
"""从HTML中提取文章URL列表"""
soup = BeautifulSoup(html, "lxml")
urls = []
# 尝试不同的选择器
div_list = soup.select("div.cardlist-a__list div.ty-card.ty-card-type1")
if not div_list:
div_list = soup.select("div.news-list li.news-item")
if not div_list:
div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
for item in div_list:
a = item.select_one("a")
if a and a.get("href"):
urls.append(a.get("href"))
return urls
def _fetch_articles(self, urls: List[str]) -> List[Article]:
"""爬取文章详情"""
articles = []
parser = SinaAutoParser()
for i, url in enumerate(urls[:self.max_articles]):
try:
article = parser.parse(url)
article.category_id = self.category_id
article.source = "新浪"
if not article.author:
article.author = "新浪汽车"
if article.is_valid():
articles.append(article)
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
except Exception as e:
self.logger.error(f"解析文章失败: {url} - {e}")
continue
return articles

View File

@ -54,6 +54,7 @@ class NewsRepository:
try: try:
with db_pool.get_connection() as conn: with db_pool.get_connection() as conn:
cursor = conn.cursor() cursor = conn.cursor()
# 批量查询已存在的URL # 批量查询已存在的URL
if urls: if urls:
placeholders = ','.join(['%s'] * len(urls)) placeholders = ','.join(['%s'] * len(urls))
@ -61,24 +62,25 @@ class NewsRepository:
cursor.execute(check_sql, urls) cursor.execute(check_sql, urls)
existing_urls = {row[0] for row in cursor.fetchall()} existing_urls = {row[0] for row in cursor.fetchall()}
# 只插入不存在的记录 # 只插入URL不存在的记录
new_data = [item for item in data if item[0] not in existing_urls] new_data = [item for item in data if item[0] not in existing_urls]
if not new_data: if not new_data:
self.logger.info(f"所有 {len(data)} 条新闻已存在,跳过插入") self.logger.info(f"所有 {len(data)} 条新闻已存在,跳过插入")
return 0 return 0
# 执行插入 # 执行插入,使用 INSERT IGNORE 忽略 content_hash 重复的记录
sql = """ sql = """
INSERT INTO news (url, title, category_id, publish_time, author, source, content, content_hash) INSERT IGNORE INTO news (url, title, category_id, publish_time, author, source, content, content_hash)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
""" """
cursor.executemany(sql, new_data) cursor.executemany(sql, new_data)
conn.commit() conn.commit()
inserted = len(new_data) # 获取实际插入的行数
self.logger.info(f"成功插入 {inserted} 条新新闻,{len(data) - inserted} 条已存在") inserted = cursor.rowcount
self.logger.info(f"成功插入 {inserted} 条新新闻,{len(new_data) - inserted} 条因内容重复被忽略")
return inserted return inserted
except Exception as e: except Exception as e:

View File

@ -0,0 +1,71 @@
import sys
import os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from base.parser_base import BaseParser
from datetime import datetime
from typing import Optional
from base.crawler_base import Article
from bs4 import BeautifulSoup
from utils.logger import get_logger
from utils.http_client import HttpClient
class SinaAutoParser(BaseParser):
"""新浪网汽车新闻解析器"""
def __init__(self):
self.logger = get_logger(__name__)
self.http_client = HttpClient()
def parse(self, url: str) -> Article:
"""
解析新浪网文章详情页
Args:
url: 文章URL
Returns:
文章对象
"""
html = self.http_client.get(url)
soup = BeautifulSoup(html, "lxml")
# 获取文章标题
article_title_tag = soup.select_one("div.main-content h1.main-title")
article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题"
# 获取文章发布时间
def normalize_time(time_str):
for fmt in ("%Y年%m月%d%H:%M", "%Y-%m-%d %H:%M:%S"):
try:
dt = datetime.strptime(time_str, fmt)
return dt.strftime("%Y-%m-%d %H:%M:%S")
except:
continue
return time_str # 如果都不匹配,返回原字符串
time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
publish_time = normalize_time(time_tag.get_text(strip=True)) if time_tag else "1949-01-01 12:00:00"
# 获取文章作者
author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
author = author_tag.get_text(strip=True) if author_tag else "未知"
# 获取文章正文段落
article_div = soup.select_one("div.main-content div.article")
if not article_div:
raise ValueError("无法找到文章内容")
paragraphs = article_div.find_all('p')
content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
return Article(
url=url,
title=article_title,
publish_time=publish_time,
author=author,
content=content,
category_id=6, # 汽车分类ID
source="sina"
)