feat: add sina auto crawler
This commit is contained in:
parent
61a5b7d301
commit
2afdd698b2
|
|
@ -110,3 +110,13 @@ sources:
|
|||
category_id: 9
|
||||
name: "AI"
|
||||
css_selector: "div.kr-information-left"
|
||||
|
||||
sina:
|
||||
base_url: "https://sina.com.cn"
|
||||
categories:
|
||||
auto:
|
||||
url: "https://auto.sina.com.cn/"
|
||||
category_id: 6
|
||||
name: "汽车"
|
||||
css_selector: "div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1"
|
||||
detail_css_selector: "div.main-content"
|
||||
|
|
|
|||
|
|
@ -102,7 +102,6 @@ class BaseCrawler(ABC):
|
|||
try:
|
||||
# 获取页面HTML
|
||||
html = self._fetch_page()
|
||||
|
||||
# 解析文章列表
|
||||
article_urls = self._extract_article_urls(html)
|
||||
self.logger.info(f"找到 {len(article_urls)} 篇文章")
|
||||
|
|
|
|||
|
|
@ -31,6 +31,9 @@ CRAWLER_CLASSES = {
|
|||
'kr36': {
|
||||
'ai': ('crawlers.kr36.ai', 'AICrawler'),
|
||||
},
|
||||
'sina': {
|
||||
'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -53,6 +56,11 @@ def list_crawlers() -> List[str]:
|
|||
for category in kr36_categories.keys():
|
||||
crawlers.append(f"kr36:{category}")
|
||||
|
||||
# 新浪爬虫
|
||||
sina_categories = config.get('sources.sina.categories', {})
|
||||
for category in sina_categories.keys():
|
||||
crawlers.append(f"sina:{category}")
|
||||
|
||||
return crawlers
|
||||
|
||||
|
||||
|
|
@ -101,7 +109,7 @@ def run_crawler(source: str, category: str, max_articles: int = None) -> bool:
|
|||
|
||||
# 创建并运行爬虫
|
||||
crawler = crawler_class(source, category)
|
||||
|
||||
print("创建并运行爬虫")
|
||||
# 覆盖最大文章数
|
||||
if max_articles:
|
||||
crawler.max_articles = max_articles
|
||||
|
|
|
|||
|
|
@ -0,0 +1,60 @@
|
|||
"""
|
||||
新浪汽车新闻爬虫
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
|
||||
from base.crawler_base import DynamicCrawler, Article
|
||||
from parsers.sina_parser import SinaAutoParser
|
||||
|
||||
|
||||
class SinaAutoCrawler(DynamicCrawler):
|
||||
"""新浪汽车新闻爬虫"""
|
||||
|
||||
def _extract_article_urls(self, html: str) -> List[str]:
|
||||
"""从HTML中提取文章URL列表"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
urls = []
|
||||
|
||||
# 尝试不同的选择器
|
||||
div_list = soup.select("div.cardlist-a__list div.ty-card.ty-card-type1")
|
||||
if not div_list:
|
||||
div_list = soup.select("div.news-list li.news-item")
|
||||
if not div_list:
|
||||
div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
|
||||
|
||||
for item in div_list:
|
||||
a = item.select_one("a")
|
||||
if a and a.get("href"):
|
||||
urls.append(a.get("href"))
|
||||
|
||||
return urls
|
||||
|
||||
def _fetch_articles(self, urls: List[str]) -> List[Article]:
|
||||
"""爬取文章详情"""
|
||||
articles = []
|
||||
parser = SinaAutoParser()
|
||||
|
||||
for i, url in enumerate(urls[:self.max_articles]):
|
||||
try:
|
||||
article = parser.parse(url)
|
||||
article.category_id = self.category_id
|
||||
article.source = "新浪"
|
||||
|
||||
if not article.author:
|
||||
article.author = "新浪汽车"
|
||||
|
||||
if article.is_valid():
|
||||
articles.append(article)
|
||||
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"解析文章失败: {url} - {e}")
|
||||
continue
|
||||
|
||||
return articles
|
||||
|
|
@ -54,6 +54,7 @@ class NewsRepository:
|
|||
try:
|
||||
with db_pool.get_connection() as conn:
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 批量查询已存在的URL
|
||||
if urls:
|
||||
placeholders = ','.join(['%s'] * len(urls))
|
||||
|
|
@ -61,24 +62,25 @@ class NewsRepository:
|
|||
cursor.execute(check_sql, urls)
|
||||
existing_urls = {row[0] for row in cursor.fetchall()}
|
||||
|
||||
# 只插入不存在的记录
|
||||
# 只插入URL不存在的记录
|
||||
new_data = [item for item in data if item[0] not in existing_urls]
|
||||
|
||||
if not new_data:
|
||||
self.logger.info(f"所有 {len(data)} 条新闻已存在,跳过插入")
|
||||
return 0
|
||||
|
||||
# 执行插入
|
||||
# 执行插入,使用 INSERT IGNORE 忽略 content_hash 重复的记录
|
||||
sql = """
|
||||
INSERT INTO news (url, title, category_id, publish_time, author, source, content, content_hash)
|
||||
INSERT IGNORE INTO news (url, title, category_id, publish_time, author, source, content, content_hash)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
||||
"""
|
||||
|
||||
cursor.executemany(sql, new_data)
|
||||
conn.commit()
|
||||
|
||||
inserted = len(new_data)
|
||||
self.logger.info(f"成功插入 {inserted} 条新新闻,{len(data) - inserted} 条已存在")
|
||||
# 获取实际插入的行数
|
||||
inserted = cursor.rowcount
|
||||
self.logger.info(f"成功插入 {inserted} 条新新闻,{len(new_data) - inserted} 条因内容重复被忽略")
|
||||
return inserted
|
||||
|
||||
except Exception as e:
|
||||
|
|
|
|||
|
|
@ -0,0 +1,71 @@
|
|||
import sys
|
||||
import os
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
|
||||
from base.parser_base import BaseParser
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from base.crawler_base import Article
|
||||
from bs4 import BeautifulSoup
|
||||
from utils.logger import get_logger
|
||||
from utils.http_client import HttpClient
|
||||
|
||||
|
||||
class SinaAutoParser(BaseParser):
|
||||
"""新浪网汽车新闻解析器"""
|
||||
|
||||
def __init__(self):
|
||||
self.logger = get_logger(__name__)
|
||||
self.http_client = HttpClient()
|
||||
|
||||
def parse(self, url: str) -> Article:
|
||||
"""
|
||||
解析新浪网文章详情页
|
||||
|
||||
Args:
|
||||
url: 文章URL
|
||||
|
||||
Returns:
|
||||
文章对象
|
||||
"""
|
||||
html = self.http_client.get(url)
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
# 获取文章标题
|
||||
article_title_tag = soup.select_one("div.main-content h1.main-title")
|
||||
article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题"
|
||||
|
||||
# 获取文章发布时间
|
||||
def normalize_time(time_str):
|
||||
for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
|
||||
try:
|
||||
dt = datetime.strptime(time_str, fmt)
|
||||
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||
except:
|
||||
continue
|
||||
return time_str # 如果都不匹配,返回原字符串
|
||||
|
||||
time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
|
||||
publish_time = normalize_time(time_tag.get_text(strip=True)) if time_tag else "1949-01-01 12:00:00"
|
||||
|
||||
# 获取文章作者
|
||||
author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
|
||||
author = author_tag.get_text(strip=True) if author_tag else "未知"
|
||||
|
||||
# 获取文章正文段落
|
||||
article_div = soup.select_one("div.main-content div.article")
|
||||
if not article_div:
|
||||
raise ValueError("无法找到文章内容")
|
||||
|
||||
paragraphs = article_div.find_all('p')
|
||||
content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
|
||||
|
||||
return Article(
|
||||
url=url,
|
||||
title=article_title,
|
||||
publish_time=publish_time,
|
||||
author=author,
|
||||
content=content,
|
||||
category_id=6, # 汽车分类ID
|
||||
source="sina"
|
||||
)
|
||||
Loading…
Reference in New Issue