feat: add sina auto crawler
This commit is contained in:
parent
61a5b7d301
commit
2afdd698b2
|
|
@ -110,3 +110,13 @@ sources:
|
||||||
category_id: 9
|
category_id: 9
|
||||||
name: "AI"
|
name: "AI"
|
||||||
css_selector: "div.kr-information-left"
|
css_selector: "div.kr-information-left"
|
||||||
|
|
||||||
|
sina:
|
||||||
|
base_url: "https://sina.com.cn"
|
||||||
|
categories:
|
||||||
|
auto:
|
||||||
|
url: "https://auto.sina.com.cn/"
|
||||||
|
category_id: 6
|
||||||
|
name: "汽车"
|
||||||
|
css_selector: "div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1"
|
||||||
|
detail_css_selector: "div.main-content"
|
||||||
|
|
|
||||||
|
|
@ -102,7 +102,6 @@ class BaseCrawler(ABC):
|
||||||
try:
|
try:
|
||||||
# 获取页面HTML
|
# 获取页面HTML
|
||||||
html = self._fetch_page()
|
html = self._fetch_page()
|
||||||
|
|
||||||
# 解析文章列表
|
# 解析文章列表
|
||||||
article_urls = self._extract_article_urls(html)
|
article_urls = self._extract_article_urls(html)
|
||||||
self.logger.info(f"找到 {len(article_urls)} 篇文章")
|
self.logger.info(f"找到 {len(article_urls)} 篇文章")
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,9 @@ CRAWLER_CLASSES = {
|
||||||
'kr36': {
|
'kr36': {
|
||||||
'ai': ('crawlers.kr36.ai', 'AICrawler'),
|
'ai': ('crawlers.kr36.ai', 'AICrawler'),
|
||||||
},
|
},
|
||||||
|
'sina': {
|
||||||
|
'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -53,6 +56,11 @@ def list_crawlers() -> List[str]:
|
||||||
for category in kr36_categories.keys():
|
for category in kr36_categories.keys():
|
||||||
crawlers.append(f"kr36:{category}")
|
crawlers.append(f"kr36:{category}")
|
||||||
|
|
||||||
|
# 新浪爬虫
|
||||||
|
sina_categories = config.get('sources.sina.categories', {})
|
||||||
|
for category in sina_categories.keys():
|
||||||
|
crawlers.append(f"sina:{category}")
|
||||||
|
|
||||||
return crawlers
|
return crawlers
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -101,7 +109,7 @@ def run_crawler(source: str, category: str, max_articles: int = None) -> bool:
|
||||||
|
|
||||||
# 创建并运行爬虫
|
# 创建并运行爬虫
|
||||||
crawler = crawler_class(source, category)
|
crawler = crawler_class(source, category)
|
||||||
|
print("创建并运行爬虫")
|
||||||
# 覆盖最大文章数
|
# 覆盖最大文章数
|
||||||
if max_articles:
|
if max_articles:
|
||||||
crawler.max_articles = max_articles
|
crawler.max_articles = max_articles
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,60 @@
|
||||||
|
"""
|
||||||
|
新浪汽车新闻爬虫
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||||
|
|
||||||
|
from base.crawler_base import DynamicCrawler, Article
|
||||||
|
from parsers.sina_parser import SinaAutoParser
|
||||||
|
|
||||||
|
|
||||||
|
class SinaAutoCrawler(DynamicCrawler):
|
||||||
|
"""新浪汽车新闻爬虫"""
|
||||||
|
|
||||||
|
def _extract_article_urls(self, html: str) -> List[str]:
|
||||||
|
"""从HTML中提取文章URL列表"""
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
urls = []
|
||||||
|
|
||||||
|
# 尝试不同的选择器
|
||||||
|
div_list = soup.select("div.cardlist-a__list div.ty-card.ty-card-type1")
|
||||||
|
if not div_list:
|
||||||
|
div_list = soup.select("div.news-list li.news-item")
|
||||||
|
if not div_list:
|
||||||
|
div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
|
||||||
|
|
||||||
|
for item in div_list:
|
||||||
|
a = item.select_one("a")
|
||||||
|
if a and a.get("href"):
|
||||||
|
urls.append(a.get("href"))
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
def _fetch_articles(self, urls: List[str]) -> List[Article]:
|
||||||
|
"""爬取文章详情"""
|
||||||
|
articles = []
|
||||||
|
parser = SinaAutoParser()
|
||||||
|
|
||||||
|
for i, url in enumerate(urls[:self.max_articles]):
|
||||||
|
try:
|
||||||
|
article = parser.parse(url)
|
||||||
|
article.category_id = self.category_id
|
||||||
|
article.source = "新浪"
|
||||||
|
|
||||||
|
if not article.author:
|
||||||
|
article.author = "新浪汽车"
|
||||||
|
|
||||||
|
if article.is_valid():
|
||||||
|
articles.append(article)
|
||||||
|
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"解析文章失败: {url} - {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return articles
|
||||||
|
|
@ -54,6 +54,7 @@ class NewsRepository:
|
||||||
try:
|
try:
|
||||||
with db_pool.get_connection() as conn:
|
with db_pool.get_connection() as conn:
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
# 批量查询已存在的URL
|
# 批量查询已存在的URL
|
||||||
if urls:
|
if urls:
|
||||||
placeholders = ','.join(['%s'] * len(urls))
|
placeholders = ','.join(['%s'] * len(urls))
|
||||||
|
|
@ -61,24 +62,25 @@ class NewsRepository:
|
||||||
cursor.execute(check_sql, urls)
|
cursor.execute(check_sql, urls)
|
||||||
existing_urls = {row[0] for row in cursor.fetchall()}
|
existing_urls = {row[0] for row in cursor.fetchall()}
|
||||||
|
|
||||||
# 只插入不存在的记录
|
# 只插入URL不存在的记录
|
||||||
new_data = [item for item in data if item[0] not in existing_urls]
|
new_data = [item for item in data if item[0] not in existing_urls]
|
||||||
|
|
||||||
if not new_data:
|
if not new_data:
|
||||||
self.logger.info(f"所有 {len(data)} 条新闻已存在,跳过插入")
|
self.logger.info(f"所有 {len(data)} 条新闻已存在,跳过插入")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
# 执行插入
|
# 执行插入,使用 INSERT IGNORE 忽略 content_hash 重复的记录
|
||||||
sql = """
|
sql = """
|
||||||
INSERT INTO news (url, title, category_id, publish_time, author, source, content, content_hash)
|
INSERT IGNORE INTO news (url, title, category_id, publish_time, author, source, content, content_hash)
|
||||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cursor.executemany(sql, new_data)
|
cursor.executemany(sql, new_data)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
inserted = len(new_data)
|
# 获取实际插入的行数
|
||||||
self.logger.info(f"成功插入 {inserted} 条新新闻,{len(data) - inserted} 条已存在")
|
inserted = cursor.rowcount
|
||||||
|
self.logger.info(f"成功插入 {inserted} 条新新闻,{len(new_data) - inserted} 条因内容重复被忽略")
|
||||||
return inserted
|
return inserted
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,71 @@
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||||
|
|
||||||
|
from base.parser_base import BaseParser
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
from base.crawler_base import Article
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from utils.logger import get_logger
|
||||||
|
from utils.http_client import HttpClient
|
||||||
|
|
||||||
|
|
||||||
|
class SinaAutoParser(BaseParser):
|
||||||
|
"""新浪网汽车新闻解析器"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.logger = get_logger(__name__)
|
||||||
|
self.http_client = HttpClient()
|
||||||
|
|
||||||
|
def parse(self, url: str) -> Article:
|
||||||
|
"""
|
||||||
|
解析新浪网文章详情页
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 文章URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
文章对象
|
||||||
|
"""
|
||||||
|
html = self.http_client.get(url)
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
|
||||||
|
# 获取文章标题
|
||||||
|
article_title_tag = soup.select_one("div.main-content h1.main-title")
|
||||||
|
article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题"
|
||||||
|
|
||||||
|
# 获取文章发布时间
|
||||||
|
def normalize_time(time_str):
|
||||||
|
for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
|
||||||
|
try:
|
||||||
|
dt = datetime.strptime(time_str, fmt)
|
||||||
|
return dt.strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
except:
|
||||||
|
continue
|
||||||
|
return time_str # 如果都不匹配,返回原字符串
|
||||||
|
|
||||||
|
time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
|
||||||
|
publish_time = normalize_time(time_tag.get_text(strip=True)) if time_tag else "1949-01-01 12:00:00"
|
||||||
|
|
||||||
|
# 获取文章作者
|
||||||
|
author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
|
||||||
|
author = author_tag.get_text(strip=True) if author_tag else "未知"
|
||||||
|
|
||||||
|
# 获取文章正文段落
|
||||||
|
article_div = soup.select_one("div.main-content div.article")
|
||||||
|
if not article_div:
|
||||||
|
raise ValueError("无法找到文章内容")
|
||||||
|
|
||||||
|
paragraphs = article_div.find_all('p')
|
||||||
|
content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
|
||||||
|
|
||||||
|
return Article(
|
||||||
|
url=url,
|
||||||
|
title=article_title,
|
||||||
|
publish_time=publish_time,
|
||||||
|
author=author,
|
||||||
|
content=content,
|
||||||
|
category_id=6, # 汽车分类ID
|
||||||
|
source="sina"
|
||||||
|
)
|
||||||
Loading…
Reference in New Issue