From 2afdd698b204377164108d948c6e8c69ad02dd1e Mon Sep 17 00:00:00 2001
From: shenjianZ <shenjianZLT@gmail.com>
Date: Wed, 14 Jan 2026 19:17:09 +0800
Subject: [PATCH] feat: add sina auto crawler

---
 crawler-module/config/config.yaml         | 10 ++++
 crawler-module/src/base/crawler_base.py   |  1 -
 crawler-module/src/cli/main.py            | 10 +++-
 crawler-module/src/crawlers/sina/auto.py  | 60 +++++++++++++++++++
 crawler-module/src/database/repository.py | 12 ++--
 crawler-module/src/parsers/sina_parser.py | 71 +++++++++++++++++++++++
 6 files changed, 157 insertions(+), 7 deletions(-)
 create mode 100644 crawler-module/src/crawlers/sina/auto.py
 create mode 100644 crawler-module/src/parsers/sina_parser.py

diff --git a/crawler-module/config/config.yaml b/crawler-module/config/config.yaml
index 0868462..ff9ca0f 100644
--- a/crawler-module/config/config.yaml
+++ b/crawler-module/config/config.yaml
@@ -110,3 +110,13 @@ sources:
         category_id: 9
         name: "AI"
         css_selector: "div.kr-information-left"
+
+  sina:
+    base_url: "https://sina.com.cn"
+    categories:
+      auto:
+        url: "https://auto.sina.com.cn/"
+        category_id: 6
+        name: "汽车"
+        css_selector: "div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1"
+        detail_css_selector: "div.main-content"
diff --git a/crawler-module/src/base/crawler_base.py b/crawler-module/src/base/crawler_base.py
index dcca387..0d5760d 100644
--- a/crawler-module/src/base/crawler_base.py
+++ b/crawler-module/src/base/crawler_base.py
@@ -102,7 +102,6 @@ class BaseCrawler(ABC):
         try:
             # 获取页面HTML
             html = self._fetch_page()
-
             # 解析文章列表
             article_urls = self._extract_article_urls(html)
             self.logger.info(f"找到 {len(article_urls)} 篇文章")
diff --git a/crawler-module/src/cli/main.py b/crawler-module/src/cli/main.py
index e68653d..72d20f5 100644
--- a/crawler-module/src/cli/main.py
+++ b/crawler-module/src/cli/main.py
@@ -31,6 +31,9 @@ CRAWLER_CLASSES = {
     'kr36': {
         'ai': ('crawlers.kr36.ai', 'AICrawler'),
     },
+    'sina': {
+        'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
+    },
 }
 
 
@@ -53,6 +56,11 @@ def list_crawlers() -> List[str]:
     for category in kr36_categories.keys():
         crawlers.append(f"kr36:{category}")
 
+    # 新浪爬虫
+    sina_categories = config.get('sources.sina.categories', {})
+    for category in sina_categories.keys():
+        crawlers.append(f"sina:{category}")
+
     return crawlers
 
 
@@ -101,7 +109,7 @@ def run_crawler(source: str, category: str, max_articles: int = None) -> bool:
 
         # 创建并运行爬虫
         crawler = crawler_class(source, category)
-
+        print("创建并运行爬虫")
         # 覆盖最大文章数
         if max_articles:
             crawler.max_articles = max_articles
diff --git a/crawler-module/src/crawlers/sina/auto.py b/crawler-module/src/crawlers/sina/auto.py
new file mode 100644
index 0000000..39f898e
--- /dev/null
+++ b/crawler-module/src/crawlers/sina/auto.py
@@ -0,0 +1,60 @@
+"""
+新浪汽车新闻爬虫
+"""
+
+from typing import List
+from bs4 import BeautifulSoup
+
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from base.crawler_base import DynamicCrawler, Article
+from parsers.sina_parser import SinaAutoParser
+
+
+class SinaAutoCrawler(DynamicCrawler):
+    """新浪汽车新闻爬虫"""
+
+    def _extract_article_urls(self, html: str) -> List[str]:
+        """从HTML中提取文章URL列表"""
+        soup = BeautifulSoup(html, "lxml")
+        urls = []
+
+        # 尝试不同的选择器
+        div_list = soup.select("div.cardlist-a__list div.ty-card.ty-card-type1")
+        if not div_list:
+            div_list = soup.select("div.news-list li.news-item")
+            if not div_list:
+                div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
+
+        for item in div_list:
+            a = item.select_one("a")
+            if a and a.get("href"):
+                urls.append(a.get("href"))
+
+        return urls
+
+    def _fetch_articles(self, urls: List[str]) -> List[Article]:
+        """爬取文章详情"""
+        articles = []
+        parser = SinaAutoParser()
+
+        for i, url in enumerate(urls[:self.max_articles]):
+            try:
+                article = parser.parse(url)
+                article.category_id = self.category_id
+                article.source = "新浪"
+
+                if not article.author:
+                    article.author = "新浪汽车"
+
+                if article.is_valid():
+                    articles.append(article)
+                    self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
+
+            except Exception as e:
+                self.logger.error(f"解析文章失败: {url} - {e}")
+                continue
+
+        return articles
\ No newline at end of file
diff --git a/crawler-module/src/database/repository.py b/crawler-module/src/database/repository.py
index cc53798..9f82e9e 100644
--- a/crawler-module/src/database/repository.py
+++ b/crawler-module/src/database/repository.py
@@ -54,6 +54,7 @@ class NewsRepository:
         try:
             with db_pool.get_connection() as conn:
                 cursor = conn.cursor()
+
                 # 批量查询已存在的URL
                 if urls:
                     placeholders = ','.join(['%s'] * len(urls))
@@ -61,24 +62,25 @@ class NewsRepository:
                     cursor.execute(check_sql, urls)
                     existing_urls = {row[0] for row in cursor.fetchall()}
 
-                # 只插入不存在的记录
+                # 只插入URL不存在的记录
                 new_data = [item for item in data if item[0] not in existing_urls]
 
                 if not new_data:
                     self.logger.info(f"所有 {len(data)} 条新闻已存在，跳过插入")
                     return 0
 
-                # 执行插入
+                # 执行插入，使用 INSERT IGNORE 忽略 content_hash 重复的记录
                 sql = """
-                INSERT INTO news (url, title, category_id, publish_time, author, source, content, content_hash)
+                INSERT IGNORE INTO news (url, title, category_id, publish_time, author, source, content, content_hash)
                 VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                 """
 
                 cursor.executemany(sql, new_data)
                 conn.commit()
 
-                inserted = len(new_data)
-                self.logger.info(f"成功插入 {inserted} 条新新闻，{len(data) - inserted} 条已存在")
+                # 获取实际插入的行数
+                inserted = cursor.rowcount
+                self.logger.info(f"成功插入 {inserted} 条新新闻，{len(new_data) - inserted} 条因内容重复被忽略")
                 return inserted
 
         except Exception as e:
diff --git a/crawler-module/src/parsers/sina_parser.py b/crawler-module/src/parsers/sina_parser.py
new file mode 100644
index 0000000..40bab47
--- /dev/null
+++ b/crawler-module/src/parsers/sina_parser.py
@@ -0,0 +1,71 @@
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from base.parser_base import BaseParser
+from datetime import datetime
+from typing import Optional
+from base.crawler_base import Article
+from bs4 import BeautifulSoup
+from utils.logger import get_logger
+from utils.http_client import HttpClient
+
+
+class SinaAutoParser(BaseParser):
+    """新浪网汽车新闻解析器"""
+
+    def __init__(self):
+        self.logger = get_logger(__name__)
+        self.http_client = HttpClient()
+
+    def parse(self, url: str) -> Article:
+        """
+        解析新浪网文章详情页
+
+        Args:
+            url: 文章URL
+
+        Returns:
+            文章对象
+        """
+        html = self.http_client.get(url)
+        soup = BeautifulSoup(html, "lxml")
+
+        # 获取文章标题
+        article_title_tag = soup.select_one("div.main-content h1.main-title")
+        article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题"
+
+        # 获取文章发布时间
+        def normalize_time(time_str):
+            for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
+                try:
+                    dt = datetime.strptime(time_str, fmt)
+                    return dt.strftime("%Y-%m-%d %H:%M:%S")
+                except:
+                    continue
+            return time_str  # 如果都不匹配，返回原字符串
+
+        time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
+        publish_time = normalize_time(time_tag.get_text(strip=True)) if time_tag else "1949-01-01 12:00:00"
+
+        # 获取文章作者
+        author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
+        author = author_tag.get_text(strip=True) if author_tag else "未知"
+
+        # 获取文章正文段落
+        article_div = soup.select_one("div.main-content div.article")
+        if not article_div:
+            raise ValueError("无法找到文章内容")
+
+        paragraphs = article_div.find_all('p')
+        content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
+
+        return Article(
+            url=url,
+            title=article_title,
+            publish_time=publish_time,
+            author=author,
+            content=content,
+            category_id=6,  # 汽车分类ID
+            source="sina"
+        )
\ No newline at end of file