feat: add sina auto crawler

2026-01-14 19:17:09 +08:00 · 2026-01-14 19:17:09 +08:00 · 2afdd698b2
parent 61a5b7d301
commit 2afdd698b2
6 changed files with 157 additions and 7 deletions
--- a/crawler-module/config/config.yaml
+++ b/crawler-module/config/config.yaml
@ -110,3 +110,13 @@ sources:
        category_id: 9
        name: "AI"
        css_selector: "div.kr-information-left"
+
+  sina:
+    base_url: "https://sina.com.cn"
+    categories:
+      auto:
+        url: "https://auto.sina.com.cn/"
+        category_id: 6
+        name: "汽车"
+        css_selector: "div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1"
+        detail_css_selector: "div.main-content"
--- a/crawler-module/src/base/crawler_base.py
+++ b/crawler-module/src/base/crawler_base.py
@ -102,7 +102,6 @@ class BaseCrawler(ABC):
        try:
            # 获取页面HTML
            html = self._fetch_page()
-
            # 解析文章列表
            article_urls = self._extract_article_urls(html)
            self.logger.info(f"找到 {len(article_urls)} 篇文章")
--- a/crawler-module/src/cli/main.py
+++ b/crawler-module/src/cli/main.py
@ -31,6 +31,9 @@ CRAWLER_CLASSES = {
    'kr36': {
        'ai': ('crawlers.kr36.ai', 'AICrawler'),
    },
+    'sina': {
+        'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
+    },
 }


@ -53,6 +56,11 @@ def list_crawlers() -> List[str]:
    for category in kr36_categories.keys():
        crawlers.append(f"kr36:{category}")

+    # 新浪爬虫
+    sina_categories = config.get('sources.sina.categories', {})
+    for category in sina_categories.keys():
+        crawlers.append(f"sina:{category}")
+
    return crawlers


@ -101,7 +109,7 @@ def run_crawler(source: str, category: str, max_articles: int = None) -> bool:

        # 创建并运行爬虫
        crawler = crawler_class(source, category)
-
+        print("创建并运行爬虫")
        # 覆盖最大文章数
        if max_articles:
            crawler.max_articles = max_articles
--- a/crawler-module/src/crawlers/sina/auto.py
+++ b/crawler-module/src/crawlers/sina/auto.py
@ -0,0 +1,60 @@
+"""
+新浪汽车新闻爬虫
+"""
+
+from typing import List
+from bs4 import BeautifulSoup
+
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from base.crawler_base import DynamicCrawler, Article
+from parsers.sina_parser import SinaAutoParser
+
+
+class SinaAutoCrawler(DynamicCrawler):
+    """新浪汽车新闻爬虫"""
+
+    def _extract_article_urls(self, html: str) -> List[str]:
+        """从HTML中提取文章URL列表"""
+        soup = BeautifulSoup(html, "lxml")
+        urls = []
+
+        # 尝试不同的选择器
+        div_list = soup.select("div.cardlist-a__list div.ty-card.ty-card-type1")
+        if not div_list:
+            div_list = soup.select("div.news-list li.news-item")
+            if not div_list:
+                div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
+
+        for item in div_list:
+            a = item.select_one("a")
+            if a and a.get("href"):
+                urls.append(a.get("href"))
+
+        return urls
+
+    def _fetch_articles(self, urls: List[str]) -> List[Article]:
+        """爬取文章详情"""
+        articles = []
+        parser = SinaAutoParser()
+
+        for i, url in enumerate(urls[:self.max_articles]):
+            try:
+                article = parser.parse(url)
+                article.category_id = self.category_id
+                article.source = "新浪"
+
+                if not article.author:
+                    article.author = "新浪汽车"
+
+                if article.is_valid():
+                    articles.append(article)
+                    self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
+
+            except Exception as e:
+                self.logger.error(f"解析文章失败: {url} - {e}")
+                continue
+
+        return articles
--- a/crawler-module/src/database/repository.py
+++ b/crawler-module/src/database/repository.py
@ -54,6 +54,7 @@ class NewsRepository:
        try:
            with db_pool.get_connection() as conn:
                cursor = conn.cursor()
+
                # 批量查询已存在的URL
                if urls:
                    placeholders = ','.join(['%s'] * len(urls))
@ -61,24 +62,25 @@ class NewsRepository:
                    cursor.execute(check_sql, urls)
                    existing_urls = {row[0] for row in cursor.fetchall()}

-                # 只插入不存在的记录
+                # 只插入URL不存在的记录
                new_data = [item for item in data if item[0] not in existing_urls]

                if not new_data:
                    self.logger.info(f"所有 {len(data)} 条新闻已存在，跳过插入")
                    return 0

-                # 执行插入
+                # 执行插入，使用 INSERT IGNORE 忽略 content_hash 重复的记录
                sql = """
-                INSERT INTO news (url, title, category_id, publish_time, author, source, content, content_hash)
+                INSERT IGNORE INTO news (url, title, category_id, publish_time, author, source, content, content_hash)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                """

                cursor.executemany(sql, new_data)
                conn.commit()

-                inserted = len(new_data)
-                self.logger.info(f"成功插入 {inserted} 条新新闻，{len(data) - inserted} 条已存在")
+                # 获取实际插入的行数
+                inserted = cursor.rowcount
+                self.logger.info(f"成功插入 {inserted} 条新新闻，{len(new_data) - inserted} 条因内容重复被忽略")
                return inserted

        except Exception as e:
--- a/crawler-module/src/parsers/sina_parser.py
+++ b/crawler-module/src/parsers/sina_parser.py
@ -0,0 +1,71 @@
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from base.parser_base import BaseParser
+from datetime import datetime
+from typing import Optional
+from base.crawler_base import Article
+from bs4 import BeautifulSoup
+from utils.logger import get_logger
+from utils.http_client import HttpClient
+
+
+class SinaAutoParser(BaseParser):
+    """新浪网汽车新闻解析器"""
+
+    def __init__(self):
+        self.logger = get_logger(__name__)
+        self.http_client = HttpClient()
+
+    def parse(self, url: str) -> Article:
+        """
+        解析新浪网文章详情页
+
+        Args:
+            url: 文章URL
+
+        Returns:
+            文章对象
+        """
+        html = self.http_client.get(url)
+        soup = BeautifulSoup(html, "lxml")
+
+        # 获取文章标题
+        article_title_tag = soup.select_one("div.main-content h1.main-title")
+        article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题"
+
+        # 获取文章发布时间
+        def normalize_time(time_str):
+            for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
+                try:
+                    dt = datetime.strptime(time_str, fmt)
+                    return dt.strftime("%Y-%m-%d %H:%M:%S")
+                except:
+                    continue
+            return time_str  # 如果都不匹配，返回原字符串
+
+        time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
+        publish_time = normalize_time(time_tag.get_text(strip=True)) if time_tag else "1949-01-01 12:00:00"
+
+        # 获取文章作者
+        author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
+        author = author_tag.get_text(strip=True) if author_tag else "未知"
+
+        # 获取文章正文段落
+        article_div = soup.select_one("div.main-content div.article")
+        if not article_div:
+            raise ValueError("无法找到文章内容")
+
+        paragraphs = article_div.find_all('p')
+        content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
+
+        return Article(
+            url=url,
+            title=article_title,
+            publish_time=publish_time,
+            author=author,
+            content=content,
+            category_id=6,  # 汽车分类ID
+            source="sina"
+        )