feat: add sina auto crawler

2026-01-14 19:17:09 +08:00 · 2026-01-14 19:17:09 +08:00 · 2afdd698b2
parent 61a5b7d301
commit 2afdd698b2
6 changed files with 157 additions and 7 deletions
--- a/crawler-module/config/config.yaml
+++ b/crawler-module/config/config.yaml
@ -110,3 +110,13 @@ sources:
        category_id: 9
        name: "AI"
        css_selector: "div.kr-information-left"
  sina:
    base_url: "https://sina.com.cn"
    categories:
      auto:
        url: "https://auto.sina.com.cn/"
        category_id: 6
        name: "汽车"
        css_selector: "div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1"
        detail_css_selector: "div.main-content"
--- a/crawler-module/src/base/crawler_base.py
+++ b/crawler-module/src/base/crawler_base.py
@ -102,7 +102,6 @@ class BaseCrawler(ABC):
        try:
            # 获取页面HTML
            html = self._fetch_page()
            # 解析文章列表
            article_urls = self._extract_article_urls(html)
            self.logger.info(f"找到 {len(article_urls)} 篇文章")
--- a/crawler-module/src/cli/main.py
+++ b/crawler-module/src/cli/main.py
@ -31,6 +31,9 @@ CRAWLER_CLASSES = {
    'kr36': {
        'ai': ('crawlers.kr36.ai', 'AICrawler'),
    },
    'sina': {
        'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'),
    },
 }
@ -53,6 +56,11 @@ def list_crawlers() -> List[str]:
    for category in kr36_categories.keys():
        crawlers.append(f"kr36:{category}")
    # 新浪爬虫
    sina_categories = config.get('sources.sina.categories', {})
    for category in sina_categories.keys():
        crawlers.append(f"sina:{category}")
    return crawlers
@ -101,7 +109,7 @@ def run_crawler(source: str, category: str, max_articles: int = None) -> bool:
        # 创建并运行爬虫
        crawler = crawler_class(source, category)
-
+        print("创建并运行爬虫")
        # 覆盖最大文章数
        if max_articles:
            crawler.max_articles = max_articles
--- a/crawler-module/src/crawlers/sina/auto.py
+++ b/crawler-module/src/crawlers/sina/auto.py
@ -0,0 +1,60 @@
 """
 新浪汽车新闻爬虫
 """
 from typing import List
 from bs4 import BeautifulSoup
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 from base.crawler_base import DynamicCrawler, Article
 from parsers.sina_parser import SinaAutoParser
 class SinaAutoCrawler(DynamicCrawler):
    """新浪汽车新闻爬虫"""
    def _extract_article_urls(self, html: str) -> List[str]:
        """从HTML中提取文章URL列表"""
        soup = BeautifulSoup(html, "lxml")
        urls = []
        # 尝试不同的选择器
        div_list = soup.select("div.cardlist-a__list div.ty-card.ty-card-type1")
        if not div_list:
            div_list = soup.select("div.news-list li.news-item")
            if not div_list:
                div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1")
        for item in div_list:
            a = item.select_one("a")
            if a and a.get("href"):
                urls.append(a.get("href"))
        return urls
    def _fetch_articles(self, urls: List[str]) -> List[Article]:
        """爬取文章详情"""
        articles = []
        parser = SinaAutoParser()
        for i, url in enumerate(urls[:self.max_articles]):
            try:
                article = parser.parse(url)
                article.category_id = self.category_id
                article.source = "新浪"
                if not article.author:
                    article.author = "新浪汽车"
                if article.is_valid():
                    articles.append(article)
                    self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
            except Exception as e:
                self.logger.error(f"解析文章失败: {url} - {e}")
                continue
        return articles
--- a/crawler-module/src/database/repository.py
+++ b/crawler-module/src/database/repository.py
@ -54,6 +54,7 @@ class NewsRepository:
        try:
            with db_pool.get_connection() as conn:
                cursor = conn.cursor()
                # 批量查询已存在的URL
                if urls:
                    placeholders = ','.join(['%s'] * len(urls))
@ -61,24 +62,25 @@ class NewsRepository:
                    cursor.execute(check_sql, urls)
                    existing_urls = {row[0] for row in cursor.fetchall()}
-                # 只插入不存在的记录
+                # 只插入URL不存在的记录
                new_data = [item for item in data if item[0] not in existing_urls]
                if not new_data:
                    self.logger.info(f"所有 {len(data)} 条新闻已存在，跳过插入")
                    return 0
-                # 执行插入
+                # 执行插入，使用 INSERT IGNORE 忽略 content_hash 重复的记录
                sql = """
-                INSERT INTO news (url, title, category_id, publish_time, author, source, content, content_hash)
+                INSERT IGNORE INTO news (url, title, category_id, publish_time, author, source, content, content_hash)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
                """
                cursor.executemany(sql, new_data)
                conn.commit()
-                inserted = len(new_data)
+                # 获取实际插入的行数
-                self.logger.info(f"成功插入 {inserted} 条新新闻，{len(data) - inserted} 条已存在")
+                inserted = cursor.rowcount
                self.logger.info(f"成功插入 {inserted} 条新新闻，{len(new_data) - inserted} 条因内容重复被忽略")
                return inserted
        except Exception as e:
--- a/crawler-module/src/parsers/sina_parser.py
+++ b/crawler-module/src/parsers/sina_parser.py
@ -0,0 +1,71 @@
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 from base.parser_base import BaseParser
 from datetime import datetime
 from typing import Optional
 from base.crawler_base import Article
 from bs4 import BeautifulSoup
 from utils.logger import get_logger
 from utils.http_client import HttpClient
 class SinaAutoParser(BaseParser):
    """新浪网汽车新闻解析器"""
    def __init__(self):
        self.logger = get_logger(__name__)
        self.http_client = HttpClient()
    def parse(self, url: str) -> Article:
        """
        解析新浪网文章详情页
        Args:
            url: 文章URL
        Returns:
            文章对象
        """
        html = self.http_client.get(url)
        soup = BeautifulSoup(html, "lxml")
        # 获取文章标题
        article_title_tag = soup.select_one("div.main-content h1.main-title")
        article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题"
        # 获取文章发布时间
        def normalize_time(time_str):
            for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"):
                try:
                    dt = datetime.strptime(time_str, fmt)
                    return dt.strftime("%Y-%m-%d %H:%M:%S")
                except:
                    continue
            return time_str  # 如果都不匹配，返回原字符串
        time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date")
        publish_time = normalize_time(time_tag.get_text(strip=True)) if time_tag else "1949-01-01 12:00:00"
        # 获取文章作者
        author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a")
        author = author_tag.get_text(strip=True) if author_tag else "未知"
        # 获取文章正文段落
        article_div = soup.select_one("div.main-content div.article")
        if not article_div:
            raise ValueError("无法找到文章内容")
        paragraphs = article_div.find_all('p')
        content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
        return Article(
            url=url,
            title=article_title,
            publish_time=publish_time,
            author=author,
            content=content,
            category_id=6,  # 汽车分类ID
            source="sina"
        )