From 2afdd698b204377164108d948c6e8c69ad02dd1e Mon Sep 17 00:00:00 2001 From: shenjianZ Date: Wed, 14 Jan 2026 19:17:09 +0800 Subject: [PATCH] feat: add sina auto crawler --- crawler-module/config/config.yaml | 10 ++++ crawler-module/src/base/crawler_base.py | 1 - crawler-module/src/cli/main.py | 10 +++- crawler-module/src/crawlers/sina/auto.py | 60 +++++++++++++++++++ crawler-module/src/database/repository.py | 12 ++-- crawler-module/src/parsers/sina_parser.py | 71 +++++++++++++++++++++++ 6 files changed, 157 insertions(+), 7 deletions(-) create mode 100644 crawler-module/src/crawlers/sina/auto.py create mode 100644 crawler-module/src/parsers/sina_parser.py diff --git a/crawler-module/config/config.yaml b/crawler-module/config/config.yaml index 0868462..ff9ca0f 100644 --- a/crawler-module/config/config.yaml +++ b/crawler-module/config/config.yaml @@ -110,3 +110,13 @@ sources: category_id: 9 name: "AI" css_selector: "div.kr-information-left" + + sina: + base_url: "https://sina.com.cn" + categories: + auto: + url: "https://auto.sina.com.cn/" + category_id: 6 + name: "汽车" + css_selector: "div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1" + detail_css_selector: "div.main-content" diff --git a/crawler-module/src/base/crawler_base.py b/crawler-module/src/base/crawler_base.py index dcca387..0d5760d 100644 --- a/crawler-module/src/base/crawler_base.py +++ b/crawler-module/src/base/crawler_base.py @@ -102,7 +102,6 @@ class BaseCrawler(ABC): try: # 获取页面HTML html = self._fetch_page() - # 解析文章列表 article_urls = self._extract_article_urls(html) self.logger.info(f"找到 {len(article_urls)} 篇文章") diff --git a/crawler-module/src/cli/main.py b/crawler-module/src/cli/main.py index e68653d..72d20f5 100644 --- a/crawler-module/src/cli/main.py +++ b/crawler-module/src/cli/main.py @@ -31,6 +31,9 @@ CRAWLER_CLASSES = { 'kr36': { 'ai': ('crawlers.kr36.ai', 'AICrawler'), }, + 'sina': { + 'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'), + }, } @@ -53,6 +56,11 @@ def list_crawlers() -> List[str]: for category in kr36_categories.keys(): crawlers.append(f"kr36:{category}") + # 新浪爬虫 + sina_categories = config.get('sources.sina.categories', {}) + for category in sina_categories.keys(): + crawlers.append(f"sina:{category}") + return crawlers @@ -101,7 +109,7 @@ def run_crawler(source: str, category: str, max_articles: int = None) -> bool: # 创建并运行爬虫 crawler = crawler_class(source, category) - + print("创建并运行爬虫") # 覆盖最大文章数 if max_articles: crawler.max_articles = max_articles diff --git a/crawler-module/src/crawlers/sina/auto.py b/crawler-module/src/crawlers/sina/auto.py new file mode 100644 index 0000000..39f898e --- /dev/null +++ b/crawler-module/src/crawlers/sina/auto.py @@ -0,0 +1,60 @@ +""" +新浪汽车新闻爬虫 +""" + +from typing import List +from bs4 import BeautifulSoup + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from base.crawler_base import DynamicCrawler, Article +from parsers.sina_parser import SinaAutoParser + + +class SinaAutoCrawler(DynamicCrawler): + """新浪汽车新闻爬虫""" + + def _extract_article_urls(self, html: str) -> List[str]: + """从HTML中提取文章URL列表""" + soup = BeautifulSoup(html, "lxml") + urls = [] + + # 尝试不同的选择器 + div_list = soup.select("div.cardlist-a__list div.ty-card.ty-card-type1") + if not div_list: + div_list = soup.select("div.news-list li.news-item") + if not div_list: + div_list = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1") + + for item in div_list: + a = item.select_one("a") + if a and a.get("href"): + urls.append(a.get("href")) + + return urls + + def _fetch_articles(self, urls: List[str]) -> List[Article]: + """爬取文章详情""" + articles = [] + parser = SinaAutoParser() + + for i, url in enumerate(urls[:self.max_articles]): + try: + article = parser.parse(url) + article.category_id = self.category_id + article.source = "新浪" + + if not article.author: + article.author = "新浪汽车" + + if article.is_valid(): + articles.append(article) + self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") + + except Exception as e: + self.logger.error(f"解析文章失败: {url} - {e}") + continue + + return articles \ No newline at end of file diff --git a/crawler-module/src/database/repository.py b/crawler-module/src/database/repository.py index cc53798..9f82e9e 100644 --- a/crawler-module/src/database/repository.py +++ b/crawler-module/src/database/repository.py @@ -54,6 +54,7 @@ class NewsRepository: try: with db_pool.get_connection() as conn: cursor = conn.cursor() + # 批量查询已存在的URL if urls: placeholders = ','.join(['%s'] * len(urls)) @@ -61,24 +62,25 @@ class NewsRepository: cursor.execute(check_sql, urls) existing_urls = {row[0] for row in cursor.fetchall()} - # 只插入不存在的记录 + # 只插入URL不存在的记录 new_data = [item for item in data if item[0] not in existing_urls] if not new_data: self.logger.info(f"所有 {len(data)} 条新闻已存在,跳过插入") return 0 - # 执行插入 + # 执行插入,使用 INSERT IGNORE 忽略 content_hash 重复的记录 sql = """ - INSERT INTO news (url, title, category_id, publish_time, author, source, content, content_hash) + INSERT IGNORE INTO news (url, title, category_id, publish_time, author, source, content, content_hash) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """ cursor.executemany(sql, new_data) conn.commit() - inserted = len(new_data) - self.logger.info(f"成功插入 {inserted} 条新新闻,{len(data) - inserted} 条已存在") + # 获取实际插入的行数 + inserted = cursor.rowcount + self.logger.info(f"成功插入 {inserted} 条新新闻,{len(new_data) - inserted} 条因内容重复被忽略") return inserted except Exception as e: diff --git a/crawler-module/src/parsers/sina_parser.py b/crawler-module/src/parsers/sina_parser.py new file mode 100644 index 0000000..40bab47 --- /dev/null +++ b/crawler-module/src/parsers/sina_parser.py @@ -0,0 +1,71 @@ +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from base.parser_base import BaseParser +from datetime import datetime +from typing import Optional +from base.crawler_base import Article +from bs4 import BeautifulSoup +from utils.logger import get_logger +from utils.http_client import HttpClient + + +class SinaAutoParser(BaseParser): + """新浪网汽车新闻解析器""" + + def __init__(self): + self.logger = get_logger(__name__) + self.http_client = HttpClient() + + def parse(self, url: str) -> Article: + """ + 解析新浪网文章详情页 + + Args: + url: 文章URL + + Returns: + 文章对象 + """ + html = self.http_client.get(url) + soup = BeautifulSoup(html, "lxml") + + # 获取文章标题 + article_title_tag = soup.select_one("div.main-content h1.main-title") + article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题" + + # 获取文章发布时间 + def normalize_time(time_str): + for fmt in ("%Y年%m月%d日 %H:%M", "%Y-%m-%d %H:%M:%S"): + try: + dt = datetime.strptime(time_str, fmt) + return dt.strftime("%Y-%m-%d %H:%M:%S") + except: + continue + return time_str # 如果都不匹配,返回原字符串 + + time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date") + publish_time = normalize_time(time_tag.get_text(strip=True)) if time_tag else "1949-01-01 12:00:00" + + # 获取文章作者 + author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a") + author = author_tag.get_text(strip=True) if author_tag else "未知" + + # 获取文章正文段落 + article_div = soup.select_one("div.main-content div.article") + if not article_div: + raise ValueError("无法找到文章内容") + + paragraphs = article_div.find_all('p') + content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)) + + return Article( + url=url, + title=article_title, + publish_time=publish_time, + author=author, + content=content, + category_id=6, # 汽车分类ID + source="sina" + ) \ No newline at end of file