diff --git a/.claude/settings.local.json b/.claude/settings.local.json index c94aa74..72dbbae 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -10,7 +10,9 @@ "Bash(Select-Object:*)", "Bash(powershell:*)", "Bash(python:*)", - "Bash(move:*)" + "Bash(move:*)", + "Bash(srcdir:*)", + "Bash(cd:*)" ] } } diff --git a/backend/src/main/resources/application-prod.yaml b/backend/src/main/resources/application-prod.yaml index 214fab6..d89014b 100644 --- a/backend/src/main/resources/application-prod.yaml +++ b/backend/src/main/resources/application-prod.yaml @@ -1,8 +1,8 @@ spring: datasource: - url: jdbc:mysql://${DB_HOST:43.143.145.172}:${DB_PORT:3306}/${DB_NAME:db_spring_1}?useSSL=false&serverTimezone=UTC&characterEncoding=UTF-8 + url: jdbc:mysql://${DB_HOST:localhost}:${DB_PORT:3306}/${DB_NAME:news}?useSSL=false&serverTimezone=UTC&characterEncoding=UTF-8 username: ${DB_USER:root} - password: ${DB_PASS:kyff145972} + password: ${DB_PASS:root} driver-class-name: com.mysql.cj.jdbc.Driver jpa: diff --git a/docs/模块开发任务清单.md b/docs/模块开发任务清单.md index e92a1c4..7351c52 100644 --- a/docs/模块开发任务清单.md +++ b/docs/模块开发任务清单.md @@ -16,1219 +16,13 @@ ## 1. 爬虫模块 (Python) -### 模块目录结构 -``` -crawler-module/ -├── src/ -│ ├── __init__.py -│ ├── base/ # 基础爬虫框架 -│ │ ├── __init__.py -│ │ ├── base_crawler.py # 爬虫基类 -│ │ ├── http_client.py # HTTP客户端封装 -│ │ └── proxy_pool.py # 代理池(可选) -│ ├── parsers/ # 解析器 -│ │ ├── __init__.py -│ │ ├── base_parser.py # 解析器基类 -│ │ ├── sina_parser.py # 新浪新闻解析器 -│ │ ├── sohu_parser.py # 搜狐新闻解析器 -│ │ └── ifeng_parser.py # 凤凰网解析器 -│ ├── cleaners/ # 数据清洗 -│ │ ├── __init__.py -│ │ ├── text_cleaner.py # 文本清洗 -│ │ └── deduplicator.py # 去重处理 -│ ├── storage/ # 存储层 -│ │ ├── __init__.py -│ │ ├── database.py # 数据库操作 -│ │ └── storage_factory.py # 存储工厂 -│ ├── utils/ # 工具类 -│ │ ├── __init__.py -│ │ ├── user_agent.py # User-Agent池 -│ │ └── date_parser.py # 日期解析 -│ └── crawler.py # 爬虫主入口 -├── config/ -│ ├── __init__.py -│ ├── settings.py # 配置文件 -│ └── sources.json # 数据源配置 -├── requirements.txt -└── main.py -``` - -### 1.1 需要完成的具体文件 - -#### 任务 1.1.1: `config/settings.py` - 爬虫配置文件 - -```python -""" -爬虫模块配置文件 -""" - -import os -from typing import List - -class CrawlerConfig: - """爬虫配置类""" - - # 项目根目录 - BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - - # 数据库配置 - DB_HOST = os.getenv('DB_HOST', 'localhost') - DB_PORT = int(os.getenv('DB_PORT', 3306)) - DB_NAME = os.getenv('DB_NAME', 'news_classifier') - DB_USER = os.getenv('DB_USER', 'root') - DB_PASSWORD = os.getenv('DB_PASSWORD', '') - - # 爬虫配置 - CONCURRENT_REQUESTS = 5 # 并发请求数 - DOWNLOAD_DELAY = 1 # 下载延迟(秒) - REQUEST_TIMEOUT = 10 # 请求超时(秒) - MAX_RETRIES = 3 # 最大重试次数 - - # User-Agent配置 - USER_AGENT_LIST = [ - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', - ] - - # 代理配置(可选) - PROXY_ENABLED = False - PROXY_LIST = [] - - # 日志配置 - LOG_LEVEL = 'INFO' - LOG_FILE = os.path.join(BASE_DIR, 'logs', 'crawler.log') - - # 存储配置 - BATCH_INSERT_SIZE = 100 # 批量插入大小 -``` - -#### 任务 1.1.2: `config/sources.json` - 数据源配置 - -```json -{ - "sources": [ - { - "name": "新浪新闻", - "code": "sina", - "enabled": true, - "base_url": "https://news.sina.com.cn", - "list_url_template": "https://news.sina.com.cn/{category}/index.shtml", - "categories": [ - {"code": "POLITICS", "url_key": "pol"}, - {"code": "FINANCE", "url_key": "finance"}, - {"code": "TECHNOLOGY", "url_key": "tech"}, - {"code": "SPORTS", "url_key": "sports"}, - {"code": "ENTERTAINMENT", "url_key": "ent"} - ], - "parser": "sina_parser.SinaNewsParser" - }, - { - "name": "搜狐新闻", - "code": "sohu", - "enabled": true, - "base_url": "https://www.sohu.com", - "list_url_template": "https://www.sohu.com/{category}", - "categories": [ - {"code": "POLITICS", "url_key": "politics"}, - {"code": "FINANCE", "url_key": "business"}, - {"code": "TECHNOLOGY", "url_key": "tech"}, - {"code": "SPORTS", "url_key": "sports"} - ], - "parser": "sohu_parser.SohuNewsParser" - } - ] -} -``` - -#### 任务 1.1.3: `base/base_crawler.py` - 爬虫基类 - -```python -""" -爬虫基类 -""" - -from abc import ABC, abstractmethod -from typing import List, Dict, Any -import logging -import time -import random - -logger = logging.getLogger(__name__) - - -class BaseCrawler(ABC): - """爬虫基类""" - - def __init__(self, config: Dict[str, Any]): - self.config = config - self.name = self.__class__.__name__ - logger.info(f"初始化爬虫: {self.name}") - - @abstractmethod - def fetch_news_list(self, category: str, page: int = 1) -> List[Dict[str, Any]]: - """ - 获取新闻列表 - :param category: 新闻类别 - :param page: 页码 - :return: 新闻URL列表 - """ - pass - - @abstractmethod - def fetch_news_detail(self, url: str) -> Dict[str, Any]: - """ - 获取新闻详情 - :param url: 新闻URL - :return: 新闻详情字典 - """ - pass - - def crawl(self, categories: List[str], max_pages: int = 5) -> List[Dict[str, Any]]: - """ - 执行爬取任务 - :param categories: 类别列表 - :param max_pages: 最大页数 - :return: 爬取的新闻列表 - """ - results = [] - for category in categories: - for page in range(1, max_pages + 1): - try: - news_list = self.fetch_news_list(category, page) - for news_url in news_list: - try: - detail = self.fetch_news_detail(news_url) - if detail: - results.append(detail) - except Exception as e: - logger.error(f"解析新闻详情失败: {news_url}, 错误: {e}") - - # 随机延迟,避免请求过快 - time.sleep(random.uniform(1, 3)) - except Exception as e: - logger.error(f"爬取失败: category={category}, page={page}, 错误: {e}") - - return results -``` - -#### 任务 1.1.4: `parsers/base_parser.py` - 解析器基类 - -```python -""" -解析器基类 -""" - -from abc import ABC, abstractmethod -from typing import Dict, Any, Optional -from datetime import datetime - - -class BaseParser(ABC): - """新闻解析器基类""" - - @abstractmethod - def parse_news_list(self, html: str) -> list[str]: - """ - 解析新闻列表页,获取新闻URL - :param html: HTML内容 - :return: 新闻URL列表 - """ - pass - - @abstractmethod - def parse_news_detail(self, html: str, url: str) -> Optional[Dict[str, Any]]: - """ - 解析新闻详情页 - :param html: HTML内容 - :param url: 新闻URL - :return: 解析后的新闻字典 - """ - pass - - def clean_html(self, html: str) -> str: - """清理HTML标签""" - from bs4 import BeautifulSoup - soup = BeautifulSoup(html, 'html.parser') - return soup.get_text(separator=' ', strip=True) - - def parse_publish_time(self, time_str: str) -> Optional[datetime]: - """解析发布时间""" - # 实现时间解析逻辑 - pass -``` - -#### 任务 1.1.5: `parsers/sina_parser.py` - 新浪新闻解析器 - -```python -""" -新浪新闻解析器 -""" - -from typing import Dict, Any, Optional -from bs4 import BeautifulSoup -import requests -from .base_parser import BaseParser - - -class SinaNewsParser(BaseParser): - """新浪新闻解析器""" - - def __init__(self): - self.base_url = "https://news.sina.com.cn" - - def parse_news_list(self, html: str) -> list[str]: - """解析新浪新闻列表""" - soup = BeautifulSoup(html, 'html.parser') - urls = [] - - # 根据新浪新闻的实际HTML结构解析 - for item in soup.select('.news-item'): - link = item.select_one('a') - if link and link.get('href'): - urls.append(link['href']) - - return urls - - def parse_news_detail(self, html: str, url: str) -> Optional[Dict[str, Any]]: - """解析新浪新闻详情""" - soup = BeautifulSoup(html, 'html.parser') - - # 提取标题 - title = soup.select_one('h1.main-title') - title = title.get_text(strip=True) if title else '' - - # 提取内容 - content = soup.select_one('.article-content') - content = self.clean_html(str(content)) if content else '' - - # 提取来源 - source = soup.select_one('.source') - source = source.get_text(strip=True) if source else '新浪新闻' - - # 提取发布时间 - publish_time = soup.select_one('.date') - publish_time = publish_time.get_text(strip=True) if publish_time else None - - # 提取作者 - author = soup.select_one('.author') - author = author.get_text(strip=True) if author else '' - - return { - 'title': title, - 'content': content, - 'summary': content[:200] if content else '', - 'source': source, - 'source_url': url, - 'author': author, - 'publish_time': self.parse_publish_time(publish_time) if publish_time else None - } -``` - -#### 任务 1.1.6: `cleaners/text_cleaner.py` - 文本清洗 - -```python -""" -文本清洗工具 -""" - -import re -from typing import List - - -class TextCleaner: - """文本清洗器""" - - # 无效字符模式 - INVALID_CHARS = r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]' - - # 无意义词(停用词) - STOP_WORDS = set([ - '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', - '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去' - ]) - - @classmethod - def clean_title(cls, title: str) -> str: - """清洗标题""" - if not title: - return '' - # 移除无效字符 - title = re.sub(cls.INVALID_CHARS, '', title) - # 移除多余空格 - title = ' '.join(title.split()) - return title.strip() - - @classmethod - def clean_content(cls, content: str) -> str: - """清洗内容""" - if not content: - return '' - # 移除HTML标签 - content = re.sub(r'<[^>]+>', '', content) - # 移除无效字符 - content = re.sub(cls.INVALID_CHARS, '', content) - # 移除多余空白 - content = ' '.join(content.split()) - # 移除过短段落 - paragraphs = content.split('。') - paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 10] - return '。'.join(paragraphs) - - @classmethod - def extract_summary(cls, content: str, max_length: int = 200) -> str: - """提取摘要""" - if not content: - return '' - # 取前N个字符作为摘要 - summary = content[:max_length] - # 确保在句子边界截断 - last_period = summary.rfind('。') - if last_period > max_length * 0.7: - return summary[:last_period + 1] - return summary + '...' -``` - -#### 任务 1.1.7: `cleaners/deduplicator.py` - 去重处理 - -```python -""" -新闻去重处理 -""" - -import hashlib -from typing import Set, Dict, Any - - -class NewsDeduplicator: - """新闻去重器""" - - def __init__(self): - self.seen_hashes: Set[str] = set() - self.seen_urls: Set[str] = set() - - def compute_hash(self, title: str, content: str) -> str: - """计算新闻内容的哈希值""" - text = f"{title}|{content[:500]}" # 使用标题和前500字符 - return hashlib.md5(text.encode('utf-8')).hexdigest() - - def is_duplicate(self, news: Dict[str, Any]) -> bool: - """ - 判断是否重复 - :param news: 新闻字典 - :return: True表示重复 - """ - # URL去重 - if news.get('source_url') in self.seen_urls: - return True - - # 内容去重 - content_hash = self.compute_hash( - news.get('title', ''), - news.get('content', '') - ) - - if content_hash in self.seen_hashes: - return True - - # 记录 - self.seen_urls.add(news.get('source_url', '')) - self.seen_hashes.add(content_hash) - - return False - - def deduplicate_batch(self, news_list: list[Dict[str, Any]]) -> list[Dict[str, Any]]: - """批量去重""" - return [news for news in news_list if not self.is_duplicate(news)] -``` - -#### 任务 1.1.8: `storage/database.py` - 数据库存储 - -```python -""" -数据库存储层 -""" - -import pymysql -from typing import List, Dict, Any, Optional -from datetime import datetime -import logging - -logger = logging.getLogger(__name__) - - -class NewsStorage: - """新闻数据库存储""" - - def __init__(self, config: Dict[str, Any]): - self.config = config - self.connection = None - self.connect() - - def connect(self): - """连接数据库""" - try: - self.connection = pymysql.connect( - host=self.config.get('DB_HOST', 'localhost'), - port=self.config.get('DB_PORT', 3306), - user=self.config.get('DB_USER', 'root'), - password=self.config.get('DB_PASSWORD', ''), - database=self.config.get('DB_NAME', 'news_classifier'), - charset='utf8mb4', - cursorclass=pymysql.cursors.DictCursor - ) - logger.info("数据库连接成功") - except Exception as e: - logger.error(f"数据库连接失败: {e}") - raise - - def close(self): - """关闭连接""" - if self.connection: - self.connection.close() - - def save_news(self, news: Dict[str, Any], category_code: str) -> Optional[int]: - """ - 保存单条新闻 - :return: 插入的新闻ID - """ - try: - with self.connection.cursor() as cursor: - sql = """ - INSERT INTO news (title, content, summary, source, source_url, - author, category_code, publish_time, status) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) - """ - cursor.execute(sql, ( - news.get('title'), - news.get('content'), - news.get('summary'), - news.get('source'), - news.get('source_url'), - news.get('author'), - category_code, - news.get('publish_time'), - 1 # status: 已发布 - )) - self.connection.commit() - return cursor.lastrowid - except Exception as e: - logger.error(f"保存新闻失败: {e}") - self.connection.rollback() - return None - - def batch_save_news(self, news_list: List[Dict[str, Any]], category_code: str) -> int: - """ - 批量保存新闻 - :return: 成功保存的数量 - """ - count = 0 - for news in news_list: - if self.save_news(news, category_code): - count += 1 - return count - - def news_exists(self, source_url: str) -> bool: - """检查新闻是否已存在""" - try: - with self.connection.cursor() as cursor: - sql = "SELECT id FROM news WHERE source_url = %s LIMIT 1" - cursor.execute(sql, (source_url,)) - return cursor.fetchone() is not None - except Exception as e: - logger.error(f"检查新闻存在性失败: {e}") - return False -``` - -#### 任务 1.1.9: `crawler.py` - 爬虫主入口 - -```python -""" -爬虫主程序入口 -""" - -import argparse -import logging -from config.settings import CrawlerConfig -from storage.database import NewsStorage -from cleaners.text_cleaner import TextCleaner -from cleaners.deduplicator import NewsDeduplicator -from parsers.sina_parser import SinaNewsParser -from parsers.sohu_parser import SohuNewsParser - - -def setup_logging(): - """配置日志""" - logging.basicConfig( - level=logging.INFO, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - handlers=[ - logging.FileHandler('logs/crawler.log', encoding='utf-8'), - logging.StreamHandler() - ] - ) - - -def main(): - """主函数""" - parser = argparse.ArgumentParser(description='新闻爬虫') - parser.add_argument('--source', type=str, help='数据源代码') - parser.add_argument('--category', type=str, help='新闻类别') - parser.add_argument('--pages', type=int, default=5, help='爬取页数') - - args = parser.parse_args() - - setup_logging() - logger = logging.getLogger(__name__) - - # 初始化组件 - storage = NewsStorage(CrawlerConfig.__dict__) - cleaner = TextCleaner() - deduplicator = NewsDeduplicator() - - # 选择解析器 - parser_map = { - 'sina': SinaNewsParser(), - 'sohu': SohuNewsParser() - } - - selected_parser = parser_map.get(args.source) - if not selected_parser: - logger.error(f"不支持的数据源: {args.source}") - return - - logger.info(f"开始爬取: source={args.source}, category={args.category}, pages={args.pages}") - - # 执行爬取 - # ... 具体爬取逻辑 - - logger.info("爬取完成") - - -if __name__ == '__main__': - main() -``` - -#### 任务 1.1.10: `requirements.txt` - 依赖文件 - -```txt -# 爬虫模块依赖 -requests>=2.31.0 -beautifulsoup4>=4.12.0 -lxml>=4.9.0 -pymysql>=1.1.0 -python-dotenv>=1.0.0 -``` - ---- ## 2. 后端服务模块 (Spring Boot) ### 模块目录结构 -``` -backend/src/main/java/com/newsclassifier/ -├── controller/ # 控制器层 -│ ├── AuthController.java -│ ├── NewsController.java -│ ├── CategoryController.java -│ ├── ClassifierController.java -│ └── AdminController.java -├── service/ # 服务层 -│ ├── AuthService.java -│ ├── NewsService.java -│ ├── CategoryService.java -│ ├── ClassifierService.java -│ └── impl/ -│ ├── AuthServiceImpl.java -│ ├── NewsServiceImpl.java -│ └── ClassifierServiceImpl.java -├── mapper/ # MyBatis Mapper -│ ├── UserMapper.java -│ ├── NewsMapper.java -│ └── CategoryMapper.java -├── entity/ # 实体类 -│ ├── User.java (已完成) -│ ├── News.java (已完成) -│ └── NewsCategory.java -├── dto/ # 数据传输对象 -│ ├── LoginDTO.java -│ ├── RegisterDTO.java -│ ├── NewsQueryDTO.java -│ ├── ClassificationResultDTO.java -│ └── PageResult.java -├── vo/ # 视图对象 -│ ├── UserVO.java -│ ├── NewsVO.java -│ └── CategoryVO.java -├── common/ # 公共类 -│ ├── Result.java (已完成) -│ ├── PageRequest.java -│ └── PageResponse.java -├── config/ # 配置类 -│ ├── SecurityConfig.java -│ ├── CorsConfig.java -│ ├── MyBatisConfig.java -│ └── AsyncConfig.java (已完成) -├── security/ # 安全认证 -│ ├── JwtTokenProvider.java -│ ├── JwtAuthenticationFilter.java -│ ├── UserDetailsServiceImpl.java -│ └── PasswordEncoder.java -├── classifier/ # 文本分类器 -│ ├── IClassifier.java (已完成) -│ ├── ClassificationResult.java -│ ├── TraditionalMLClassifier.java (已完成) -│ ├── BERTClassifier.java -│ └── HybridClassifier.java (已完成) -├── exception/ # 异常处理 -│ ├── GlobalExceptionHandler.java -│ ├── BusinessException.java -│ └── ErrorCode.java -├── util/ # 工具类 -│ ├── JwtUtil.java -│ ├── DateUtil.java -│ └── ValidationUtil.java -└── NewsClassifierApplication.java (已完成) -``` -### 2.1 需要完成的具体文件 - -#### 任务 2.1.1: `security/JwtTokenProvider.java` - JWT令牌提供者 - -```java -package com.newsclassifier.security; - -import io.jsonwebtoken.*; -import io.jsonwebtoken.security.Keys; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.security.core.Authentication; -import org.springframework.stereotype.Component; - -import javax.crypto.SecretKey; -import java.util.Date; - -/** - * JWT令牌提供者 - */ -@Component -public class JwtTokenProvider { - - @Value("${jwt.secret}") - private String jwtSecret; - - @Value("${jwt.expiration:86400000}") // 默认24小时 - private long jwtExpiration; - - private SecretKey getSigningKey() { - return Keys.hmacShaKeyFor(jwtSecret.getBytes()); - } - - /** - * 生成JWT令牌 - */ - public String generateToken(Authentication authentication) { - String username = authentication.getName(); - Date now = new Date(); - Date expiryDate = new Date(now.getTime() + jwtExpiration); - - return Jwts.builder() - .subject(username) - .issuedAt(now) - .expiration(expiryDate) - .signWith(getSigningKey()) - .compact(); - } - - /** - * 从令牌获取用户名 - */ - public String getUsernameFromToken(String token) { - Claims claims = Jwts.parser() - .verifyWith(getSigningKey()) - .build() - .parseSignedClaims(token) - .getPayload(); - return claims.getSubject(); - } - - /** - * 验证令牌 - */ - public boolean validateToken(String token) { - try { - Jwts.parser() - .verifyWith(getSigningKey()) - .build() - .parseSignedClaims(token); - return true; - } catch (JwtException ex) { - // 日志记录 - } - return false; - } -} -``` - -#### 任务 2.1.2: `security/SecurityConfig.java` - 安全配置 - -```java -package com.newsclassifier.config; - -import com.newsclassifier.security.JwtAuthenticationFilter; -import org.springframework.context.annotation.Bean; -import org.springframework.context.annotation.Configuration; -import org.springframework.security.config.annotation.web.builders.HttpSecurity; -import org.springframework.security.config.annotation.web.configuration.EnableWebSecurity; -import org.springframework.security.config.http.SessionCreationPolicy; -import org.springframework.security.crypto.bcrypt.BCryptPasswordEncoder; -import org.springframework.security.crypto.password.PasswordEncoder; -import org.springframework.security.web.SecurityFilterChain; -import org.springframework.security.web.authentication.UsernamePasswordAuthenticationFilter; - -/** - * Spring Security配置 - */ -@Configuration -@EnableWebSecurity -public class SecurityConfig { - - private final JwtAuthenticationFilter jwtAuthenticationFilter; - - public SecurityConfig(JwtAuthenticationFilter jwtAuthenticationFilter) { - this.jwtAuthenticationFilter = jwtAuthenticationFilter; - } - - @Bean - public SecurityFilterChain securityFilterChain(HttpSecurity http) throws Exception { - http - .csrf(csrf -> csrf.disable()) - .sessionManagement(session -> - session.sessionCreationPolicy(SessionCreationPolicy.STATELESS) - ) - .authorizeHttpRequests(auth -> auth - .requestMatchers("/api/auth/**").permitAll() - .requestMatchers("/api/doc.html", "/api/swagger/**").permitAll() - .anyRequest().authenticated() - ) - .addFilterBefore(jwtAuthenticationFilter, UsernamePasswordAuthenticationFilter.class); - - return http.build(); - } - - @Bean - public PasswordEncoder passwordEncoder() { - return new BCryptPasswordEncoder(); - } -} -``` - -#### 任务 2.1.3: `controller/AuthController.java` - 认证控制器 - -```java -package com.newsclassifier.controller; - -import com.newsclassifier.common.Result; -import com.newsclassifier.dto.LoginDTO; -import com.newsclassifier.dto.RegisterDTO; -import com.newsclassifier.service.AuthService; -import com.newsclassifier.vo.UserVO; -import io.swagger.v3.oas.annotations.Operation; -import io.swagger.v3.oas.annotations.tags.Tag; -import jakarta.validation.Valid; -import lombok.RequiredArgsConstructor; -import org.springframework.web.bind.annotation.*; - -/** - * 认证控制器 - */ -@Tag(name = "认证接口") -@RestController -@RequestMapping("/api/auth") -@RequiredArgsConstructor -public class AuthController { - - private final AuthService authService; - - @Operation(summary = "用户登录") - @PostMapping("/login") - public Result login(@Valid @RequestBody LoginDTO loginDTO) { - UserVO userVO = authService.login(loginDTO); - return Result.success(userVO); - } - - @Operation(summary = "用户注册") - @PostMapping("/register") - public Result register(@Valid @RequestBody RegisterDTO registerDTO) { - authService.register(registerDTO); - return Result.success(); - } - - @Operation(summary = "刷新令牌") - @PostMapping("/refresh") - public Result refreshToken(@RequestHeader("Authorization") String token) { - String newToken = authService.refreshToken(token); - return Result.success(newToken); - } - - @Operation(summary = "用户登出") - @PostMapping("/logout") - public Result logout() { - return Result.success(); - } -} -``` - -#### 任务 2.1.4: `controller/NewsController.java` - 新闻控制器 - -```java -package com.newsclassifier.controller; - -import com.newsclassifier.common.PageResponse; -import com.newsclassifier.common.Result; -import com.newsclassifier.dto.NewsQueryDTO; -import com.newsclassifier.service.NewsService; -import com.newsclassifier.vo.NewsVO; -import io.swagger.v3.oas.annotations.Operation; -import io.swagger.v3.oas.annotations.tags.Tag; -import lombok.RequiredArgsConstructor; -import org.springframework.web.bind.annotation.*; - -/** - * 新闻控制器 - */ -@Tag(name = "新闻接口") -@RestController -@RequestMapping("/api/news") -@RequiredArgsConstructor -public class NewsController { - - private final NewsService newsService; - - @Operation(summary = "分页查询新闻") - @GetMapping("/page") - public Result> getNewsPage(NewsQueryDTO queryDTO) { - PageResponse page = newsService.getNewsPage(queryDTO); - return Result.success(page); - } - - @Operation(summary = "获取新闻详情") - @GetMapping("/{id}") - public Result getNewsDetail(@PathVariable Long id) { - NewsVO newsVO = newsService.getNewsDetail(id); - return Result.success(newsVO); - } - - @Operation(summary = "搜索新闻") - @GetMapping("/search") - public Result> searchNews( - @RequestParam String keyword, - @RequestParam(defaultValue = "1") Integer page, - @RequestParam(defaultValue = "20") Integer size - ) { - PageResponse result = newsService.searchNews(keyword, page, size); - return Result.success(result); - } - - @Operation(summary = "手动分类新闻") - @PostMapping("/{id}/classify") - public Result manualClassify( - @PathVariable Long id, - @RequestParam Long categoryId - ) { - newsService.manualClassify(id, categoryId); - return Result.success(); - } -} -``` - -#### 任务 2.1.5: `controller/ClassifierController.java` - 分类控制器 - -```java -package com.newsclassifier.controller; - -import com.newsclassifier.common.Result; -import com.newsclassifier.dto.ClassifyRequestDTO; -import com.newsclassifier.service.ClassifierService; -import io.swagger.v3.oas.annotations.Operation; -import io.swagger.v3.oas.annotations.tags.Tag; -import lombok.RequiredArgsConstructor; -import org.springframework.web.bind.annotation.*; - -/** - * 文本分类控制器 - */ -@Tag(name = "分类接口") -@RestController -@RequestMapping("/api/classifier") -@RequiredArgsConstructor -public class ClassifierController { - - private final ClassifierService classifierService; - - @Operation(summary = "对单条新闻进行分类") - @PostMapping("/classify") - public Result classify(@RequestBody ClassifyRequestDTO request) { - ClassificationResultDTO result = classifierService.classify( - request.getTitle(), - request.getContent(), - request.getMode() - ); - return Result.success(result); - } - - @Operation(summary = "批量分类") - @PostMapping("/batch-classify") - public Result batchClassify( - @RequestBody BatchClassifyRequestDTO request - ) { - BatchClassifyResultDTO result = classifierService.batchClassify( - request.getNewsIds(), - request.getMode() - ); - return Result.success(result); - } - - @Operation(summary = "获取分类器状态") - @GetMapping("/status") - public Result getStatus() { - ClassifierStatusDTO status = classifierService.getStatus(); - return Result.success(status); - } -} -``` - -#### 任务 2.1.6: `service/impl/NewsServiceImpl.java` - 新闻服务实现 - -```java -package com.newsclassifier.service.impl; - -import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper; -import com.baomidou.mybatisplus.extension.plugins.pagination.Page; -import com.newsclassifier.common.PageResponse; -import com.newsclassifier.dto.NewsQueryDTO; -import com.newsclassifier.entity.News; -import com.newsclassifier.mapper.NewsMapper; -import com.newsclassifier.service.NewsService; -import com.newsclassifier.vo.NewsVO; -import lombok.RequiredArgsConstructor; -import org.springframework.stereotype.Service; - -/** - * 新闻服务实现 - */ -@Service -@RequiredArgsConstructor -public class NewsServiceImpl implements NewsService { - - private final NewsMapper newsMapper; - - @Override - public PageResponse getNewsPage(NewsQueryDTO queryDTO) { - Page page = new Page<>(queryDTO.getPage(), queryDTO.getSize()); - - LambdaQueryWrapper wrapper = new LambdaQueryWrapper() - .eq(queryDTO.getCategoryId() != null, News::getCategoryId, queryDTO.getCategoryId()) - .eq(queryDTO.getCategoryCode() != null, News::getCategoryCode, queryDTO.getCategoryCode()) - .eq(queryDTO.getStatus() != null, News::getStatus, queryDTO.getStatus()) - .like(queryDTO.getKeyword() != null, News::getTitle, queryDTO.getKeyword()) - .orderByDesc(News::getPublishTime); - - Page resultPage = newsMapper.selectPage(page, wrapper); - - // 转换为VO - List voList = resultPage.getRecords().stream() - .map(this::convertToVO) - .collect(Collectors.toList()); - - return PageResponse.of(resultPage.getTotal(), voList); - } - - @Override - public NewsVO getNewsDetail(Long id) { - News news = newsMapper.selectById(id); - if (news == null) { - throw new BusinessException(ErrorCode.NEWS_NOT_FOUND); - } - - // 增加浏览次数 - newsMapper.addViewCount(id); - - return convertToVO(news); - } - - private NewsVO convertToVO(News news) { - // 实现Entity到VO的转换 - return new NewsVO(); - } -} -``` - -#### 任务 2.1.7: `classifier/ClassificationResult.java` - 分类结果类 - -```java -package com.newsclassifier.classifier; - -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.Data; -import lombok.NoArgsConstructor; - -import java.math.BigDecimal; - -/** - * 分类结果 - */ -@Data -@Builder -@NoArgsConstructor -@AllArgsConstructor -public class ClassificationResult { - - /** - * 分类代码 - */ - private String categoryCode; - - /** - * 分类名称 - */ - private String categoryName; - - /** - * 置信度 0-1 - */ - private BigDecimal confidence; - - /** - * 分类器类型 - */ - private String classifierType; - - /** - * 各类别概率分布 - */ - private java.util.Map probabilities; - - /** - * 耗时(毫秒) - */ - private Long duration; -} -``` - -#### 任务 2.1.8: `classifier/BERTClassifier.java` - BERT分类器 - -```java -package com.newsclassifier.classifier; - -import lombok.extern.slf4j.Slf4j; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.stereotype.Component; - -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.math.BigDecimal; -import java.util.HashMap; -import java.util.Map; - -/** - * BERT文本分类器 - * 通过调用Python服务实现 - */ -@Slf4j -@Component -public class BERTClassifier implements IClassifier { - - @Value("${classifier.bert.service-url:http://localhost:5000/api/predict}") - private String bertServiceUrl; - - @Value("${classifier.bert.timeout:5000}") - private int timeout; - - @Override - public ClassificationResult classify(String title, String content) { - long startTime = System.currentTimeMillis(); - - try { - // 调用Python BERT服务 - String result = callBERTService(title, content); - // 解析结果 - return parseResult(result); - } catch (Exception e) { - log.error("BERT分类失败", e); - // 返回默认结果或降级处理 - return getDefaultResult(); - } finally { - long duration = System.currentTimeMillis() - startTime; - log.info("BERT分类耗时: {}ms", duration); - } - } - - @Override - public String getType() { - return "DL"; - } - - private String callBERTService(String title, String content) { - // 使用HttpClient调用Python服务 - // 实现HTTP请求逻辑 - return ""; - } - - private ClassificationResult parseResult(String jsonResponse) { - // 解析JSON响应 - return ClassificationResult.builder() - .categoryCode("TECHNOLOGY") - .categoryName("科技") - .confidence(new BigDecimal("0.95")) - .classifierType("DL") - .build(); - } -} -``` - -#### 任务 2.1.9: `exception/GlobalExceptionHandler.java` - 全局异常处理 - -```java -package com.newsclassifier.exception; - -import com.newsclassifier.common.Result; -import lombok.extern.slf4j.Slf4j; -import org.springframework.http.HttpStatus; -import org.springframework.validation.BindException; -import org.springframework.web.bind.annotation.ExceptionHandler; -import org.springframework.web.bind.annotation.ResponseStatus; -import org.springframework.web.bind.annotation.RestControllerAdvice; - -/** - * 全局异常处理器 - */ -@Slf4j -@RestControllerAdvice -public class GlobalExceptionHandler { - - @ExceptionHandler(BusinessException.class) - @ResponseStatus(HttpStatus.OK) - public Result handleBusinessException(BusinessException e) { - log.error("业务异常: {}", e.getMessage()); - return Result.error(e.getErrorCode(), e.getMessage()); - } - - @ExceptionHandler(BindException.class) - @ResponseStatus(HttpStatus.BAD_REQUEST) - public Result handleBindException(BindException e) { - String message = e.getBindingResult().getAllErrors().get(0).getDefaultMessage(); - return Result.error(400, message); - } - - @ExceptionHandler(Exception.class) - @ResponseStatus(HttpStatus.INTERNAL_SERVER_ERROR) - public Result handleException(Exception e) { - log.error("系统异常", e); - return Result.error(500, "系统内部错误"); - } -} ``` #### 任务 2.1.10: `application.yml` - 应用配置文件 diff --git a/docs/爬虫模块说明.md b/docs/爬虫模块说明.md deleted file mode 100644 index 768276e..0000000 --- a/docs/爬虫模块说明.md +++ /dev/null @@ -1,287 +0,0 @@ -# 新闻爬虫模块使用说明 - -## 模块概述 - -爬虫模块负责从各大新闻网站自动抓取新闻数据,经过清洗、去重、分类后存储到数据库。 - -## 文件结构 - -``` -backend/src/main/java/com/newsclassifier/ -├── config/ -│ ├── CrawlerConfig.java # 爬虫配置类(RestTemplate) -│ ├── CrawlerProperties.java # 爬虫属性配置 -│ └── AsyncConfig.java # 异步任务配置 -├── controller/ -│ └── CrawlerController.java # 爬虫API控制器 -├── service/ -│ ├── CrawlerService.java # 爬虫服务接口 -│ └── impl/ -│ └── CrawlerServiceImpl.java # 爬虫服务实现 -├── crawler/ -│ ├── HtmlParser.java # HTML解析器 -│ ├── DataCleaner.java # 数据清洗工具 -│ └── DuplicationService.java # 去重服务 -├── scheduler/ -│ └── CrawlerScheduledTasks.java # 定时任务 -└── dto/ - ├── CrawledNewsDTO.java # 爬取的新闻数据 - └── CrawlerReportDTO.java # 爬虫报告 -``` - -## 核心功能 - -### 1. HTML解析器 (HtmlParser) - -**功能**: 使用Jsoup解析HTML,提取新闻标题、内容、链接等信息 - -**方法**: -- `parseNewsList()`: 解析新闻列表页面 -- `parseNewsDetail()`: 解析新闻详情页 -- `parseDateTime()`: 解析各种格式的日期时间 - -### 2. 数据清洗工具 (DataCleaner) - -**功能**: 清洗和规范化爬取的新闻数据 - -**清洗内容**: -- 去除HTML标签 -- 去除特殊控制字符 -- 规范化空白字符 -- 生成摘要(前200字) -- 验证数据完整性 - -### 3. 去重服务 (DuplicationService) - -**功能**: 检查并过滤重复新闻 - -**去重方式**: -- URL去重: 检查`source_url`是否已存在 -- 标题去重: 检查`title`是否已存在 -- 相似度计算: Levenshtein距离算法 - -### 4. 爬虫服务 (CrawlerService) - -**功能**: 协调整个爬虫流程 - -**流程**: -``` -定时任务触发 → 获取新闻源 → 并行爬取 → 解析HTML → 数据清洗 -→ 去重检查 → 文本分类 → 保存数据库 → 更新缓存 → 生成报告 -``` - -## 配置说明 - -### application.yml 配置 - -```yaml -crawler: - # 是否启用爬虫 - enabled: true - - # Cron表达式 (每30分钟执行一次) - cron: "0 */30 * * * ?" - - # User-Agent - user-agent: "Mozilla/5.0 ..." - - # 连接超时(毫秒) - connect-timeout: 10000 - - # 读取超时(毫秒) - read-timeout: 30000 - - # 新闻源配置 - sources: - - name: 新闻源名称 - url: https://example.com/news - enabled: true - encoding: UTF-8 - delay: 2000 # 请求间隔(毫秒) - selector: - container: ".news-list" # 列表容器选择器 - title: ".title" # 标题选择器 - link: "a" # 链接选择器 - content: ".content" # 内容选择器 - publish-time: ".time" # 时间选择器 - author: ".author" # 作者选择器 - source: ".source" # 来源选择器 -``` - -### 新闻源配置步骤 - -1. **确定新闻源**: 选择要爬取的新闻网站 - -2. **分析页面结构**: 使用浏览器开发者工具查看HTML结构 - -3. **编写CSS选择器**: - ```html - -
-
- 新闻标题 -

新闻内容摘要

- 2024-12-24 10:00 -
-
- ``` - - ```yaml - selector: - container: ".news-list .news-item" - title: ".title" - link: "a" - content: ".content" - publish-time: ".time" - ``` - -4. **测试验证**: 使用手动触发接口测试 - -## API接口 - -### 1. 手动触发爬虫 - -```http -POST /api/crawler/execute -Authorization: Bearer {token} -``` - -**响应**: -```json -{ - "code": 200, - "message": "爬虫任务执行完成", - "data": { - "startTime": "2025-12-24T10:00:00", - "endTime": "2025-12-24T10:05:00", - "duration": 300000, - "totalSuccess": 50, - "totalFailed": 2, - "totalSkipped": 5, - "sourceStatsMap": { - "36kr": { - "sourceName": "36kr", - "successCount": 30, - "failedCount": 1, - "skippedCount": 2 - } - } - } -} -``` - -### 2. 从指定新闻源爬取 - -```http -POST /api/crawler/crawl/{sourceName} -Authorization: Bearer {token} -``` - -### 3. 获取爬虫配置 - -```http -GET /api/crawler/config -Authorization: Bearer {token} -``` - -### 4. 更新爬虫状态 - -```http -PUT /api/crawler/status?enabled=true -Authorization: Bearer {token} -``` - -## 定时任务 - -默认配置为每30分钟执行一次,可在`application.yml`中修改: - -```yaml -crawler: - # Cron表达式格式: 秒 分 时 日 月 周 - cron: "0 */30 * * * ?" # 每30分钟 - # cron: "0 0 */2 * * ?" # 每2小时 - # cron: "0 0 8,12,18 * * ?" # 每天8点、12点、18点 -``` - -## 日志输出 - -``` -2025-12-24 10:00:00 [crawler-async-1] INFO - 开始执行爬虫任务... -2025-12-24 10:00:01 [crawler-async-1] INFO - 启用的新闻源数量: 2 -2025-12-24 10:00:02 [crawler-async-1] INFO - 开始爬取新闻源: 36kr -2025-12-24 10:00:03 [crawler-async-1] DEBUG - 从 36kr 解析到 30 条新闻 -2025-12-24 10:00:04 [crawler-async-1] DEBUG - 成功保存新闻: xxxxx -2025-12-24 10:00:05 [crawler-async-1] INFO - 新闻源 36kr 爬取完成 - 成功: 28, 失败: 1, 跳过: 1 -2025-12-24 10:05:00 [crawler-async-1] INFO - 爬虫任务完成 - 成功: 50, 失败: 2, 跳过: 5, 耗时: 300000ms -``` - -## 常见问题 - -### 1. 爬取失败怎么办? - -检查以下几点: -- 目标网站是否可访问 -- CSS选择器是否正确 -- 是否需要添加请求头(Referer、Cookie) -- 是否有反爬措施(需要调整User-Agent或延迟) - -### 2. 如何调试CSS选择器? - -使用浏览器开发者工具: -1. F12打开开发者工具 -2. 使用Ctrl+Shift+C选择元素 -3. 右键 → Copy → Copy selector - -### 3. 如何添加新的新闻源? - -在`application.yml`的`crawler.sources`中添加配置: - -```yaml -- name: 新新闻源 - url: https://new-source.com - enabled: true - delay: 2000 - selector: - title: ".news-title" - link: "a" - content: ".news-content" -``` - -### 4. 爬虫影响性能怎么办? - -- 调整`delay`参数增加请求间隔 -- 减少`enabled`的新闻源数量 -- 调整线程池大小(AsyncConfig.java) - -## 注意事项 - -1. **遵守robots.txt**: 检查目标网站的爬取规则 -2. **合理设置延迟**: 避免对目标网站造成压力 -3. **注意版权**: 爬取的内容仅供学习使用 -4. **定期维护**: 网站结构变化时需要更新CSS选择器 - -## 扩展功能 - -### 1. 支持更多解析方式 - -当前使用CSS选择器,可以扩展支持: -- XPath -- 正则表达式 -- 自定义解析规则 - -### 2. 增加反爬策略 - -- 随机User-Agent -- 代理IP池 -- Cookie池 -- 验证码识别 - -### 3. 分布式爬虫 - -使用消息队列实现多实例协同爬取 - ---- - -**作者**: 张俊恒 -**版本**: v1.0 -**更新日期**: 2025-12-24