diff --git a/.gitignore b/.gitignore index 0b12d8c..5b341e0 100644 --- a/.gitignore +++ b/.gitignore @@ -74,3 +74,5 @@ logs/ # Temporary *.tmp *.temp + +ml-module/models/ \ No newline at end of file diff --git a/crawler-module/config/config.yaml b/crawler-module/config/config.yaml index b01eb71..094c890 100644 --- a/crawler-module/config/config.yaml +++ b/crawler-module/config/config.yaml @@ -125,3 +125,27 @@ sources: name: "汽车" css_selector: "div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1" detail_css_selector: "div.main-content" + gov: + url: "https://gov.sina.com.cn/" + category_id: 7 + name: "政务" + css_selector: "a[href]" + + tencent: + base_url: "https://new.qq.com" + categories: + auto: + url: "https://new.qq.com/auto" + category_id: 6 + name: "汽车" + css_selector: "" + war: + url: "https://news.qq.com/ch/milite" + category_id: 5 + name: "军事" + css_selector: "" + war_web: + url: "https://news.qq.com/ch/milite" + category_id: 5 + name: "军事(网页版)" + css_selector: "div[id='channel-feed-area']" diff --git a/crawler-module/docs/bs4语法.md b/crawler-module/docs/bs4语法.md new file mode 100644 index 0000000..9c18953 --- /dev/null +++ b/crawler-module/docs/bs4语法.md @@ -0,0 +1,12 @@ + + + +CSS 属性选择器常用运算符速查表 +| 运算符 | 含义 | 示例 | | | +| ---- | ------ | ----------------- | ------ | ------- | +| `=` | 完全等于 | `[id="content"]` | | | +| `*=` | **包含** | `[id*="content"]` | | | +| `^=` | 以…开头 | `[id^="content"]` | | | +| `$=` | 以…结尾 | `[id$="content"]` | | | +| `~=` | 单词匹配 | `[class~="item"]` | | | +| ` | =` | 前缀匹配 | `[lang | ="en"]` | diff --git a/crawler-module/docs/添加新爬虫指南.md b/crawler-module/docs/添加新爬虫指南.md new file mode 100644 index 0000000..ba4ec27 --- /dev/null +++ b/crawler-module/docs/添加新爬虫指南.md @@ -0,0 +1,929 @@ +# 新闻爬虫系统 - 添加新爬虫实现指南 + +## 目录 +1. [项目架构概述](#项目架构概述) +2. [添加新爬虫的完整流程](#添加新爬虫的完整流程) +3. [详细实现步骤](#详细实现步骤) +4. [示例代码](#示例代码) +5. [常见问题](#常见问题) + +--- + +## 项目架构概述 + +### 核心组件 + +``` +crawler-module/ +├── src/ +│ ├── base/ # 基类层 +│ │ ├── crawler_base.py # 爬虫基类 +│ │ └── parser_base.py # 解析器基类 +│ ├── crawlers/ # 爬虫实现层 +│ │ ├── netease/ # 网易爬虫 +│ │ ├── kr36/ # 36氪爬虫 +│ │ └── sina/ # 新浪爬虫 +│ ├── parsers/ # 解析器层 +│ │ ├── netease_parser.py +│ │ ├── kr36_parser.py +│ │ └── sina_parser.py +│ ├── utils/ # 工具层 +│ │ ├── http_client.py # HTTP客户端 +│ │ ├── selenium_driver.py # Selenium驱动 +│ │ └── logger.py # 日志工具 +│ ├── database/ # 数据层 +│ │ ├── models.py # 数据模型 +│ │ ├── repository.py # 数据访问 +│ │ └── connection.py # 数据库连接 +│ └── cli/ # CLI入口 +│ └── main.py # 命令行接口 +└── config/ + ├── config.yaml # 配置文件 + └── settings.py # 配置加载器 +``` + +### 架构设计模式 + +1. **基类继承模式**: 所有爬虫继承 `DynamicCrawler` 或 `StaticCrawler` +2. **解析器分离模式**: 爬虫负责抓取URL列表,解析器负责解析详情页 +3. **配置驱动模式**: 通过 YAML 配置文件管理爬虫参数 +4. **工厂模式**: CLI 通过动态导入创建爬虫实例 + +--- + +## 添加新爬虫的完整流程 + +### 步骤概览 + +``` +1. 分析目标网站 + ↓ +2. 创建爬虫类文件 + ↓ +3. 创建解析器类文件 + ↓ +4. 更新配置文件 + ↓ +5. 注册爬虫到CLI + ↓ +6. 测试和调试 +``` + +--- + +## 详细实现步骤 + +### 步骤 1: 分析目标网站 + +在编写代码之前,需要分析目标网站的以下信息: + +#### 1.1 确定网站类型 +- **静态网站**: 内容直接在 HTML 中,使用 `StaticCrawler` +- **动态网站**: 内容通过 JavaScript 加载,使用 `DynamicCrawler` + +#### 1.2 确定关键信息 +- 列表页 URL +- 文章 URL 提取规则(CSS 选择器) +- 文章详情页结构 +- 标题、时间、作者、正文的选择器 + +#### 1.3 确定分类信息 +- 分类名称(如:科技、娱乐、财经) +- 分类 ID(需与数据库一致) +- 分类代码(如:tech, entertainment, finance) + +--- + +### 步骤 2: 创建爬虫类文件 + +#### 2.1 创建目录结构 + +假设要添加一个名为 `example` 的网站,分类为 `tech`: + +```bash +# 创建网站目录 +mkdir src/crawlers/example + +# 创建 __init__.py +touch src/crawlers/example/__init__.py +``` + +#### 2.2 编写爬虫类 + +创建文件 `src/crawlers/example/tech.py`: + +```python +""" +Example 科技新闻爬虫 +""" + +from typing import List +from bs4 import BeautifulSoup + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from base.crawler_base import DynamicCrawler, Article +from parsers.example_parser import ExampleParser + + +class TechCrawler(DynamicCrawler): + """Example 科技新闻爬虫""" + + def _extract_article_urls(self, html: str) -> List[str]: + """ + 从HTML中提取文章URL列表 + + Args: + html: 页面HTML内容 + + Returns: + 文章URL列表 + """ + soup = BeautifulSoup(html, "lxml") + urls = [] + + # 根据实际网站结构编写选择器 + news_items = soup.select("div.news-list div.news-item") + + for item in news_items: + article_link = item.select_one("a.title") + if article_link: + href = article_link.get('href') + if href: + # 处理相对路径 + if href.startswith('/'): + href = f"https://www.example.com{href}" + urls.append(href) + + return urls + + def _fetch_articles(self, urls: List[str]) -> List[Article]: + """ + 爬取文章详情 + + Args: + urls: 文章URL列表 + + Returns: + 文章列表 + """ + articles = [] + parser = ExampleParser() + + for i, url in enumerate(urls[:self.max_articles]): + try: + article = parser.parse(url) + article.category_id = self.category_id + article.source = "Example" + + if not article.author: + article.author = "Example科技" + + if article.is_valid(): + articles.append(article) + self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") + + except Exception as e: + self.logger.error(f"解析文章失败: {url} - {e}") + continue + + return articles +``` + +#### 2.3 爬虫类说明 + +**继承基类选择**: +- `DynamicCrawler`: 使用 Selenium,适合动态网站 +- `StaticCrawler`: 使用 requests,适合静态网站 + +**必须实现的方法**: +- `_extract_article_urls(html)`: 从列表页提取文章 URL +- `_fetch_articles(urls)`: 爬取每篇文章的详情 + +**可用的属性**: +- `self.url`: 列表页 URL +- `self.category_id`: 分类 ID +- `self.category_name`: 分类名称 +- `self.css_selector`: 等待加载的 CSS 选择器 +- `self.max_articles`: 最大文章数 +- `self.http_client`: HTTP 客户端(StaticCrawler) +- `self.driver`: Selenium 驱动(DynamicCrawler) +- `self.logger`: 日志记录器 + +--- + +### 步骤 3: 创建解析器类文件 + +#### 3.1 创建解析器文件 + +创建文件 `src/parsers/example_parser.py`: + +```python +""" +Example 文章解析器 +""" + +import re +from bs4 import BeautifulSoup + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from base.parser_base import BaseParser +from base.crawler_base import Article +from utils.http_client import HttpClient +from utils.logger import get_logger + + +class ExampleParser(BaseParser): + """Example 文章解析器""" + + def __init__(self): + self.logger = get_logger(__name__) + self.http_client = HttpClient() + + def parse(self, url: str) -> Article: + """ + 解析文章详情页 + + Args: + url: 文章URL + + Returns: + 文章对象 + """ + # 获取页面 HTML + html = self.http_client.get(url) + soup = BeautifulSoup(html, "lxml") + + # 提取标题 + title = None + title_tag = soup.select_one("h1.article-title") + if title_tag: + title = title_tag.get_text(strip=True) + + # 提取发布时间 + publish_time = None + time_tag = soup.select_one("div.article-info span.publish-time") + if time_tag: + time_text = time_tag.get_text(strip=True) + # 标准化时间格式 + time_match = re.search(r"\d{4}-\d{2}-\d{2}", time_text) + if time_match: + publish_time = time_match.group() + + # 提取作者 + author = None + author_tag = soup.select_one("div.article-info span.author") + if author_tag: + author = author_tag.get_text(strip=True) + + # 提取正文内容 + content_lines = [] + article_body = soup.select_one("div.article-content") + + if article_body: + # 移除不需要的标签 + for tag in article_body.select("script, style, iframe, .ad"): + tag.decompose() + + # 提取段落 + for p in article_body.find_all("p"): + text = p.get_text(strip=True) + if text: + content_lines.append(text) + + content = '\n'.join(content_lines) + + return Article( + url=url, + title=title, + publish_time=publish_time, + author=author, + content=content, + ) +``` + +#### 3.2 解析器类说明 + +**必须实现的方法**: +- `parse(url)`: 解析文章详情页,返回 Article 对象 + +**Article 对象字段**: +- `url`: 文章 URL(必需) +- `title`: 文章标题(必需) +- `content`: 文章内容(必需) +- `publish_time`: 发布时间(可选) +- `author`: 作者(可选) +- `category_id`: 分类 ID(由爬虫设置) +- `source`: 新闻源(由爬虫设置) + +**可用的工具**: +- `self.http_client`: HTTP 客户端 +- `self.logger`: 日志记录器 + +--- + +### 步骤 4: 更新配置文件 + +编辑 `config/config.yaml`,在 `sources` 节点下添加新网站配置: + +```yaml +sources: + # ... 其他网站配置 ... + + example: + base_url: "https://www.example.com" + categories: + tech: + url: "https://www.example.com/tech" + category_id: 4 + name: "科技" + css_selector: "div.news-list" # 列表页等待加载的选择器 + # 可以添加更多分类 + entertainment: + url: "https://www.example.com/entertainment" + category_id: 1 + name: "娱乐" + css_selector: "div.news-list" +``` + +#### 配置项说明 + +| 配置项 | 说明 | 示例 | +|--------|------|------| +| `base_url` | 网站基础 URL | `https://www.example.com` | +| `url` | 列表页 URL | `https://www.example.com/tech` | +| `category_id` | 分类 ID(需与数据库一致) | `4` | +| `name` | 分类名称 | `科技` | +| `css_selector` | 列表页等待加载的选择器 | `div.news-list` | + +#### 分类 ID 对照表 + +根据项目文档,分类 ID 如下: + +| ID | 分类名称 | 代码 | +|----|----------|------| +| 1 | 娱乐 | entertainment | +| 2 | 体育 | sports | +| 3 | 财经 | finance | +| 4 | 科技 | tech | +| 5 | 军事 | war | +| 6 | 汽车 | auto | +| 7 | 政务 | gov | +| 8 | 健康 | health | +| 9 | AI | ai | +| 10 | 教育 | education | + +--- + +### 步骤 5: 注册爬虫到 CLI + +编辑 `src/cli/main.py`,在 `CRAWLER_CLASSES` 字典中添加新爬虫: + +```python +CRAWLER_CLASSES = { + # ... 其他爬虫配置 ... + + 'example': { + 'tech': ('crawlers.example.tech', 'TechCrawler'), + 'entertainment': ('crawlers.example.entertainment', 'EntertainmentCrawler'), + # 可以添加更多分类 + }, +} +``` + +#### 注册格式说明 + +```python +'网站代码': { + '分类代码': ('爬虫模块路径', '爬虫类名'), +} +``` + +**示例**: +- `'example'`: 网站代码(对应配置文件中的 sources.example) +- `'tech'`: 分类代码(对应配置文件中的 categories.tech) +- `'crawlers.example.tech'`: 模块路径(相对于 src 目录) +- `'TechCrawler'`: 爬虫类名 + +--- + +### 步骤 6: 测试和调试 + +#### 6.1 运行单个爬虫 + +```bash +# 进入项目目录 +cd D:\tmp\write\news-classifier\crawler-module + +# 运行新爬虫 +python -m src.cli.main example:tech + +# 限制爬取数量 +python -m src.cli.main example:tech --max 3 +``` + +#### 6.2 列出所有爬虫 + +```bash +python -m src.cli.main --list +``` + +应该能看到新添加的爬虫: +``` +可用的爬虫: + - netease:entertainment + - netease:tech + - kr36:ai + - example:tech + - example:entertainment +``` + +#### 6.3 查看日志 + +```bash +# 日志文件位置 +type logs\crawler.log +``` + +#### 6.4 调试技巧 + +**开启调试模式**: +```bash +python -m src.cli.main example:tech --debug +``` + +**手动测试解析器**: +```python +from parsers.example_parser import ExampleParser + +parser = ExampleParser() +article = parser.parse("https://www.example.com/article/123") +print(article.title) +print(article.content) +``` + +**手动测试爬虫**: +```python +from crawlers.example.tech import TechCrawler + +crawler = TechCrawler('example', 'tech') +crawler.max_articles = 3 +articles = crawler.crawl() + +for article in articles: + print(article.title) +``` + +--- + +## 示例代码 + +### 完整示例:添加新浪娱乐爬虫 + +假设我们要为新浪网站添加娱乐分类爬虫: + +#### 1. 创建爬虫类 + +文件:`src/crawlers/sina/entertainment.py` + +```python +""" +新浪娱乐新闻爬虫 +""" + +from typing import List +from bs4 import BeautifulSoup + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from base.crawler_base import DynamicCrawler, Article +from parsers.sina_parser import SinaEntertainmentParser + + +class EntertainmentCrawler(DynamicCrawler): + """新浪娱乐新闻爬虫""" + + def _extract_article_urls(self, html: str) -> List[str]: + """从HTML中提取文章URL列表""" + soup = BeautifulSoup(html, "lxml") + urls = [] + + # 新浪娱乐列表页选择器 + news_items = soup.select("div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1") + + for item in news_items: + article_link = item.select_one("a") + if article_link: + href = article_link.get('href') + if href: + urls.append(href) + + return urls + + def _fetch_articles(self, urls: List[str]) -> List[Article]: + """爬取文章详情""" + articles = [] + parser = SinaEntertainmentParser() + + for i, url in enumerate(urls[:self.max_articles]): + try: + article = parser.parse(url) + article.category_id = self.category_id + article.source = "新浪" + + if not article.author: + article.author = "新浪娱乐" + + if article.is_valid(): + articles.append(article) + self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") + + except Exception as e: + self.logger.error(f"解析文章失败: {url} - {e}") + continue + + return articles +``` + +#### 2. 创建解析器类 + +文件:`src/parsers/sina_parser.py`(在文件末尾添加) + +```python +class SinaEntertainmentParser(BaseParser): + """新浪网娱乐新闻解析器""" + + def __init__(self): + self.logger = get_logger(__name__) + self.http_client = HttpClient() + + def parse(self, url: str) -> Article: + """解析新浪网文章详情页""" + html = self.http_client.get(url) + soup = BeautifulSoup(html, "lxml") + + # 获取文章标题 + article_title_tag = soup.select_one("div.main-content h1.main-title") + article_title = article_title_tag.get_text(strip=True) if article_title_tag else "未知标题" + + # 获取文章发布时间 + time_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source span.date") + publish_time = time_tag.get_text(strip=True) if time_tag else "1949-01-01 12:00:00" + + # 获取文章作者 + author_tag = soup.select_one("div.main-content div.top-bar-wrap div.date-source a") + author = author_tag.get_text(strip=True) if author_tag else "未知" + + # 获取文章正文段落 + article_div = soup.select_one("div.main-content div.article") + if not article_div: + raise ValueError("无法找到文章内容") + + paragraphs = article_div.find_all('p') + content = '\n'.join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)) + + return Article( + url=url, + title=article_title, + publish_time=publish_time, + author=author, + content=content, + ) +``` + +#### 3. 更新配置文件 + +文件:`config/config.yaml` + +```yaml +sina: + base_url: "https://sina.com.cn" + categories: + auto: + url: "https://auto.sina.com.cn/" + category_id: 6 + name: "汽车" + css_selector: "div.feed_card.ty-feed-card-container div.cardlist-a__list div.ty-card.ty-card-type1" + detail_css_selector: "div.main-content" + gov: + url: "https://gov.sina.com.cn/" + category_id: 7 + name: "政务" + css_selector: "a[href]" + entertainment: # 新增 + url: "https://ent.sina.com.cn/" + category_id: 1 + name: "娱乐" + css_selector: "div.feed_card.ty-feed-card-container div.cardlist-a__list" +``` + +#### 4. 注册爬虫到 CLI + +文件:`src/cli/main.py` + +```python +CRAWLER_CLASSES = { + # ... 其他配置 ... + + 'sina': { + 'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'), + 'gov': ('crawlers.sina.gov', 'SinaGovCrawler'), + 'entertainment': ('crawlers.sina.entertainment', 'EntertainmentCrawler'), # 新增 + }, +} +``` + +#### 5. 测试运行 + +```bash +# 运行爬虫 +python -m src.cli.main sina:entertainment + +# 限制数量测试 +python -m src.cli.main sina:entertainment --max 3 +``` + +--- + +## 常见问题 + +### Q1: 如何确定使用 DynamicCrawler 还是 StaticCrawler? + +**判断方法**: +1. 使用浏览器查看网页源代码(Ctrl+U) +2. 如果源代码中包含完整的文章列表和内容,使用 `StaticCrawler` +3. 如果源代码中内容很少,内容通过 JavaScript 动态加载,使用 `DynamicCrawler` + +**示例**: +- 网易新闻:列表页需要滚动加载 → `DynamicCrawler` +- 简单的博客网站:内容直接在 HTML 中 → `StaticCrawler` + +### Q2: 如何找到正确的 CSS 选择器? + +**方法 1: 使用浏览器开发者工具** +1. 按 F12 打开开发者工具 +2. 使用元素选择器(Ctrl+Shift+C)点击目标元素 +3. 在 Elements 面板中,右键点击元素 → Copy → Copy selector + +**方法 2: 使用 BeautifulSoup 测试** +```python +from bs4 import BeautifulSoup + +html = """...""" +soup = BeautifulSoup(html, "lxml") +elements = soup.select("div.news-list a") +print(len(elements)) +``` + +### Q3: 爬虫运行失败,如何调试? + +**步骤**: +1. 查看日志文件:`logs\crawler.log` +2. 开启调试模式:`python -m src.cli.main example:tech --debug` +3. 手动测试 URL 是否可访问 +4. 检查 CSS 选择器是否正确 +5. 检查网站是否有反爬机制(如需要登录、验证码) + +**常见错误**: +- `未找到新闻列表`: CSS 选择器错误 +- `解析文章失败`: URL 格式错误或网站结构变化 +- `HTTP请求失败`: 网络问题或被反爬 + +### Q4: 如何处理相对路径的 URL? + +```python +href = article_link.get('href') + +if href.startswith('/'): + # 相对路径,拼接基础 URL + base_url = "https://www.example.com" + href = base_url + href +elif href.startswith('http'): + # 绝对路径,直接使用 + pass +else: + # 其他情况,拼接当前页面的基础路径 + href = "https://www.example.com/" + href +``` + +### Q5: 如何处理时间格式不一致? + +```python +import re +from datetime import datetime + +def normalize_time(time_str): + """标准化时间格式""" + # 定义多种时间格式 + formats = [ + "%Y年%m月%d日 %H:%M", + "%Y-%m-%d %H:%M:%S", + "%Y/%m/%d %H:%M", + "%Y.%m.%d %H:%M", + ] + + for fmt in formats: + try: + dt = datetime.strptime(time_str, fmt) + return dt.strftime("%Y-%m-%d %H:%M:%S") + except: + continue + + # 如果都不匹配,返回默认值 + return "1949-01-01 12:00:00" +``` + +### Q6: 如何提取干净的正文内容? + +```python +# 移除不需要的标签 +for tag in article_body.select("script, style, iframe, .ad, .comment"): + tag.decompose() + +# 提取段落 +content_lines = [] +for p in article_body.find_all("p"): + text = p.get_text(strip=True) + if text and len(text) > 10: # 过滤太短的段落 + content_lines.append(text) + +content = '\n'.join(content_lines) +``` + +### Q7: 如何处理文章重复? + +系统自动处理重复: +1. 通过 URL 去重 +2. 通过内容哈希(content_hash)去重 +3. 使用 `INSERT IGNORE` 语句避免重复插入 + +**查看重复数据**: +```sql +SELECT url, COUNT(*) as count +FROM news +GROUP BY url +HAVING count > 1; +``` + +### Q8: 如何批量运行所有爬虫? + +```bash +# 运行所有爬虫 +python -m src.cli.main --all + +# 限制每个爬虫的数量 +python -m src.cli.main --all --max 5 +``` + +### Q9: 如何修改最大爬取数量? + +**方法 1: 命令行参数** +```bash +python -m src.cli.main example:tech --max 20 +``` + +**方法 2: 配置文件** +编辑 `config/config.yaml`: +```yaml +crawlers: + max_articles: 20 # 修改全局默认值 +``` + +### Q10: 爬虫运行很慢,如何优化? + +**优化策略**: +1. 减少 `max_articles` 数量 +2. 调整 `selenium.scroll_pause_time`(滚动暂停时间) +3. 减少 `selenium.max_scroll_times`(最大滚动次数) +4. 使用 `StaticCrawler` 代替 `DynamicCrawler`(如果可能) + +**配置示例**: +```yaml +selenium: + scroll_pause_time: 0.5 # 减少暂停时间 + max_scroll_times: 3 # 减少滚动次数 +``` + +--- + +## 附录 + +### A. 数据库表结构 + +```sql +CREATE TABLE `news` ( + `id` int NOT NULL AUTO_INCREMENT, + `url` varchar(500) NOT NULL COMMENT '文章URL', + `title` varchar(500) NOT NULL COMMENT '文章标题', + `content` text COMMENT '文章内容', + `category_id` int NOT NULL COMMENT '分类ID', + `publish_time` varchar(50) DEFAULT NULL COMMENT '发布时间', + `author` varchar(100) DEFAULT NULL COMMENT '作者', + `source` varchar(50) DEFAULT NULL COMMENT '新闻源', + `content_hash` varchar(64) DEFAULT NULL COMMENT '内容哈希', + `created_at` datetime DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', + PRIMARY KEY (`id`), + UNIQUE KEY `url` (`url`), + KEY `content_hash` (`content_hash`), + KEY `category_id` (`category_id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='新闻表'; +``` + +### B. 分类表结构 + +```sql +CREATE TABLE `news_category` ( + `id` int NOT NULL AUTO_INCREMENT, + `name` varchar(50) NOT NULL COMMENT '分类名称', + `code` varchar(50) NOT NULL COMMENT '分类代码', + `description` varchar(200) DEFAULT NULL COMMENT '描述', + `sort_order` int DEFAULT 0 COMMENT '排序', + PRIMARY KEY (`id`), + UNIQUE KEY `code` (`code`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='新闻分类表'; +``` + +### C. 常用 CSS 选择器示例 + +```python +# 通过 ID 选择 +soup.select_one("#article-title") + +# 通过 class 选择 +soup.select_one(".article-title") +soup.select_one("div.article-title") + +# 通过属性选择 +soup.select_one("a[href^='/article/']") + +# 组合选择 +soup.select_one("div.news-list div.news-item a.title") + +# 多层级选择 +soup.select_one("div.main-content > div.article > p") + +# 伪类选择 +soup.select_one("ul.news-list li:first-child a") +``` + +### D. BeautifulSoup 常用方法 + +```python +# 获取文本 +element.get_text(strip=True) + +# 获取属性 +element.get('href') +element.get('class') + +# 查找单个元素 +soup.select_one("div.title") +soup.find("div", class_="title") + +# 查找多个元素 +soup.select("div.news-item") +soup.find_all("div", class_="news-item") + +# 父元素和子元素 +parent = element.parent +children = element.children +``` + +### E. 项目依赖 + +查看 `requirements.txt`: +``` +requests>=2.31.0 +beautifulsoup4>=4.12.0 +lxml>=4.9.0 +selenium>=4.15.0 +PyYAML>=6.0 +``` + +--- + +## 总结 + +添加新爬虫的核心步骤: + +1. ✅ 分析目标网站结构 +2. ✅ 创建爬虫类(继承 `DynamicCrawler` 或 `StaticCrawler`) +3. ✅ 创建解析器类(继承 `BaseParser`) +4. ✅ 更新配置文件(`config.yaml`) +5. ✅ 注册爬虫到 CLI(`src/cli/main.py`) +6. ✅ 测试运行 + +遵循本指南,您可以为新闻爬虫系统添加任意数量的新网站和分类爬虫。 + +--- + +**文档版本**: 1.0 +**最后更新**: 2026-01-15 +**维护者**: 新闻爬虫项目组 \ No newline at end of file diff --git a/crawler-module/kr36-health.txt b/crawler-module/kr36-health.txt deleted file mode 100644 index dda0b1b..0000000 --- a/crawler-module/kr36-health.txt +++ /dev/null @@ -1,33 +0,0 @@ - -这是36kr关于爬取健康相关新闻的代码 -```python -import requests -from bs4 import BeautifulSoup -import re - -URL = "https://www.36kr.com/search/articles/%E5%81%A5%E5%BA%B7" -TARGET_URL = "https://www.36kr.com" -headers = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/120.0.0.0 Safari/537.36" - ) -} -resp = requests.get(URL,headers=headers,timeout=10) -resp.raise_for_status() -resp.encoding = "utf-8" -# print(resp.text) -with open("example/example-11.html","r",encoding="utf-8") as f: - html = f.read() - -# soup = BeautifulSoup(resp.text,"lxml") -soup = BeautifulSoup(html,"lxml") -li_list = soup.select("div.kr-layout div.kr-layout-main div.kr-layout-content div.kr-search-result-list ul.kr-search-result-list-main > li") - -for item in li_list: - a = item.select_one("div.kr-shadow-content a") - href = TARGET_URL+ a.get("href") - print(href) - -``` \ No newline at end of file diff --git a/crawler-module/src/cli/main.py b/crawler-module/src/cli/main.py index 6879683..fc21bcc 100644 --- a/crawler-module/src/cli/main.py +++ b/crawler-module/src/cli/main.py @@ -34,6 +34,12 @@ CRAWLER_CLASSES = { }, 'sina': { 'auto': ('crawlers.sina.auto', 'SinaAutoCrawler'), + 'gov': ('crawlers.sina.gov', 'SinaGovCrawler'), + }, + 'tencent': { + 'auto': ('crawlers.tencent.auto', 'AutoCrawler'), + 'war': ('crawlers.tencent.war', 'WarCrawler'), + 'war_web': ('crawlers.tencent.war_web', 'WarWebCrawler'), }, } @@ -62,6 +68,11 @@ def list_crawlers() -> List[str]: for category in sina_categories.keys(): crawlers.append(f"sina:{category}") + # 腾讯爬虫 + tencent_categories = config.get('sources.tencent.categories', {}) + for category in tencent_categories.keys(): + crawlers.append(f"tencent:{category}") + return crawlers diff --git a/crawler-module/src/crawlers/sina/gov.py b/crawler-module/src/crawlers/sina/gov.py new file mode 100644 index 0000000..f362783 --- /dev/null +++ b/crawler-module/src/crawlers/sina/gov.py @@ -0,0 +1,68 @@ +""" +新浪政务新闻爬虫 +""" + +from typing import List +from bs4 import BeautifulSoup +import re + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from base.crawler_base import StaticCrawler, Article +from parsers.sina_parser import SinaAutoParser + + +class SinaGovCrawler(StaticCrawler): + """新浪政务新闻爬虫""" + + def _extract_article_urls(self, html: str) -> List[str]: + """从HTML中提取文章URL列表""" + soup = BeautifulSoup(html, "lxml") + urls = [] + + # 遍历所有带href的a标签 + for a in soup.select("a[href]"): + href = a.get("href") + + if not href: + continue + + # 补全 // 开头的链接 + if href.startswith("//"): + href = "https:" + href + + # 正则匹配真正的新闻正文页 + # 格式: https://news.sina.com.cn/xxx/2024-01-14/doc-xxxxxxxxx.shtml + if re.search(r"^https://news\.sina\.com\.cn/.+/\d{4}-\d{2}-\d{2}/doc-.*\.shtml$", href): + title = a.get_text(strip=True) + if title: # 有标题,这才是新闻 + urls.append(href) + + # 去重 + return list(dict.fromkeys(urls)) + + def _fetch_articles(self, urls: List[str]) -> List[Article]: + """爬取文章详情""" + articles = [] + parser = SinaAutoParser() + + for i, url in enumerate(urls[:self.max_articles]): + try: + article = parser.parse(url) + article.category_id = self.category_id + article.source = "新浪" + + if not article.author: + article.author = "新浪政务" + + if article.is_valid(): + articles.append(article) + self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") + + except Exception as e: + self.logger.error(f"解析文章失败: {url} - {e}") + continue + + return articles \ No newline at end of file diff --git a/crawler-module/src/crawlers/tencent/__init__.py b/crawler-module/src/crawlers/tencent/__init__.py new file mode 100644 index 0000000..eafbfe2 --- /dev/null +++ b/crawler-module/src/crawlers/tencent/__init__.py @@ -0,0 +1,2 @@ + +# 腾讯新闻爬虫 diff --git a/crawler-module/src/crawlers/tencent/auto.py b/crawler-module/src/crawlers/tencent/auto.py new file mode 100644 index 0000000..4f803a5 --- /dev/null +++ b/crawler-module/src/crawlers/tencent/auto.py @@ -0,0 +1,210 @@ +""" +腾讯汽车新闻爬虫 +""" + +import time +import random +import hashlib +from typing import List +import requests + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from base.crawler_base import StaticCrawler, Article +from parsers.tencent_parser import TencentParser + + +class AutoCrawler(StaticCrawler): + """腾讯汽车新闻爬虫""" + + def __init__(self, source: str, category: str): + super().__init__(source, category) + + # 腾讯API配置 + self.api_url = "https://i.news.qq.com/web_feed/getPCList" + self.channel_id = "news_news_auto" # 汽车频道 + self.seen_ids = set() + self.item_count = 20 # 每页固定请求20条 + + def _generate_trace_id(self): + """生成trace_id""" + random_str = str(random.random()) + str(time.time()) + return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12] + + def crawl(self) -> List[Article]: + """ + 执行爬取任务(重写基类方法以支持API接口) + + Returns: + 文章列表 + """ + self.logger.info(f"开始爬取腾讯{self.category_name}新闻") + + try: + # 生成设备ID + device_id = self._generate_trace_id() + + # 获取文章URL列表 + article_urls = self._fetch_article_urls_from_api(device_id) + self.logger.info(f"找到 {len(article_urls)} 篇文章") + + # 爬取文章详情 + articles = self._fetch_articles(article_urls) + + self.logger.info(f"成功爬取 {len(articles)} 篇文章") + return articles + + except Exception as e: + self.logger.error(f"爬取失败: {e}", exc_info=True) + return [] + finally: + self._cleanup() + + def _fetch_article_urls_from_api(self, device_id: str) -> List[str]: + """ + 从API获取文章URL列表 + + Args: + device_id: 设备ID + + Returns: + 文章URL列表 + """ + urls = [] + + # 根据 max_articles 动态计算需要抓取的页数 + # 每页20条,向上取整 + import math + max_pages = math.ceil(self.max_articles / self.item_count) + self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages} 页") + + for flush_num in range(max_pages): + payload = { + "base_req": {"from": "pc"}, + "forward": "1", + "qimei36": device_id, + "device_id": device_id, + "flush_num": flush_num + 1, + "channel_id": self.channel_id, + "item_count": self.item_count, + "is_local_chlid": "0" + } + + try: + headers = { + "User-Agent": self.http_client.session.headers.get("User-Agent"), + "Referer": "https://new.qq.com/", + "Origin": "https://new.qq.com", + "Content-Type": "application/json" + } + + response = requests.post( + self.api_url, + headers=headers, + json=payload, + timeout=10 + ) + + if response.status_code == 200: + data = response.json() + if data.get("code") == 0 and "data" in data: + news_list = data["data"] + if not news_list: + self.logger.info("没有更多数据了") + break + + # 提取URL + for item in news_list: + news_id = item.get("id") + + # 去重 + if news_id in self.seen_ids: + continue + self.seen_ids.add(news_id) + + # 过滤视频新闻(articletype == "4") + article_type = item.get("articletype") + if article_type == "4": + continue + + # 提取URL + url = item.get("link_info", {}).get("url") + if url: + urls.append(url) + + # 如果已经获取到足够的文章数量,提前终止 + if len(urls) >= self.max_articles: + self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取") + break + + # 如果外层循环也需要终止 + if len(urls) >= self.max_articles: + break + + else: + self.logger.warning(f"接口返回错误: {data.get('message')}") + else: + self.logger.warning(f"HTTP请求失败: {response.status_code}") + + except Exception as e: + self.logger.error(f"获取API数据失败: {e}") + + # 延迟,避免请求过快 + time.sleep(random.uniform(1, 2)) + + return urls + + def _fetch_page(self) -> str: + """ + 获取页面HTML(腾讯爬虫不使用此方法) + + Returns: + 空字符串 + """ + return "" + + def _extract_article_urls(self, html: str) -> List[str]: + """ + 从HTML中提取文章URL列表(腾讯爬虫不使用此方法) + + Args: + html: 页面HTML内容 + + Returns: + 空列表 + """ + return [] + + def _fetch_articles(self, urls: List[str]) -> List[Article]: + """ + 爬取文章详情 + + Args: + urls: 文章URL列表 + + Returns: + 文章列表 + """ + articles = [] + parser = TencentParser() + + for i, url in enumerate(urls[:self.max_articles]): + try: + article = parser.parse(url) + article.category_id = self.category_id + article.source = "腾讯" + + if not article.author: + article.author = "腾讯汽车" + + if article.is_valid(): + articles.append(article) + self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") + + except Exception as e: + self.logger.error(f"解析文章失败: {url} - {e}") + continue + + return articles \ No newline at end of file diff --git a/crawler-module/src/crawlers/tencent/war.py b/crawler-module/src/crawlers/tencent/war.py new file mode 100644 index 0000000..494e83f --- /dev/null +++ b/crawler-module/src/crawlers/tencent/war.py @@ -0,0 +1,211 @@ +""" +腾讯军事新闻爬虫(API版) +使用腾讯新闻 API 接口获取数据,性能更好 +""" + +import time +import random +import hashlib +from typing import List +import requests + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from base.crawler_base import StaticCrawler, Article +from parsers.tencent_parser import TencentParser + + +class WarCrawler(StaticCrawler): + """腾讯军事新闻爬虫(API版)""" + + def __init__(self, source: str, category: str): + super().__init__(source, category) + + # 腾讯API配置 + self.api_url = "https://i.news.qq.com/web_feed/getPCList" + self.channel_id = "news_news_mil" # 军事频道 + self.seen_ids = set() + self.item_count = 20 # 每页固定请求20条 + + def _generate_trace_id(self): + """生成trace_id""" + random_str = str(random.random()) + str(time.time()) + return "0_" + hashlib.md5(random_str.encode()).hexdigest()[:12] + + def crawl(self) -> List[Article]: + """ + 执行爬取任务(重写基类方法以支持API接口) + + Returns: + 文章列表 + """ + self.logger.info(f"开始爬取腾讯{self.category_name}新闻") + + try: + # 生成设备ID + device_id = self._generate_trace_id() + + # 获取文章URL列表 + article_urls = self._fetch_article_urls_from_api(device_id) + self.logger.info(f"找到 {len(article_urls)} 篇文章") + + # 爬取文章详情 + articles = self._fetch_articles(article_urls) + + self.logger.info(f"成功爬取 {len(articles)} 篇文章") + return articles + + except Exception as e: + self.logger.error(f"爬取失败: {e}", exc_info=True) + return [] + finally: + self._cleanup() + + def _fetch_article_urls_from_api(self, device_id: str) -> List[str]: + """ + 从API获取文章URL列表 + + Args: + device_id: 设备ID + + Returns: + 文章URL列表 + """ + urls = [] + + # 根据 max_articles 动态计算需要抓取的页数 + # 每页20条,向上取整 + import math + max_pages = math.ceil(self.max_articles / self.item_count) + self.logger.info(f"根据 max_articles={self.max_articles},计算需要抓取 {max_pages} 页") + + for flush_num in range(max_pages): + payload = { + "base_req": {"from": "pc"}, + "forward": "1", + "qimei36": device_id, + "device_id": device_id, + "flush_num": flush_num + 1, + "channel_id": self.channel_id, + "item_count": self.item_count, + "is_local_chlid": "0" + } + + try: + headers = { + "User-Agent": self.http_client.session.headers.get("User-Agent"), + "Referer": "https://new.qq.com/", + "Origin": "https://new.qq.com", + "Content-Type": "application/json" + } + + response = requests.post( + self.api_url, + headers=headers, + json=payload, + timeout=10 + ) + + if response.status_code == 200: + data = response.json() + if data.get("code") == 0 and "data" in data: + news_list = data["data"] + if not news_list: + self.logger.info("没有更多数据了") + break + + # 提取URL + for item in news_list: + news_id = item.get("id") + + # 去重 + if news_id in self.seen_ids: + continue + self.seen_ids.add(news_id) + + # 过滤视频新闻(articletype == "4") + article_type = item.get("articletype") + if article_type == "4": + continue + + # 提取URL + url = item.get("link_info", {}).get("url") + if url: + urls.append(url) + + # 如果已经获取到足够的文章数量,提前终止 + if len(urls) >= self.max_articles: + self.logger.info(f"已获取 {len(urls)} 篇文章,达到目标数量,停止抓取") + break + + # 如果外层循环也需要终止 + if len(urls) >= self.max_articles: + break + + else: + self.logger.warning(f"接口返回错误: {data.get('message')}") + else: + self.logger.warning(f"HTTP请求失败: {response.status_code}") + + except Exception as e: + self.logger.error(f"获取API数据失败: {e}") + + # 延迟,避免请求过快 + time.sleep(random.uniform(1, 2)) + + return urls + + def _fetch_page(self) -> str: + """ + 获取页面HTML(腾讯爬虫不使用此方法) + + Returns: + 空字符串 + """ + return "" + + def _extract_article_urls(self, html: str) -> List[str]: + """ + 从HTML中提取文章URL列表(腾讯爬虫不使用此方法) + + Args: + html: 页面HTML内容 + + Returns: + 空列表 + """ + return [] + + def _fetch_articles(self, urls: List[str]) -> List[Article]: + """ + 爬取文章详情 + + Args: + urls: 文章URL列表 + + Returns: + 文章列表 + """ + articles = [] + parser = TencentParser() + + for i, url in enumerate(urls[:self.max_articles]): + try: + article = parser.parse(url) + article.category_id = self.category_id + article.source = "腾讯" + + if not article.author: + article.author = "腾讯军事" + + if article.is_valid(): + articles.append(article) + self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") + + except Exception as e: + self.logger.error(f"解析文章失败: {url} - {e}") + continue + + return articles \ No newline at end of file diff --git a/crawler-module/src/crawlers/tencent/war_web.py b/crawler-module/src/crawlers/tencent/war_web.py new file mode 100644 index 0000000..ddf7c75 --- /dev/null +++ b/crawler-module/src/crawlers/tencent/war_web.py @@ -0,0 +1,79 @@ +""" +腾讯军事新闻爬虫(网页版) +使用 Selenium 动态加载页面,适用于网页抓取 +""" + +from typing import List +from bs4 import BeautifulSoup + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from base.crawler_base import DynamicCrawler, Article +from parsers.tencent_parser import TencentParser + + +class WarWebCrawler(DynamicCrawler): + """腾讯军事新闻爬虫(网页版)""" + + def _extract_article_urls(self, html: str) -> List[str]: + """ + 从HTML中提取文章URL列表 + + Args: + html: 页面HTML内容 + + Returns: + 文章URL列表 + """ + soup = BeautifulSoup(html, "lxml") + urls = [] + + # 选择军事频道的文章列表 + # dt-params*='article_type=0' 过滤掉视频新闻 + div_list = soup.select("div[id='channel-feed-area'] div.channel-feed-list div.channel-feed-item[dt-params*='article_type=0']") + + for div in div_list: + article_link = div.select_one("a.article-title") + if article_link: + href = article_link.get('href') + if href: + # 处理相对路径 + if href.startswith('/'): + href = f"https://news.qq.com{href}" + urls.append(href) + + return urls + + def _fetch_articles(self, urls: List[str]) -> List[Article]: + """ + 爬取文章详情 + + Args: + urls: 文章URL列表 + + Returns: + 文章列表 + """ + articles = [] + parser = TencentParser() + + for i, url in enumerate(urls[:self.max_articles]): + try: + article = parser.parse(url) + article.category_id = self.category_id + article.source = "腾讯" + + if not article.author: + article.author = "腾讯军事" + + if article.is_valid(): + articles.append(article) + self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") + + except Exception as e: + self.logger.error(f"解析文章失败: {url} - {e}") + continue + + return articles \ No newline at end of file diff --git a/crawler-module/src/parsers/tencent_parser.py b/crawler-module/src/parsers/tencent_parser.py new file mode 100644 index 0000000..0619dd3 --- /dev/null +++ b/crawler-module/src/parsers/tencent_parser.py @@ -0,0 +1,79 @@ +""" +腾讯新闻文章解析器 +""" + +from bs4 import BeautifulSoup + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from base.parser_base import BaseParser +from base.crawler_base import Article +from utils.http_client import HttpClient +from utils.logger import get_logger + + +class TencentParser(BaseParser): + """腾讯新闻文章解析器""" + + def __init__(self): + self.logger = get_logger(__name__) + self.http_client = HttpClient() + + def parse(self, url: str) -> Article: + """ + 解析腾讯新闻文章详情页 + + Args: + url: 文章URL + + Returns: + 文章对象 + """ + # 获取页面HTML + html = self.http_client.get(url) + soup = BeautifulSoup(html, "lxml") + + # 提取标题 + title = None + title_tag = soup.select_one("div.content-left div.content-article > h1") + if title_tag: + title = title_tag.get_text(strip=True) + + # 提取作者 + author = None + author_tag = soup.select_one("div.content-left div.content-article div.article-author div.media-info a p") + if author_tag: + author = author_tag.get_text(strip=True) + + # 提取发布时间 + publish_time = None + time_tag = soup.select_one("div.content-left div.content-article div.article-author div.media-info p.media-meta > span") + if time_tag: + publish_time = time_tag.get_text(strip=True) + + # 提取正文内容 + content_lines = [] + content_tag = soup.select_one("div.content-left div.content-article #article-content div.rich_media_content") + + if content_tag: + # 提取段落,跳过只包含图片的段落 + for p in content_tag.find_all("p"): + # 跳过只包含图片的 p + if p.find("img"): + continue + + text = p.get_text(strip=True) + if text: + content_lines.append(text) + + content = '\n'.join(content_lines) + + return Article( + url=url, + title=title, + publish_time=publish_time, + author=author, + content=content, + ) \ No newline at end of file diff --git a/crawler-module/tencent-war.txt b/crawler-module/tencent-war.txt new file mode 100644 index 0000000..d312d39 --- /dev/null +++ b/crawler-module/tencent-war.txt @@ -0,0 +1,31 @@ + +这是关于腾讯新闻网爬取军事分类新闻的一个可行的代码 +需要注意的是腾讯新闻解析文章详情的代码是通用的,这里没有给出(使用tencent_parser.py即可) +注意这里需要使用到动态加载(继承DynamicCrawler,并且无需重写_fetch_page()) +```python +import requests +from bs4 import BeautifulSoup + + +URL = "https://news.qq.com/ch/milite" +headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ) +} +resp = requests.get(URL,headers=headers,timeout=10) +resp.raise_for_status() +resp.encoding = "utf-8" +# print(resp.text) +# with open("example/example-13.html","r",encoding="utf-8") as f: +# html = f.read() + +soup = BeautifulSoup(resp.text,"lxml") +# soup = BeautifulSoup(html,"lxml") +div_list = soup.select("div[id='channel-feed-area'] div.channel-feed-list div.channel-feed-item[dt-params*='article_type=0']") +for div in div_list: + href = div.select_one("a.article-title").get("href") + print(href) +``` \ No newline at end of file diff --git a/ml-module/models/traditional/nb_classifier.pkl b/ml-module/models/traditional/nb_classifier.pkl deleted file mode 100644 index 8a30c75..0000000 Binary files a/ml-module/models/traditional/nb_classifier.pkl and /dev/null differ diff --git a/ml-module/models/traditional/nb_vectorizer.pkl b/ml-module/models/traditional/nb_vectorizer.pkl deleted file mode 100644 index c05705d..0000000 Binary files a/ml-module/models/traditional/nb_vectorizer.pkl and /dev/null differ diff --git a/ml-module/src/traditional/train_model.py b/ml-module/src/traditional/train_model.py index 7615eab..88e610c 100644 --- a/ml-module/src/traditional/train_model.py +++ b/ml-module/src/traditional/train_model.py @@ -232,8 +232,28 @@ if __name__ == '__main__': classifier.save_model('../../models/traditional') # 测试预测 - test_title = "华为发布新款折叠屏手机" - test_content = "华为今天正式发布了新一代折叠屏手机,搭载最新麒麟芯片..." + test_title = "高效办成一件事”" + test_content = """ + 国务院办公厅12日对外发布《“高效办成一件事”2026年度第一批重点事项清单》,并印发通知指出,统筹线上和线下政务服务渠道,因地制宜推进新一批重点事项落实落细,持续优化已推出重点事项服务,推动政务服务从“能办”向“好办、易办”转变。 + +此次清单包含13项内容,既涵盖科技型企业创新政策扶持、知识产权保护、举办体育赛事活动等赋能发展的“大事”,也包含育儿补贴申领、灵活就业参保等贴近生活的“小事”,更有外籍来华人员办理电话卡、海船开航等服务开放的“新事”。可以说,清单聚焦的都是市场主体和群众关切,和我们每一个人都息息相关。 + +去年7月,国办印发关于健全“高效办成一件事”重点事项常态化推进机制的意见,要求加强重点事项清单管理,推动重点事项常态化实施,拓展“高效办成一件事”应用领域。 + +让政务服务好办、易办,背后涉及的改革并不轻松。“一件事”往往横跨多个部门、多个系统,牵涉数据共享、流程重塑和权责厘清。正因如此,“高效办成一件事”并不是简单做减法,而是对行政体系协同能力的一次系统性考验:能否打破条块分割,能否让规则围绕真实需求变动,能否把制度设计真正落到操作层面。 + +在这个过程中,很多地方都做出了积极努力。比如北京市推进个人创业“一件事”时,按照实际诉求进行流程优化再造,最终实现“一表申请、一次办结”,还能用一张表单同时办理创业担保贷款和一次性创业补贴两项业务;上海由市政府办公厅统筹,各类“一件事”部门形成多个工作小组推动“一件事”落地见实效。 + +这些改革从群众和企业的呼声出发,直面办事过程中的堵点与不便,通过打破部门壁垒、推动协同和数据共享,将分散环节重新整合,让服务真正落到具体场景中,体现出可感知的效率提升,也让办好、易办“一件事”逐渐从制度设计走向实际运行。 + +截至目前,国家层面“高效办成一件事”重点事项清单已推出五批共55项,便民惠企清单持续扩容,政务服务改革红利持续释放:流程被压缩、材料被整合,时间成本不断降低。 + +这一改革最终通向的,是办事效率和群众满意度的持续提升。对企业而言,是政策兑现和经营活动更可预期;对个人而言,是办事体验更稳定、更省心。每一次分散的便利叠加起来,改革的价值就不再停留在抽象的表述中,而是转化为日常生活和经济运行中看得见、摸得着的获得感;也是在为经济社会的发展,从细微处巩固基础、积蓄动能。 + +政务服务改革永远在路上。从更长的时间维度看,“高效办成一件事”所指向的,不只是推动某一件、某一批事项的完成,而是要带动政府治理能力整体提升,助力高质量发展。 + +随着社会需求变化、新业态不断出现,新的“堵点”“断点”还可能不断出现,改革也必须随之滚动推进、动态优化。只有保持这种不断拆解问题、不断修补细节的耐心,政务服务才能持续进阶,在回应现实需求中走向更加成熟与稳健。 + """ result = classifier.predict(test_title, test_content) print("\n测试预测结果:", result) else: diff --git a/ml-module/src/utils/data_loader.py b/ml-module/src/utils/data_loader.py index 1ba76ba..ff61e24 100644 --- a/ml-module/src/utils/data_loader.py +++ b/ml-module/src/utils/data_loader.py @@ -154,7 +154,7 @@ if __name__ == '__main__': loader = DataLoader(db_url) # 从数据库加载数据并保存到本地 - data = loader.update_local_data(limit=800) + data = loader.update_local_data(limit=2000) if data is not None: print(f"成功加载数据,共 {len(data)} 条记录")