diff --git a/crawler-module/config/config.yaml b/crawler-module/config/config.yaml
index bb1ad19..605eceb 100644
--- a/crawler-module/config/config.yaml
+++ b/crawler-module/config/config.yaml
@@ -31,7 +31,7 @@ http:
# Selenium配置
selenium:
- headless: true
+ headless: false
log_level: 3
window_size: "1920,1080"
page_load_timeout: 60
@@ -180,3 +180,12 @@ sources:
name: "AI"
css_selector: ""
# 注意:此分类通过搜索接口获取数据,而非正常的分类列表接口
+
+ souhu:
+ base_url: "https://news.sohu.com/"
+ categories:
+ house:
+ url: "https://house.focus.cn/zixun/"
+ category_id: 10
+ name: "房产"
+ css_selector: ".TPLTextFeedItem, .TPLImageTextFeedItem"
diff --git a/crawler-module/souhu-house.txt b/crawler-module/souhu-house.txt
new file mode 100644
index 0000000..4897358
--- /dev/null
+++ b/crawler-module/souhu-house.txt
@@ -0,0 +1,65 @@
+
+这是关于搜狐新闻网爬取房产分类新闻的一个可行的代码
+注意这里需要使用到动态加载 ,并且需要点击翻页导航的下一页按钮进行分页,根据 --max 的参数决定翻页的次数
+这是页面的翻页导航
+```html
+
+```
+```python
+import requests
+from bs4 import BeautifulSoup
+
+
+URL = "https://house.focus.cn/zixun/"
+headers = {
+ "User-Agent": (
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/120.0.0.0 Safari/537.36"
+ )
+}
+resp = requests.get(URL,headers=headers,timeout=10)
+resp.raise_for_status()
+resp.encoding = "utf-8"
+# with open("example/example-14.html", "r",encoding="utf-8") as f:
+# html = f.read()
+def parser_article(href):
+ resp_article = requests.get(href, headers=headers)
+ soup_article = BeautifulSoup(resp_article.text,"lxml")
+ # print(resp.text)
+ # 标题
+ title_tag = soup_article.select_one("#article-container div.text-title h1")
+ title = title_tag.get_text(strip=True)
+ # 作者
+ author_tag = soup_article.select_one("#article-container .article-info .author a")
+ author = author_tag.get_text(strip=True)
+ #发布时间
+ time_tag = soup_article.select_one("#article-container .article-info .author #news-time")
+ time = time_tag.get_text(strip=True)
+ # 文本内容
+ content_tag = soup_article.select_one("article#mp-editor")
+ if content_tag:
+ # strip=True 去除首尾空白
+ # separator='\n\n' 让每个段落之间空一行
+ content = content_tag.get_text(separator='\n\n', strip=True)
+ else:
+ content = "未找到正文内容"
+ print(title,author,time)
+ print(content)
+
+ # soup = BeautifulSoup(resp.text,"lxml")
+soup = BeautifulSoup(html,"lxml")
+div_list = soup.select('.TPLTextFeedItem, .TPLImageTextFeedItem')
+# print(div_list)
+for item in div_list:
+ link_tag = item.select_one('a')
+ if link_tag and link_tag.has_attr('href'):
+ url = link_tag['href']
+ # 处理可能的相对协议链接 (//www...)
+ if url.startswith('//'):
+ url = 'https:' + url
+ parser_article(url)
+
+
+
+```
\ No newline at end of file
diff --git a/crawler-module/src/cli/main.py b/crawler-module/src/cli/main.py
index 5978946..d933995 100644
--- a/crawler-module/src/cli/main.py
+++ b/crawler-module/src/cli/main.py
@@ -49,6 +49,9 @@ CRAWLER_CLASSES = {
'finance': ('crawlers.tencent.finance', 'FinanceCrawler'),
'ai': ('crawlers.tencent.ai', 'SearchAICrawler'),
},
+ 'souhu': {
+ 'house': ('crawlers.souhu.house', 'HouseCrawler'),
+ },
}
@@ -97,6 +100,11 @@ def list_crawlers() -> List[str]:
for category in tencent_categories.keys():
crawlers.append(f"tencent:{category}")
+ # 搜狐爬虫
+ souhu_categories = config.get('sources.souhu.categories', {})
+ for category in souhu_categories.keys():
+ crawlers.append(f"souhu:{category}")
+
return crawlers
diff --git a/crawler-module/src/crawlers/souhu/__init__.py b/crawler-module/src/crawlers/souhu/__init__.py
new file mode 100644
index 0000000..28af9ac
--- /dev/null
+++ b/crawler-module/src/crawlers/souhu/__init__.py
@@ -0,0 +1,3 @@
+"""
+搜狐新闻爬虫模块
+"""
diff --git a/crawler-module/src/crawlers/souhu/house.py b/crawler-module/src/crawlers/souhu/house.py
new file mode 100644
index 0000000..243833f
--- /dev/null
+++ b/crawler-module/src/crawlers/souhu/house.py
@@ -0,0 +1,177 @@
+"""
+焦点房产网新闻爬虫
+"""
+
+from typing import List, Set
+from bs4 import BeautifulSoup
+import time
+
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+from base.crawler_base import DynamicCrawler, Article
+from parsers.souhu_parser import SouhuParser
+
+
+class HouseCrawler(DynamicCrawler):
+ """焦点房产网新闻爬虫"""
+
+ def crawl(self) -> List[Article]:
+ """
+ 重写 crawl 方法以支持分页导航
+
+ 分页导航会跳转到新页面,需要在每次翻页后收集 URL
+ """
+ self.logger.info(f"开始爬取 {self.category_name} 新闻: {self.url}")
+
+ try:
+ if not self._driver:
+ from utils.selenium_driver import SeleniumDriver
+ self._driver = SeleniumDriver()
+
+ # 获取首页
+ self._driver._driver.get(self.url)
+ time.sleep(2)
+
+ # 收集所有 URL(使用集合自动去重)
+ all_urls: Set[str] = set()
+
+ # 收集第一页的 URL
+ first_page_urls = self._collect_urls_from_current_page()
+ all_urls.update(first_page_urls)
+ self.logger.info(f"第 1 页:找到 {len(first_page_urls)} 篇文章")
+
+ # 计算需要翻页的次数
+ import math
+ # 每页约10篇文章,准确计算翻页次数
+ max_clicks = math.ceil(self.max_articles / 10)
+
+ self.logger.info(f"计划翻页 {max_clicks} 次以获取约 {self.max_articles} 篇文章(每页10篇)")
+
+ # 执行翻页并收集 URL
+ for page_num in range(1, max_clicks + 1):
+ try:
+ # 滚动到底部确保按钮可见
+ self._driver._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+ time.sleep(0.5)
+
+ # 查找下一页按钮
+ from selenium.webdriver.common.by import By
+ try:
+ next_btn = self._driver._driver.find_element(By.CSS_SELECTOR, "div.pagination-item-next.pagination-more")
+ except:
+ self.logger.info(f"找不到下一页按钮,可能已到最后一页")
+ break
+
+ # 检查按钮是否禁用
+ btn_class = next_btn.get_attribute("class") or ""
+ if "disabled" in btn_class or next_btn.get_attribute("disabled"):
+ self.logger.info(f"下一页按钮已禁用,停止翻页")
+ break
+
+ # 点击按钮
+ next_btn.click()
+ time.sleep(2) # 等待新页面加载
+
+ # 收集当前页的 URL
+ current_page_urls = self._collect_urls_from_current_page()
+ new_urls_count = len(current_page_urls - all_urls)
+ all_urls.update(current_page_urls)
+
+ self.logger.info(f"第 {page_num + 1} 页:找到 {len(current_page_urls)} 篇文章(新增 {new_urls_count} 篇),总计 {len(all_urls)} 篇")
+
+ # 如果已经收集到足够的文章,停止翻页
+ if len(all_urls) >= self.max_articles:
+ self.logger.info(f"已收集到足够的文章({len(all_urls)} >= {self.max_articles}),停止翻页")
+ break
+
+ except Exception as e:
+ self.logger.info(f"翻页出错: {e},停止翻页")
+ break
+
+ # 转换为列表
+ article_urls = list(all_urls)
+ self.logger.info(f"总共找到 {len(article_urls)} 篇文章")
+
+ # 爬取文章详情
+ articles = self._fetch_articles(article_urls)
+
+ self.logger.info(f"成功爬取 {len(articles)} 篇文章")
+ return articles
+
+ except Exception as e:
+ self.logger.error(f"爬取失败: {e}", exc_info=True)
+ return []
+ finally:
+ self._cleanup()
+
+ def _collect_urls_from_current_page(self) -> Set[str]:
+ """
+ 从当前页面收集文章URL
+
+ Returns:
+ 当前页面的文章URL集合
+ """
+ page_source = self._driver._driver.page_source
+ soup = BeautifulSoup(page_source, "lxml")
+ urls = set()
+
+ # 选择所有文章项 - 只选择当前Tab下的内容
+ div_list = soup.select(
+ "#Tab > div:first-child div.TPLTextFeedItem, "
+ "#Tab > div:first-child div.TPLImageTextFeedItem"
+ )
+
+ for item in div_list:
+ link_tag = item.select_one('a')
+ if link_tag and link_tag.has_attr('href'):
+ url = link_tag['href']
+
+ # 处理相对协议链接 (//www...)
+ if url.startswith('//'):
+ url = 'https:' + url
+
+ # 处理相对路径
+ elif url.startswith('/'):
+ url = 'https://house.focus.cn' + url
+
+ if url:
+ urls.add(url)
+
+ return urls
+
+ def _fetch_page(self) -> str:
+ """重写基类方法,返回空字符串(实际逻辑在 crawl() 中)"""
+ return ""
+
+ def _extract_article_urls(self, html: str) -> List[str]:
+ """重写基类方法,返回空列表(实际逻辑在 crawl() 中)"""
+ return []
+
+ def _fetch_articles(self, urls: List[str]) -> List[Article]:
+ """
+ 爬取文章详情
+ """
+ articles = []
+ parser = SouhuParser()
+
+ for i, url in enumerate(urls[:self.max_articles]):
+ try:
+ article = parser.parse(url)
+ article.category_id = self.category_id
+ article.source = "焦点房产网"
+
+ # 设置默认作者
+ if not article.author:
+ article.author = "焦点房产网"
+
+ if article.is_valid():
+ articles.append(article)
+ self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
+
+ except Exception as e:
+ self.logger.error(f"解析文章失败: {url} - {e}")
+ continue
+
+ return articles
diff --git a/crawler-module/src/parsers/souhu_parser.py b/crawler-module/src/parsers/souhu_parser.py
new file mode 100644
index 0000000..f448615
--- /dev/null
+++ b/crawler-module/src/parsers/souhu_parser.py
@@ -0,0 +1,82 @@
+"""
+搜狐文章解析器
+"""
+
+from bs4 import BeautifulSoup
+import re
+
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from base.parser_base import BaseParser
+from base.crawler_base import Article
+from utils.http_client import HttpClient
+from utils.logger import get_logger
+
+
+class SouhuParser(BaseParser):
+ """焦点房产网文章解析器"""
+
+ def __init__(self):
+ self.logger = get_logger(__name__)
+ self.http_client = HttpClient()
+
+ def parse(self, url: str) -> Article:
+ """
+ 解析焦点房产网文章详情页
+
+ Args:
+ url: 文章URL
+
+ Returns:
+ 文章对象
+ """
+ html = self.http_client.get(url)
+ soup = BeautifulSoup(html, "lxml")
+
+ # 找到文章容器
+ article_container = soup.select_one("#article-container")
+
+ if not article_container:
+ return Article(
+ url=url,
+ title=None,
+ publish_time=None,
+ author=None,
+ content="",
+ )
+
+ # 提取标题
+ title = None
+ title_tag = article_container.select_one("div.text-title h1")
+ if title_tag:
+ title = title_tag.get_text(strip=True)
+
+ # 提取作者
+ author = None
+ author_tag = article_container.select_one(".article-info .author a")
+ if author_tag:
+ author = author_tag.get_text(strip=True)
+
+ # 提取发布时间
+ publish_time = None
+ time_tag = article_container.select_one(".article-info .author #news-time")
+ if time_tag:
+ publish_time = time_tag.get_text(strip=True)
+
+ # 提取正文内容
+ content = ""
+ content_tag = soup.select_one("article#mp-editor")
+
+ if content_tag:
+ # 使用 separator='\n\n' 让每个段落之间空一行
+ content = content_tag.get_text(separator='\n\n', strip=True)
+
+ return Article(
+ url=url,
+ title=title,
+ publish_time=publish_time,
+ author=author,
+ content=content,
+ )
diff --git a/crawler-module/src/utils/selenium_driver.py b/crawler-module/src/utils/selenium_driver.py
index b6dfd4f..144d977 100644
--- a/crawler-module/src/utils/selenium_driver.py
+++ b/crawler-module/src/utils/selenium_driver.py
@@ -163,6 +163,54 @@ class SeleniumDriver:
self.logger.info(f"加载更多按钮共点击 {click_count} 次")
+ def click_next_page(self, selector: str = "div.pagination-item-next.pagination-more", max_clicks: int = 10):
+ """
+ 点击下一页按钮实现翻页
+
+ 与 click_load_more 不同,此方法专门用于分页导航
+
+ Args:
+ selector: 下一页按钮的CSS选择器
+ max_clicks: 最大点击次数
+ """
+ if not self._driver:
+ self._create_driver()
+
+ click_count = 0
+
+ for i in range(max_clicks):
+ try:
+ # 先滚动到页面底部,确保按钮可见
+ self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+ time.sleep(0.5)
+
+ # 查找下一页按钮
+ try:
+ next_btn = self._driver.find_element(By.CSS_SELECTOR, selector)
+ except Exception as e:
+ self.logger.info(f"找不到下一页按钮,可能已到最后一页,停止翻页")
+ break
+
+ # 检查按钮是否可点击(检查是否有disabled类或属性)
+ btn_class = next_btn.get_attribute("class") or ""
+ if "disabled" in btn_class or next_btn.get_attribute("disabled"):
+ self.logger.info(f"下一页按钮已禁用,停止翻页")
+ break
+
+ # 点击按钮
+ next_btn.click()
+ click_count += 1
+ self.logger.debug(f"成功点击下一页按钮 {click_count} 次")
+
+ # 等待新页面加载
+ time.sleep(config.selenium.scroll_pause_time * 2) # 增加等待时间
+
+ except Exception as e:
+ self.logger.info(f"点击下一页按钮时出错: {e},停止翻页")
+ break
+
+ self.logger.info(f"下一页按钮共点击 {click_count} 次")
+
def quit(self):
"""退出驱动"""
if self._driver:
diff --git a/crawler-module/tencent-war.txt b/crawler-module/tencent-war.txt
deleted file mode 100644
index d312d39..0000000
--- a/crawler-module/tencent-war.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-
-这是关于腾讯新闻网爬取军事分类新闻的一个可行的代码
-需要注意的是腾讯新闻解析文章详情的代码是通用的,这里没有给出(使用tencent_parser.py即可)
-注意这里需要使用到动态加载(继承DynamicCrawler,并且无需重写_fetch_page())
-```python
-import requests
-from bs4 import BeautifulSoup
-
-
-URL = "https://news.qq.com/ch/milite"
-headers = {
- "User-Agent": (
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
- "AppleWebKit/537.36 (KHTML, like Gecko) "
- "Chrome/120.0.0.0 Safari/537.36"
- )
-}
-resp = requests.get(URL,headers=headers,timeout=10)
-resp.raise_for_status()
-resp.encoding = "utf-8"
-# print(resp.text)
-# with open("example/example-13.html","r",encoding="utf-8") as f:
-# html = f.read()
-
-soup = BeautifulSoup(resp.text,"lxml")
-# soup = BeautifulSoup(html,"lxml")
-div_list = soup.select("div[id='channel-feed-area'] div.channel-feed-list div.channel-feed-item[dt-params*='article_type=0']")
-for div in div_list:
- href = div.select_one("a.article-title").get("href")
- print(href)
-```
\ No newline at end of file