feat: add souhu house crawler

2026-01-18 11:22:50 +08:00 · 2026-01-18 11:22:50 +08:00 · d02e8d65e0
parent 05b67d5cbc
commit d02e8d65e0
8 changed files with 393 additions and 32 deletions
--- a/crawler-module/config/config.yaml
+++ b/crawler-module/config/config.yaml
@ -31,7 +31,7 @@ http:
 # Selenium配置
 selenium:
-  headless: true
+  headless: false
  log_level: 3
  window_size: "1920,1080"
  page_load_timeout: 60
@ -180,3 +180,12 @@ sources:
        name: "AI"
        css_selector: ""
        # 注意：此分类通过搜索接口获取数据，而非正常的分类列表接口
  souhu:
    base_url: "https://news.sohu.com/"
    categories:
      house:
        url: "https://house.focus.cn/zixun/"
        category_id: 10
        name: "房产"
        css_selector: ".TPLTextFeedItem, .TPLImageTextFeedItem"
--- a/crawler-module/souhu-house.txt
+++ b/crawler-module/souhu-house.txt
@ -0,0 +1,65 @@
 这是关于搜狐新闻网爬取房产分类新闻的一个可行的代码
 注意这里需要使用到动态加载 ，并且需要点击翻页导航的下一页按钮进行分页，根据 --max 的参数决定翻页的次数
 这是页面的翻页导航
 ```html
 <div data-v-4b61222c="" data-v-32183bf0="" class="Pagination"><div data-v-4b61222c="" class="pagination-content"><div data-v-4b61222c="" class="pagination-item pagination-item-pre">首页</div> <div data-v-4b61222c="" class="pagination-item pagination-item-pre">上一页</div> <div data-v-4b61222c="" class="pagination-item-content"><div data-v-4b61222c="" class="pagination-item pagination-item-0">1</div><div data-v-4b61222c="" class="pagination-item pagination-item-1">2</div> <div data-v-4b61222c="" class="pagination-item-point">···</div> <div data-v-4b61222c="" class="pagination-item pagination-item-0">4</div><div data-v-4b61222c="" class="pagination-item pagination-item-1 active-item">5</div><div data-v-4b61222c="" class="pagination-item pagination-item-2">6</div> <div data-v-4b61222c="" class="pagination-item-point">···</div> <div data-v-4b61222c="" class="pagination-item pagination-item-0">99</div><div data-v-4b61222c="" class="pagination-item pagination-item-1">100</div></div> <div data-v-4b61222c="" class="pagination-item pagination-item-next pagination-more">下一页</div></div></div>
 ```
 ```python
 import requests
 from bs4 import BeautifulSoup
 URL = "https://house.focus.cn/zixun/"
 headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
 }
 resp = requests.get(URL,headers=headers,timeout=10)
 resp.raise_for_status()
 resp.encoding = "utf-8"
 # with open("example/example-14.html", "r",encoding="utf-8") as f:
 #     html = f.read()
 def parser_article(href):
    resp_article = requests.get(href, headers=headers)
    soup_article = BeautifulSoup(resp_article.text,"lxml")
    # print(resp.text)
    # 标题
    title_tag = soup_article.select_one("#article-container div.text-title h1")
    title = title_tag.get_text(strip=True)
    # 作者
    author_tag = soup_article.select_one("#article-container .article-info .author a")
    author = author_tag.get_text(strip=True)
    #发布时间
    time_tag = soup_article.select_one("#article-container .article-info .author #news-time")
    time = time_tag.get_text(strip=True)
    # 文本内容
    content_tag = soup_article.select_one("article#mp-editor")
    if content_tag:
        # strip=True 去除首尾空白
        # separator='\n\n' 让每个段落之间空一行
        content = content_tag.get_text(separator='\n\n', strip=True)
    else:
        content = "未找到正文内容"
    print(title,author,time)
    print(content)
    # soup = BeautifulSoup(resp.text,"lxml")
 soup = BeautifulSoup(html,"lxml")
 div_list = soup.select('.TPLTextFeedItem, .TPLImageTextFeedItem')
 # print(div_list)
 for item in div_list:
        link_tag = item.select_one('a')
        if link_tag and link_tag.has_attr('href'):
            url = link_tag['href']
            # 处理可能的相对协议链接 (//www...)
            if url.startswith('//'):
                url = 'https:' + url
            parser_article(url)
 ```    
--- a/crawler-module/src/cli/main.py
+++ b/crawler-module/src/cli/main.py
@ -49,6 +49,9 @@ CRAWLER_CLASSES = {
        'finance': ('crawlers.tencent.finance', 'FinanceCrawler'),
        'ai': ('crawlers.tencent.ai', 'SearchAICrawler'),
    },
    'souhu': {
        'house': ('crawlers.souhu.house', 'HouseCrawler'),
    },
 }
@ -97,6 +100,11 @@ def list_crawlers() -> List[str]:
    for category in tencent_categories.keys():
        crawlers.append(f"tencent:{category}")
    # 搜狐爬虫
    souhu_categories = config.get('sources.souhu.categories', {})
    for category in souhu_categories.keys():
        crawlers.append(f"souhu:{category}")
    return crawlers
--- a/crawler-module/src/crawlers/souhu/init.py
+++ b/crawler-module/src/crawlers/souhu/init.py
@ -0,0 +1,3 @@
 """
 搜狐新闻爬虫模块
 """
--- a/crawler-module/src/crawlers/souhu/house.py
+++ b/crawler-module/src/crawlers/souhu/house.py
@ -0,0 +1,177 @@
 """
 焦点房产网新闻爬虫
 """
 from typing import List, Set
 from bs4 import BeautifulSoup
 import time
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
 from base.crawler_base import DynamicCrawler, Article
 from parsers.souhu_parser import SouhuParser
 class HouseCrawler(DynamicCrawler):
    """焦点房产网新闻爬虫"""
    def crawl(self) -> List[Article]:
        """
        重写 crawl 方法以支持分页导航
        分页导航会跳转到新页面，需要在每次翻页后收集 URL
        """
        self.logger.info(f"开始爬取 {self.category_name} 新闻: {self.url}")
        try:
            if not self._driver:
                from utils.selenium_driver import SeleniumDriver
                self._driver = SeleniumDriver()
            # 获取首页
            self._driver._driver.get(self.url)
            time.sleep(2)
            # 收集所有 URL（使用集合自动去重）
            all_urls: Set[str] = set()
            # 收集第一页的 URL
            first_page_urls = self._collect_urls_from_current_page()
            all_urls.update(first_page_urls)
            self.logger.info(f"第 1 页：找到 {len(first_page_urls)} 篇文章")
            # 计算需要翻页的次数
            import math
            # 每页约10篇文章，准确计算翻页次数
            max_clicks = math.ceil(self.max_articles / 10)
            self.logger.info(f"计划翻页 {max_clicks} 次以获取约 {self.max_articles} 篇文章（每页10篇）")
            # 执行翻页并收集 URL
            for page_num in range(1, max_clicks + 1):
                try:
                    # 滚动到底部确保按钮可见
                    self._driver._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(0.5)
                    # 查找下一页按钮
                    from selenium.webdriver.common.by import By
                    try:
                        next_btn = self._driver._driver.find_element(By.CSS_SELECTOR, "div.pagination-item-next.pagination-more")
                    except:
                        self.logger.info(f"找不到下一页按钮，可能已到最后一页")
                        break
                    # 检查按钮是否禁用
                    btn_class = next_btn.get_attribute("class") or ""
                    if "disabled" in btn_class or next_btn.get_attribute("disabled"):
                        self.logger.info(f"下一页按钮已禁用，停止翻页")
                        break
                    # 点击按钮
                    next_btn.click()
                    time.sleep(2)  # 等待新页面加载
                    # 收集当前页的 URL
                    current_page_urls = self._collect_urls_from_current_page()
                    new_urls_count = len(current_page_urls - all_urls)
                    all_urls.update(current_page_urls)
                    self.logger.info(f"第 {page_num + 1} 页：找到 {len(current_page_urls)} 篇文章（新增 {new_urls_count} 篇），总计 {len(all_urls)} 篇")
                    # 如果已经收集到足够的文章，停止翻页
                    if len(all_urls) >= self.max_articles:
                        self.logger.info(f"已收集到足够的文章（{len(all_urls)} >= {self.max_articles}），停止翻页")
                        break
                except Exception as e:
                    self.logger.info(f"翻页出错: {e}，停止翻页")
                    break
            # 转换为列表
            article_urls = list(all_urls)
            self.logger.info(f"总共找到 {len(article_urls)} 篇文章")
            # 爬取文章详情
            articles = self._fetch_articles(article_urls)
            self.logger.info(f"成功爬取 {len(articles)} 篇文章")
            return articles
        except Exception as e:
            self.logger.error(f"爬取失败: {e}", exc_info=True)
            return []
        finally:
            self._cleanup()
    def _collect_urls_from_current_page(self) -> Set[str]:
        """
        从当前页面收集文章URL
        Returns:
            当前页面的文章URL集合
        """
        page_source = self._driver._driver.page_source
        soup = BeautifulSoup(page_source, "lxml")
        urls = set()
        # 选择所有文章项 - 只选择当前Tab下的内容
        div_list = soup.select(
            "#Tab > div:first-child div.TPLTextFeedItem, "
            "#Tab > div:first-child div.TPLImageTextFeedItem"
        )
        for item in div_list:
            link_tag = item.select_one('a')
            if link_tag and link_tag.has_attr('href'):
                url = link_tag['href']
                # 处理相对协议链接 (//www...)
                if url.startswith('//'):
                    url = 'https:' + url
                # 处理相对路径
                elif url.startswith('/'):
                    url = 'https://house.focus.cn' + url
                if url:
                    urls.add(url)
        return urls
    def _fetch_page(self) -> str:
        """重写基类方法，返回空字符串（实际逻辑在 crawl() 中）"""
        return ""
    def _extract_article_urls(self, html: str) -> List[str]:
        """重写基类方法，返回空列表（实际逻辑在 crawl() 中）"""
        return []
    def _fetch_articles(self, urls: List[str]) -> List[Article]:
        """
        爬取文章详情
        """
        articles = []
        parser = SouhuParser()
        for i, url in enumerate(urls[:self.max_articles]):
            try:
                article = parser.parse(url)
                article.category_id = self.category_id
                article.source = "焦点房产网"
                # 设置默认作者
                if not article.author:
                    article.author = "焦点房产网"
                if article.is_valid():
                    articles.append(article)
                    self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
            except Exception as e:
                self.logger.error(f"解析文章失败: {url} - {e}")
                continue
        return articles
--- a/crawler-module/src/parsers/souhu_parser.py
+++ b/crawler-module/src/parsers/souhu_parser.py
@ -0,0 +1,82 @@
 """
 搜狐文章解析器
 """
 from bs4 import BeautifulSoup
 import re
 import sys
 import os
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from base.parser_base import BaseParser
 from base.crawler_base import Article
 from utils.http_client import HttpClient
 from utils.logger import get_logger
 class SouhuParser(BaseParser):
    """焦点房产网文章解析器"""
    def __init__(self):
        self.logger = get_logger(__name__)
        self.http_client = HttpClient()
    def parse(self, url: str) -> Article:
        """
        解析焦点房产网文章详情页
        Args:
            url: 文章URL
        Returns:
            文章对象
        """
        html = self.http_client.get(url)
        soup = BeautifulSoup(html, "lxml")
        # 找到文章容器
        article_container = soup.select_one("#article-container")
        if not article_container:
            return Article(
                url=url,
                title=None,
                publish_time=None,
                author=None,
                content="",
            )
        # 提取标题
        title = None
        title_tag = article_container.select_one("div.text-title h1")
        if title_tag:
            title = title_tag.get_text(strip=True)
        # 提取作者
        author = None
        author_tag = article_container.select_one(".article-info .author a")
        if author_tag:
            author = author_tag.get_text(strip=True)
        # 提取发布时间
        publish_time = None
        time_tag = article_container.select_one(".article-info .author #news-time")
        if time_tag:
            publish_time = time_tag.get_text(strip=True)
        # 提取正文内容
        content = ""
        content_tag = soup.select_one("article#mp-editor")
        if content_tag:
            # 使用 separator='\n\n' 让每个段落之间空一行
            content = content_tag.get_text(separator='\n\n', strip=True)
        return Article(
            url=url,
            title=title,
            publish_time=publish_time,
            author=author,
            content=content,
        )
--- a/crawler-module/src/utils/selenium_driver.py
+++ b/crawler-module/src/utils/selenium_driver.py
@ -163,6 +163,54 @@ class SeleniumDriver:
        self.logger.info(f"加载更多按钮共点击 {click_count} 次")
    def click_next_page(self, selector: str = "div.pagination-item-next.pagination-more", max_clicks: int = 10):
        """
        点击下一页按钮实现翻页
        与 click_load_more 不同，此方法专门用于分页导航
        Args:
            selector: 下一页按钮的CSS选择器
            max_clicks: 最大点击次数
        """
        if not self._driver:
            self._create_driver()
        click_count = 0
        for i in range(max_clicks):
            try:
                # 先滚动到页面底部，确保按钮可见
                self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(0.5)
                # 查找下一页按钮
                try:
                    next_btn = self._driver.find_element(By.CSS_SELECTOR, selector)
                except Exception as e:
                    self.logger.info(f"找不到下一页按钮，可能已到最后一页，停止翻页")
                    break
                # 检查按钮是否可点击（检查是否有disabled类或属性）
                btn_class = next_btn.get_attribute("class") or ""
                if "disabled" in btn_class or next_btn.get_attribute("disabled"):
                    self.logger.info(f"下一页按钮已禁用，停止翻页")
                    break
                # 点击按钮
                next_btn.click()
                click_count += 1
                self.logger.debug(f"成功点击下一页按钮 {click_count} 次")
                # 等待新页面加载
                time.sleep(config.selenium.scroll_pause_time * 2)  # 增加等待时间
            except Exception as e:
                self.logger.info(f"点击下一页按钮时出错: {e}，停止翻页")
                break
        self.logger.info(f"下一页按钮共点击 {click_count} 次")
    def quit(self):
        """退出驱动"""
        if self._driver:
--- a/crawler-module/tencent-war.txt
+++ b/crawler-module/tencent-war.txt
@ -1,31 +0,0 @@
 这是关于腾讯新闻网爬取军事分类新闻的一个可行的代码
 需要注意的是腾讯新闻解析文章详情的代码是通用的，这里没有给出（使用tencent_parser.py即可）
 注意这里需要使用到动态加载（继承DynamicCrawler，并且无需重写_fetch_page()）
 ```python
 import requests
 from bs4 import BeautifulSoup
 URL = "https://news.qq.com/ch/milite"
 headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    )
 }
 resp = requests.get(URL,headers=headers,timeout=10)
 resp.raise_for_status()
 resp.encoding = "utf-8"
 # print(resp.text)
 # with open("example/example-13.html","r",encoding="utf-8") as f:
 #     html = f.read()
 soup = BeautifulSoup(resp.text,"lxml")
 # soup = BeautifulSoup(html,"lxml")
 div_list = soup.select("div[id='channel-feed-area'] div.channel-feed-list div.channel-feed-item[dt-params*='article_type=0']")
 for div in div_list:
    href = div.select_one("a.article-title").get("href")
    print(href)
 ```