From d02e8d65e01392528282a63d61ef62296760a3ca Mon Sep 17 00:00:00 2001 From: shenjianZ Date: Sun, 18 Jan 2026 11:22:50 +0800 Subject: [PATCH] feat: add souhu house crawler --- crawler-module/config/config.yaml | 11 +- crawler-module/souhu-house.txt | 65 +++++++ crawler-module/src/cli/main.py | 8 + crawler-module/src/crawlers/souhu/__init__.py | 3 + crawler-module/src/crawlers/souhu/house.py | 177 ++++++++++++++++++ crawler-module/src/parsers/souhu_parser.py | 82 ++++++++ crawler-module/src/utils/selenium_driver.py | 48 +++++ crawler-module/tencent-war.txt | 31 --- 8 files changed, 393 insertions(+), 32 deletions(-) create mode 100644 crawler-module/souhu-house.txt create mode 100644 crawler-module/src/crawlers/souhu/__init__.py create mode 100644 crawler-module/src/crawlers/souhu/house.py create mode 100644 crawler-module/src/parsers/souhu_parser.py delete mode 100644 crawler-module/tencent-war.txt diff --git a/crawler-module/config/config.yaml b/crawler-module/config/config.yaml index bb1ad19..605eceb 100644 --- a/crawler-module/config/config.yaml +++ b/crawler-module/config/config.yaml @@ -31,7 +31,7 @@ http: # Selenium配置 selenium: - headless: true + headless: false log_level: 3 window_size: "1920,1080" page_load_timeout: 60 @@ -180,3 +180,12 @@ sources: name: "AI" css_selector: "" # 注意:此分类通过搜索接口获取数据,而非正常的分类列表接口 + + souhu: + base_url: "https://news.sohu.com/" + categories: + house: + url: "https://house.focus.cn/zixun/" + category_id: 10 + name: "房产" + css_selector: ".TPLTextFeedItem, .TPLImageTextFeedItem" diff --git a/crawler-module/souhu-house.txt b/crawler-module/souhu-house.txt new file mode 100644 index 0000000..4897358 --- /dev/null +++ b/crawler-module/souhu-house.txt @@ -0,0 +1,65 @@ + +这是关于搜狐新闻网爬取房产分类新闻的一个可行的代码 +注意这里需要使用到动态加载 ,并且需要点击翻页导航的下一页按钮进行分页,根据 --max 的参数决定翻页的次数 +这是页面的翻页导航 +```html + +``` +```python +import requests +from bs4 import BeautifulSoup + + +URL = "https://house.focus.cn/zixun/" +headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ) +} +resp = requests.get(URL,headers=headers,timeout=10) +resp.raise_for_status() +resp.encoding = "utf-8" +# with open("example/example-14.html", "r",encoding="utf-8") as f: +# html = f.read() +def parser_article(href): + resp_article = requests.get(href, headers=headers) + soup_article = BeautifulSoup(resp_article.text,"lxml") + # print(resp.text) + # 标题 + title_tag = soup_article.select_one("#article-container div.text-title h1") + title = title_tag.get_text(strip=True) + # 作者 + author_tag = soup_article.select_one("#article-container .article-info .author a") + author = author_tag.get_text(strip=True) + #发布时间 + time_tag = soup_article.select_one("#article-container .article-info .author #news-time") + time = time_tag.get_text(strip=True) + # 文本内容 + content_tag = soup_article.select_one("article#mp-editor") + if content_tag: + # strip=True 去除首尾空白 + # separator='\n\n' 让每个段落之间空一行 + content = content_tag.get_text(separator='\n\n', strip=True) + else: + content = "未找到正文内容" + print(title,author,time) + print(content) + + # soup = BeautifulSoup(resp.text,"lxml") +soup = BeautifulSoup(html,"lxml") +div_list = soup.select('.TPLTextFeedItem, .TPLImageTextFeedItem') +# print(div_list) +for item in div_list: + link_tag = item.select_one('a') + if link_tag and link_tag.has_attr('href'): + url = link_tag['href'] + # 处理可能的相对协议链接 (//www...) + if url.startswith('//'): + url = 'https:' + url + parser_article(url) + + + +``` \ No newline at end of file diff --git a/crawler-module/src/cli/main.py b/crawler-module/src/cli/main.py index 5978946..d933995 100644 --- a/crawler-module/src/cli/main.py +++ b/crawler-module/src/cli/main.py @@ -49,6 +49,9 @@ CRAWLER_CLASSES = { 'finance': ('crawlers.tencent.finance', 'FinanceCrawler'), 'ai': ('crawlers.tencent.ai', 'SearchAICrawler'), }, + 'souhu': { + 'house': ('crawlers.souhu.house', 'HouseCrawler'), + }, } @@ -97,6 +100,11 @@ def list_crawlers() -> List[str]: for category in tencent_categories.keys(): crawlers.append(f"tencent:{category}") + # 搜狐爬虫 + souhu_categories = config.get('sources.souhu.categories', {}) + for category in souhu_categories.keys(): + crawlers.append(f"souhu:{category}") + return crawlers diff --git a/crawler-module/src/crawlers/souhu/__init__.py b/crawler-module/src/crawlers/souhu/__init__.py new file mode 100644 index 0000000..28af9ac --- /dev/null +++ b/crawler-module/src/crawlers/souhu/__init__.py @@ -0,0 +1,3 @@ +""" +搜狐新闻爬虫模块 +""" diff --git a/crawler-module/src/crawlers/souhu/house.py b/crawler-module/src/crawlers/souhu/house.py new file mode 100644 index 0000000..243833f --- /dev/null +++ b/crawler-module/src/crawlers/souhu/house.py @@ -0,0 +1,177 @@ +""" +焦点房产网新闻爬虫 +""" + +from typing import List, Set +from bs4 import BeautifulSoup +import time + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + +from base.crawler_base import DynamicCrawler, Article +from parsers.souhu_parser import SouhuParser + + +class HouseCrawler(DynamicCrawler): + """焦点房产网新闻爬虫""" + + def crawl(self) -> List[Article]: + """ + 重写 crawl 方法以支持分页导航 + + 分页导航会跳转到新页面,需要在每次翻页后收集 URL + """ + self.logger.info(f"开始爬取 {self.category_name} 新闻: {self.url}") + + try: + if not self._driver: + from utils.selenium_driver import SeleniumDriver + self._driver = SeleniumDriver() + + # 获取首页 + self._driver._driver.get(self.url) + time.sleep(2) + + # 收集所有 URL(使用集合自动去重) + all_urls: Set[str] = set() + + # 收集第一页的 URL + first_page_urls = self._collect_urls_from_current_page() + all_urls.update(first_page_urls) + self.logger.info(f"第 1 页:找到 {len(first_page_urls)} 篇文章") + + # 计算需要翻页的次数 + import math + # 每页约10篇文章,准确计算翻页次数 + max_clicks = math.ceil(self.max_articles / 10) + + self.logger.info(f"计划翻页 {max_clicks} 次以获取约 {self.max_articles} 篇文章(每页10篇)") + + # 执行翻页并收集 URL + for page_num in range(1, max_clicks + 1): + try: + # 滚动到底部确保按钮可见 + self._driver._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(0.5) + + # 查找下一页按钮 + from selenium.webdriver.common.by import By + try: + next_btn = self._driver._driver.find_element(By.CSS_SELECTOR, "div.pagination-item-next.pagination-more") + except: + self.logger.info(f"找不到下一页按钮,可能已到最后一页") + break + + # 检查按钮是否禁用 + btn_class = next_btn.get_attribute("class") or "" + if "disabled" in btn_class or next_btn.get_attribute("disabled"): + self.logger.info(f"下一页按钮已禁用,停止翻页") + break + + # 点击按钮 + next_btn.click() + time.sleep(2) # 等待新页面加载 + + # 收集当前页的 URL + current_page_urls = self._collect_urls_from_current_page() + new_urls_count = len(current_page_urls - all_urls) + all_urls.update(current_page_urls) + + self.logger.info(f"第 {page_num + 1} 页:找到 {len(current_page_urls)} 篇文章(新增 {new_urls_count} 篇),总计 {len(all_urls)} 篇") + + # 如果已经收集到足够的文章,停止翻页 + if len(all_urls) >= self.max_articles: + self.logger.info(f"已收集到足够的文章({len(all_urls)} >= {self.max_articles}),停止翻页") + break + + except Exception as e: + self.logger.info(f"翻页出错: {e},停止翻页") + break + + # 转换为列表 + article_urls = list(all_urls) + self.logger.info(f"总共找到 {len(article_urls)} 篇文章") + + # 爬取文章详情 + articles = self._fetch_articles(article_urls) + + self.logger.info(f"成功爬取 {len(articles)} 篇文章") + return articles + + except Exception as e: + self.logger.error(f"爬取失败: {e}", exc_info=True) + return [] + finally: + self._cleanup() + + def _collect_urls_from_current_page(self) -> Set[str]: + """ + 从当前页面收集文章URL + + Returns: + 当前页面的文章URL集合 + """ + page_source = self._driver._driver.page_source + soup = BeautifulSoup(page_source, "lxml") + urls = set() + + # 选择所有文章项 - 只选择当前Tab下的内容 + div_list = soup.select( + "#Tab > div:first-child div.TPLTextFeedItem, " + "#Tab > div:first-child div.TPLImageTextFeedItem" + ) + + for item in div_list: + link_tag = item.select_one('a') + if link_tag and link_tag.has_attr('href'): + url = link_tag['href'] + + # 处理相对协议链接 (//www...) + if url.startswith('//'): + url = 'https:' + url + + # 处理相对路径 + elif url.startswith('/'): + url = 'https://house.focus.cn' + url + + if url: + urls.add(url) + + return urls + + def _fetch_page(self) -> str: + """重写基类方法,返回空字符串(实际逻辑在 crawl() 中)""" + return "" + + def _extract_article_urls(self, html: str) -> List[str]: + """重写基类方法,返回空列表(实际逻辑在 crawl() 中)""" + return [] + + def _fetch_articles(self, urls: List[str]) -> List[Article]: + """ + 爬取文章详情 + """ + articles = [] + parser = SouhuParser() + + for i, url in enumerate(urls[:self.max_articles]): + try: + article = parser.parse(url) + article.category_id = self.category_id + article.source = "焦点房产网" + + # 设置默认作者 + if not article.author: + article.author = "焦点房产网" + + if article.is_valid(): + articles.append(article) + self.logger.info(f"[{i+1}/{len(urls)}] {article.title}") + + except Exception as e: + self.logger.error(f"解析文章失败: {url} - {e}") + continue + + return articles diff --git a/crawler-module/src/parsers/souhu_parser.py b/crawler-module/src/parsers/souhu_parser.py new file mode 100644 index 0000000..f448615 --- /dev/null +++ b/crawler-module/src/parsers/souhu_parser.py @@ -0,0 +1,82 @@ +""" +搜狐文章解析器 +""" + +from bs4 import BeautifulSoup +import re + +import sys +import os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from base.parser_base import BaseParser +from base.crawler_base import Article +from utils.http_client import HttpClient +from utils.logger import get_logger + + +class SouhuParser(BaseParser): + """焦点房产网文章解析器""" + + def __init__(self): + self.logger = get_logger(__name__) + self.http_client = HttpClient() + + def parse(self, url: str) -> Article: + """ + 解析焦点房产网文章详情页 + + Args: + url: 文章URL + + Returns: + 文章对象 + """ + html = self.http_client.get(url) + soup = BeautifulSoup(html, "lxml") + + # 找到文章容器 + article_container = soup.select_one("#article-container") + + if not article_container: + return Article( + url=url, + title=None, + publish_time=None, + author=None, + content="", + ) + + # 提取标题 + title = None + title_tag = article_container.select_one("div.text-title h1") + if title_tag: + title = title_tag.get_text(strip=True) + + # 提取作者 + author = None + author_tag = article_container.select_one(".article-info .author a") + if author_tag: + author = author_tag.get_text(strip=True) + + # 提取发布时间 + publish_time = None + time_tag = article_container.select_one(".article-info .author #news-time") + if time_tag: + publish_time = time_tag.get_text(strip=True) + + # 提取正文内容 + content = "" + content_tag = soup.select_one("article#mp-editor") + + if content_tag: + # 使用 separator='\n\n' 让每个段落之间空一行 + content = content_tag.get_text(separator='\n\n', strip=True) + + return Article( + url=url, + title=title, + publish_time=publish_time, + author=author, + content=content, + ) diff --git a/crawler-module/src/utils/selenium_driver.py b/crawler-module/src/utils/selenium_driver.py index b6dfd4f..144d977 100644 --- a/crawler-module/src/utils/selenium_driver.py +++ b/crawler-module/src/utils/selenium_driver.py @@ -163,6 +163,54 @@ class SeleniumDriver: self.logger.info(f"加载更多按钮共点击 {click_count} 次") + def click_next_page(self, selector: str = "div.pagination-item-next.pagination-more", max_clicks: int = 10): + """ + 点击下一页按钮实现翻页 + + 与 click_load_more 不同,此方法专门用于分页导航 + + Args: + selector: 下一页按钮的CSS选择器 + max_clicks: 最大点击次数 + """ + if not self._driver: + self._create_driver() + + click_count = 0 + + for i in range(max_clicks): + try: + # 先滚动到页面底部,确保按钮可见 + self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + time.sleep(0.5) + + # 查找下一页按钮 + try: + next_btn = self._driver.find_element(By.CSS_SELECTOR, selector) + except Exception as e: + self.logger.info(f"找不到下一页按钮,可能已到最后一页,停止翻页") + break + + # 检查按钮是否可点击(检查是否有disabled类或属性) + btn_class = next_btn.get_attribute("class") or "" + if "disabled" in btn_class or next_btn.get_attribute("disabled"): + self.logger.info(f"下一页按钮已禁用,停止翻页") + break + + # 点击按钮 + next_btn.click() + click_count += 1 + self.logger.debug(f"成功点击下一页按钮 {click_count} 次") + + # 等待新页面加载 + time.sleep(config.selenium.scroll_pause_time * 2) # 增加等待时间 + + except Exception as e: + self.logger.info(f"点击下一页按钮时出错: {e},停止翻页") + break + + self.logger.info(f"下一页按钮共点击 {click_count} 次") + def quit(self): """退出驱动""" if self._driver: diff --git a/crawler-module/tencent-war.txt b/crawler-module/tencent-war.txt deleted file mode 100644 index d312d39..0000000 --- a/crawler-module/tencent-war.txt +++ /dev/null @@ -1,31 +0,0 @@ - -这是关于腾讯新闻网爬取军事分类新闻的一个可行的代码 -需要注意的是腾讯新闻解析文章详情的代码是通用的,这里没有给出(使用tencent_parser.py即可) -注意这里需要使用到动态加载(继承DynamicCrawler,并且无需重写_fetch_page()) -```python -import requests -from bs4 import BeautifulSoup - - -URL = "https://news.qq.com/ch/milite" -headers = { - "User-Agent": ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/120.0.0.0 Safari/537.36" - ) -} -resp = requests.get(URL,headers=headers,timeout=10) -resp.raise_for_status() -resp.encoding = "utf-8" -# print(resp.text) -# with open("example/example-13.html","r",encoding="utf-8") as f: -# html = f.read() - -soup = BeautifulSoup(resp.text,"lxml") -# soup = BeautifulSoup(html,"lxml") -div_list = soup.select("div[id='channel-feed-area'] div.channel-feed-list div.channel-feed-item[dt-params*='article_type=0']") -for div in div_list: - href = div.select_one("a.article-title").get("href") - print(href) -``` \ No newline at end of file