feat: add souhu house crawler
This commit is contained in:
parent
05b67d5cbc
commit
d02e8d65e0
|
|
@ -31,7 +31,7 @@ http:
|
||||||
|
|
||||||
# Selenium配置
|
# Selenium配置
|
||||||
selenium:
|
selenium:
|
||||||
headless: true
|
headless: false
|
||||||
log_level: 3
|
log_level: 3
|
||||||
window_size: "1920,1080"
|
window_size: "1920,1080"
|
||||||
page_load_timeout: 60
|
page_load_timeout: 60
|
||||||
|
|
@ -180,3 +180,12 @@ sources:
|
||||||
name: "AI"
|
name: "AI"
|
||||||
css_selector: ""
|
css_selector: ""
|
||||||
# 注意:此分类通过搜索接口获取数据,而非正常的分类列表接口
|
# 注意:此分类通过搜索接口获取数据,而非正常的分类列表接口
|
||||||
|
|
||||||
|
souhu:
|
||||||
|
base_url: "https://news.sohu.com/"
|
||||||
|
categories:
|
||||||
|
house:
|
||||||
|
url: "https://house.focus.cn/zixun/"
|
||||||
|
category_id: 10
|
||||||
|
name: "房产"
|
||||||
|
css_selector: ".TPLTextFeedItem, .TPLImageTextFeedItem"
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,65 @@
|
||||||
|
|
||||||
|
这是关于搜狐新闻网爬取房产分类新闻的一个可行的代码
|
||||||
|
注意这里需要使用到动态加载 ,并且需要点击翻页导航的下一页按钮进行分页,根据 --max 的参数决定翻页的次数
|
||||||
|
这是页面的翻页导航
|
||||||
|
```html
|
||||||
|
<div data-v-4b61222c="" data-v-32183bf0="" class="Pagination"><div data-v-4b61222c="" class="pagination-content"><div data-v-4b61222c="" class="pagination-item pagination-item-pre">首页</div> <div data-v-4b61222c="" class="pagination-item pagination-item-pre">上一页</div> <div data-v-4b61222c="" class="pagination-item-content"><div data-v-4b61222c="" class="pagination-item pagination-item-0">1</div><div data-v-4b61222c="" class="pagination-item pagination-item-1">2</div> <div data-v-4b61222c="" class="pagination-item-point">···</div> <div data-v-4b61222c="" class="pagination-item pagination-item-0">4</div><div data-v-4b61222c="" class="pagination-item pagination-item-1 active-item">5</div><div data-v-4b61222c="" class="pagination-item pagination-item-2">6</div> <div data-v-4b61222c="" class="pagination-item-point">···</div> <div data-v-4b61222c="" class="pagination-item pagination-item-0">99</div><div data-v-4b61222c="" class="pagination-item pagination-item-1">100</div></div> <div data-v-4b61222c="" class="pagination-item pagination-item-next pagination-more">下一页</div></div></div>
|
||||||
|
```
|
||||||
|
```python
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
URL = "https://house.focus.cn/zixun/"
|
||||||
|
headers = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||||
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||||
|
"Chrome/120.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
resp = requests.get(URL,headers=headers,timeout=10)
|
||||||
|
resp.raise_for_status()
|
||||||
|
resp.encoding = "utf-8"
|
||||||
|
# with open("example/example-14.html", "r",encoding="utf-8") as f:
|
||||||
|
# html = f.read()
|
||||||
|
def parser_article(href):
|
||||||
|
resp_article = requests.get(href, headers=headers)
|
||||||
|
soup_article = BeautifulSoup(resp_article.text,"lxml")
|
||||||
|
# print(resp.text)
|
||||||
|
# 标题
|
||||||
|
title_tag = soup_article.select_one("#article-container div.text-title h1")
|
||||||
|
title = title_tag.get_text(strip=True)
|
||||||
|
# 作者
|
||||||
|
author_tag = soup_article.select_one("#article-container .article-info .author a")
|
||||||
|
author = author_tag.get_text(strip=True)
|
||||||
|
#发布时间
|
||||||
|
time_tag = soup_article.select_one("#article-container .article-info .author #news-time")
|
||||||
|
time = time_tag.get_text(strip=True)
|
||||||
|
# 文本内容
|
||||||
|
content_tag = soup_article.select_one("article#mp-editor")
|
||||||
|
if content_tag:
|
||||||
|
# strip=True 去除首尾空白
|
||||||
|
# separator='\n\n' 让每个段落之间空一行
|
||||||
|
content = content_tag.get_text(separator='\n\n', strip=True)
|
||||||
|
else:
|
||||||
|
content = "未找到正文内容"
|
||||||
|
print(title,author,time)
|
||||||
|
print(content)
|
||||||
|
|
||||||
|
# soup = BeautifulSoup(resp.text,"lxml")
|
||||||
|
soup = BeautifulSoup(html,"lxml")
|
||||||
|
div_list = soup.select('.TPLTextFeedItem, .TPLImageTextFeedItem')
|
||||||
|
# print(div_list)
|
||||||
|
for item in div_list:
|
||||||
|
link_tag = item.select_one('a')
|
||||||
|
if link_tag and link_tag.has_attr('href'):
|
||||||
|
url = link_tag['href']
|
||||||
|
# 处理可能的相对协议链接 (//www...)
|
||||||
|
if url.startswith('//'):
|
||||||
|
url = 'https:' + url
|
||||||
|
parser_article(url)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
```
|
||||||
|
|
@ -49,6 +49,9 @@ CRAWLER_CLASSES = {
|
||||||
'finance': ('crawlers.tencent.finance', 'FinanceCrawler'),
|
'finance': ('crawlers.tencent.finance', 'FinanceCrawler'),
|
||||||
'ai': ('crawlers.tencent.ai', 'SearchAICrawler'),
|
'ai': ('crawlers.tencent.ai', 'SearchAICrawler'),
|
||||||
},
|
},
|
||||||
|
'souhu': {
|
||||||
|
'house': ('crawlers.souhu.house', 'HouseCrawler'),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -97,6 +100,11 @@ def list_crawlers() -> List[str]:
|
||||||
for category in tencent_categories.keys():
|
for category in tencent_categories.keys():
|
||||||
crawlers.append(f"tencent:{category}")
|
crawlers.append(f"tencent:{category}")
|
||||||
|
|
||||||
|
# 搜狐爬虫
|
||||||
|
souhu_categories = config.get('sources.souhu.categories', {})
|
||||||
|
for category in souhu_categories.keys():
|
||||||
|
crawlers.append(f"souhu:{category}")
|
||||||
|
|
||||||
return crawlers
|
return crawlers
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
"""
|
||||||
|
搜狐新闻爬虫模块
|
||||||
|
"""
|
||||||
|
|
@ -0,0 +1,177 @@
|
||||||
|
"""
|
||||||
|
焦点房产网新闻爬虫
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List, Set
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import time
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||||
|
|
||||||
|
from base.crawler_base import DynamicCrawler, Article
|
||||||
|
from parsers.souhu_parser import SouhuParser
|
||||||
|
|
||||||
|
|
||||||
|
class HouseCrawler(DynamicCrawler):
|
||||||
|
"""焦点房产网新闻爬虫"""
|
||||||
|
|
||||||
|
def crawl(self) -> List[Article]:
|
||||||
|
"""
|
||||||
|
重写 crawl 方法以支持分页导航
|
||||||
|
|
||||||
|
分页导航会跳转到新页面,需要在每次翻页后收集 URL
|
||||||
|
"""
|
||||||
|
self.logger.info(f"开始爬取 {self.category_name} 新闻: {self.url}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not self._driver:
|
||||||
|
from utils.selenium_driver import SeleniumDriver
|
||||||
|
self._driver = SeleniumDriver()
|
||||||
|
|
||||||
|
# 获取首页
|
||||||
|
self._driver._driver.get(self.url)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# 收集所有 URL(使用集合自动去重)
|
||||||
|
all_urls: Set[str] = set()
|
||||||
|
|
||||||
|
# 收集第一页的 URL
|
||||||
|
first_page_urls = self._collect_urls_from_current_page()
|
||||||
|
all_urls.update(first_page_urls)
|
||||||
|
self.logger.info(f"第 1 页:找到 {len(first_page_urls)} 篇文章")
|
||||||
|
|
||||||
|
# 计算需要翻页的次数
|
||||||
|
import math
|
||||||
|
# 每页约10篇文章,准确计算翻页次数
|
||||||
|
max_clicks = math.ceil(self.max_articles / 10)
|
||||||
|
|
||||||
|
self.logger.info(f"计划翻页 {max_clicks} 次以获取约 {self.max_articles} 篇文章(每页10篇)")
|
||||||
|
|
||||||
|
# 执行翻页并收集 URL
|
||||||
|
for page_num in range(1, max_clicks + 1):
|
||||||
|
try:
|
||||||
|
# 滚动到底部确保按钮可见
|
||||||
|
self._driver._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# 查找下一页按钮
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
try:
|
||||||
|
next_btn = self._driver._driver.find_element(By.CSS_SELECTOR, "div.pagination-item-next.pagination-more")
|
||||||
|
except:
|
||||||
|
self.logger.info(f"找不到下一页按钮,可能已到最后一页")
|
||||||
|
break
|
||||||
|
|
||||||
|
# 检查按钮是否禁用
|
||||||
|
btn_class = next_btn.get_attribute("class") or ""
|
||||||
|
if "disabled" in btn_class or next_btn.get_attribute("disabled"):
|
||||||
|
self.logger.info(f"下一页按钮已禁用,停止翻页")
|
||||||
|
break
|
||||||
|
|
||||||
|
# 点击按钮
|
||||||
|
next_btn.click()
|
||||||
|
time.sleep(2) # 等待新页面加载
|
||||||
|
|
||||||
|
# 收集当前页的 URL
|
||||||
|
current_page_urls = self._collect_urls_from_current_page()
|
||||||
|
new_urls_count = len(current_page_urls - all_urls)
|
||||||
|
all_urls.update(current_page_urls)
|
||||||
|
|
||||||
|
self.logger.info(f"第 {page_num + 1} 页:找到 {len(current_page_urls)} 篇文章(新增 {new_urls_count} 篇),总计 {len(all_urls)} 篇")
|
||||||
|
|
||||||
|
# 如果已经收集到足够的文章,停止翻页
|
||||||
|
if len(all_urls) >= self.max_articles:
|
||||||
|
self.logger.info(f"已收集到足够的文章({len(all_urls)} >= {self.max_articles}),停止翻页")
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.info(f"翻页出错: {e},停止翻页")
|
||||||
|
break
|
||||||
|
|
||||||
|
# 转换为列表
|
||||||
|
article_urls = list(all_urls)
|
||||||
|
self.logger.info(f"总共找到 {len(article_urls)} 篇文章")
|
||||||
|
|
||||||
|
# 爬取文章详情
|
||||||
|
articles = self._fetch_articles(article_urls)
|
||||||
|
|
||||||
|
self.logger.info(f"成功爬取 {len(articles)} 篇文章")
|
||||||
|
return articles
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"爬取失败: {e}", exc_info=True)
|
||||||
|
return []
|
||||||
|
finally:
|
||||||
|
self._cleanup()
|
||||||
|
|
||||||
|
def _collect_urls_from_current_page(self) -> Set[str]:
|
||||||
|
"""
|
||||||
|
从当前页面收集文章URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
当前页面的文章URL集合
|
||||||
|
"""
|
||||||
|
page_source = self._driver._driver.page_source
|
||||||
|
soup = BeautifulSoup(page_source, "lxml")
|
||||||
|
urls = set()
|
||||||
|
|
||||||
|
# 选择所有文章项 - 只选择当前Tab下的内容
|
||||||
|
div_list = soup.select(
|
||||||
|
"#Tab > div:first-child div.TPLTextFeedItem, "
|
||||||
|
"#Tab > div:first-child div.TPLImageTextFeedItem"
|
||||||
|
)
|
||||||
|
|
||||||
|
for item in div_list:
|
||||||
|
link_tag = item.select_one('a')
|
||||||
|
if link_tag and link_tag.has_attr('href'):
|
||||||
|
url = link_tag['href']
|
||||||
|
|
||||||
|
# 处理相对协议链接 (//www...)
|
||||||
|
if url.startswith('//'):
|
||||||
|
url = 'https:' + url
|
||||||
|
|
||||||
|
# 处理相对路径
|
||||||
|
elif url.startswith('/'):
|
||||||
|
url = 'https://house.focus.cn' + url
|
||||||
|
|
||||||
|
if url:
|
||||||
|
urls.add(url)
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
def _fetch_page(self) -> str:
|
||||||
|
"""重写基类方法,返回空字符串(实际逻辑在 crawl() 中)"""
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _extract_article_urls(self, html: str) -> List[str]:
|
||||||
|
"""重写基类方法,返回空列表(实际逻辑在 crawl() 中)"""
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _fetch_articles(self, urls: List[str]) -> List[Article]:
|
||||||
|
"""
|
||||||
|
爬取文章详情
|
||||||
|
"""
|
||||||
|
articles = []
|
||||||
|
parser = SouhuParser()
|
||||||
|
|
||||||
|
for i, url in enumerate(urls[:self.max_articles]):
|
||||||
|
try:
|
||||||
|
article = parser.parse(url)
|
||||||
|
article.category_id = self.category_id
|
||||||
|
article.source = "焦点房产网"
|
||||||
|
|
||||||
|
# 设置默认作者
|
||||||
|
if not article.author:
|
||||||
|
article.author = "焦点房产网"
|
||||||
|
|
||||||
|
if article.is_valid():
|
||||||
|
articles.append(article)
|
||||||
|
self.logger.info(f"[{i+1}/{len(urls)}] {article.title}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"解析文章失败: {url} - {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return articles
|
||||||
|
|
@ -0,0 +1,82 @@
|
||||||
|
"""
|
||||||
|
搜狐文章解析器
|
||||||
|
"""
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import re
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from base.parser_base import BaseParser
|
||||||
|
from base.crawler_base import Article
|
||||||
|
from utils.http_client import HttpClient
|
||||||
|
from utils.logger import get_logger
|
||||||
|
|
||||||
|
|
||||||
|
class SouhuParser(BaseParser):
|
||||||
|
"""焦点房产网文章解析器"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.logger = get_logger(__name__)
|
||||||
|
self.http_client = HttpClient()
|
||||||
|
|
||||||
|
def parse(self, url: str) -> Article:
|
||||||
|
"""
|
||||||
|
解析焦点房产网文章详情页
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 文章URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
文章对象
|
||||||
|
"""
|
||||||
|
html = self.http_client.get(url)
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
|
||||||
|
# 找到文章容器
|
||||||
|
article_container = soup.select_one("#article-container")
|
||||||
|
|
||||||
|
if not article_container:
|
||||||
|
return Article(
|
||||||
|
url=url,
|
||||||
|
title=None,
|
||||||
|
publish_time=None,
|
||||||
|
author=None,
|
||||||
|
content="",
|
||||||
|
)
|
||||||
|
|
||||||
|
# 提取标题
|
||||||
|
title = None
|
||||||
|
title_tag = article_container.select_one("div.text-title h1")
|
||||||
|
if title_tag:
|
||||||
|
title = title_tag.get_text(strip=True)
|
||||||
|
|
||||||
|
# 提取作者
|
||||||
|
author = None
|
||||||
|
author_tag = article_container.select_one(".article-info .author a")
|
||||||
|
if author_tag:
|
||||||
|
author = author_tag.get_text(strip=True)
|
||||||
|
|
||||||
|
# 提取发布时间
|
||||||
|
publish_time = None
|
||||||
|
time_tag = article_container.select_one(".article-info .author #news-time")
|
||||||
|
if time_tag:
|
||||||
|
publish_time = time_tag.get_text(strip=True)
|
||||||
|
|
||||||
|
# 提取正文内容
|
||||||
|
content = ""
|
||||||
|
content_tag = soup.select_one("article#mp-editor")
|
||||||
|
|
||||||
|
if content_tag:
|
||||||
|
# 使用 separator='\n\n' 让每个段落之间空一行
|
||||||
|
content = content_tag.get_text(separator='\n\n', strip=True)
|
||||||
|
|
||||||
|
return Article(
|
||||||
|
url=url,
|
||||||
|
title=title,
|
||||||
|
publish_time=publish_time,
|
||||||
|
author=author,
|
||||||
|
content=content,
|
||||||
|
)
|
||||||
|
|
@ -163,6 +163,54 @@ class SeleniumDriver:
|
||||||
|
|
||||||
self.logger.info(f"加载更多按钮共点击 {click_count} 次")
|
self.logger.info(f"加载更多按钮共点击 {click_count} 次")
|
||||||
|
|
||||||
|
def click_next_page(self, selector: str = "div.pagination-item-next.pagination-more", max_clicks: int = 10):
|
||||||
|
"""
|
||||||
|
点击下一页按钮实现翻页
|
||||||
|
|
||||||
|
与 click_load_more 不同,此方法专门用于分页导航
|
||||||
|
|
||||||
|
Args:
|
||||||
|
selector: 下一页按钮的CSS选择器
|
||||||
|
max_clicks: 最大点击次数
|
||||||
|
"""
|
||||||
|
if not self._driver:
|
||||||
|
self._create_driver()
|
||||||
|
|
||||||
|
click_count = 0
|
||||||
|
|
||||||
|
for i in range(max_clicks):
|
||||||
|
try:
|
||||||
|
# 先滚动到页面底部,确保按钮可见
|
||||||
|
self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# 查找下一页按钮
|
||||||
|
try:
|
||||||
|
next_btn = self._driver.find_element(By.CSS_SELECTOR, selector)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.info(f"找不到下一页按钮,可能已到最后一页,停止翻页")
|
||||||
|
break
|
||||||
|
|
||||||
|
# 检查按钮是否可点击(检查是否有disabled类或属性)
|
||||||
|
btn_class = next_btn.get_attribute("class") or ""
|
||||||
|
if "disabled" in btn_class or next_btn.get_attribute("disabled"):
|
||||||
|
self.logger.info(f"下一页按钮已禁用,停止翻页")
|
||||||
|
break
|
||||||
|
|
||||||
|
# 点击按钮
|
||||||
|
next_btn.click()
|
||||||
|
click_count += 1
|
||||||
|
self.logger.debug(f"成功点击下一页按钮 {click_count} 次")
|
||||||
|
|
||||||
|
# 等待新页面加载
|
||||||
|
time.sleep(config.selenium.scroll_pause_time * 2) # 增加等待时间
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.info(f"点击下一页按钮时出错: {e},停止翻页")
|
||||||
|
break
|
||||||
|
|
||||||
|
self.logger.info(f"下一页按钮共点击 {click_count} 次")
|
||||||
|
|
||||||
def quit(self):
|
def quit(self):
|
||||||
"""退出驱动"""
|
"""退出驱动"""
|
||||||
if self._driver:
|
if self._driver:
|
||||||
|
|
|
||||||
|
|
@ -1,31 +0,0 @@
|
||||||
|
|
||||||
这是关于腾讯新闻网爬取军事分类新闻的一个可行的代码
|
|
||||||
需要注意的是腾讯新闻解析文章详情的代码是通用的,这里没有给出(使用tencent_parser.py即可)
|
|
||||||
注意这里需要使用到动态加载(继承DynamicCrawler,并且无需重写_fetch_page())
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
|
|
||||||
URL = "https://news.qq.com/ch/milite"
|
|
||||||
headers = {
|
|
||||||
"User-Agent": (
|
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
||||||
"Chrome/120.0.0.0 Safari/537.36"
|
|
||||||
)
|
|
||||||
}
|
|
||||||
resp = requests.get(URL,headers=headers,timeout=10)
|
|
||||||
resp.raise_for_status()
|
|
||||||
resp.encoding = "utf-8"
|
|
||||||
# print(resp.text)
|
|
||||||
# with open("example/example-13.html","r",encoding="utf-8") as f:
|
|
||||||
# html = f.read()
|
|
||||||
|
|
||||||
soup = BeautifulSoup(resp.text,"lxml")
|
|
||||||
# soup = BeautifulSoup(html,"lxml")
|
|
||||||
div_list = soup.select("div[id='channel-feed-area'] div.channel-feed-list div.channel-feed-item[dt-params*='article_type=0']")
|
|
||||||
for div in div_list:
|
|
||||||
href = div.select_one("a.article-title").get("href")
|
|
||||||
print(href)
|
|
||||||
```
|
|
||||||
Loading…
Reference in New Issue