129 lines
4.3 KiB
Markdown
129 lines
4.3 KiB
Markdown
---
|
||
title: 集成学习
|
||
tags: ensemble-learning
|
||
categories: machinelearning
|
||
abbrlink: 8816
|
||
date: 2025-01-25 15:12:08
|
||
cover: /img/machinelearning/ensemble-learning.png
|
||
top_img: /img/site01.jpg
|
||
---
|
||
|
||
### Bagging
|
||
|
||
### 随机森林
|
||
> `Random-Forest` 就是`Bagging + Decisiontree`
|
||
```python
|
||
import seaborn as sns
|
||
import pandas as pd
|
||
import numpy as np
|
||
from sklearn.model_selection import train_test_split,GridSearchCV
|
||
from sklearn.feature_extraction import DictVectorizer
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
# 1.获取数据集 - 加载 Titanic 数据集
|
||
titanic = sns.load_dataset('titanic')
|
||
missing_age_count = titanic['age'].isna().sum()
|
||
# print(f"缺失的 age 数量: {missing_age_count}")
|
||
# 2. 数据基本处理
|
||
# 2.1 确认特征值、目标值
|
||
X = titanic[['pclass','age','sex']]
|
||
y = titanic['survived']
|
||
# 2.2 缺失值处理
|
||
X.loc[:, 'age'] = X['age'].fillna(value=X['age'].mean()) # 使用 .loc 进行修改
|
||
# 2.3 划分数据集
|
||
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=22)
|
||
# 3. 特征工程(字典特征提取)
|
||
X_train = X_train.to_dict(orient="records")
|
||
X_test= X_test.to_dict(orient="records")
|
||
transfer = DictVectorizer()
|
||
X_train = transfer.fit_transform(X_train)
|
||
X_test = transfer.transform(X_test)
|
||
# 4. 机器学习 随机森林
|
||
rf = RandomForestClassifier()
|
||
gc = GridSearchCV(estimator=rf ,param_grid={"n_estimators":[100,120,300],"max_depth":[3,7,11]},cv=3)
|
||
gc.fit(X_train,y_train)
|
||
y_pred = gc.predict(X_test)
|
||
print(f"模型的测试集的预测值:{y_pred}")
|
||
ret = gc.score(X_test,y_test)
|
||
print(f"最佳模型在测试集上的评分:{ret}")
|
||
print(f"最佳模型的参数:{gc.best_estimator_}")
|
||
print(f"最佳模型在训练集上的评分:{gc.best_score_}")
|
||
print(X_test.toarray())
|
||
```
|
||

|
||
|
||
### ott案例
|
||
```python
|
||
import pandas as pd
|
||
import numpy as np
|
||
import matplotlib.pyplot as plt
|
||
from imblearn.under_sampling import RandomUnderSampler
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.preprocessing import LabelEncoder
|
||
from sklearn.ensemble import RandomForestClassifier
|
||
from sklearn.metrics import log_loss
|
||
from sklearn.preprocessing import OneHotEncoder
|
||
# 1. 获取数据集
|
||
data = pd.read_csv('./data/train.csv')
|
||
# 查看目标值分类
|
||
import seaborn as sns
|
||
sns.countplot(data=data, x='target', hue='target', palette="Set2", legend=False) # 使用 hue='target' 替代 palette
|
||
plt.show()
|
||
|
||
# 2. 数据集的基本处理
|
||
# 2.1 确定特征值、目标值
|
||
x = data.drop(["id", "target"], axis=1)
|
||
y = data['target']
|
||
|
||
# 2.2 使用随机欠采样进行平衡
|
||
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=0)
|
||
x_resampled, y_resampled = undersampler.fit_resample(x, y)
|
||
|
||
# 查看欠采样后的类别分布
|
||
# print(f"欠采样后训练集中的类别分布:\n{y_train_resampled.value_counts()}")
|
||
|
||
# 2.3. 将标签转换为数字
|
||
le = LabelEncoder()
|
||
y_resampled = le.fit_transform(y_resampled)
|
||
|
||
# 2.4. 划分训练集和测试集
|
||
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2)
|
||
|
||
# 3. 机器学习
|
||
rf = RandomForestClassifier(oob_score = True)
|
||
rf.fit(x_train,y_train)
|
||
y_pred = rf.predict(x_test)
|
||
print(f"预测值:{y_pred}")
|
||
print(f"评分:{rf.score(x_test,y_test)}")
|
||
|
||
# # 4. 模型评估 (解决二分类预测问题)
|
||
# import numpy as np
|
||
# from sklearn.metrics import log_loss
|
||
# # 假设 y_pred_prob 是通过 predict_proba 得到的预测概率
|
||
# # 对预测概率进行裁剪,将其限制在 [eps, 1-eps] 范围内
|
||
# eps = 1e-15 # 设置一个小的eps值,避免极端值
|
||
# y_pred_prob = rf.predict_proba(x_test)
|
||
# y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps)
|
||
|
||
# # 计算 log_loss
|
||
# loss = log_loss(y_test, y_pred_prob, normalize=True)
|
||
# print(f"Log Loss: {loss}")
|
||
|
||
# 4. 模型评估 (解决多分类预测问题)
|
||
|
||
# 获取预测的概率
|
||
y_pred_prob = rf.predict_proba(x_test)
|
||
|
||
# 使用 OneHotEncoder 对 y_test 进行 One-Hot 编码
|
||
encoder = OneHotEncoder(sparse_output=False) # 确保返回的是密集矩阵
|
||
y_test_one_hot = encoder.fit_transform(y_test.reshape(-1, 1))
|
||
|
||
# 对预测概率进行裁剪,将其限制在 [eps, 1-eps] 范围内
|
||
eps = 1e-15
|
||
y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps)
|
||
|
||
# 计算 log_loss
|
||
loss = log_loss(y_test_one_hot, y_pred_prob, normalize=True)
|
||
print(f"Log Loss: {loss}")
|
||
|
||
```
|
||
 |