add ensemble learning details
This commit is contained in:
@@ -169,6 +169,8 @@ graph.view(output_path) # 打开图像,path为保存路径,不需要加后
|
||||
|
||||
[Webgraphviz](http://webgraphviz.com/),这个网站可以将`tree.dot`文件的内容生成对应的可视化树
|
||||
|
||||
|
||||
#### 回归决策树与线性回归的对比
|
||||
```python
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
127
source/_posts/machinelearning/ensemblelearning.md
Normal file
127
source/_posts/machinelearning/ensemblelearning.md
Normal file
@@ -0,0 +1,127 @@
|
||||
---
|
||||
title: 集成学习
|
||||
tags: ensemble-learning
|
||||
categories: machinelearning
|
||||
abbrlink: 8816
|
||||
date: 2025-01-25 15:12:08
|
||||
---
|
||||
|
||||
### Bagging
|
||||
|
||||
### 随机森林
|
||||
> `Random-Forest` 就是`Bagging + Decisiontree`
|
||||
```python
|
||||
import seaborn as sns
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split,GridSearchCV
|
||||
from sklearn.feature_extraction import DictVectorizer
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
# 1.获取数据集 - 加载 Titanic 数据集
|
||||
titanic = sns.load_dataset('titanic')
|
||||
missing_age_count = titanic['age'].isna().sum()
|
||||
# print(f"缺失的 age 数量: {missing_age_count}")
|
||||
# 2. 数据基本处理
|
||||
# 2.1 确认特征值、目标值
|
||||
X = titanic[['pclass','age','sex']]
|
||||
y = titanic['survived']
|
||||
# 2.2 缺失值处理
|
||||
X.loc[:, 'age'] = X['age'].fillna(value=X['age'].mean()) # 使用 .loc 进行修改
|
||||
# 2.3 划分数据集
|
||||
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=22)
|
||||
# 3. 特征工程(字典特征提取)
|
||||
X_train = X_train.to_dict(orient="records")
|
||||
X_test= X_test.to_dict(orient="records")
|
||||
transfer = DictVectorizer()
|
||||
X_train = transfer.fit_transform(X_train)
|
||||
X_test = transfer.transform(X_test)
|
||||
# 4. 机器学习 随机森林
|
||||
rf = RandomForestClassifier()
|
||||
gc = GridSearchCV(estimator=rf ,param_grid={"n_estimators":[100,120,300],"max_depth":[3,7,11]},cv=3)
|
||||
gc.fit(X_train,y_train)
|
||||
y_pred = gc.predict(X_test)
|
||||
print(f"模型的测试集的预测值:{y_pred}")
|
||||
ret = gc.score(X_test,y_test)
|
||||
print(f"最佳模型在测试集上的评分:{ret}")
|
||||
print(f"最佳模型的参数:{gc.best_estimator_}")
|
||||
print(f"最佳模型在训练集上的评分:{gc.best_score_}")
|
||||
print(X_test.toarray())
|
||||
```
|
||||

|
||||
|
||||
### ott案例
|
||||
```python
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from imblearn.under_sampling import RandomUnderSampler
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.metrics import log_loss
|
||||
from sklearn.preprocessing import OneHotEncoder
|
||||
# 1. 获取数据集
|
||||
data = pd.read_csv('./data/train.csv')
|
||||
# 查看目标值分类
|
||||
import seaborn as sns
|
||||
sns.countplot(data=data, x='target', hue='target', palette="Set2", legend=False) # 使用 hue='target' 替代 palette
|
||||
plt.show()
|
||||
|
||||
# 2. 数据集的基本处理
|
||||
# 2.1 确定特征值、目标值
|
||||
x = data.drop(["id", "target"], axis=1)
|
||||
y = data['target']
|
||||
|
||||
# 2.2 使用随机欠采样进行平衡
|
||||
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=0)
|
||||
x_resampled, y_resampled = undersampler.fit_resample(x, y)
|
||||
|
||||
# 查看欠采样后的类别分布
|
||||
# print(f"欠采样后训练集中的类别分布:\n{y_train_resampled.value_counts()}")
|
||||
|
||||
# 2.3. 将标签转换为数字
|
||||
le = LabelEncoder()
|
||||
y_resampled = le.fit_transform(y_resampled)
|
||||
|
||||
# 2.4. 划分训练集和测试集
|
||||
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2)
|
||||
|
||||
# 3. 机器学习
|
||||
rf = RandomForestClassifier(oob_score = True)
|
||||
rf.fit(x_train,y_train)
|
||||
y_pred = rf.predict(x_test)
|
||||
print(f"预测值:{y_pred}")
|
||||
print(f"评分:{rf.score(x_test,y_test)}")
|
||||
|
||||
# # 4. 模型评估 (解决二分类预测问题)
|
||||
# import numpy as np
|
||||
# from sklearn.metrics import log_loss
|
||||
# # 假设 y_pred_prob 是通过 predict_proba 得到的预测概率
|
||||
# # 对预测概率进行裁剪,将其限制在 [eps, 1-eps] 范围内
|
||||
# eps = 1e-15 # 设置一个小的eps值,避免极端值
|
||||
# y_pred_prob = rf.predict_proba(x_test)
|
||||
# y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps)
|
||||
|
||||
# # 计算 log_loss
|
||||
# loss = log_loss(y_test, y_pred_prob, normalize=True)
|
||||
# print(f"Log Loss: {loss}")
|
||||
|
||||
# 4. 模型评估 (解决多分类预测问题)
|
||||
|
||||
# 获取预测的概率
|
||||
y_pred_prob = rf.predict_proba(x_test)
|
||||
|
||||
# 使用 OneHotEncoder 对 y_test 进行 One-Hot 编码
|
||||
encoder = OneHotEncoder(sparse_output=False) # 确保返回的是密集矩阵
|
||||
y_test_one_hot = encoder.fit_transform(y_test.reshape(-1, 1))
|
||||
|
||||
# 对预测概率进行裁剪,将其限制在 [eps, 1-eps] 范围内
|
||||
eps = 1e-15
|
||||
y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps)
|
||||
|
||||
# 计算 log_loss
|
||||
loss = log_loss(y_test_one_hot, y_pred_prob, normalize=True)
|
||||
print(f"Log Loss: {loss}")
|
||||
|
||||
```
|
||||

|
||||
Reference in New Issue
Block a user