Bagging

随机森林

Random-Forest 就是Bagging + Decisiontree

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
# 1.获取数据集 - 加载 Titanic 数据集
titanic = sns.load_dataset('titanic')
missing_age_count = titanic['age'].isna().sum()
# print(f"缺失的 age 数量: {missing_age_count}")
# 2. 数据基本处理
# 2.1 确认特征值、目标值
X = titanic[['pclass','age','sex']]
y = titanic['survived']
# 2.2 缺失值处理
X.loc[:, 'age'] = X['age'].fillna(value=X['age'].mean()) # 使用 .loc 进行修改
# 2.3 划分数据集
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=22)
# 3. 特征工程(字典特征提取)
X_train = X_train.to_dict(orient="records")
X_test= X_test.to_dict(orient="records")
transfer = DictVectorizer()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test)
# 4. 机器学习 随机森林
rf = RandomForestClassifier()
gc = GridSearchCV(estimator=rf ,param_grid={"n_estimators":[100,120,300],"max_depth":[3,7,11]},cv=3)
gc.fit(X_train,y_train)
y_pred = gc.predict(X_test)
print(f"模型的测试集的预测值:{y_pred}")
ret = gc.score(X_test,y_test)
print(f"最佳模型在测试集上的评分:{ret}")
print(f"最佳模型的参数:{gc.best_estimator_}")
print(f"最佳模型在训练集上的评分:{gc.best_score_}")
print(X_test.toarray())

ott案例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import OneHotEncoder
# 1. 获取数据集
data = pd.read_csv('./data/train.csv')
# 查看目标值分类
import seaborn as sns
sns.countplot(data=data, x='target', hue='target', palette="Set2", legend=False) # 使用 hue='target' 替代 palette
plt.show()

# 2. 数据集的基本处理
# 2.1 确定特征值、目标值
x = data.drop(["id", "target"], axis=1)
y = data['target']

# 2.2 使用随机欠采样进行平衡
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=0)
x_resampled, y_resampled = undersampler.fit_resample(x, y)

# 查看欠采样后的类别分布
# print(f"欠采样后训练集中的类别分布:\n{y_train_resampled.value_counts()}")

# 2.3. 将标签转换为数字
le = LabelEncoder()
y_resampled = le.fit_transform(y_resampled)

# 2.4. 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2)

# 3. 机器学习
rf = RandomForestClassifier(oob_score = True)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print(f"预测值:{y_pred}")
print(f"评分:{rf.score(x_test,y_test)}")

# # 4. 模型评估 (解决二分类预测问题)
# import numpy as np
# from sklearn.metrics import log_loss
# # 假设 y_pred_prob 是通过 predict_proba 得到的预测概率
# # 对预测概率进行裁剪,将其限制在 [eps, 1-eps] 范围内
# eps = 1e-15 # 设置一个小的eps值,避免极端值
# y_pred_prob = rf.predict_proba(x_test)
# y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps)

# # 计算 log_loss
# loss = log_loss(y_test, y_pred_prob, normalize=True)
# print(f"Log Loss: {loss}")

# 4. 模型评估 (解决多分类预测问题)

# 获取预测的概率
y_pred_prob = rf.predict_proba(x_test)

# 使用 OneHotEncoder 对 y_test 进行 One-Hot 编码
encoder = OneHotEncoder(sparse_output=False) # 确保返回的是密集矩阵
y_test_one_hot = encoder.fit_transform(y_test.reshape(-1, 1))

# 对预测概率进行裁剪,将其限制在 [eps, 1-eps] 范围内
eps = 1e-15
y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps)

# 计算 log_loss
loss = log_loss(y_test_one_hot, y_pred_prob, normalize=True)
print(f"Log Loss: {loss}")