--- title: 集成学习 tags: ensemble-learning categories: machinelearning abbrlink: 8816 date: 2025-01-25 15:12:08 cover: /img/machinelearning/ensemble-learning.png top_img: /img/site01.jpg --- ### Bagging ### 随机森林 > `Random-Forest` 就是`Bagging + Decisiontree` ```python import seaborn as sns import pandas as pd import numpy as np from sklearn.model_selection import train_test_split,GridSearchCV from sklearn.feature_extraction import DictVectorizer from sklearn.ensemble import RandomForestClassifier # 1.获取数据集 - 加载 Titanic 数据集 titanic = sns.load_dataset('titanic') missing_age_count = titanic['age'].isna().sum() # print(f"缺失的 age 数量: {missing_age_count}") # 2. 数据基本处理 # 2.1 确认特征值、目标值 X = titanic[['pclass','age','sex']] y = titanic['survived'] # 2.2 缺失值处理 X.loc[:, 'age'] = X['age'].fillna(value=X['age'].mean()) # 使用 .loc 进行修改 # 2.3 划分数据集 X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=22) # 3. 特征工程(字典特征提取) X_train = X_train.to_dict(orient="records") X_test= X_test.to_dict(orient="records") transfer = DictVectorizer() X_train = transfer.fit_transform(X_train) X_test = transfer.transform(X_test) # 4. 机器学习 随机森林 rf = RandomForestClassifier() gc = GridSearchCV(estimator=rf ,param_grid={"n_estimators":[100,120,300],"max_depth":[3,7,11]},cv=3) gc.fit(X_train,y_train) y_pred = gc.predict(X_test) print(f"模型的测试集的预测值:{y_pred}") ret = gc.score(X_test,y_test) print(f"最佳模型在测试集上的评分:{ret}") print(f"最佳模型的参数:{gc.best_estimator_}") print(f"最佳模型在训练集上的评分:{gc.best_score_}") print(X_test.toarray()) ``` ![](/img/machinelearning/random-forest.png) ### ott案例 ```python import pandas as pd import numpy as np import matplotlib.pyplot as plt from imblearn.under_sampling import RandomUnderSampler from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import log_loss from sklearn.preprocessing import OneHotEncoder # 1. 获取数据集 data = pd.read_csv('./data/train.csv') # 查看目标值分类 import seaborn as sns sns.countplot(data=data, x='target', hue='target', palette="Set2", legend=False) # 使用 hue='target' 替代 palette plt.show() # 2. 数据集的基本处理 # 2.1 确定特征值、目标值 x = data.drop(["id", "target"], axis=1) y = data['target'] # 2.2 使用随机欠采样进行平衡 undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=0) x_resampled, y_resampled = undersampler.fit_resample(x, y) # 查看欠采样后的类别分布 # print(f"欠采样后训练集中的类别分布:\n{y_train_resampled.value_counts()}") # 2.3. 将标签转换为数字 le = LabelEncoder() y_resampled = le.fit_transform(y_resampled) # 2.4. 划分训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2) # 3. 机器学习 rf = RandomForestClassifier(oob_score = True) rf.fit(x_train,y_train) y_pred = rf.predict(x_test) print(f"预测值:{y_pred}") print(f"评分:{rf.score(x_test,y_test)}") # # 4. 模型评估 (解决二分类预测问题) # import numpy as np # from sklearn.metrics import log_loss # # 假设 y_pred_prob 是通过 predict_proba 得到的预测概率 # # 对预测概率进行裁剪,将其限制在 [eps, 1-eps] 范围内 # eps = 1e-15 # 设置一个小的eps值,避免极端值 # y_pred_prob = rf.predict_proba(x_test) # y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps) # # 计算 log_loss # loss = log_loss(y_test, y_pred_prob, normalize=True) # print(f"Log Loss: {loss}") # 4. 模型评估 (解决多分类预测问题) # 获取预测的概率 y_pred_prob = rf.predict_proba(x_test) # 使用 OneHotEncoder 对 y_test 进行 One-Hot 编码 encoder = OneHotEncoder(sparse_output=False) # 确保返回的是密集矩阵 y_test_one_hot = encoder.fit_transform(y_test.reshape(-1, 1)) # 对预测概率进行裁剪,将其限制在 [eps, 1-eps] 范围内 eps = 1e-15 y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps) # 计算 log_loss loss = log_loss(y_test_one_hot, y_pred_prob, normalize=True) print(f"Log Loss: {loss}") ``` ![](/img/machinelearning/ott.png)