Bagging
随机森林
Random-Forest 就是Bagging + Decisiontree
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
| import seaborn as sns import pandas as pd import numpy as np from sklearn.model_selection import train_test_split,GridSearchCV from sklearn.feature_extraction import DictVectorizer from sklearn.ensemble import RandomForestClassifier
titanic = sns.load_dataset('titanic') missing_age_count = titanic['age'].isna().sum()
X = titanic[['pclass','age','sex']] y = titanic['survived']
X.loc[:, 'age'] = X['age'].fillna(value=X['age'].mean())
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=22)
X_train = X_train.to_dict(orient="records") X_test= X_test.to_dict(orient="records") transfer = DictVectorizer() X_train = transfer.fit_transform(X_train) X_test = transfer.transform(X_test)
rf = RandomForestClassifier() gc = GridSearchCV(estimator=rf ,param_grid={"n_estimators":[100,120,300],"max_depth":[3,7,11]},cv=3) gc.fit(X_train,y_train) y_pred = gc.predict(X_test) print(f"模型的测试集的预测值:{y_pred}") ret = gc.score(X_test,y_test) print(f"最佳模型在测试集上的评分:{ret}") print(f"最佳模型的参数:{gc.best_estimator_}") print(f"最佳模型在训练集上的评分:{gc.best_score_}") print(X_test.toarray())
|

ott案例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
| import pandas as pd import numpy as np import matplotlib.pyplot as plt from imblearn.under_sampling import RandomUnderSampler from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import log_loss from sklearn.preprocessing import OneHotEncoder
data = pd.read_csv('./data/train.csv')
import seaborn as sns sns.countplot(data=data, x='target', hue='target', palette="Set2", legend=False) plt.show()
x = data.drop(["id", "target"], axis=1) y = data['target']
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=0) x_resampled, y_resampled = undersampler.fit_resample(x, y)
le = LabelEncoder() y_resampled = le.fit_transform(y_resampled)
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2)
rf = RandomForestClassifier(oob_score = True) rf.fit(x_train,y_train) y_pred = rf.predict(x_test) print(f"预测值:{y_pred}") print(f"评分:{rf.score(x_test,y_test)}")
y_pred_prob = rf.predict_proba(x_test)
encoder = OneHotEncoder(sparse_output=False) y_test_one_hot = encoder.fit_transform(y_test.reshape(-1, 1))
eps = 1e-15 y_pred_prob = np.clip(y_pred_prob, eps, 1 - eps)
loss = log_loss(y_test_one_hot, y_pred_prob, normalize=True) print(f"Log Loss: {loss}")
|
