5.2 KiB
5.2 KiB
| title | tags | categories | mathjax | abbrlink | date |
|---|---|---|---|---|---|
| 逻辑回归 | logistic-regression | machinelearning | true | 60504 | 2025-01-20 15:30:08 |
logistic regression code
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# 1. 加载乳腺癌数据集
data = load_breast_cancer()
# 2.1 数据集基本处理
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
for i in df.columns:
# 检查列是否有缺失值
if np.any(pd.isnull(df[i])):
print(f"Filling missing values in column: {i}")
#2.2 确认特征值、目标值
X = df.iloc[:,0:df.shape[1] - 1]
y = df.loc[:,"target"]
# 2.3 分割数据
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
# 显示前几行数据
df.head(1)
# 3. 特征工程 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test)
# 4 机器学习 逻辑回归
estimator = LogisticRegression()
estimator.fit(X_train,y_train)
# 5. 模型评估
print(f"模型准确率:{estimator.score(X_test,y_test)}")
print(f"模型预测值为:\n{estimator.predict(X_test)}")
分类评估的参数
-
准确率
准确率是所有预测正确的样本占总样本的比例Accuracy = \frac{TP+TN}{TP+FN+FP+TN} -
精准率
精准率(又称查准率)是指所有被预测为正类的样本中,真正为正类的比例Precision = \frac{TP}{TP+FP} -
召回率
召回率(又称查全率)是指所有实际为正类的样本中,被正确预测为正类的比例Recall = \frac{TP}{TP+FN} -
F1-score
F1 值(F1 Score)是精准率和召回率的调和平均数,综合考虑了精准率和召回率的影响。F1 = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}} -
roc曲线
tpr、fpr来衡量不平衡的二分类问题
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
# 1. 加载乳腺癌数据集
data = load_breast_cancer()
# 2.1 数据集基本处理
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
for i in df.columns:
# 检查列是否有缺失值
if np.any(pd.isnull(df[i])):
print(f"Filling missing values in column: {i}")
# 2.2 确认特征值、目标值
X = df.iloc[:, 0:df.shape[1] - 1]
y = df.loc[:, "target"]
# 2.3 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 显示前几行数据
df.head(1)
# 3. 特征工程 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test)
# 4 机器学习 逻辑回归
estimator = LogisticRegression()
estimator.fit(X_train, y_train)
# 5. 模型评估
print(f"模型准确率:{estimator.score(X_test, y_test)}")
y_pred = estimator.predict(X_test)
print(f"模型预测值为:\n{y_pred}")
# 5.1 精确率、召回率
ret = classification_report(y_test, y_pred, labels=[1, 0], target_names=["良性", "恶性"])
roc_score = roc_auc_score(y_test, y_pred)
print(f"准确率、召回率:{ret}")
print(f"roc_score:{roc_score}")
类别不平衡的处理
先准备类别不平衡的数据
from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from collections import Counter
# 1.准备类别不平衡的数据
X, y = make_classification(
n_samples=5000,
n_features=2,
n_informative=2,
n_redundant=0,
n_repeated=0,
n_classes=3,
n_clusters_per_class=1,
weights=[0.01, 0.05, 0.94],
random_state=0,
)
counter = Counter(y)
plt.scatter(X[:,0],X[:,1],c=y)
plt.show()
- 过采样
增加训练集的少数的类别的样本,使得正反例样本数据接近- 随机过采样(RandomOverSampler)
ros = RandomOverSampler()
X_resampled,y_resampled = ros.fit_resample(X,y)
print(Counter(y_resampled))
plt.scatter(X_resampled[:,0],X_resampled[:,1],c=y_resampled)
plt.show()
SMOTE过采样(SMOTE)
smote = SMOTE()
X_resampled,y_resampled = smote.fit_resample(X,y)
print(Counter(y_resampled))
plt.scatter(X_resampled[:,0],X_resampled[:,1],c=y_resampled)
plt.show()
- 欠采样
减少训练集的多数的类别的样本,使得正反例样本数据接近- 随机欠采样(RandomUnderSampler)
rus = RandomUnderSampler(random_state=0)
X_resampled,y_resampled = rus.fit_resample(X,y)
print(Counter(y_resampled))
plt.scatter(X_resampled[:,0],X_resampled[:,1],c=y_resampled)
plt.show()


