QuickReference/source/_posts/machinelearning/logisticregression.md

5.2 KiB
Raw Blame History

title tags categories mathjax abbrlink date
逻辑回归 logistic-regression machinelearning true 60504 2025-01-20 15:30:08

logistic regression code

import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# 1. 加载乳腺癌数据集
data = load_breast_cancer()
# 2.1  数据集基本处理
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
for i in df.columns:
    # 检查列是否有缺失值
    if np.any(pd.isnull(df[i])):
        print(f"Filling missing values in column: {i}")
#2.2 确认特征值、目标值
X = df.iloc[:,0:df.shape[1] - 1]
y = df.loc[:,"target"]
# 2.3 分割数据
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
# 显示前几行数据
df.head(1)

# 3. 特征工程 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test)

# 4 机器学习 逻辑回归
estimator = LogisticRegression()
estimator.fit(X_train,y_train)

# 5. 模型评估
print(f"模型准确率:{estimator.score(X_test,y_test)}")
print(f"模型预测值为:\n{estimator.predict(X_test)}")

分类评估的参数

  • 准确率
    准确率是所有预测正确的样本占总样本的比例

    Accuracy = \frac{TP+TN}{TP+FN+FP+TN}
  • 精准率
    精准率(又称查准率)是指所有被预测为正类的样本中,真正为正类的比例

    Precision = \frac{TP}{TP+FP}
  • 召回率
    召回率(又称查全率)是指所有实际为正类的样本中,被正确预测为正类的比例

    Recall = \frac{TP}{TP+FN}
  • F1-score
    F1 值F1 Score是精准率和召回率的调和平均数综合考虑了精准率和召回率的影响。

     F1 = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}} 
  • roc曲线
    tpr、fpr来衡量不平衡的二分类问题

 import pandas as pd
 import numpy as np
 from sklearn.datasets import load_breast_cancer
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import classification_report, roc_auc_score
 # 1. 加载乳腺癌数据集
 data = load_breast_cancer()
 # 2.1  数据集基本处理
 df = pd.DataFrame(data.data, columns=data.feature_names)
 df['target'] = data.target
 for i in df.columns:
     # 检查列是否有缺失值
     if np.any(pd.isnull(df[i])):
         print(f"Filling missing values in column: {i}")
 # 2.2 确认特征值、目标值
 X = df.iloc[:, 0:df.shape[1] - 1]
 y = df.loc[:, "target"]
 # 2.3 分割数据
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 # 显示前几行数据
 df.head(1)
 
 # 3. 特征工程 标准化
 transfer = StandardScaler()
 X_train = transfer.fit_transform(X_train)
 X_test = transfer.transform(X_test)
 
 # 4 机器学习 逻辑回归
 estimator = LogisticRegression()
 estimator.fit(X_train, y_train)
 
 # 5. 模型评估
 print(f"模型准确率:{estimator.score(X_test, y_test)}")
 y_pred = estimator.predict(X_test)
 print(f"模型预测值为:\n{y_pred}")
 # 5.1 精确率、召回率
 ret = classification_report(y_test, y_pred, labels=[1, 0], target_names=["良性", "恶性"])
 roc_score = roc_auc_score(y_test, y_pred)
 print(f"准确率、召回率:{ret}")
 print(f"roc_score:{roc_score}")

类别不平衡的处理

先准备类别不平衡的数据

from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from collections import Counter

# 1.准备类别不平衡的数据
X, y = make_classification(
    n_samples=5000,
    n_features=2,
    n_informative=2,
    n_redundant=0,
    n_repeated=0,
    n_classes=3,
    n_clusters_per_class=1,
    weights=[0.01, 0.05, 0.94],
    random_state=0,
)
counter = Counter(y)
plt.scatter(X[:,0],X[:,1],c=y)
plt.show()
  • 过采样
    增加训练集的少数的类别的样本,使得正反例样本数据接近
    • 随机过采样RandomOverSampler)
  ros = RandomOverSampler()
  X_resampled,y_resampled = ros.fit_resample(X,y)
  print(Counter(y_resampled))
  plt.scatter(X_resampled[:,0],X_resampled[:,1],c=y_resampled)
  plt.show()

  • SMOTE过采样SMOTE
  smote = SMOTE()
  X_resampled,y_resampled = smote.fit_resample(X,y)
  print(Counter(y_resampled))
  plt.scatter(X_resampled[:,0],X_resampled[:,1],c=y_resampled)
  plt.show()

  • 欠采样
    减少训练集的多数的类别的样本,使得正反例样本数据接近
    • 随机欠采样RandomUnderSampler
  rus = RandomUnderSampler(random_state=0)
  X_resampled,y_resampled = rus.fit_resample(X,y)
  print(Counter(y_resampled))
  plt.scatter(X_resampled[:,0],X_resampled[:,1],c=y_resampled)
  plt.show()