线性回归简介

用于预测一个连续的目标变量(因变量),与一个或多个特征(自变量)之间存在线性关系。

假设函数:
$$y = w_1x_1 + w_2x_2 + \cdot\cdot\cdot+w_nx_n$$

  • $y$ 是目标变量(因变量),即我们希望预测的值。
  • $x1​,x2​,…,xn$​ 是特征变量(自变量),即输入的值。

损失函数

为了找到最佳的线性模型,我们需要通过最小化损失函数来优化模型参数。在线性回归中,常用的损失函数是 均方误差(MSE)
$$MSE = \frac{1}{m} \sum_{i=1}^{m} (y_i - \hat{y}_i)^2$$

  • m 是样本的数量。
  • $y_i$​ 是第 i 个样本的真实值。
  • $\hat{y}_i​$ 是模型预测的第 i 个样本的值。

线性回归优化

  • 梯度下降法

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    from sklearn.datasets import fetch_california_housing
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import SGDRegressor
    from sklearn.metrics import mean_squared_error

    # 1. 获取数据集
    housing = fetch_california_housing()

    # 2. 数据集处理
    # 2.1 分割数据集
    X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.25)

    # 3. 特征工程
    # 3.1 标准化
    transfer = StandardScaler()
    X_train = transfer.fit_transform(X_train)
    X_test = transfer.transform(X_test) # 使用 transform() 而不是 fit_transform()

    # 4.机器学习- 梯度下降法
    estimater = SGDRegressor(max_iter=1000, eta0=0.01)
    estimater.fit(X_train, y_train)
    print(f"SGD模型的偏置是:{estimater.intercept_}")
    print(f"SGD模型的系数是:{estimater.coef_}")

    # 5. 模型评估
    y_pred = estimater.predict(X_test)
    print(f"SGD模型预测值:{y_pred}")
    mse = mean_squared_error(y_test, y_pred)
    print(f"SGD模型均方误差:{mse}")
  • 正规方程

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    from sklearn.datasets import fetch_california_housing
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error

    # 1. 获取数据集
    housing = fetch_california_housing()

    # 2. 数据集处理
    # 2.1 分割数据集
    X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.25)

    # 3. 特征工程
    # 3.1 标准化
    transfer = StandardScaler()
    X_train = transfer.fit_transform(X_train)
    X_test = transfer.fit_transform(X_test)

    # 4.机器学习- 线性回归
    estimater = LinearRegression()
    estimater.fit(X_train, y_train)
    print(f"模型的偏置是:{estimater.intercept_}")
    print(f"模型的系数是:{estimater.coef_}")

    # 5. 模型评估
    y_pred = estimater.predict(X_test)
    print(f"模型预测值:{y_pred}")
    mse = mean_squared_error(y_test, y_pred)
    print(f"模型均方误差:{mse}")
  • 岭回归

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    from sklearn.datasets import fetch_california_housing
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import Ridge, RidgeCV
    from sklearn.metrics import mean_squared_error

    # 1. 获取数据集
    housing = fetch_california_housing()

    # 2. 数据集处理
    # 2.1 分割数据集
    X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.25)

    # 3. 特征工程
    # 3.1 标准化
    transfer = StandardScaler()
    X_train = transfer.fit_transform(X_train)
    X_test = transfer.transform(X_test) # 使用 transform() 而不是 fit_transform()

    # 4.机器学习- 岭回归 使用了Ridge的alpha的搜索
    # estimater = Ridge(alpha=1.0)
    estimater = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100])
    estimater.fit(X_train, y_train)
    print(f"Ridge模型的偏置是:{estimater.intercept_}")
    print(f"Ridge模型的系数是:{estimater.coef_}")

    # 查看最佳 alpha
    print(f"最佳 alpha 值是:{estimater.alpha_}")

    # 5. 模型评估
    y_pred = estimater.predict(X_test)
    print(f"Ridge模型预测值:{y_pred}")
    mse = mean_squared_error(y_test, y_pred)
    print(f"Ridge模型均方误差:{mse}")

这样每个代码块的缩进保持一致,便于阅读和理解。如果有其他优化需求,随时告诉我!

模型保存和加载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
import joblib

def save_model():
# 1. 获取数据集
housing = fetch_california_housing()
# 2. 数据集处理
# 2.1 分割数据集
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.25)
# 3. 特征工程
# 3.1 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test) # 使用 transform() 而不是 fit_transform()
# 4. 机器学习 - 岭回归 使用了Ridge的alpha的搜索
estimater = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100])
estimater.fit(X_train, y_train)
print(f"Ridge模型的偏置是:{estimater.intercept_}")
print(f"Ridge模型的系数是:{estimater.coef_}")
# 保存模型
joblib.dump(estimater, 'ridge_model.pkl')
# 查看最佳 alpha
print(f"最佳 alpha 值是:{estimater.alpha_}")
# 5. 模型评估
y_pred = estimater.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Ridge模型均方误差:{mse}")

def load_model():
# 1. 获取数据集
housing = fetch_california_housing()
# 2. 数据集处理
# 2.1 分割数据集
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.25)
# 3. 特征工程
# 3.1 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test) # 使用 transform() 而不是 fit_transform()
# 加载模型
estimater = joblib.load('ridge_model.pkl')
print(f"Ridge模型的偏置是:{estimater.intercept_}")
print(f"Ridge模型的系数是:{estimater.coef_}")
# 查看最佳 alpha
print(f"最佳 alpha 值是:{estimater.alpha_}")
# 5. 模型评估
y_pred = estimater.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Ridge模型预测值:{y_pred}")
print(f"Ridge模型均方误差:{mse}")

print("训练并保存模型:")
save_model()
print("加载模型")
load_model()