update decisiontree regression problem details

This commit is contained in:
2025-01-24 18:17:26 +08:00
63 changed files with 2536 additions and 260 deletions

View File

@@ -106,6 +106,7 @@ print(f"转换后的数据:\n{new_data}")
### 回归决策树
<<<<<<< HEAD
#### 决策树算法的应用 (泰坦尼克号沉船幸存者预测)
```python
import seaborn as sns
@@ -168,6 +169,9 @@ graph.view(output_path) # 打开图像path为保存路径不需要加后
```
[Webgraphviz](http://webgraphviz.com/),这个网站可以将`tree.dot`文件的内容生成对应的可视化树
=======
[Webgraphviz](http://webgraphviz.com/)
>>>>>>> efff4fc1310b4d2f201748d9b976f5efbb4a42bf
```python
import numpy as np

View File

@@ -1,6 +1,7 @@
---
title: k近邻算法K-Nearest NeighborsKNN
tags: machinelearning
tags: KNN
categories: machinelearning
abbrlink: 29139
mathjax: true
date: 2025-01-13 17:20:59

View File

@@ -0,0 +1,200 @@
---
title: 线性回归
tags: linear-regression
categories: machinelearning
mathjax: true
abbrlink: 52662
date: 2025-01-19 16:46:51
---
### 线性回归简介
>用于预测一个连续的目标变量(因变量),与一个或多个特征(自变量)之间存在线性关系。
假设函数:
$$y = w_1x_1 + w_2x_2 + \cdot\cdot\cdot+w_nx_n$$
- $y$ 是目标变量(因变量),即我们希望预测的值。
- $x1,x2,…,xn$ 是特征变量(自变量),即输入的值。
### 损失函数
为了找到最佳的线性模型,我们需要通过最小化损失函数来优化模型参数。在线性回归中,常用的损失函数是 **均方误差MSE**
$$MSE = \frac{1}{m} \sum_{i=1}^{m} (y_i - \hat{y}_i)^2$$
- m 是样本的数量。
- $y_i$ 是第 i 个样本的真实值。
- $\hat{y}_i$ 是模型预测的第 i 个样本的值。
### 线性回归优化
- 梯度下降法
```python
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
# 1. 获取数据集
housing = fetch_california_housing()
# 2. 数据集处理
# 2.1 分割数据集
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.25)
# 3. 特征工程
# 3.1 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test) # 使用 transform() 而不是 fit_transform()
# 4.机器学习- 梯度下降法
estimater = SGDRegressor(max_iter=1000, eta0=0.01)
estimater.fit(X_train, y_train)
print(f"SGD模型的偏置是{estimater.intercept_}")
print(f"SGD模型的系数是{estimater.coef_}")
# 5. 模型评估
y_pred = estimater.predict(X_test)
print(f"SGD模型预测值{y_pred}")
mse = mean_squared_error(y_test, y_pred)
print(f"SGD模型均方误差:{mse}")
```
- 正规方程
```python
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# 1. 获取数据集
housing = fetch_california_housing()
# 2. 数据集处理
# 2.1 分割数据集
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.25)
# 3. 特征工程
# 3.1 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.fit_transform(X_test)
# 4.机器学习- 线性回归
estimater = LinearRegression()
estimater.fit(X_train, y_train)
print(f"模型的偏置是:{estimater.intercept_}")
print(f"模型的系数是:{estimater.coef_}")
# 5. 模型评估
y_pred = estimater.predict(X_test)
print(f"模型预测值:{y_pred}")
mse = mean_squared_error(y_test, y_pred)
print(f"模型均方误差:{mse}")
```
- 岭回归
```python
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
# 1. 获取数据集
housing = fetch_california_housing()
# 2. 数据集处理
# 2.1 分割数据集
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.25)
# 3. 特征工程
# 3.1 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test) # 使用 transform() 而不是 fit_transform()
# 4.机器学习- 岭回归 使用了Ridge的alpha的搜索
# estimater = Ridge(alpha=1.0)
estimater = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100])
estimater.fit(X_train, y_train)
print(f"Ridge模型的偏置是{estimater.intercept_}")
print(f"Ridge模型的系数是{estimater.coef_}")
# 查看最佳 alpha
print(f"最佳 alpha 值是:{estimater.alpha_}")
# 5. 模型评估
y_pred = estimater.predict(X_test)
print(f"Ridge模型预测值{y_pred}")
mse = mean_squared_error(y_test, y_pred)
print(f"Ridge模型均方误差:{mse}")
```
这样每个代码块的缩进保持一致,便于阅读和理解。如果有其他优化需求,随时告诉我!
![](/img/machinelearning/linear.png)
![](/img/machinelearning/fitting.png)
### 模型保存和加载
```python
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.metrics import mean_squared_error
import joblib
def save_model():
# 1. 获取数据集
housing = fetch_california_housing()
# 2. 数据集处理
# 2.1 分割数据集
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.25)
# 3. 特征工程
# 3.1 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test) # 使用 transform() 而不是 fit_transform()
# 4. 机器学习 - 岭回归 使用了Ridge的alpha的搜索
estimater = RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100])
estimater.fit(X_train, y_train)
print(f"Ridge模型的偏置是{estimater.intercept_}")
print(f"Ridge模型的系数是{estimater.coef_}")
# 保存模型
joblib.dump(estimater, 'ridge_model.pkl')
# 查看最佳 alpha
print(f"最佳 alpha 值是:{estimater.alpha_}")
# 5. 模型评估
y_pred = estimater.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Ridge模型均方误差:{mse}")
def load_model():
# 1. 获取数据集
housing = fetch_california_housing()
# 2. 数据集处理
# 2.1 分割数据集
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, test_size=0.25)
# 3. 特征工程
# 3.1 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test) # 使用 transform() 而不是 fit_transform()
# 加载模型
estimater = joblib.load('ridge_model.pkl')
print(f"Ridge模型的偏置是{estimater.intercept_}")
print(f"Ridge模型的系数是{estimater.coef_}")
# 查看最佳 alpha
print(f"最佳 alpha 值是:{estimater.alpha_}")
# 5. 模型评估
y_pred = estimater.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Ridge模型预测值{y_pred}")
print(f"Ridge模型均方误差:{mse}")
print("训练并保存模型:")
save_model()
print("加载模型")
load_model()
```

View File

@@ -0,0 +1,173 @@
---
title: 逻辑回归
tags: logistic-regression
categories: machinelearning
mathjax: true
abbrlink: 60504
date: 2025-01-20 15:30:08
---
### logistic regression code
```python
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# 1. 加载乳腺癌数据集
data = load_breast_cancer()
# 2.1 数据集基本处理
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
for i in df.columns:
# 检查列是否有缺失值
if np.any(pd.isnull(df[i])):
print(f"Filling missing values in column: {i}")
#2.2 确认特征值、目标值
X = df.iloc[:,0:df.shape[1] - 1]
y = df.loc[:,"target"]
# 2.3 分割数据
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
# 显示前几行数据
df.head(1)
# 3. 特征工程 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test)
# 4 机器学习 逻辑回归
estimator = LogisticRegression()
estimator.fit(X_train,y_train)
# 5. 模型评估
print(f"模型准确率:{estimator.score(X_test,y_test)}")
print(f"模型预测值为:\n{estimator.predict(X_test)}")
```
### 分类评估的参数
- 准确率
准确率是所有预测正确的样本占总样本的比例
$$Accuracy = \frac{TP+TN}{TP+FN+FP+TN}$$
- 精准率
精准率(又称查准率)是指所有被预测为正类的样本中,真正为正类的比例
$$Precision = \frac{TP}{TP+FP}$$
- 召回率
召回率(又称查全率)是指所有实际为正类的样本中,被正确预测为正类的比例
$$Recall = \frac{TP}{TP+FN}$$
- F1-score
F1 值F1 Score是精准率和召回率的调和平均数综合考虑了精准率和召回率的影响。
$$ F1 = 2 \times \frac{\text{Precision} \times \text{Recall}}{\text{Precision} + \text{Recall}} $$
- roc曲线
tpr、fpr来衡量不平衡的二分类问题
```python
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
# 1. 加载乳腺癌数据集
data = load_breast_cancer()
# 2.1 数据集基本处理
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
for i in df.columns:
# 检查列是否有缺失值
if np.any(pd.isnull(df[i])):
print(f"Filling missing values in column: {i}")
# 2.2 确认特征值、目标值
X = df.iloc[:, 0:df.shape[1] - 1]
y = df.loc[:, "target"]
# 2.3 分割数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# 显示前几行数据
df.head(1)
# 3. 特征工程 标准化
transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test)
# 4 机器学习 逻辑回归
estimator = LogisticRegression()
estimator.fit(X_train, y_train)
# 5. 模型评估
print(f"模型准确率:{estimator.score(X_test, y_test)}")
y_pred = estimator.predict(X_test)
print(f"模型预测值为:\n{y_pred}")
# 5.1 精确率、召回率
ret = classification_report(y_test, y_pred, labels=[1, 0], target_names=["良性", "恶性"])
roc_score = roc_auc_score(y_test, y_pred)
print(f"准确率、召回率:{ret}")
print(f"roc_score:{roc_score}")
```
### 类别不平衡的处理
先准备类别不平衡的数据
```python
from imblearn.over_sampling import RandomOverSampler,SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
from collections import Counter
# 1.准备类别不平衡的数据
X, y = make_classification(
n_samples=5000,
n_features=2,
n_informative=2,
n_redundant=0,
n_repeated=0,
n_classes=3,
n_clusters_per_class=1,
weights=[0.01, 0.05, 0.94],
random_state=0,
)
counter = Counter(y)
plt.scatter(X[:,0],X[:,1],c=y)
plt.show()
```
- 过采样
增加训练集的少数的类别的样本,使得正反例样本数据接近
- 随机过采样RandomOverSampler)
```python
ros = RandomOverSampler()
X_resampled,y_resampled = ros.fit_resample(X,y)
print(Counter(y_resampled))
plt.scatter(X_resampled[:,0],X_resampled[:,1],c=y_resampled)
plt.show()
```
![](/img/machinelearning/over_random_sampling.png)
- `SMOTE`过采样SMOTE
```python
smote = SMOTE()
X_resampled,y_resampled = smote.fit_resample(X,y)
print(Counter(y_resampled))
plt.scatter(X_resampled[:,0],X_resampled[:,1],c=y_resampled)
plt.show()
```
![](/img/machinelearning/over_smote_sampling.png)
- 欠采样
减少训练集的多数的类别的样本,使得正反例样本数据接近
- 随机欠采样RandomUnderSampler
```python
rus = RandomUnderSampler(random_state=0)
X_resampled,y_resampled = rus.fit_resample(X,y)
print(Counter(y_resampled))
plt.scatter(X_resampled[:,0],X_resampled[:,1],c=y_resampled)
plt.show()
```
![](/img/machinelearning/under_sampling.png)