331 lines
11 KiB
Python
331 lines
11 KiB
Python
import json
|
|
import os
|
|
from pathlib import Path
|
|
|
|
import joblib
|
|
import matplotlib
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
import pandas as pd
|
|
from sklearn.metrics import confusion_matrix
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
import config
|
|
from core.deep_learning_model import (
|
|
_build_sequence_arrays,
|
|
load_lstm_mlp_bundle,
|
|
)
|
|
from core.model_features import (
|
|
NUMERICAL_OUTLIER_COLUMNS,
|
|
TARGET_COLUMN,
|
|
apply_outlier_bounds,
|
|
engineer_features,
|
|
fit_outlier_bounds,
|
|
make_target_bins,
|
|
normalize_columns,
|
|
prepare_modeling_dataframe,
|
|
)
|
|
from core.preprocessing import get_clean_data
|
|
|
|
|
|
matplotlib.rcParams['font.sans-serif'] = [
|
|
'Microsoft YaHei',
|
|
'SimHei',
|
|
'Noto Sans CJK SC',
|
|
'Arial Unicode MS',
|
|
'DejaVu Sans',
|
|
]
|
|
matplotlib.rcParams['axes.unicode_minus'] = False
|
|
|
|
BASE_DIR = Path(config.BASE_DIR)
|
|
MODELS_DIR = Path(config.MODELS_DIR)
|
|
OUTPUT_DIR = BASE_DIR / 'outputs' / 'eval_figures'
|
|
PREDICTION_CSV = OUTPUT_DIR / 'lstm_predictions.csv'
|
|
SUMMARY_JSON = OUTPUT_DIR / 'evaluation_summary.json'
|
|
|
|
MODEL_DISPLAY_NAMES = {
|
|
'lstm_mlp': '时序注意力融合网络',
|
|
'xgboost': 'XGBoost',
|
|
'gradient_boosting': 'GBDT',
|
|
'random_forest': '随机森林',
|
|
'extra_trees': '极端随机树',
|
|
'lightgbm': 'LightGBM',
|
|
}
|
|
|
|
|
|
def ensure_output_dir():
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
def load_metrics():
|
|
metrics_path = MODELS_DIR / 'model_metrics.pkl'
|
|
if not metrics_path.exists():
|
|
raise FileNotFoundError(f'未找到模型评估文件: {metrics_path}')
|
|
metrics = joblib.load(metrics_path)
|
|
return dict(sorted(metrics.items(), key=lambda item: item[1].get('r2', -999), reverse=True))
|
|
|
|
|
|
def get_test_split():
|
|
raw_df = normalize_columns(get_clean_data())
|
|
target_bins = make_target_bins(raw_df[TARGET_COLUMN].values)
|
|
raw_train_df, raw_test_df = train_test_split(
|
|
raw_df,
|
|
test_size=config.TEST_SIZE,
|
|
random_state=config.RANDOM_STATE,
|
|
stratify=target_bins,
|
|
)
|
|
return raw_train_df.reset_index(drop=True), raw_test_df.reset_index(drop=True)
|
|
|
|
|
|
def classify_risk(values):
|
|
values = np.asarray(values, dtype=float)
|
|
return np.where(values < 4, '低风险', np.where(values <= 8, '中风险', '高风险'))
|
|
|
|
|
|
def load_lstm_predictions():
|
|
model_path = MODELS_DIR / 'lstm_mlp_model.pt'
|
|
if not model_path.exists():
|
|
raise FileNotFoundError(f'未找到深度学习模型文件: {model_path}')
|
|
|
|
bundle = load_lstm_mlp_bundle(str(model_path))
|
|
if bundle is None:
|
|
raise RuntimeError('无法加载深度学习模型,请确认 torch 环境和模型文件正常。')
|
|
|
|
raw_train_df, raw_test_df = get_test_split()
|
|
fit_df = prepare_modeling_dataframe(raw_train_df)
|
|
test_df = prepare_modeling_dataframe(raw_test_df)
|
|
outlier_bounds = fit_outlier_bounds(fit_df, NUMERICAL_OUTLIER_COLUMNS)
|
|
fit_df = apply_outlier_bounds(fit_df, outlier_bounds)
|
|
test_df = apply_outlier_bounds(test_df, outlier_bounds)
|
|
|
|
feature_layout = bundle['feature_layout']
|
|
category_maps = bundle['category_maps']
|
|
target_transform = bundle['target_transform']
|
|
|
|
_, _, _, _, _ = _build_sequence_arrays(
|
|
fit_df,
|
|
feature_layout,
|
|
category_maps,
|
|
target_transform,
|
|
)
|
|
test_seq_num, test_seq_cat, test_static_num, test_static_cat, y_test = _build_sequence_arrays(
|
|
test_df,
|
|
feature_layout,
|
|
category_maps,
|
|
target_transform,
|
|
)
|
|
|
|
test_seq_num = ((test_seq_num - bundle['seq_mean']) / bundle['seq_std']).astype(np.float32)
|
|
test_static_num = ((test_static_num - bundle['static_mean']) / bundle['static_std']).astype(np.float32)
|
|
|
|
import torch
|
|
|
|
model = bundle['model']
|
|
model.eval()
|
|
with torch.no_grad():
|
|
predictions = model(
|
|
torch.tensor(test_seq_num, dtype=torch.float32),
|
|
torch.tensor(test_seq_cat, dtype=torch.long),
|
|
torch.tensor(test_static_num, dtype=torch.float32),
|
|
torch.tensor(test_static_cat, dtype=torch.long),
|
|
).cpu().numpy()
|
|
|
|
if target_transform == 'log1p':
|
|
y_true = np.expm1(y_test)
|
|
y_pred = np.expm1(predictions)
|
|
else:
|
|
y_true = y_test
|
|
y_pred = predictions
|
|
|
|
y_true = np.asarray(y_true, dtype=float)
|
|
y_pred = np.clip(np.asarray(y_pred, dtype=float), a_min=0.0, a_max=None)
|
|
residuals = y_pred - y_true
|
|
|
|
prediction_df = pd.DataFrame({
|
|
'真实值': np.round(y_true, 4),
|
|
'预测值': np.round(y_pred, 4),
|
|
'残差': np.round(residuals, 4),
|
|
'真实风险等级': classify_risk(y_true),
|
|
'预测风险等级': classify_risk(y_pred),
|
|
})
|
|
prediction_df.to_csv(PREDICTION_CSV, index=False, encoding='utf-8-sig')
|
|
return prediction_df
|
|
|
|
|
|
def plot_model_comparison(metrics):
|
|
model_names = [MODEL_DISPLAY_NAMES.get(name, name) for name in metrics.keys()]
|
|
r2_values = [metrics[name]['r2'] for name in metrics]
|
|
rmse_values = [metrics[name]['rmse'] for name in metrics]
|
|
mae_values = [metrics[name]['mae'] for name in metrics]
|
|
|
|
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
|
|
bar_colors = ['#0f766e' if name == 'lstm_mlp' else '#94a3b8' for name in metrics.keys()]
|
|
|
|
axes[0].bar(model_names, r2_values, color=bar_colors)
|
|
axes[0].set_title('模型R2对比')
|
|
axes[0].set_ylabel('R2')
|
|
axes[0].tick_params(axis='x', rotation=20)
|
|
|
|
axes[1].bar(model_names, rmse_values, color=bar_colors)
|
|
axes[1].set_title('模型RMSE对比')
|
|
axes[1].set_ylabel('RMSE')
|
|
axes[1].tick_params(axis='x', rotation=20)
|
|
|
|
axes[2].bar(model_names, mae_values, color=bar_colors)
|
|
axes[2].set_title('模型MAE对比')
|
|
axes[2].set_ylabel('MAE')
|
|
axes[2].tick_params(axis='x', rotation=20)
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(OUTPUT_DIR / '01_模型性能对比.png', dpi=220, bbox_inches='tight')
|
|
plt.close(fig)
|
|
|
|
|
|
def plot_actual_vs_pred(prediction_df):
|
|
y_true = prediction_df['真实值'].to_numpy()
|
|
y_pred = prediction_df['预测值'].to_numpy()
|
|
max_value = max(float(y_true.max()), float(y_pred.max()))
|
|
|
|
fig, ax = plt.subplots(figsize=(7, 7))
|
|
ax.scatter(y_true, y_pred, s=18, alpha=0.55, color='#0f766e', edgecolors='none')
|
|
ax.plot([0, max_value], [0, max_value], color='#dc2626', linestyle='--', linewidth=1.5)
|
|
ax.set_title('LSTM模型真实值与预测值对比')
|
|
ax.set_xlabel('真实缺勤时长(小时)')
|
|
ax.set_ylabel('预测缺勤时长(小时)')
|
|
fig.tight_layout()
|
|
fig.savefig(OUTPUT_DIR / '02_LSTM真实值_vs_预测值.png', dpi=220, bbox_inches='tight')
|
|
plt.close(fig)
|
|
|
|
|
|
def plot_residuals(prediction_df):
|
|
y_pred = prediction_df['预测值'].to_numpy()
|
|
residuals = prediction_df['残差'].to_numpy()
|
|
|
|
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
|
|
|
|
axes[0].hist(residuals, bins=30, color='#2563eb', alpha=0.85, edgecolor='white')
|
|
axes[0].axvline(0, color='#dc2626', linestyle='--', linewidth=1.2)
|
|
axes[0].set_title('LSTM残差分布')
|
|
axes[0].set_xlabel('残差(预测值 - 真实值)')
|
|
axes[0].set_ylabel('样本数')
|
|
|
|
axes[1].scatter(y_pred, residuals, s=18, alpha=0.55, color='#7c3aed', edgecolors='none')
|
|
axes[1].axhline(0, color='#dc2626', linestyle='--', linewidth=1.2)
|
|
axes[1].set_title('LSTM残差散点图')
|
|
axes[1].set_xlabel('预测缺勤时长(小时)')
|
|
axes[1].set_ylabel('残差')
|
|
|
|
fig.tight_layout()
|
|
fig.savefig(OUTPUT_DIR / '03_LSTM残差分析.png', dpi=220, bbox_inches='tight')
|
|
plt.close(fig)
|
|
|
|
|
|
def plot_confusion_matrix(prediction_df):
|
|
labels = ['低风险', '中风险', '高风险']
|
|
cm = confusion_matrix(
|
|
prediction_df['真实风险等级'],
|
|
prediction_df['预测风险等级'],
|
|
labels=labels,
|
|
)
|
|
|
|
fig, ax = plt.subplots(figsize=(6, 5))
|
|
image = ax.imshow(cm, cmap='GnBu')
|
|
ax.set_title('LSTM风险等级混淆矩阵')
|
|
ax.set_xlabel('预测类别')
|
|
ax.set_ylabel('真实类别')
|
|
ax.set_xticks(range(len(labels)))
|
|
ax.set_xticklabels(labels)
|
|
ax.set_yticks(range(len(labels)))
|
|
ax.set_yticklabels(labels)
|
|
|
|
for row in range(cm.shape[0]):
|
|
for col in range(cm.shape[1]):
|
|
ax.text(col, row, int(cm[row, col]), ha='center', va='center', color='#111827')
|
|
|
|
fig.colorbar(image, ax=ax, fraction=0.046, pad=0.04)
|
|
fig.tight_layout()
|
|
fig.savefig(OUTPUT_DIR / '04_LSTM风险等级混淆矩阵.png', dpi=220, bbox_inches='tight')
|
|
plt.close(fig)
|
|
|
|
|
|
def plot_feature_importance():
|
|
candidate_files = [
|
|
('xgboost', MODELS_DIR / 'xgboost_model.pkl'),
|
|
('random_forest', MODELS_DIR / 'random_forest_model.pkl'),
|
|
('extra_trees', MODELS_DIR / 'extra_trees_model.pkl'),
|
|
]
|
|
selected_features_path = MODELS_DIR / 'selected_features.pkl'
|
|
feature_names_path = MODELS_DIR / 'feature_names.pkl'
|
|
selected_features = joblib.load(selected_features_path) if selected_features_path.exists() else None
|
|
feature_names = joblib.load(feature_names_path) if feature_names_path.exists() else None
|
|
|
|
for model_name, model_path in candidate_files:
|
|
if not model_path.exists():
|
|
continue
|
|
model = joblib.load(model_path)
|
|
if not hasattr(model, 'feature_importances_'):
|
|
continue
|
|
importances = model.feature_importances_
|
|
names = selected_features or feature_names or [f'feature_{idx}' for idx in range(len(importances))]
|
|
if len(names) != len(importances):
|
|
names = [f'feature_{idx}' for idx in range(len(importances))]
|
|
|
|
top_items = sorted(zip(names, importances), key=lambda item: item[1], reverse=True)[:15]
|
|
top_items.reverse()
|
|
|
|
fig, ax = plt.subplots(figsize=(8, 6))
|
|
ax.barh(
|
|
[config.FEATURE_NAME_CN.get(name, name) for name, _ in top_items],
|
|
[float(value) for _, value in top_items],
|
|
color='#0f766e',
|
|
)
|
|
ax.set_title(f'{MODEL_DISPLAY_NAMES.get(model_name, model_name)}特征重要性 Top15')
|
|
ax.set_xlabel('重要性')
|
|
fig.tight_layout()
|
|
fig.savefig(OUTPUT_DIR / '05_特征重要性_Top15.png', dpi=220, bbox_inches='tight')
|
|
plt.close(fig)
|
|
return model_name
|
|
|
|
return None
|
|
|
|
|
|
def save_summary(metrics, prediction_df, feature_model_name):
|
|
residuals = prediction_df['残差'].to_numpy()
|
|
summary = {
|
|
'best_model': next(iter(metrics.keys())),
|
|
'metrics': metrics,
|
|
'lstm_prediction_summary': {
|
|
'prediction_count': int(len(prediction_df)),
|
|
'residual_mean': round(float(residuals.mean()), 4),
|
|
'residual_std': round(float(residuals.std()), 4),
|
|
'risk_accuracy': round(
|
|
float((prediction_df['真实风险等级'] == prediction_df['预测风险等级']).mean()),
|
|
4,
|
|
),
|
|
},
|
|
'feature_importance_model': feature_model_name,
|
|
'generated_files': sorted([file.name for file in OUTPUT_DIR.iterdir() if file.is_file()]),
|
|
}
|
|
SUMMARY_JSON.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding='utf-8')
|
|
|
|
|
|
def main():
|
|
ensure_output_dir()
|
|
metrics = load_metrics()
|
|
prediction_df = load_lstm_predictions()
|
|
|
|
plot_model_comparison(metrics)
|
|
plot_actual_vs_pred(prediction_df)
|
|
plot_residuals(prediction_df)
|
|
plot_confusion_matrix(prediction_df)
|
|
feature_model_name = plot_feature_importance()
|
|
save_summary(metrics, prediction_df, feature_model_name)
|
|
|
|
print(f'评估图片已生成: {OUTPUT_DIR}')
|
|
print(f'LSTM预测明细: {PREDICTION_CSV}')
|
|
print(f'评估摘要: {SUMMARY_JSON}')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|