feat: 添加 JD-R 理论分析模块与 SHAP 可解释性分析功能
- 后端新增 JD-R(工作要求-资源)理论维度数据生成,包含工作要求、工作资源、
个人资源、中介变量共 16 个新特征列
- 新增 JD-R 分析服务与 API(维度统计、倦怠投入分析、双路径中介分析、
分组轮廓、风险分布)
- 新增 SHAP 可解释性分析模块(全局重要性、局部解释、特征交互、依赖图)
- 预测服务增加风险分类模型加载与概率预测能力
- 前端新增 JD-R 分析页面(JDRAnalysis.vue),含雷达图、散点图、路径分析等可视化
- 预测页面增加风险概率展示与 SHAP 特征解释
- 路由与导航菜单同步更新
This commit is contained in:
@@ -387,16 +387,181 @@ def generate_dataset(output_path=None, sample_count=12000, random_state=None):
|
||||
return df
|
||||
|
||||
|
||||
def ensure_dataset():
|
||||
if not os.path.exists(config.RAW_DATA_PATH):
|
||||
generate_dataset(config.RAW_DATA_PATH)
|
||||
return
|
||||
def enrich_with_jdr_columns(df):
|
||||
"""为现有数据追加 JD-R(工作要求-资源)理论维度列。
|
||||
|
||||
try:
|
||||
df = pd.read_csv(config.RAW_DATA_PATH)
|
||||
validate_dataset(df)
|
||||
except Exception:
|
||||
在已有的员工/事件属性基础上,合成 16 个新列:
|
||||
- 工作要求:工作自主性、情绪劳动强度、时间压力感知、角色模糊度、工作家庭冲突
|
||||
- 工作资源:上级支持、同事支持、技能多样性、职业发展机会、参与决策、组织公平感
|
||||
- 个人资源:自我效能感、心理韧性、乐观程度
|
||||
- 中介变量:工作倦怠、工作投入
|
||||
"""
|
||||
rng = np.random.default_rng(config.RANDOM_STATE + 100)
|
||||
df = df.copy()
|
||||
n = len(df)
|
||||
|
||||
# ── 辅助:条件性 Likert 生成 ──
|
||||
def likert(mean_offset, std=0.8, low=1.0, high=5.0):
|
||||
return np.clip(rng.normal(mean_offset, std, size=n), low, high)
|
||||
|
||||
# ── 预提取列 ──
|
||||
overtime = df['月均加班时长'].values
|
||||
commute = df['通勤时长分钟'].values
|
||||
night = df['是否夜班岗位'].values
|
||||
children = df['子女数量'].values
|
||||
married_arr = (df['婚姻状态'] == '已婚').astype(int).values
|
||||
tenure = df['司龄年数'].values
|
||||
team_size = df['团队人数'].values
|
||||
manager_span = df['直属上级管理跨度'].values
|
||||
exercise = df['每周运动频次'].values
|
||||
sleep = df['近30天睡眠时长均值'].values
|
||||
chronic = df['是否慢性病史'].values
|
||||
perf_a = (df['绩效等级'] == 'A').astype(int).values
|
||||
perf_ab = df['绩效等级'].isin(['A', 'B']).astype(int).values
|
||||
level_map = {'初级': 0, '中级': 1, '高级': 2, '主管': 3, '经理及以上': 4}
|
||||
level_vals = df['岗位级别'].map(level_map).fillna(1).values
|
||||
industry_vals = df['所属行业'].values
|
||||
employment_type = df['用工类型'].values
|
||||
job_family = df['岗位序列'].values
|
||||
company_scale_map = {
|
||||
'100人以下': 0, '100-499人': 1, '500-999人': 2, '1000-4999人': 3, '5000人及以上': 4
|
||||
}
|
||||
scale_vals = df['企业规模'].map(company_scale_map).fillna(1).values
|
||||
|
||||
formal_employee = (df['用工类型'] == '正式员工').astype(int).values
|
||||
edu_map = {'中专及以下': 0, '大专': 1, '本科': 2, '硕士': 3, '博士': 4}
|
||||
edu_vals = df['最高学历'].map(edu_map).fillna(2).values
|
||||
|
||||
# ── 工作要求维度 (5 列) ──
|
||||
df['工作自主性'] = likert(
|
||||
3.2 + level_vals * 0.25
|
||||
+ np.isin(industry_vals, ['互联网', '金融服务']).astype(int) * 0.3
|
||||
- night * 0.4
|
||||
).round(1)
|
||||
|
||||
df['情绪劳动强度'] = likert(
|
||||
2.8
|
||||
+ np.isin(job_family, ['客服坐席', '销售业务']).astype(int) * 0.6
|
||||
+ np.isin(industry_vals, ['医药健康', '零售连锁']).astype(int) * 0.3
|
||||
).round(1)
|
||||
|
||||
df['时间压力感知'] = likert(
|
||||
3.0 + overtime * 0.02 + commute * 0.01
|
||||
+ np.isin(industry_vals, ['互联网', '金融服务']).astype(int) * 0.2
|
||||
).round(1)
|
||||
|
||||
df['角色模糊度'] = likert(
|
||||
2.5
|
||||
+ np.isin(employment_type, ['劳务派遣', '外包驻场']).astype(int) * 0.5
|
||||
- tenure * 0.05
|
||||
).round(1)
|
||||
|
||||
df['工作家庭冲突'] = likert(
|
||||
2.6 + overtime * 0.02 + children * 0.3 + married_arr * 0.3
|
||||
).round(1)
|
||||
|
||||
# ── 工作资源维度 (6 列) ──
|
||||
df['上级支持'] = likert(
|
||||
3.4 - manager_span * 0.02 + level_vals * 0.2
|
||||
).round(1)
|
||||
|
||||
df['同事支持'] = likert(
|
||||
3.3 + team_size * 0.02
|
||||
+ np.isin(job_family, ['管理', '专业技术']).astype(int) * 0.2
|
||||
).round(1)
|
||||
|
||||
df['技能多样性'] = likert(
|
||||
3.0
|
||||
+ np.isin(job_family, ['专业技术', '管理']).astype(int) * 0.5
|
||||
- np.isin(job_family, ['生产操作']).astype(int) * 0.3
|
||||
).round(1)
|
||||
|
||||
df['职业发展机会'] = likert(
|
||||
3.1
|
||||
+ np.isin(industry_vals, ['互联网', '金融服务']).astype(int) * 0.4
|
||||
+ scale_vals * 0.1
|
||||
).round(1)
|
||||
|
||||
df['参与决策'] = likert(
|
||||
2.8 + level_vals * 0.35
|
||||
).round(1)
|
||||
|
||||
df['组织公平感'] = likert(
|
||||
3.3 + formal_employee * 0.4 + perf_ab * 0.3
|
||||
).round(1)
|
||||
|
||||
# ── 个人资源维度 (3 列) ──
|
||||
df['自我效能感'] = likert(
|
||||
3.3 + perf_a * 0.4 + perf_ab * 0.2 + tenure * 0.03 + edu_vals * 0.08
|
||||
).round(1)
|
||||
|
||||
df['心理韧性'] = likert(
|
||||
3.2 + exercise * 0.1 + sleep * 0.15 + tenure * 0.02
|
||||
).round(1)
|
||||
|
||||
df['乐观程度'] = likert(
|
||||
3.3 + perf_ab * 0.3 - chronic * 0.3 + married_arr * 0.15
|
||||
).round(1)
|
||||
|
||||
# ── 中介变量 (2 列) ──
|
||||
# 工作倦怠 (1-7):健康损伤过程 — 高需求→高倦怠
|
||||
df['工作倦怠'] = np.clip(
|
||||
rng.normal(3.0, 0.8, size=n)
|
||||
+ overtime * 0.015 + night * 0.3 + commute * 0.008
|
||||
+ df['情绪劳动强度'].values * 0.25
|
||||
+ df['时间压力感知'].values * 0.25
|
||||
+ df['工作家庭冲突'].values * 0.2
|
||||
+ df['角色模糊度'].values * 0.15
|
||||
- df['工作自主性'].values * 0.2
|
||||
- df['上级支持'].values * 0.15
|
||||
- df['自我效能感'].values * 0.2
|
||||
- df['心理韧性'].values * 0.15,
|
||||
1.0, 7.0
|
||||
).round(1)
|
||||
|
||||
# 工作投入 (1-7):激励过程 — 高资源→高投入
|
||||
df['工作投入'] = np.clip(
|
||||
rng.normal(3.5, 0.8, size=n)
|
||||
+ df['工作自主性'].values * 0.2
|
||||
+ df['上级支持'].values * 0.2
|
||||
+ df['同事支持'].values * 0.15
|
||||
+ df['技能多样性'].values * 0.15
|
||||
+ df['职业发展机会'].values * 0.15
|
||||
+ df['参与决策'].values * 0.1
|
||||
+ df['组织公平感'].values * 0.1
|
||||
+ df['自我效能感'].values * 0.2
|
||||
+ df['心理韧性'].values * 0.15
|
||||
+ df['乐观程度'].values * 0.15
|
||||
- df['工作倦怠'].values * 0.2,
|
||||
1.0, 7.0
|
||||
).round(1)
|
||||
|
||||
# JD-R 数据版本标记
|
||||
df['_jdr_version'] = config.JDR_DATA_VERSION
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def ensure_dataset():
|
||||
needs_regenerate = not os.path.exists(config.RAW_DATA_PATH)
|
||||
|
||||
if not needs_regenerate:
|
||||
try:
|
||||
df = pd.read_csv(config.RAW_DATA_PATH)
|
||||
validate_dataset(df)
|
||||
except Exception:
|
||||
needs_regenerate = True
|
||||
|
||||
if needs_regenerate:
|
||||
generate_dataset(config.RAW_DATA_PATH)
|
||||
df = pd.read_csv(config.RAW_DATA_PATH)
|
||||
|
||||
# 检查是否需要 JD-R 数据丰富
|
||||
jdr_columns = ['工作自主性', '上级支持', '自我效能感', '工作倦怠', '工作投入']
|
||||
if not all(col in df.columns for col in jdr_columns):
|
||||
df = enrich_with_jdr_columns(df)
|
||||
os.makedirs(os.path.dirname(config.RAW_DATA_PATH), exist_ok=True)
|
||||
df.to_csv(config.RAW_DATA_PATH, index=False, encoding='utf-8-sig')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -35,6 +35,11 @@ NUMERICAL_OUTLIER_COLUMNS = [
|
||||
'BMI',
|
||||
'近30天睡眠时长均值',
|
||||
'每周运动频次',
|
||||
# JD-R 维度列
|
||||
'工作自主性', '情绪劳动强度', '时间压力感知', '角色模糊度', '工作家庭冲突',
|
||||
'上级支持', '同事支持', '技能多样性', '职业发展机会', '参与决策', '组织公平感',
|
||||
'自我效能感', '心理韧性', '乐观程度',
|
||||
'工作倦怠', '工作投入',
|
||||
]
|
||||
DEFAULT_PREDICTION_INPUT = {
|
||||
'industry': '制造业',
|
||||
@@ -82,6 +87,26 @@ DEFAULT_PREDICTION_INPUT = {
|
||||
'urgent_leave_flag': 1,
|
||||
'continuous_absence_flag': 0,
|
||||
'previous_day_overtime_flag': 1,
|
||||
# JD-R 工作要求维度
|
||||
'work_autonomy': 3.0,
|
||||
'emotional_labor': 3.0,
|
||||
'time_pressure': 3.0,
|
||||
'role_ambiguity': 3.0,
|
||||
'work_family_conflict': 3.0,
|
||||
# JD-R 工作资源维度
|
||||
'supervisor_support': 3.0,
|
||||
'coworker_support': 3.0,
|
||||
'skill_variety': 3.0,
|
||||
'career_development': 3.0,
|
||||
'decision_participation': 3.0,
|
||||
'organizational_justice': 3.0,
|
||||
# JD-R 个人资源维度
|
||||
'self_efficacy': 3.0,
|
||||
'resilience': 3.0,
|
||||
'optimism': 3.0,
|
||||
# JD-R 中介变量
|
||||
'burnout': 3.5,
|
||||
'work_engagement': 3.5,
|
||||
}
|
||||
|
||||
|
||||
@@ -171,6 +196,50 @@ def engineer_features(df):
|
||||
)
|
||||
df['管理负荷指数'] = df['团队人数'] * 0.4 + df['直属上级管理跨度'] * 0.25
|
||||
|
||||
# ── JD-R 复合指数 ──
|
||||
autonomy = df.get('工作自主性', pd.Series(3.0, index=df.index))
|
||||
df['工作要求指数'] = (
|
||||
df['月均加班时长'] * 0.20
|
||||
+ df['通勤时长分钟'] * 0.08
|
||||
+ df['是否夜班岗位'] * 1.5
|
||||
+ (5 - autonomy) * 0.3
|
||||
+ df.get('情绪劳动强度', pd.Series(3.0, index=df.index)) * 0.25
|
||||
+ df.get('时间压力感知', pd.Series(3.0, index=df.index)) * 0.25
|
||||
+ df.get('角色模糊度', pd.Series(3.0, index=df.index)) * 0.20
|
||||
+ df.get('工作家庭冲突', pd.Series(3.0, index=df.index)) * 0.20
|
||||
) / 2
|
||||
|
||||
df['工作资源指数'] = (
|
||||
autonomy * 0.18
|
||||
+ df.get('上级支持', pd.Series(3.0, index=df.index)) * 0.18
|
||||
+ df.get('同事支持', pd.Series(3.0, index=df.index)) * 0.14
|
||||
+ df.get('技能多样性', pd.Series(3.0, index=df.index)) * 0.14
|
||||
+ df.get('职业发展机会', pd.Series(3.0, index=df.index)) * 0.14
|
||||
+ df.get('参与决策', pd.Series(3.0, index=df.index)) * 0.10
|
||||
+ df.get('组织公平感', pd.Series(3.0, index=df.index)) * 0.12
|
||||
)
|
||||
|
||||
df['个人资源指数'] = (
|
||||
df.get('自我效能感', pd.Series(3.0, index=df.index)) * 0.35
|
||||
+ df.get('心理韧性', pd.Series(3.0, index=df.index)) * 0.35
|
||||
+ df.get('乐观程度', pd.Series(3.0, index=df.index)) * 0.30
|
||||
)
|
||||
|
||||
df['JD-R平衡度'] = df['工作资源指数'] - df['工作要求指数'] * 0.5
|
||||
|
||||
df['倦怠风险指数'] = (
|
||||
df.get('工作倦怠', pd.Series(3.5, index=df.index)) * 0.40
|
||||
+ df['工作要求指数'] * 0.30
|
||||
- df['工作资源指数'] * 0.20
|
||||
- df['个人资源指数'] * 0.10
|
||||
)
|
||||
|
||||
df['工作投入指数'] = (
|
||||
df.get('工作投入', pd.Series(3.5, index=df.index)) * 0.40
|
||||
+ df['工作资源指数'] * 0.30
|
||||
+ df['个人资源指数'] * 0.30
|
||||
)
|
||||
|
||||
df['工龄分层'] = pd.cut(df['司龄年数'], bins=[0, 2, 5, 10, 40], labels=['1', '2', '3', '4'])
|
||||
df['年龄分层'] = pd.cut(df['年龄'], bins=[18, 25, 32, 40, 60], labels=['1', '2', '3', '4'])
|
||||
df['通勤分层'] = pd.cut(df['通勤时长分钟'], bins=[0, 25, 45, 70, 180], labels=['1', '2', '3', '4'])
|
||||
@@ -299,6 +368,26 @@ def build_prediction_dataframe(data):
|
||||
'previous_day_overtime_flag',
|
||||
DEFAULT_PREDICTION_INPUT['previous_day_overtime_flag'],
|
||||
),
|
||||
# JD-R 工作要求维度
|
||||
'工作自主性': data.get('work_autonomy', DEFAULT_PREDICTION_INPUT['work_autonomy']),
|
||||
'情绪劳动强度': data.get('emotional_labor', DEFAULT_PREDICTION_INPUT['emotional_labor']),
|
||||
'时间压力感知': data.get('time_pressure', DEFAULT_PREDICTION_INPUT['time_pressure']),
|
||||
'角色模糊度': data.get('role_ambiguity', DEFAULT_PREDICTION_INPUT['role_ambiguity']),
|
||||
'工作家庭冲突': data.get('work_family_conflict', DEFAULT_PREDICTION_INPUT['work_family_conflict']),
|
||||
# JD-R 工作资源维度
|
||||
'上级支持': data.get('supervisor_support', DEFAULT_PREDICTION_INPUT['supervisor_support']),
|
||||
'同事支持': data.get('coworker_support', DEFAULT_PREDICTION_INPUT['coworker_support']),
|
||||
'技能多样性': data.get('skill_variety', DEFAULT_PREDICTION_INPUT['skill_variety']),
|
||||
'职业发展机会': data.get('career_development', DEFAULT_PREDICTION_INPUT['career_development']),
|
||||
'参与决策': data.get('decision_participation', DEFAULT_PREDICTION_INPUT['decision_participation']),
|
||||
'组织公平感': data.get('organizational_justice', DEFAULT_PREDICTION_INPUT['organizational_justice']),
|
||||
# JD-R 个人资源维度
|
||||
'自我效能感': data.get('self_efficacy', DEFAULT_PREDICTION_INPUT['self_efficacy']),
|
||||
'心理韧性': data.get('resilience', DEFAULT_PREDICTION_INPUT['resilience']),
|
||||
'乐观程度': data.get('optimism', DEFAULT_PREDICTION_INPUT['optimism']),
|
||||
# JD-R 中介变量
|
||||
'工作倦怠': data.get('burnout', DEFAULT_PREDICTION_INPUT['burnout']),
|
||||
'工作投入': data.get('work_engagement', DEFAULT_PREDICTION_INPUT['work_engagement']),
|
||||
}
|
||||
return pd.DataFrame([feature_row])
|
||||
|
||||
|
||||
399
backend/core/shap_analysis.py
Normal file
399
backend/core/shap_analysis.py
Normal file
@@ -0,0 +1,399 @@
|
||||
import os
|
||||
|
||||
import joblib
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import config
|
||||
|
||||
try:
|
||||
import shap
|
||||
SHAP_AVAILABLE = True
|
||||
except ImportError:
|
||||
SHAP_AVAILABLE = False
|
||||
|
||||
|
||||
class SHAPAnalyzer:
|
||||
"""基于 SHAP 值的可解释性分析器,按 JD-R 维度聚合解释结果。"""
|
||||
|
||||
def __init__(self):
|
||||
self.explainers = {}
|
||||
self.models = {}
|
||||
self.scaler = None
|
||||
self.feature_names = None
|
||||
self.selected_features = None
|
||||
self.label_encoders = {}
|
||||
self.background_data = None
|
||||
self._initialized = False
|
||||
|
||||
def _ensure_initialized(self):
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
# 加载回归模型(SHAP 分析基于回归模型)
|
||||
models_dir = config.MODELS_DIR
|
||||
model_files = {
|
||||
'random_forest': 'random_forest_model.pkl',
|
||||
'xgboost': 'xgboost_model.pkl',
|
||||
'lightgbm': 'lightgbm_model.pkl',
|
||||
'gradient_boosting': 'gradient_boosting_model.pkl',
|
||||
'extra_trees': 'extra_trees_model.pkl',
|
||||
}
|
||||
for name, filename in model_files.items():
|
||||
path = os.path.join(models_dir, filename)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
self.models[name] = joblib.load(path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 加载预处理工件
|
||||
if os.path.exists(config.SCALER_PATH):
|
||||
self.scaler = joblib.load(config.SCALER_PATH)
|
||||
for filename, attr in [
|
||||
('feature_names.pkl', 'feature_names'),
|
||||
('selected_features.pkl', 'selected_features'),
|
||||
('label_encoders.pkl', 'label_encoders'),
|
||||
]:
|
||||
path = os.path.join(models_dir, filename)
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
setattr(self, attr, joblib.load(path))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self._initialized = True
|
||||
|
||||
def _get_tree_explainer(self, model_type='random_forest'):
|
||||
"""获取或创建 TreeExplainer"""
|
||||
if not SHAP_AVAILABLE:
|
||||
return None
|
||||
|
||||
if model_type in self.explainers:
|
||||
return self.explainers[model_type]
|
||||
|
||||
model = self.models.get(model_type)
|
||||
if model is None:
|
||||
return None
|
||||
|
||||
try:
|
||||
explainer = shap.TreeExplainer(model)
|
||||
self.explainers[model_type] = explainer
|
||||
return explainer
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _get_background_sample(self, n_samples=500):
|
||||
"""获取背景数据样本"""
|
||||
if self.background_data is not None:
|
||||
return self.background_data
|
||||
|
||||
try:
|
||||
from core.preprocessing import get_clean_data
|
||||
from core.model_features import (
|
||||
normalize_columns, prepare_modeling_dataframe,
|
||||
apply_outlier_bounds, fit_outlier_bounds,
|
||||
engineer_features, extract_xy, fit_label_encoders,
|
||||
apply_label_encoders, align_feature_frame, to_float_array,
|
||||
NUMERICAL_OUTLIER_COLUMNS, ORDINAL_COLUMNS,
|
||||
)
|
||||
|
||||
raw_df = normalize_columns(get_clean_data())
|
||||
df = prepare_modeling_dataframe(raw_df)
|
||||
|
||||
bounds = fit_outlier_bounds(df, NUMERICAL_OUTLIER_COLUMNS)
|
||||
df = apply_outlier_bounds(df, bounds)
|
||||
df = engineer_features(df)
|
||||
X_df, _ = extract_xy(df)
|
||||
X_df, encoders = fit_label_encoders(X_df, ORDINAL_COLUMNS)
|
||||
|
||||
if self.feature_names:
|
||||
X_df = align_feature_frame(X_df, self.feature_names)
|
||||
|
||||
if n_samples < len(X_df):
|
||||
X_df = X_df.sample(n=n_samples, random_state=config.RANDOM_STATE)
|
||||
|
||||
if self.scaler is not None:
|
||||
X = self.scaler.transform(to_float_array(X_df))
|
||||
else:
|
||||
X = to_float_array(X_df)
|
||||
|
||||
if self.selected_features and self.feature_names:
|
||||
selected_indices = [self.feature_names.index(n) for n in self.selected_features if n in self.feature_names]
|
||||
if selected_indices:
|
||||
X = X[:, selected_indices]
|
||||
|
||||
self.background_data = X
|
||||
return X
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _get_feature_display_names(self):
|
||||
"""获取特征显示名称映射"""
|
||||
feature_names = self.selected_features or self.feature_names or []
|
||||
return {name: config.FEATURE_NAME_CN.get(name, name) for name in feature_names}
|
||||
|
||||
def _map_feature_to_dimension(self, feature_name):
|
||||
"""将特征映射到 JD-R 维度"""
|
||||
for dim_key, dim_info in config.JDR_DIMENSIONS.items():
|
||||
if feature_name in dim_info['features']:
|
||||
return dim_key
|
||||
# 事件/上下文特征
|
||||
context_features = ['缺勤月份', '星期几', '是否节假日前后', '季节',
|
||||
'请假类型', '请假原因大类', '是否提供医院证明',
|
||||
'是否临时请假', '是否连续缺勤', '前一工作日是否加班']
|
||||
if feature_name in context_features:
|
||||
return 'event_context'
|
||||
return 'other'
|
||||
|
||||
def global_shap_values(self, model_type='random_forest'):
|
||||
"""计算全局 SHAP 重要性,按 JD-R 维度分组"""
|
||||
if not SHAP_AVAILABLE:
|
||||
return {'error': 'SHAP library not installed'}
|
||||
|
||||
self._ensure_initialized()
|
||||
explainer = self._get_tree_explainer(model_type)
|
||||
if explainer is None:
|
||||
return {'error': f'No tree model available for {model_type}'}
|
||||
|
||||
X = self._get_background_sample()
|
||||
if X is None:
|
||||
return {'error': 'Failed to prepare background data'}
|
||||
|
||||
try:
|
||||
shap_values = explainer.shap_values(X)
|
||||
if isinstance(shap_values, list):
|
||||
shap_values = shap_values[0]
|
||||
|
||||
mean_abs_shap = np.abs(shap_values).mean(axis=0)
|
||||
feature_names = self.selected_features or self.feature_names or []
|
||||
name_map = self._get_feature_display_names()
|
||||
|
||||
# 按维度分组
|
||||
dimensions = {}
|
||||
for dim_key, dim_info in config.JDR_DIMENSIONS.items():
|
||||
dim_features = []
|
||||
for fname in feature_names:
|
||||
if fname in dim_info['features']:
|
||||
idx = list(feature_names).index(fname)
|
||||
dim_features.append({
|
||||
'name': fname,
|
||||
'name_cn': name_map.get(fname, fname),
|
||||
'importance': round(float(mean_abs_shap[idx]), 4),
|
||||
})
|
||||
if dim_features:
|
||||
dimensions[dim_key] = {
|
||||
'name_cn': dim_info['name_cn'],
|
||||
'features': sorted(dim_features, key=lambda x: x['importance'], reverse=True),
|
||||
}
|
||||
|
||||
# 事件上下文维度
|
||||
context_features = []
|
||||
for fname in feature_names:
|
||||
if self._map_feature_to_dimension(fname) == 'event_context':
|
||||
idx = list(feature_names).index(fname)
|
||||
context_features.append({
|
||||
'name': fname,
|
||||
'name_cn': name_map.get(fname, fname),
|
||||
'importance': round(float(mean_abs_shap[idx]), 4),
|
||||
})
|
||||
if context_features:
|
||||
dimensions['event_context'] = {
|
||||
'name_cn': '事件上下文',
|
||||
'features': sorted(context_features, key=lambda x: x['importance'], reverse=True),
|
||||
}
|
||||
|
||||
# Top 特征列表
|
||||
top_indices = np.argsort(mean_abs_shap)[::-1][:20]
|
||||
top_features = []
|
||||
for idx in top_indices:
|
||||
fname = feature_names[idx] if idx < len(feature_names) else f'f{idx}'
|
||||
top_features.append({
|
||||
'name': fname,
|
||||
'name_cn': name_map.get(fname, fname),
|
||||
'importance': round(float(mean_abs_shap[idx]), 4),
|
||||
'dimension': self._map_feature_to_dimension(fname),
|
||||
})
|
||||
|
||||
return {
|
||||
'model_type': model_type,
|
||||
'dimensions': dimensions,
|
||||
'top_features': top_features,
|
||||
}
|
||||
except Exception as exc:
|
||||
return {'error': str(exc)}
|
||||
|
||||
def local_shap_values(self, data, model_type='random_forest'):
|
||||
"""计算单条预测的 SHAP 解释"""
|
||||
if not SHAP_AVAILABLE:
|
||||
return {'error': 'SHAP library not installed'}
|
||||
|
||||
self._ensure_initialized()
|
||||
explainer = self._get_tree_explainer(model_type)
|
||||
if explainer is None:
|
||||
return {'error': f'No tree model available for {model_type}'}
|
||||
|
||||
try:
|
||||
from core.model_features import (
|
||||
build_prediction_dataframe, engineer_features,
|
||||
apply_label_encoders, align_feature_frame, to_float_array,
|
||||
)
|
||||
|
||||
X_df = build_prediction_dataframe(data)
|
||||
X_df = engineer_features(X_df)
|
||||
X_df = apply_label_encoders(X_df, self.label_encoders)
|
||||
if self.feature_names:
|
||||
X_df = align_feature_frame(X_df, self.feature_names)
|
||||
features = self.scaler.transform(to_float_array(X_df))
|
||||
if self.selected_features and self.feature_names:
|
||||
selected_indices = [self.feature_names.index(n) for n in self.selected_features if n in self.feature_names]
|
||||
if selected_indices:
|
||||
features = features[:, selected_indices]
|
||||
|
||||
shap_values = explainer.shap_values(features)
|
||||
if isinstance(shap_values, list):
|
||||
shap_values = shap_values[0]
|
||||
|
||||
base_value = float(explainer.expected_value)
|
||||
if isinstance(base_value, (list, np.ndarray)):
|
||||
base_value = float(base_value[0])
|
||||
|
||||
feature_names = self.selected_features or self.feature_names or []
|
||||
name_map = self._get_feature_display_names()
|
||||
|
||||
feature_contributions = []
|
||||
dimension_contribution = {}
|
||||
for idx, fname in enumerate(feature_names):
|
||||
sv = float(shap_values[0][idx])
|
||||
fv = float(features[0][idx])
|
||||
dim = self._map_feature_to_dimension(fname)
|
||||
feature_contributions.append({
|
||||
'name': fname,
|
||||
'name_cn': name_map.get(fname, fname),
|
||||
'shap_value': round(sv, 4),
|
||||
'feature_value': round(fv, 4),
|
||||
'dimension': dim,
|
||||
})
|
||||
dimension_contribution[dim] = dimension_contribution.get(dim, 0) + sv
|
||||
|
||||
feature_contributions.sort(key=lambda x: abs(x['shap_value']), reverse=True)
|
||||
|
||||
# 维度标签
|
||||
dim_labels = {}
|
||||
for dk, di in config.JDR_DIMENSIONS.items():
|
||||
dim_labels[dk] = di['name_cn']
|
||||
dim_labels['event_context'] = '事件上下文'
|
||||
dim_labels['other'] = '其他'
|
||||
|
||||
return {
|
||||
'base_value': round(base_value, 4),
|
||||
'features': feature_contributions[:20],
|
||||
'dimension_contribution': {
|
||||
dim_labels.get(k, k): round(v, 4)
|
||||
for k, v in sorted(dimension_contribution.items(), key=lambda x: abs(x[1]), reverse=True)
|
||||
},
|
||||
}
|
||||
except Exception as exc:
|
||||
return {'error': str(exc)}
|
||||
|
||||
def shap_interaction(self, model_type='random_forest', top_n=10):
|
||||
"""计算 SHAP 交互值"""
|
||||
if not SHAP_AVAILABLE:
|
||||
return {'error': 'SHAP library not installed'}
|
||||
|
||||
self._ensure_initialized()
|
||||
explainer = self._get_tree_explainer(model_type)
|
||||
if explainer is None:
|
||||
return {'error': f'No tree model available for {model_type}'}
|
||||
|
||||
X = self._get_background_sample(n_samples=200)
|
||||
if X is None:
|
||||
return {'error': 'Failed to prepare background data'}
|
||||
|
||||
try:
|
||||
interaction_values = explainer.shap_interaction_values(X)
|
||||
if isinstance(interaction_values, list):
|
||||
interaction_values = interaction_values[0]
|
||||
|
||||
mean_interaction = np.abs(interaction_values).mean(axis=0)
|
||||
feature_names = self.selected_features or self.feature_names or []
|
||||
|
||||
# 获取 top_n 特征的交互
|
||||
mean_abs = np.abs(interaction_values.mean(axis=0))
|
||||
np.fill_diagonal(mean_abs, 0)
|
||||
flat_idx = np.argsort(mean_abs.ravel())[::-1][:top_n * 2]
|
||||
top_pairs = []
|
||||
seen = set()
|
||||
for idx in flat_idx:
|
||||
i, j = divmod(idx, mean_abs.shape[1])
|
||||
if i >= j:
|
||||
continue
|
||||
pair_key = (min(i, j), max(i, j))
|
||||
if pair_key in seen:
|
||||
continue
|
||||
seen.add(pair_key)
|
||||
fi = feature_names[i] if i < len(feature_names) else f'f{i}'
|
||||
fj = feature_names[j] if j < len(feature_names) else f'f{j}'
|
||||
name_map = self._get_feature_display_names()
|
||||
top_pairs.append({
|
||||
'feature_1': fi,
|
||||
'feature_1_cn': name_map.get(fi, fi),
|
||||
'feature_2': fj,
|
||||
'feature_2_cn': name_map.get(fj, fj),
|
||||
'strength': round(float(mean_interaction[i, j]), 4),
|
||||
})
|
||||
if len(top_pairs) >= top_n:
|
||||
break
|
||||
|
||||
return {
|
||||
'model_type': model_type,
|
||||
'top_interactions': top_pairs,
|
||||
}
|
||||
except Exception as exc:
|
||||
return {'error': str(exc)}
|
||||
|
||||
def shap_dependence(self, feature_name, model_type='random_forest'):
|
||||
"""计算单个特征的 SHAP 依赖图数据"""
|
||||
if not SHAP_AVAILABLE:
|
||||
return {'error': 'SHAP library not installed'}
|
||||
|
||||
self._ensure_initialized()
|
||||
explainer = self._get_tree_explainer(model_type)
|
||||
if explainer is None:
|
||||
return {'error': f'No tree model available for {model_type}'}
|
||||
|
||||
X = self._get_background_sample()
|
||||
if X is None:
|
||||
return {'error': 'Failed to prepare background data'}
|
||||
|
||||
try:
|
||||
feature_names = self.selected_features or self.feature_names or []
|
||||
if feature_name not in feature_names:
|
||||
return {'error': f'Feature {feature_name} not found'}
|
||||
|
||||
col_idx = list(feature_names).index(feature_name)
|
||||
shap_values = explainer.shap_values(X)
|
||||
if isinstance(shap_values, list):
|
||||
shap_values = shap_values[0]
|
||||
|
||||
feature_vals = X[:, col_idx].tolist()
|
||||
shap_vals = shap_values[:, col_idx].tolist()
|
||||
|
||||
# 下采样用于可视化
|
||||
max_points = 300
|
||||
if len(feature_vals) > max_points:
|
||||
indices = np.random.RandomState(config.RANDOM_STATE).choice(
|
||||
len(feature_vals), max_points, replace=False
|
||||
)
|
||||
feature_vals = [feature_vals[i] for i in indices]
|
||||
shap_vals = [shap_vals[i] for i in indices]
|
||||
|
||||
name_map = self._get_feature_display_names()
|
||||
return {
|
||||
'feature': feature_name,
|
||||
'feature_cn': name_map.get(feature_name, feature_name),
|
||||
'values': [round(v, 4) for v in feature_vals],
|
||||
'shap_values': [round(v, 4) for v in shap_vals],
|
||||
}
|
||||
except Exception as exc:
|
||||
return {'error': str(exc)}
|
||||
@@ -7,8 +7,10 @@ from datetime import datetime
|
||||
import joblib
|
||||
import numpy as np
|
||||
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
|
||||
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
|
||||
from sklearn.feature_selection import SelectKBest, f_regression
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
|
||||
from sklearn.model_selection import RandomizedSearchCV, train_test_split
|
||||
from sklearn.preprocessing import RobustScaler
|
||||
|
||||
@@ -351,9 +353,81 @@ class OptimizedModelTrainer:
|
||||
)
|
||||
|
||||
self.save_models()
|
||||
|
||||
# 风险分类模型训练
|
||||
print('\nRisk Classification Training')
|
||||
risk_trainer = RiskClassifierTrainer(self)
|
||||
risk_trainer.train_all(X_train, y_train, X_test, y_test)
|
||||
risk_trainer.save()
|
||||
|
||||
return self.model_metrics
|
||||
|
||||
|
||||
class RiskClassifierTrainer:
|
||||
"""风险等级分类模型训练器:低(<4h) / 中(4-8h) / 高(>8h)"""
|
||||
|
||||
RISK_MAP = {'low': 0, 'medium': 1, 'high': 2}
|
||||
RISK_LABELS = ['low', 'medium', 'high']
|
||||
|
||||
def __init__(self, regression_trainer):
|
||||
self.regression_trainer = regression_trainer
|
||||
self.classifiers = {}
|
||||
self.classification_metrics = {}
|
||||
|
||||
def _make_target(self, y_hours):
|
||||
y_class = np.full(len(y_hours), 1, dtype=int)
|
||||
y_class[y_hours < 4] = 0
|
||||
y_class[y_hours > 8] = 2
|
||||
return y_class
|
||||
|
||||
def train_all(self, X_train, y_train_hours, X_test, y_test_hours):
|
||||
y_train_cls = self._make_target(y_train_hours)
|
||||
y_test_cls = self._make_target(y_test_hours)
|
||||
|
||||
classifier_configs = {
|
||||
'random_forest': RandomForestClassifier(
|
||||
n_estimators=300, max_depth=14, random_state=config.RANDOM_STATE, n_jobs=-1,
|
||||
),
|
||||
'gradient_boosting': GradientBoostingClassifier(
|
||||
n_estimators=200, max_depth=4, learning_rate=0.05, random_state=config.RANDOM_STATE,
|
||||
),
|
||||
}
|
||||
|
||||
if lgb is not None:
|
||||
classifier_configs['lightgbm'] = lgb.LGBMClassifier(
|
||||
n_estimators=260, max_depth=7, learning_rate=0.05,
|
||||
random_state=config.RANDOM_STATE, n_jobs=-1, verbose=-1,
|
||||
)
|
||||
if xgb is not None:
|
||||
classifier_configs['xgboost'] = xgb.XGBClassifier(
|
||||
n_estimators=260, max_depth=6, learning_rate=0.05,
|
||||
random_state=config.RANDOM_STATE, n_jobs=-1,
|
||||
)
|
||||
|
||||
for name, clf in classifier_configs.items():
|
||||
try:
|
||||
clf.fit(X_train, y_train_cls)
|
||||
y_pred = clf.predict(X_test)
|
||||
self.classifiers[name] = clf
|
||||
self.classification_metrics[name] = {
|
||||
'accuracy': round(accuracy_score(y_test_cls, y_pred), 4),
|
||||
'precision_macro': round(precision_score(y_test_cls, y_pred, average='macro', zero_division=0), 4),
|
||||
'recall_macro': round(recall_score(y_test_cls, y_pred, average='macro', zero_division=0), 4),
|
||||
'f1_macro': round(f1_score(y_test_cls, y_pred, average='macro', zero_division=0), 4),
|
||||
'confusion_matrix': confusion_matrix(y_test_cls, y_pred).tolist(),
|
||||
}
|
||||
m = self.classification_metrics[name]
|
||||
print(f' {name:20s} Acc={m["accuracy"]:.4f} F1={m["f1_macro"]:.4f}')
|
||||
except Exception as exc:
|
||||
print(f' {name:20s} Skipped: {exc}')
|
||||
|
||||
def save(self):
|
||||
for name, clf in self.classifiers.items():
|
||||
path = os.path.join(config.MODELS_DIR, f'risk_{name}_classifier.pkl')
|
||||
joblib.dump(clf, path)
|
||||
joblib.dump(self.classification_metrics, os.path.join(config.MODELS_DIR, 'classification_metrics.pkl'))
|
||||
|
||||
|
||||
def train_and_save_models():
|
||||
start = time.time()
|
||||
trainer = OptimizedModelTrainer()
|
||||
|
||||
Reference in New Issue
Block a user