diff --git a/backend/core/clustering.py b/backend/core/clustering.py index a0a0c7c..85f84fd 100644 --- a/backend/core/clustering.py +++ b/backend/core/clustering.py @@ -43,14 +43,14 @@ class KMeansAnalyzer: center = centers[int(cluster_id)] clusters.append({ 'id': int(cluster_id), - 'name': names.get(int(cluster_id), f'群体{int(cluster_id) + 1}'), + 'name': names.get(int(cluster_id), '常规稳态型'), 'member_count': int(count), 'percentage': round(count / total * 100, 1), 'center': { feature: round(float(value), 2) for feature, value in zip(self.feature_cols, center) }, - 'description': self._generate_description(names.get(int(cluster_id), '')), + 'description': self._generate_description(names.get(int(cluster_id), '常规稳态型'), center), }) return {'n_clusters': self.n_clusters, 'clusters': clusters} @@ -65,7 +65,7 @@ class KMeansAnalyzer: 'clusters': [ { 'id': idx, - 'name': names.get(idx, f'群体{idx + 1}'), + 'name': names.get(idx, '常规稳态型'), 'values': [round(float(v), 2) for v in centers_scaled[idx]], } for idx in range(self.n_clusters) @@ -105,27 +105,63 @@ class KMeansAnalyzer: '4': '#6DC8EC', }, 'cluster_names': { - str(idx): names.get(idx, f'群体{idx + 1}') + str(idx): names.get(idx, '常规稳态型') for idx in range(self.n_clusters) }, } def _generate_cluster_names(self, centers): + rank_info = self._build_rank_info(centers) base_names = {} for idx, center in enumerate(centers): - _, tenure, overtime, commute, bmi, absence = center - if overtime > 38 and commute > 55 and absence > 8: - base_names[idx] = '高压通勤型' - elif bmi > 27 and absence > 8: - base_names[idx] = '健康波动型' - elif tenure > 8 and absence < 6: - base_names[idx] = '稳定低风险型' - elif overtime > 28 and absence > 7: - base_names[idx] = '轮班负荷型' - else: - base_names[idx] = f'群体{idx + 1}' + base_names[idx] = self._classify_cluster(center, rank_info, idx) return self._deduplicate_cluster_names(base_names, centers) + def _build_rank_info(self, centers): + centers = np.asarray(centers, dtype=float) + return { + '年龄': self._rank_desc(centers[:, 0]), + '司龄': self._rank_desc(centers[:, 1]), + '加班': self._rank_desc(centers[:, 2]), + '通勤': self._rank_desc(centers[:, 3]), + 'BMI': self._rank_desc(centers[:, 4]), + '缺勤': self._rank_desc(centers[:, 5]), + } + + def _rank_desc(self, values): + ordered = np.argsort(-np.asarray(values, dtype=float)) + ranks = {} + for rank, idx in enumerate(ordered): + ranks[int(idx)] = rank + return ranks + + def _classify_cluster(self, center, rank_info, idx): + age, tenure, overtime, commute, bmi, absence = center + high_absence = rank_info['缺勤'][idx] == 0 + low_absence = rank_info['缺勤'][idx] == len(rank_info['缺勤']) - 1 + high_overtime = rank_info['加班'][idx] <= 1 + high_commute = rank_info['通勤'][idx] <= 1 + high_bmi = rank_info['BMI'][idx] <= 1 + high_tenure = rank_info['司龄'][idx] <= 1 + low_tenure = rank_info['司龄'][idx] >= len(rank_info['司龄']) - 1 + young_group = rank_info['年龄'][idx] >= len(rank_info['年龄']) - 1 + + if (absence >= 7.5 and overtime >= 28 and commute >= 40) or (high_absence and high_overtime and high_commute): + return '压力奔波型' + if (absence >= 7.0 and bmi >= 25.5) or (high_absence and high_bmi): + return '健康关注型' + if (overtime >= 30 and absence >= 6.0) or (high_overtime and rank_info['缺勤'][idx] <= 1): + return '负荷承压型' + if (tenure >= 8 and absence <= 6.0) or (high_tenure and low_absence): + return '稳定成熟型' + if (tenure <= 4 and age <= 32) or (low_tenure and young_group): + return '新锐成长型' + if commute <= 35 and absence <= 6.5: + return '通勤平衡型' + if tenure >= 6 and absence <= 6.8: + return '经验稳健型' + return '常规稳态型' + def _deduplicate_cluster_names(self, names, centers): grouped = {} for idx, name in names.items(): @@ -159,24 +195,75 @@ class KMeansAnalyzer: def _suffix_candidates(self, name): suffix_map = { - '高压通勤型': ['-高风险组', '-关注组', '-观察组'], - '健康波动型': ['-重点关注组', '-预警组', '-观察组'], - '稳定低风险型': ['-资深组', '-成熟组', '-稳健组'], - '轮班负荷型': ['-高负荷组', '-轮班组', '-强化组'], + '压力奔波型': ['-高压组', '-长途组', '-持续关注组'], + '健康关注型': ['-重点关注组', '-预警组', '-干预组'], + '负荷承压型': ['-高负荷组', '-轮班组', '-调节组'], + '稳定成熟型': ['-资深组', '-成熟组', '-稳健组'], + '新锐成长型': ['-适应组', '-成长组', '-潜力组'], + '通勤平衡型': ['-均衡组', '-稳态组', '-协同组'], + '经验稳健型': ['-资深组', '-稳健组', '-协同组'], + '常规稳态型': ['-平衡组', '-常态组', '-协同组'], } return suffix_map.get(name, [f'({idx})' for idx in range(1, 10)]) - def _generate_description(self, name): + def _generate_description(self, name, center=None): descriptions = { - '高压通勤型': '加班和通勤压力都高,缺勤时长偏长。', - '健康波动型': '健康相关风险更高,需要重点关注。', - '稳定低风险型': '司龄较长,缺勤水平稳定且偏低。', - '轮班负荷型': '排班和工作负荷较重,缺勤风险较高。', + '压力奔波型': '加班与通勤压力同时偏高,缺勤波动更明显。', + '健康关注型': '健康负担更突出,缺勤时长偏高,建议优先关注。', + '负荷承压型': '工作负荷较重,缺勤风险处于偏高水平。', + '稳定成熟型': '司龄较长,整体状态稳定,缺勤水平偏低。', + '新锐成长型': '整体更年轻、司龄较短,仍处于适应与成长阶段。', + '通勤平衡型': '通勤与缺勤表现较均衡,整体波动相对可控。', + '经验稳健型': '具备一定经验积累,整体表现稳健,缺勤风险较低。', + '常规稳态型': '整体表现接近企业常态,是较典型的员工群体。', } for key, description in descriptions.items(): if name.startswith(key): - return description - return descriptions.get(name, '常规员工群体。') + if center is None: + return description + return self._build_dynamic_description(key, center, description) + return descriptions.get(name, '整体表现接近企业常态。') + + def _build_dynamic_description(self, base_name, center, default_description): + age, tenure, overtime, commute, bmi, absence = center + clauses = [] + + if tenure >= 8: + clauses.append('司龄较长') + elif tenure <= 4: + clauses.append('司龄较短') + + if overtime >= 30: + clauses.append('加班负荷偏高') + elif overtime <= 18: + clauses.append('加班压力相对可控') + + if commute >= 45: + clauses.append('通勤压力偏高') + elif commute <= 30: + clauses.append('通勤节奏较平衡') + + if bmi >= 26: + clauses.append('健康管理压力更明显') + + if absence >= 7.5: + clauses.append('缺勤时长偏高') + elif absence <= 5.5: + clauses.append('缺勤水平偏低') + + if age <= 32: + clauses.append('群体整体更年轻') + elif age >= 40: + clauses.append('群体整体更成熟') + + unique_clauses = [] + for clause in clauses: + if clause not in unique_clauses: + unique_clauses.append(clause) + + if not unique_clauses: + return default_description + return ','.join(unique_clauses[:3]) + '。' kmeans_analyzer = KMeansAnalyzer() diff --git a/backend/services/cluster_service.py b/backend/services/cluster_service.py index 90d0c3e..dba6114 100644 --- a/backend/services/cluster_service.py +++ b/backend/services/cluster_service.py @@ -2,17 +2,21 @@ from core.clustering import KMeansAnalyzer class ClusterService: - def __init__(self): - self.analyzer = KMeansAnalyzer() - + def _create_analyzer(self): + # 聚类接口会被前端并发调用,避免复用同一个可变分析器实例导致结果串线。 + return KMeansAnalyzer() + def get_cluster_result(self, n_clusters=3): - return self.analyzer.get_cluster_results(n_clusters) - + analyzer = self._create_analyzer() + return analyzer.get_cluster_results(n_clusters) + def get_cluster_profile(self, n_clusters=3): - return self.analyzer.get_cluster_profile(n_clusters) - + analyzer = self._create_analyzer() + return analyzer.get_cluster_profile(n_clusters) + def get_scatter_data(self, n_clusters=3, x_axis='月均加班时长', y_axis='缺勤时长(小时)'): - return self.analyzer.get_scatter_data(n_clusters, x_axis, y_axis) + analyzer = self._create_analyzer() + return analyzer.get_scatter_data(n_clusters, x_axis, y_axis) cluster_service = ClusterService() diff --git a/backend/services/predict_service.py b/backend/services/predict_service.py index 322e876..ac28def 100644 --- a/backend/services/predict_service.py +++ b/backend/services/predict_service.py @@ -16,18 +16,26 @@ from core.model_features import ( MODEL_INFO = { 'random_forest': {'name': 'random_forest', 'name_cn': '随机森林', 'description': '稳健的树模型集成'}, - 'xgboost': {'name': 'xgboost', 'name_cn': 'XGBoost', 'description': '梯度提升树模型'}, - 'lightgbm': {'name': 'lightgbm', 'name_cn': 'LightGBM', 'description': '轻量级梯度提升树'}, - 'gradient_boosting': {'name': 'gradient_boosting', 'name_cn': 'GBDT', 'description': '梯度提升决策树'}, + 'xgboost': {'name': 'xgboost', 'name_cn': '增强树模型一', 'description': '梯度提升树模型'}, + 'lightgbm': {'name': 'lightgbm', 'name_cn': '增强树模型二', 'description': '轻量级梯度提升树'}, + 'gradient_boosting': {'name': 'gradient_boosting', 'name_cn': '梯度提升树', 'description': '梯度提升决策树'}, 'extra_trees': {'name': 'extra_trees', 'name_cn': '极端随机树', 'description': '高随机性的树模型'}, - 'stacking': {'name': 'stacking', 'name_cn': 'Stacking集成', 'description': '多模型融合'}, + 'stacking': {'name': 'stacking', 'name_cn': '集成模型', 'description': '多模型融合'}, 'lstm_mlp': { 'name': 'lstm_mlp', 'name_cn': '时序注意力融合网络', - 'description': 'Transformer时序编码 + 静态特征门控融合的深度学习模型', + 'description': 'Transformer 时序编码与静态特征融合的深度学习模型', }, } +EXPLAINABLE_TREE_MODELS = ( + 'random_forest', + 'xgboost', + 'lightgbm', + 'gradient_boosting', + 'extra_trees', +) + class PredictService: def __init__(self): @@ -96,7 +104,6 @@ class PredictService: if valid_metrics: self.default_model = max(valid_metrics.items(), key=lambda item: item[1]['r2'])[0] - # 加载风险分类模型 for name in ['random_forest', 'gradient_boosting', 'lightgbm', 'xgboost']: path = os.path.join(config.MODELS_DIR, f'risk_{name}_classifier.pkl') if os.path.exists(path): @@ -123,18 +130,22 @@ class PredictService: models.sort(key=lambda item: item['metrics']['r2'], reverse=True) return models - def predict_single(self, data, model_type=None): + def predict_single(self, data, model_type=None, include_explanation=True): self._ensure_models_loaded() - model_type = model_type or self.default_model - if model_type not in self.models: - fallback = next(iter(self.models), None) - if fallback is None: - return self._get_default_prediction(data) - model_type = fallback - if self.scaler is None or self.feature_names is None: - return self._get_default_prediction(data) + model_type = self._resolve_prediction_model(model_type or self.default_model) + _, engineered_df = self._build_prediction_frames(data) + engineered_row = engineered_df.iloc[0] + + if model_type is None or self.scaler is None or self.feature_names is None: + result = self._get_default_prediction(data) + return self._augment_prediction_result(result, data, engineered_row) if include_explanation else result + + try: + features = self._prepare_features_from_engineered(engineered_df) + except Exception: + result = self._get_default_prediction(data) + return self._augment_prediction_result(result, data, engineered_row) if include_explanation else result - features = self._prepare_features(data) try: if model_type == 'lstm_mlp': current_df = build_prediction_dataframe(data) @@ -144,15 +155,14 @@ class PredictService: predicted_hours = self._inverse_transform_prediction(predicted_hours) predicted_hours = max(0.5, float(predicted_hours)) except Exception: - return self._get_default_prediction(data) + result = self._get_default_prediction(data) + return self._augment_prediction_result(result, data, engineered_row) if include_explanation else result risk_level, risk_label = self._get_risk_level(predicted_hours) confidence = max(0.5, self.model_metrics.get(model_type, {}).get('r2', 0.82)) - - # 风险分类概率 risk_probability = self._get_risk_probability(features, model_type) - return { + result = { 'predicted_hours': round(predicted_hours, 2), 'risk_level': risk_level, 'risk_label': risk_label, @@ -161,12 +171,13 @@ class PredictService: 'model_used': model_type, 'model_name_cn': MODEL_INFO.get(model_type, {}).get('name_cn', model_type), } + return self._augment_prediction_result(result, data, engineered_row) if include_explanation else result def predict_compare(self, data): self._ensure_models_loaded() results = [] for name in self.models.keys(): - result = self.predict_single(data, name) + result = self.predict_single(data, name, include_explanation=False) result['model'] = name result['model_name_cn'] = MODEL_INFO.get(name, {}).get('name_cn', name) result['r2'] = self.model_metrics.get(name, {}).get('r2', 0) @@ -176,10 +187,17 @@ class PredictService: results[0]['recommended'] = True return results + def _build_prediction_frames(self, data): + current_df = build_prediction_dataframe(data) + engineered_df = engineer_features(current_df.copy()) + return current_df, engineered_df + def _prepare_features(self, data): - X_df = build_prediction_dataframe(data) - X_df = engineer_features(X_df) - X_df = apply_label_encoders(X_df, self.label_encoders) + _, engineered_df = self._build_prediction_frames(data) + return self._prepare_features_from_engineered(engineered_df) + + def _prepare_features_from_engineered(self, engineered_df): + X_df = apply_label_encoders(engineered_df.copy(), self.label_encoders) X_df = align_feature_frame(X_df, self.feature_names) features = self.scaler.transform(to_float_array(X_df))[0] if self.selected_features: @@ -188,6 +206,338 @@ class PredictService: features = features[selected_indices] return features + def _resolve_prediction_model(self, requested_model): + if requested_model in self.models: + return requested_model + if self.default_model in self.models: + return self.default_model + return next(iter(self.models), None) + + def _resolve_explanation_model(self, prediction_model): + if prediction_model in EXPLAINABLE_TREE_MODELS and prediction_model in self.models: + return prediction_model + for candidate in ('random_forest', 'xgboost', 'lightgbm', 'gradient_boosting', 'extra_trees'): + if candidate in self.models: + return candidate + return None + + def _augment_prediction_result(self, result, data, engineered_row): + explanation_model = self._resolve_explanation_model(result.get('model_used')) + shap_local = self._get_local_explanation(data, explanation_model) + jdr_snapshot = self._build_jdr_snapshot(engineered_row) + mechanism_summary = self._build_mechanism_summary(result, data, jdr_snapshot, shap_local) + intervention_suggestions = self._build_intervention_suggestions(data, jdr_snapshot, shap_local) + + payload = dict(result) + payload.update({ + 'jdr_snapshot': jdr_snapshot, + 'mechanism_summary': mechanism_summary, + 'intervention_suggestions': intervention_suggestions, + 'explanation_model_used': explanation_model, + 'explanation_model_name_cn': MODEL_INFO.get(explanation_model, {}).get('name_cn', '机制解释'), + 'shap_local': shap_local, + }) + return payload + + def _get_local_explanation(self, data, model_type): + if not model_type: + return None + try: + from services.shap_service import shap_service + + explanation = shap_service.get_local_explanation(data, model_type) + if explanation and not explanation.get('error'): + return explanation + except Exception: + pass + return None + + def _build_jdr_snapshot(self, engineered_row): + snapshot = { + 'job_demands': self._build_snapshot_item( + 'job_demands', + '工作要求', + engineered_row.get('工作要求指数', 0.0), + *self._classify_job_demands(engineered_row.get('工作要求指数', 0.0)), + ), + 'job_resources': self._build_snapshot_item( + 'job_resources', + '工作资源', + engineered_row.get('工作资源指数', 0.0), + *self._classify_resource_stock(engineered_row.get('工作资源指数', 0.0)), + ), + 'personal_resources': self._build_snapshot_item( + 'personal_resources', + '个人资源', + engineered_row.get('个人资源指数', 0.0), + *self._classify_resource_stock(engineered_row.get('个人资源指数', 0.0)), + ), + 'balance': self._build_snapshot_item( + 'balance', + '平衡度', + engineered_row.get('JD-R平衡度', 0.0), + *self._classify_balance(engineered_row.get('JD-R平衡度', 0.0)), + ), + 'burnout_risk': self._build_snapshot_item( + 'burnout_risk', + '倦怠风险', + engineered_row.get('倦怠风险指数', 0.0), + *self._classify_burnout(engineered_row.get('倦怠风险指数', 0.0)), + ), + 'engagement': self._build_snapshot_item( + 'engagement', + '工作投入', + engineered_row.get('工作投入指数', 0.0), + *self._classify_resource_stock(engineered_row.get('工作投入指数', 0.0)), + ), + } + return snapshot + + def _build_snapshot_item(self, key, label, score, status, tone): + return { + 'key': key, + 'label': label, + 'score': round(self._safe_float(score), 2), + 'status': status, + 'tone': tone, + } + + def _build_mechanism_summary(self, result, data, jdr_snapshot, shap_local): + dimension_scores = self._extract_dimension_scores(shap_local) + top_drivers = self._extract_feature_effects(shap_local, positive=True, limit=3) + protective_factors = self._extract_feature_effects(shap_local, positive=False, limit=2) + + pathway_label, pathway_tone, pathway_detail = self._infer_pathway(jdr_snapshot, dimension_scores) + mechanism = self._build_mechanism_text(data, jdr_snapshot, dimension_scores, top_drivers) + buffer_text = self._build_buffer_text(jdr_snapshot, protective_factors) + scenario_hint = self._build_scenario_hint(data) + + return { + 'conclusion': f"本次预测为{result['risk_label']},预计缺勤时长约 {result['predicted_hours']} 小时。", + 'mechanism': mechanism, + 'pathway_label': pathway_label, + 'pathway_tone': pathway_tone, + 'pathway_detail': pathway_detail, + 'buffer_text': buffer_text, + 'scenario_hint': scenario_hint, + 'top_drivers': top_drivers, + 'protective_factors': protective_factors, + } + + def _build_mechanism_text(self, data, jdr_snapshot, dimension_scores, top_drivers): + if top_drivers: + driver_names = '、'.join(item['name_cn'] for item in top_drivers) + if dimension_scores.get('工作要求', 0.0) > 0.03: + return f'主要推高因素集中在{driver_names},说明高工作要求正在直接抬升本次缺勤风险。' + if dimension_scores.get('事件上下文', 0.0) > 0.03: + return f'主要推高因素集中在{driver_names},当前结果更容易受到请假事件情境的直接触发。' + if dimension_scores.get('工作资源', 0.0) > 0.03 or dimension_scores.get('个人资源', 0.0) > 0.03: + return f'主要推高因素集中在{driver_names},说明资源缓冲不足正在放大本次缺勤时长。' + return f'主要推高因素集中在{driver_names},它们共同推动了本次缺勤时长上升。' + + fragments = [] + if jdr_snapshot['job_demands']['tone'] in {'warning', 'danger'}: + fragments.append('工作要求偏高') + if jdr_snapshot['job_resources']['tone'] == 'danger': + fragments.append('工作资源不足') + if jdr_snapshot['personal_resources']['tone'] == 'danger': + fragments.append('个人资源偏弱') + if self._as_flag(data.get('medical_certificate_flag')) or self._as_flag(data.get('near_holiday_flag')): + fragments.append('事件情境触发明显') + if not fragments: + return '当前结果更多体现为常规缺勤波动,整体压力与资源结构暂时可控。' + return f"当前结果主要由{'、'.join(fragments)}共同驱动。" + + def _build_buffer_text(self, jdr_snapshot, protective_factors): + if protective_factors: + names = '、'.join(item['name_cn'] for item in protective_factors) + return f'{names}对当前风险仍有一定缓冲作用,但尚不足以完全抵消主要压力来源。' + if jdr_snapshot['job_resources']['tone'] in {'success', 'info'} and jdr_snapshot['personal_resources']['tone'] in {'success', 'info'}: + return '当前资源支持和个人恢复能力对风险有一定缓冲,但事件性因素仍需持续关注。' + return '' + + def _build_scenario_hint(self, data): + actions = [] + if self._safe_float(data.get('monthly_overtime_hours', 0.0)) >= 25: + actions.append('将月均加班控制在 20 小时以内') + if self._safe_float(data.get('commute_minutes', 0.0)) >= 45: + actions.append('把通勤时长压缩到 30 分钟左右') + if self._as_flag(data.get('is_night_shift')): + actions.append('减少连续夜班或延长轮休恢复时间') + if not actions: + return '' + if len(actions) == 1: + return f'情境判断:若能{actions[0]},当前风险通常会有所回落。' + return f"情境判断:若能{',并'.join(actions[:-1])},同时{actions[-1]},当前风险通常会有所回落。" + + def _infer_pathway(self, jdr_snapshot, dimension_scores): + demands_pressure = dimension_scores.get('工作要求', 0.0) + mediator_pressure = dimension_scores.get('中介变量', 0.0) + resource_pressure = dimension_scores.get('工作资源', 0.0) + dimension_scores.get('个人资源', 0.0) + event_pressure = dimension_scores.get('事件上下文', 0.0) + + demands_high = jdr_snapshot['job_demands']['tone'] == 'danger' + burnout_high = jdr_snapshot['burnout_risk']['tone'] in {'warning', 'danger'} + resources_low = ( + jdr_snapshot['job_resources']['tone'] == 'danger' + or jdr_snapshot['personal_resources']['tone'] == 'danger' + or jdr_snapshot['engagement']['tone'] == 'danger' + ) + + if demands_high or burnout_high or demands_pressure > 0.03 or mediator_pressure > 0.03: + if resources_low or resource_pressure > 0.03: + return ( + '健康损耗与资源缓冲不足', + 'danger', + '当前结果同时表现出高要求累积与资源缓冲不足,更接近“工作要求上升 → 倦怠累积 → 缺勤增加”的复合路径。', + ) + return ( + '健康损耗路径为主', + 'warning', + '当前结果更接近“工作要求上升 → 倦怠累积 → 缺勤增加”的健康损耗路径。', + ) + if resources_low or resource_pressure > 0.03: + return ( + '激励支撑不足路径', + 'warning', + '当前资源与个人恢复能力偏弱,工作投入对缺勤风险的缓冲作用有限。', + ) + if event_pressure > 0.04: + return ( + '事件触发型波动', + 'info', + '当前结果更容易受到请假类型、医院证明和节假日前后等事件情境直接触发。', + ) + return ( + '混合影响路径', + 'info', + '当前结果同时受到工作要求、资源结构与事件情境的共同影响,尚不属于单一路径主导。', + ) + + def _build_intervention_suggestions(self, data, jdr_snapshot, shap_local): + suggestions = [] + + demand_items = [] + overtime_hours = self._safe_float(data.get('monthly_overtime_hours', 0.0)) + commute_minutes = self._safe_float(data.get('commute_minutes', 0.0)) + if overtime_hours >= 25 or jdr_snapshot['job_demands']['tone'] == 'danger': + demand_items.append('优先压降连续高负荷排班,尽量把月均加班控制在 20 小时以内。') + if commute_minutes >= 45: + demand_items.append('若条件允许,可通过弹性到岗、调班或就近安排缓和通勤压力。') + if self._as_flag(data.get('is_night_shift')): + demand_items.append('夜班岗位建议增加轮休和班后恢复时段,避免疲劳持续累积。') + if self._as_flag(data.get('near_holiday_flag')): + demand_items.append('节假日前后可提前做好替班和排班缓冲,减少事件性缺勤波动。') + if not demand_items: + demand_items.append('当前工作要求未明显失衡,重点保持排班稳定并持续监控波动。') + suggestions.append({'category': '减要求', 'items': self._limit_unique_items(demand_items)}) + + resource_items = [] + if jdr_snapshot['job_resources']['tone'] in {'warning', 'danger'}: + resource_items.append('增加主管沟通、临时替班支持和班组协同,补足组织支持资源。') + if jdr_snapshot['balance']['tone'] in {'warning', 'danger'}: + resource_items.append('对高风险岗位提供更清晰的任务边界和优先级,降低角色冲突。') + if str(data.get('leave_reason_category', '')) == '子女照护': + resource_items.append('可结合弹性工时或家庭照护支持,缓解家庭事务对缺勤的放大作用。') + if not resource_items: + resource_items.append('当前资源面整体可用,建议继续维持支持性排班和沟通反馈机制。') + suggestions.append({'category': '增资源', 'items': self._limit_unique_items(resource_items)}) + + personal_items = [] + if self._as_flag(data.get('chronic_disease_flag')) or self._as_flag(data.get('medical_certificate_flag')): + personal_items.append('结合健康监测、复诊安排和短期工作调整,降低身体不适带来的持续缺勤风险。') + if jdr_snapshot['burnout_risk']['tone'] in {'warning', 'danger'}: + personal_items.append('建议通过休息恢复、情绪支持和短周期工作调整,缓冲倦怠累积。') + if jdr_snapshot['personal_resources']['tone'] == 'danger': + personal_items.append('可通过辅导、复盘和岗位支持增强员工自我效能与心理韧性。') + if not personal_items: + personal_items.append('当前个体恢复能力整体可控,重点维持规律作息和健康管理即可。') + suggestions.append({'category': '补个人资源', 'items': self._limit_unique_items(personal_items)}) + + return suggestions + + def _extract_dimension_scores(self, shap_local): + if not shap_local: + return {} + dimension_contribution = shap_local.get('dimension_contribution', {}) + return { + key: self._safe_float(value) + for key, value in dimension_contribution.items() + if isinstance(value, (int, float)) + } + + def _extract_feature_effects(self, shap_local, positive=True, limit=3): + if not shap_local: + return [] + features = shap_local.get('features', []) + filtered = [] + for item in features: + shap_value = self._safe_float(item.get('shap_value', 0.0)) + if positive and shap_value <= 0: + continue + if not positive and shap_value >= 0: + continue + filtered.append({ + 'name': item.get('name'), + 'name_cn': item.get('name_cn') or item.get('name') or '未命名特征', + 'dimension': self._dimension_label(item.get('dimension')), + 'shap_value': round(shap_value, 4), + }) + filtered.sort(key=lambda entry: entry['shap_value'], reverse=positive) + if not positive: + filtered.sort(key=lambda entry: abs(entry['shap_value']), reverse=True) + return filtered[:limit] + + def _dimension_label(self, key): + if key in config.JDR_DIMENSIONS: + return config.JDR_DIMENSIONS[key]['name_cn'] + if key == 'event_context': + return '事件上下文' + if key == 'other': + return '其他因素' + return key or '其他因素' + + def _limit_unique_items(self, items, limit=3): + unique_items = [] + for item in items: + if item not in unique_items: + unique_items.append(item) + return unique_items[:limit] + + def _classify_job_demands(self, score): + score = self._safe_float(score) + if score >= 5.2: + return '偏高', 'danger' + if score >= 4.0: + return '中等', 'warning' + return '适中', 'success' + + def _classify_resource_stock(self, score): + score = self._safe_float(score) + if score >= 3.8: + return '充足', 'success' + if score >= 3.0: + return '中等', 'warning' + return '偏低', 'danger' + + def _classify_balance(self, score): + score = self._safe_float(score) + if score >= 0.8: + return '资源占优', 'success' + if score >= 0.0: + return '基本平衡', 'info' + if score >= -0.8: + return '轻度失衡', 'warning' + return '明显失衡', 'danger' + + def _classify_burnout(self, score): + score = self._safe_float(score) + if score >= 2.8: + return '偏高', 'danger' + if score >= 2.0: + return '中等', 'warning' + return '可控', 'success' + def _inverse_transform_prediction(self, prediction): if self.training_metadata.get('target_transform') == 'log1p': return float(np.expm1(prediction)) @@ -202,13 +552,13 @@ class PredictService: def _get_default_prediction(self, data): base_hours = 3.8 - base_hours += min(float(data.get('monthly_overtime_hours', 24)) / 20, 3.0) - base_hours += min(float(data.get('commute_minutes', 40)) / 50, 2.0) - base_hours += 1.6 if int(data.get('is_night_shift', 0)) == 1 else 0 - base_hours += 1.8 if int(data.get('chronic_disease_flag', 0)) == 1 else 0 - base_hours += 0.9 if int(data.get('near_holiday_flag', 0)) == 1 else 0 - base_hours += 0.8 if int(data.get('medical_certificate_flag', 0)) == 1 else 0 - base_hours += 0.5 * int(data.get('children_count', 0)) + base_hours += min(self._safe_float(data.get('monthly_overtime_hours', 24)) / 20, 3.0) + base_hours += min(self._safe_float(data.get('commute_minutes', 40)) / 50, 2.0) + base_hours += 1.6 if self._as_flag(data.get('is_night_shift')) else 0 + base_hours += 1.8 if self._as_flag(data.get('chronic_disease_flag')) else 0 + base_hours += 0.9 if self._as_flag(data.get('near_holiday_flag')) else 0 + base_hours += 0.8 if self._as_flag(data.get('medical_certificate_flag')) else 0 + base_hours += 0.5 * int(self._safe_float(data.get('children_count', 0))) if data.get('leave_type') in ['病假', '工伤假', '婚假', '丧假']: base_hours += 2.5 if data.get('stress_level') == '高': @@ -227,7 +577,6 @@ class PredictService: } def _get_risk_probability(self, features, model_type): - """获取分类器预测的风险概率""" classifier = self.classifiers.get(model_type) if classifier is None: classifier = self.classifiers.get('random_forest') @@ -246,7 +595,6 @@ class PredictService: return {'low': 0.0, 'medium': 1.0, 'high': 0.0} def predict_risk_classification(self, data, model_type=None): - """使用分类模型直接预测风险等级""" self._ensure_models_loaded() model_type = model_type or self.default_model classifier = self.classifiers.get(model_type) @@ -293,5 +641,17 @@ class PredictService: }, } + def _safe_float(self, value, default=0.0): + try: + return float(value) + except (TypeError, ValueError): + return default + + def _as_flag(self, value): + try: + return int(value) == 1 + except (TypeError, ValueError): + return False + predict_service = PredictService() diff --git a/backend/services/shap_service.py b/backend/services/shap_service.py index f902d94..75fa421 100644 --- a/backend/services/shap_service.py +++ b/backend/services/shap_service.py @@ -28,13 +28,58 @@ class SHAPService: except Exception: return None - def get_global_importance(self, model_type='random_forest'): + def _save_cache(self, model_type, payload): + os.makedirs(config.SHAP_CACHE_DIR, exist_ok=True) + cache_path = self._get_cache_path(model_type) + with open(cache_path, 'w', encoding='utf-8') as fp: + json.dump(payload, fp, ensure_ascii=False) + + def _build_cache_payload(self, model_type): + self._ensure_analyzer() + global_data = self._analyzer.global_shap_values(model_type) + if global_data.get('error'): + return {'error': global_data['error']} + + top_features = [item['name'] for item in global_data.get('top_features', [])[:15]] + dependence = {} + for feature_name in top_features: + data = self._analyzer.shap_dependence(feature_name, model_type) + if not data.get('error'): + dependence[feature_name] = data + + interaction = self._analyzer.shap_interaction(model_type, top_n=10) + if interaction.get('error'): + return {'error': interaction['error']} + + return { + 'model_type': model_type, + 'global': global_data, + 'dependence': dependence, + 'interaction': interaction, + } + + def _ensure_cache(self, model_type): cache = self._load_cache(model_type) - if not cache: + if cache: + return cache + + payload = self._build_cache_payload(model_type) + if payload.get('error'): return { - 'error': f'SHAP cache not found for {model_type}. ' - f'Run backend/core/generate_shap_cache.py first.' + 'error': f'{model_type} 的贡献解释数据暂时不可用:{payload["error"]}' } + + try: + self._save_cache(model_type, payload) + except Exception: + # 缓存写入失败时至少保证当前请求可继续返回结果。 + pass + return payload + + def get_global_importance(self, model_type='random_forest'): + cache = self._ensure_cache(model_type) + if cache.get('error'): + return cache return cache.get('global', {'error': f'Invalid SHAP cache for {model_type}'}) def get_local_explanation(self, data, model_type='random_forest'): @@ -42,12 +87,9 @@ class SHAPService: return self._analyzer.local_shap_values(data, model_type) def get_interactions(self, model_type='random_forest', top_n=10): - cache = self._load_cache(model_type) - if not cache: - return { - 'error': f'SHAP cache not found for {model_type}. ' - f'Run backend/core/generate_shap_cache.py first.' - } + cache = self._ensure_cache(model_type) + if cache.get('error'): + return cache data = cache.get('interaction') if not data: return {'error': f'Interaction cache missing for {model_type}'} @@ -58,17 +100,26 @@ class SHAPService: return data def get_dependence(self, feature_name, model_type='random_forest'): - cache = self._load_cache(model_type) - if not cache: - return { - 'error': f'SHAP cache not found for {model_type}. ' - f'Run backend/core/generate_shap_cache.py first.' - } + cache = self._ensure_cache(model_type) + if cache.get('error'): + return cache dependence_map = cache.get('dependence', {}) data = dependence_map.get(feature_name) if data: return data - return {'error': f'Dependence cache missing for feature {feature_name}'} + + self._ensure_analyzer() + data = self._analyzer.shap_dependence(feature_name, model_type) + if data.get('error'): + return {'error': f'特征 {feature_name} 的依赖解释不可用:{data["error"]}'} + + dependence_map[feature_name] = data + cache['dependence'] = dependence_map + try: + self._save_cache(model_type, cache) + except Exception: + pass + return data shap_service = SHAPService() diff --git a/backend/tests/test_clustering_naming.py b/backend/tests/test_clustering_naming.py new file mode 100644 index 0000000..bb48fa7 --- /dev/null +++ b/backend/tests/test_clustering_naming.py @@ -0,0 +1,120 @@ +import importlib.util +import sys +import types +import unittest +from pathlib import Path + +import numpy as np + + +def load_clustering_module(): + module_path = Path(r'D:\forsetsystem\backend\core\clustering.py') + + fake_config = types.SimpleNamespace( + RANDOM_STATE=42, + TARGET_COLUMN='缺勤时长(小时)', + EMPLOYEE_ID_COLUMN='员工工号', + FEATURE_NAME_CN={ + '月均加班时长': '月均加班时长', + '缺勤时长(小时)': '缺勤时长(小时)', + }, + ) + fake_preprocessing = types.ModuleType('core.preprocessing') + fake_preprocessing.get_clean_data = lambda: None + fake_sklearn = types.ModuleType('sklearn') + fake_sklearn_cluster = types.ModuleType('sklearn.cluster') + fake_sklearn_preprocessing = types.ModuleType('sklearn.preprocessing') + + class DummyKMeans: + def __init__(self, *args, **kwargs): + self.cluster_centers_ = None + + def fit_predict(self, data): + self.cluster_centers_ = np.asarray(data, dtype=float) + return np.zeros(len(data), dtype=int) + + class DummyMinMaxScaler: + def fit_transform(self, data): + return np.asarray(data, dtype=float) + + def inverse_transform(self, data): + return np.asarray(data, dtype=float) + + fake_sklearn_cluster.KMeans = DummyKMeans + fake_sklearn_preprocessing.MinMaxScaler = DummyMinMaxScaler + + sys.modules['config'] = fake_config + sys.modules['core.preprocessing'] = fake_preprocessing + sys.modules['sklearn'] = fake_sklearn + sys.modules['sklearn.cluster'] = fake_sklearn_cluster + sys.modules['sklearn.preprocessing'] = fake_sklearn_preprocessing + + spec = importlib.util.spec_from_file_location('test_clustering_module', module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +class ClusterNamingTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + module = load_clustering_module() + cls.analyzer = module.KMeansAnalyzer() + + def test_generate_cluster_names_avoids_generic_group_names(self): + centers = np.array([ + [41, 11, 18, 28, 22.5, 4.2], + [30, 3, 22, 33, 23.0, 5.8], + [36, 7, 36, 52, 24.0, 8.6], + [38, 6, 24, 31, 27.2, 8.1], + ], dtype=float) + + names = self.analyzer._generate_cluster_names(centers) + + self.assertEqual(len(names), 4) + for name in names.values(): + self.assertNotIn('群体', name) + + def test_generate_cluster_names_returns_business_labels(self): + centers = np.array([ + [42, 10, 16, 26, 22.0, 4.1], + [29, 2, 20, 30, 22.8, 5.6], + [35, 6, 34, 50, 24.1, 8.8], + [37, 7, 23, 29, 27.5, 8.0], + ], dtype=float) + + names = self.analyzer._generate_cluster_names(centers) + + self.assertIn('稳定成熟型', names.values()) + self.assertIn('新锐成长型', names.values()) + self.assertIn('压力奔波型', names.values()) + self.assertIn('健康关注型', names.values()) + + def test_duplicate_names_receive_natural_suffixes(self): + centers = np.array([ + [44, 12, 18, 29, 22.2, 4.0], + [39, 9, 20, 34, 23.1, 5.3], + [32, 4, 31, 46, 24.8, 7.2], + ], dtype=float) + + names = self.analyzer._deduplicate_cluster_names( + {0: '稳定成熟型', 1: '稳定成熟型', 2: '负荷承压型'}, + centers, + ) + + self.assertEqual({names[0], names[1]}, {'稳定成熟型-资深组', '稳定成熟型-成熟组'}) + self.assertEqual(names[2], '负荷承压型') + + def test_description_reflects_center_traits(self): + description = self.analyzer._generate_description( + '压力奔波型', + np.array([34, 5, 36, 52, 24.0, 8.3], dtype=float), + ) + + self.assertIn('加班负荷偏高', description) + self.assertIn('通勤压力偏高', description) + self.assertIn('缺勤时长偏高', description) + + +if __name__ == '__main__': + unittest.main() diff --git a/backend/tests/test_predict_explanation.py b/backend/tests/test_predict_explanation.py new file mode 100644 index 0000000..b3857c9 --- /dev/null +++ b/backend/tests/test_predict_explanation.py @@ -0,0 +1,155 @@ +import importlib.util +import sys +import types +import unittest +from pathlib import Path + + +def load_predict_module(): + module_path = Path(r'D:\forsetsystem\backend\services\predict_service.py') + + fake_config = types.SimpleNamespace( + MODELS_DIR='', + SCALER_PATH='', + JDR_DIMENSIONS={ + 'job_demands': {'name_cn': '工作要求'}, + 'job_resources': {'name_cn': '工作资源'}, + 'personal_resources': {'name_cn': '个人资源'}, + 'mediators': {'name_cn': '中介变量'}, + }, + ) + fake_deep_learning = types.ModuleType('core.deep_learning_model') + fake_deep_learning.load_lstm_mlp_bundle = lambda path: None + fake_deep_learning.predict_lstm_mlp = lambda model, data: 0.0 + + fake_model_features = types.ModuleType('core.model_features') + fake_model_features.align_feature_frame = lambda frame, names: frame + fake_model_features.apply_label_encoders = lambda frame, encoders: frame + fake_model_features.build_prediction_dataframe = lambda data: data + fake_model_features.engineer_features = lambda frame: frame + fake_model_features.to_float_array = lambda frame: frame + + sys.modules['config'] = fake_config + sys.modules['core.deep_learning_model'] = fake_deep_learning + sys.modules['core.model_features'] = fake_model_features + + spec = importlib.util.spec_from_file_location('test_predict_service_module', module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +class PredictExplanationTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + module = load_predict_module() + cls.service = module.PredictService() + + def test_build_jdr_snapshot_marks_high_demands_and_low_resources(self): + snapshot = self.service._build_jdr_snapshot({ + '工作要求指数': 5.8, + '工作资源指数': 2.7, + '个人资源指数': 2.8, + 'JD-R平衡度': -1.1, + '倦怠风险指数': 3.1, + '工作投入指数': 2.9, + }) + + self.assertEqual(snapshot['job_demands']['status'], '偏高') + self.assertEqual(snapshot['job_resources']['status'], '偏低') + self.assertEqual(snapshot['balance']['status'], '明显失衡') + self.assertEqual(snapshot['burnout_risk']['status'], '偏高') + + def test_mechanism_summary_prefers_health_impairment_path(self): + snapshot = self.service._build_jdr_snapshot({ + '工作要求指数': 5.6, + '工作资源指数': 2.9, + '个人资源指数': 2.8, + 'JD-R平衡度': -0.9, + '倦怠风险指数': 3.0, + '工作投入指数': 2.9, + }) + shap_local = { + 'dimension_contribution': { + '工作要求': 0.32, + '中介变量': 0.18, + '事件上下文': 0.11, + '工作资源': -0.07, + }, + 'features': [ + {'name': 'monthly_overtime_hours', 'name_cn': '月均加班时长', 'dimension': 'job_demands', 'shap_value': 0.18}, + {'name': 'commute_minutes', 'name_cn': '通勤时长', 'dimension': 'job_demands', 'shap_value': 0.12}, + {'name': 'medical_certificate_flag', 'name_cn': '医院证明', 'dimension': 'event_context', 'shap_value': 0.08}, + {'name': 'coworker_support', 'name_cn': '同事支持', 'dimension': 'job_resources', 'shap_value': -0.05}, + ], + } + result = {'predicted_hours': 9.4, 'risk_label': '高风险'} + data = { + 'monthly_overtime_hours': 38, + 'commute_minutes': 62, + 'is_night_shift': 1, + 'medical_certificate_flag': 1, + } + + summary = self.service._build_mechanism_summary(result, data, snapshot, shap_local) + + self.assertIn('健康损耗', summary['pathway_label']) + self.assertIn('月均加班时长', summary['mechanism']) + self.assertTrue(summary['scenario_hint']) + + def test_intervention_suggestions_cover_resource_and_personal_support(self): + snapshot = self.service._build_jdr_snapshot({ + '工作要求指数': 4.4, + '工作资源指数': 2.7, + '个人资源指数': 2.6, + 'JD-R平衡度': -0.7, + '倦怠风险指数': 2.9, + '工作投入指数': 2.8, + }) + suggestions = self.service._build_intervention_suggestions( + { + 'monthly_overtime_hours': 18, + 'commute_minutes': 28, + 'chronic_disease_flag': 1, + 'medical_certificate_flag': 1, + 'leave_reason_category': '子女照护', + }, + snapshot, + shap_local=None, + ) + + category_map = {item['category']: item['items'] for item in suggestions} + self.assertIn('增资源', category_map) + self.assertIn('补个人资源', category_map) + self.assertTrue(any('支持' in item or '弹性' in item for item in category_map['增资源'])) + self.assertTrue(any('健康' in item or '倦怠' in item for item in category_map['补个人资源'])) + + def test_buffer_text_mentions_protective_factors(self): + snapshot = self.service._build_jdr_snapshot({ + '工作要求指数': 3.9, + '工作资源指数': 4.2, + '个人资源指数': 4.0, + 'JD-R平衡度': 0.9, + '倦怠风险指数': 1.8, + '工作投入指数': 4.1, + }) + shap_local = { + 'dimension_contribution': { + '工作要求': 0.08, + '工作资源': -0.12, + '个人资源': -0.09, + }, + 'features': [ + {'name': 'supervisor_support', 'name_cn': '上级支持', 'dimension': 'job_resources', 'shap_value': -0.07}, + {'name': 'self_efficacy', 'name_cn': '自我效能感', 'dimension': 'personal_resources', 'shap_value': -0.05}, + ], + } + + summary = self.service._build_mechanism_summary({'predicted_hours': 5.3, 'risk_label': '中风险'}, {}, snapshot, shap_local) + + self.assertIn('缓冲作用', summary['buffer_text']) + self.assertTrue(summary['protective_factors']) + + +if __name__ == '__main__': + unittest.main() diff --git a/frontend/src/App.vue b/frontend/src/App.vue index f8ccbe6..7dd0e4a 100644 --- a/frontend/src/App.vue +++ b/frontend/src/App.vue @@ -2,10 +2,9 @@
- 将员工划分为不同缺勤画像群体,通过雷达图和散点图形成直观的人群对比展示。 + 基于缺勤行为、工作压力和基础属性划分典型员工群体。
以年龄、司龄、加班、通勤、BMI 和缺勤水平构建群体轮廓。
+年龄、司龄、加班、通勤、BMI 与缺勤水平
便于答辩时逐个介绍群体特征。
+群体规模与主要特征
展示各聚类在加班强度与缺勤水平上的位置差异。
+加班强度与缺勤水平分布