import os import joblib import numpy as np import config from core.deep_learning_model import load_lstm_mlp_bundle, predict_lstm_mlp from core.model_features import ( align_feature_frame, apply_label_encoders, build_prediction_dataframe, engineer_features, to_float_array, ) MODEL_INFO = { 'random_forest': {'name': 'random_forest', 'name_cn': '随机森林', 'description': '稳健的树模型集成'}, 'xgboost': {'name': 'xgboost', 'name_cn': '增强树模型一', 'description': '梯度提升树模型'}, 'lightgbm': {'name': 'lightgbm', 'name_cn': '增强树模型二', 'description': '轻量级梯度提升树'}, 'gradient_boosting': {'name': 'gradient_boosting', 'name_cn': '梯度提升树', 'description': '梯度提升决策树'}, 'extra_trees': {'name': 'extra_trees', 'name_cn': '极端随机树', 'description': '高随机性的树模型'}, 'stacking': {'name': 'stacking', 'name_cn': '集成模型', 'description': '多模型融合'}, 'lstm_mlp': { 'name': 'lstm_mlp', 'name_cn': '时序注意力融合网络', 'description': 'Transformer 时序编码与静态特征融合的深度学习模型', }, } EXPLAINABLE_TREE_MODELS = ( 'random_forest', 'xgboost', 'lightgbm', 'gradient_boosting', 'extra_trees', ) class PredictService: def __init__(self): self.models = {} self.classifiers = {} self.classification_metrics = {} self.scaler = None self.feature_names = None self.selected_features = None self.label_encoders = {} self.model_metrics = {} self.training_metadata = {} self.default_model = 'random_forest' def _ensure_models_loaded(self): if not self.models: self.load_models() def load_models(self): metadata_path = os.path.join(config.MODELS_DIR, 'training_metadata.pkl') if os.path.exists(metadata_path): self.training_metadata = joblib.load(metadata_path) model_files = { 'random_forest': 'random_forest_model.pkl', 'xgboost': 'xgboost_model.pkl', 'lightgbm': 'lightgbm_model.pkl', 'gradient_boosting': 'gradient_boosting_model.pkl', 'extra_trees': 'extra_trees_model.pkl', 'stacking': 'stacking_model.pkl', 'lstm_mlp': 'lstm_mlp_model.pt', } allowed_models = self.training_metadata.get('available_models') if allowed_models: model_files = {k: v for k, v in model_files.items() if k in allowed_models} for name, filename in model_files.items(): path = os.path.join(config.MODELS_DIR, filename) if os.path.exists(path): try: if name == 'lstm_mlp': bundle = load_lstm_mlp_bundle(path) if bundle is not None: self.models[name] = bundle else: self.models[name] = joblib.load(path) except Exception as exc: print(f'Failed to load model {name}: {exc}') if os.path.exists(config.SCALER_PATH): self.scaler = joblib.load(config.SCALER_PATH) for filename, attr in [ ('feature_names.pkl', 'feature_names'), ('selected_features.pkl', 'selected_features'), ('label_encoders.pkl', 'label_encoders'), ('model_metrics.pkl', 'model_metrics'), ]: path = os.path.join(config.MODELS_DIR, filename) if os.path.exists(path): try: setattr(self, attr, joblib.load(path)) except Exception as exc: print(f'Failed to load artifact {filename}: {exc}') valid_metrics = {key: value for key, value in self.model_metrics.items() if key in self.models} if valid_metrics: self.default_model = max(valid_metrics.items(), key=lambda item: item[1]['r2'])[0] for name in ['random_forest', 'gradient_boosting', 'lightgbm', 'xgboost']: path = os.path.join(config.MODELS_DIR, f'risk_{name}_classifier.pkl') if os.path.exists(path): try: self.classifiers[name] = joblib.load(path) except Exception: pass cls_metrics_path = os.path.join(config.MODELS_DIR, 'classification_metrics.pkl') if os.path.exists(cls_metrics_path): try: self.classification_metrics = joblib.load(cls_metrics_path) except Exception: pass def get_available_models(self): self._ensure_models_loaded() models = [] for name in self.models.keys(): info = MODEL_INFO.get(name, {'name': name, 'name_cn': name, 'description': ''}).copy() info['is_available'] = True info['is_default'] = name == self.default_model info['metrics'] = self.model_metrics.get(name, {'r2': 0, 'rmse': 0, 'mae': 0}) models.append(info) models.sort(key=lambda item: item['metrics']['r2'], reverse=True) return models def predict_single(self, data, model_type=None, include_explanation=True): self._ensure_models_loaded() model_type = self._resolve_prediction_model(model_type or self.default_model) _, engineered_df = self._build_prediction_frames(data) engineered_row = engineered_df.iloc[0] if model_type is None or self.scaler is None or self.feature_names is None: result = self._get_default_prediction(data) return self._augment_prediction_result(result, data, engineered_row) if include_explanation else result try: features = self._prepare_features_from_engineered(engineered_df) except Exception: result = self._get_default_prediction(data) return self._augment_prediction_result(result, data, engineered_row) if include_explanation else result try: if model_type == 'lstm_mlp': current_df = build_prediction_dataframe(data) predicted_hours = predict_lstm_mlp(self.models[model_type], current_df) else: predicted_hours = self.models[model_type].predict([features])[0] predicted_hours = self._inverse_transform_prediction(predicted_hours) predicted_hours = max(0.5, float(predicted_hours)) except Exception: result = self._get_default_prediction(data) return self._augment_prediction_result(result, data, engineered_row) if include_explanation else result risk_level, risk_label = self._get_risk_level(predicted_hours) confidence = max(0.5, self.model_metrics.get(model_type, {}).get('r2', 0.82)) risk_probability = self._get_risk_probability(features, model_type) result = { 'predicted_hours': round(predicted_hours, 2), 'risk_level': risk_level, 'risk_label': risk_label, 'risk_probability': risk_probability, 'confidence': round(confidence, 2), 'model_used': model_type, 'model_name_cn': MODEL_INFO.get(model_type, {}).get('name_cn', model_type), } return self._augment_prediction_result(result, data, engineered_row) if include_explanation else result def predict_compare(self, data): self._ensure_models_loaded() results = [] for name in self.models.keys(): result = self.predict_single(data, name, include_explanation=False) result['model'] = name result['model_name_cn'] = MODEL_INFO.get(name, {}).get('name_cn', name) result['r2'] = self.model_metrics.get(name, {}).get('r2', 0) results.append(result) results.sort(key=lambda item: item.get('r2', 0), reverse=True) if results: results[0]['recommended'] = True return results def _build_prediction_frames(self, data): current_df = build_prediction_dataframe(data) engineered_df = engineer_features(current_df.copy()) return current_df, engineered_df def _prepare_features(self, data): _, engineered_df = self._build_prediction_frames(data) return self._prepare_features_from_engineered(engineered_df) def _prepare_features_from_engineered(self, engineered_df): X_df = apply_label_encoders(engineered_df.copy(), self.label_encoders) X_df = align_feature_frame(X_df, self.feature_names) features = self.scaler.transform(to_float_array(X_df))[0] if self.selected_features: selected_indices = [self.feature_names.index(name) for name in self.selected_features if name in self.feature_names] if selected_indices: features = features[selected_indices] return features def _resolve_prediction_model(self, requested_model): if requested_model in self.models: return requested_model if self.default_model in self.models: return self.default_model return next(iter(self.models), None) def _resolve_explanation_model(self, prediction_model): if prediction_model in EXPLAINABLE_TREE_MODELS and prediction_model in self.models: return prediction_model for candidate in ('random_forest', 'xgboost', 'lightgbm', 'gradient_boosting', 'extra_trees'): if candidate in self.models: return candidate return None def _augment_prediction_result(self, result, data, engineered_row): explanation_model = self._resolve_explanation_model(result.get('model_used')) shap_local = self._get_local_explanation(data, explanation_model) jdr_snapshot = self._build_jdr_snapshot(engineered_row) mechanism_summary = self._build_mechanism_summary(result, data, jdr_snapshot, shap_local) intervention_suggestions = self._build_intervention_suggestions(data, jdr_snapshot, shap_local) payload = dict(result) payload.update({ 'jdr_snapshot': jdr_snapshot, 'mechanism_summary': mechanism_summary, 'intervention_suggestions': intervention_suggestions, 'explanation_model_used': explanation_model, 'explanation_model_name_cn': MODEL_INFO.get(explanation_model, {}).get('name_cn', '机制解释'), 'shap_local': shap_local, }) return payload def _get_local_explanation(self, data, model_type): if not model_type: return None try: from services.shap_service import shap_service explanation = shap_service.get_local_explanation(data, model_type) if explanation and not explanation.get('error'): return explanation except Exception: pass return None def _build_jdr_snapshot(self, engineered_row): snapshot = { 'job_demands': self._build_snapshot_item( 'job_demands', '工作要求', engineered_row.get('工作要求指数', 0.0), *self._classify_job_demands(engineered_row.get('工作要求指数', 0.0)), ), 'job_resources': self._build_snapshot_item( 'job_resources', '工作资源', engineered_row.get('工作资源指数', 0.0), *self._classify_resource_stock(engineered_row.get('工作资源指数', 0.0)), ), 'personal_resources': self._build_snapshot_item( 'personal_resources', '个人资源', engineered_row.get('个人资源指数', 0.0), *self._classify_resource_stock(engineered_row.get('个人资源指数', 0.0)), ), 'balance': self._build_snapshot_item( 'balance', '平衡度', engineered_row.get('JD-R平衡度', 0.0), *self._classify_balance(engineered_row.get('JD-R平衡度', 0.0)), ), 'burnout_risk': self._build_snapshot_item( 'burnout_risk', '倦怠风险', engineered_row.get('倦怠风险指数', 0.0), *self._classify_burnout(engineered_row.get('倦怠风险指数', 0.0)), ), 'engagement': self._build_snapshot_item( 'engagement', '工作投入', engineered_row.get('工作投入指数', 0.0), *self._classify_resource_stock(engineered_row.get('工作投入指数', 0.0)), ), } return snapshot def _build_snapshot_item(self, key, label, score, status, tone): return { 'key': key, 'label': label, 'score': round(self._safe_float(score), 2), 'status': status, 'tone': tone, } def _build_mechanism_summary(self, result, data, jdr_snapshot, shap_local): dimension_scores = self._extract_dimension_scores(shap_local) top_drivers = self._extract_feature_effects(shap_local, positive=True, limit=3) protective_factors = self._extract_feature_effects(shap_local, positive=False, limit=2) pathway_label, pathway_tone, pathway_detail = self._infer_pathway(jdr_snapshot, dimension_scores) mechanism = self._build_mechanism_text(data, jdr_snapshot, dimension_scores, top_drivers) buffer_text = self._build_buffer_text(jdr_snapshot, protective_factors) scenario_hint = self._build_scenario_hint(data) return { 'conclusion': f"本次预测为{result['risk_label']},预计缺勤时长约 {result['predicted_hours']} 小时。", 'mechanism': mechanism, 'pathway_label': pathway_label, 'pathway_tone': pathway_tone, 'pathway_detail': pathway_detail, 'buffer_text': buffer_text, 'scenario_hint': scenario_hint, 'top_drivers': top_drivers, 'protective_factors': protective_factors, } def _build_mechanism_text(self, data, jdr_snapshot, dimension_scores, top_drivers): if top_drivers: driver_names = '、'.join(item['name_cn'] for item in top_drivers) if dimension_scores.get('工作要求', 0.0) > 0.03: return f'主要推高因素集中在{driver_names},说明高工作要求正在直接抬升本次缺勤风险。' if dimension_scores.get('事件上下文', 0.0) > 0.03: return f'主要推高因素集中在{driver_names},当前结果更容易受到请假事件情境的直接触发。' if dimension_scores.get('工作资源', 0.0) > 0.03 or dimension_scores.get('个人资源', 0.0) > 0.03: return f'主要推高因素集中在{driver_names},说明资源缓冲不足正在放大本次缺勤时长。' return f'主要推高因素集中在{driver_names},它们共同推动了本次缺勤时长上升。' fragments = [] if jdr_snapshot['job_demands']['tone'] in {'warning', 'danger'}: fragments.append('工作要求偏高') if jdr_snapshot['job_resources']['tone'] == 'danger': fragments.append('工作资源不足') if jdr_snapshot['personal_resources']['tone'] == 'danger': fragments.append('个人资源偏弱') if self._as_flag(data.get('medical_certificate_flag')) or self._as_flag(data.get('near_holiday_flag')): fragments.append('事件情境触发明显') if not fragments: return '当前结果更多体现为常规缺勤波动,整体压力与资源结构暂时可控。' return f"当前结果主要由{'、'.join(fragments)}共同驱动。" def _build_buffer_text(self, jdr_snapshot, protective_factors): if protective_factors: names = '、'.join(item['name_cn'] for item in protective_factors) return f'{names}对当前风险仍有一定缓冲作用,但尚不足以完全抵消主要压力来源。' if jdr_snapshot['job_resources']['tone'] in {'success', 'info'} and jdr_snapshot['personal_resources']['tone'] in {'success', 'info'}: return '当前资源支持和个人恢复能力对风险有一定缓冲,但事件性因素仍需持续关注。' return '' def _build_scenario_hint(self, data): actions = [] if self._safe_float(data.get('monthly_overtime_hours', 0.0)) >= 25: actions.append('将月均加班控制在 20 小时以内') if self._safe_float(data.get('commute_minutes', 0.0)) >= 45: actions.append('把通勤时长压缩到 30 分钟左右') if self._as_flag(data.get('is_night_shift')): actions.append('减少连续夜班或延长轮休恢复时间') if not actions: return '' if len(actions) == 1: return f'情境判断:若能{actions[0]},当前风险通常会有所回落。' return f"情境判断:若能{',并'.join(actions[:-1])},同时{actions[-1]},当前风险通常会有所回落。" def _infer_pathway(self, jdr_snapshot, dimension_scores): demands_pressure = dimension_scores.get('工作要求', 0.0) mediator_pressure = dimension_scores.get('中介变量', 0.0) resource_pressure = dimension_scores.get('工作资源', 0.0) + dimension_scores.get('个人资源', 0.0) event_pressure = dimension_scores.get('事件上下文', 0.0) demands_high = jdr_snapshot['job_demands']['tone'] == 'danger' burnout_high = jdr_snapshot['burnout_risk']['tone'] in {'warning', 'danger'} resources_low = ( jdr_snapshot['job_resources']['tone'] == 'danger' or jdr_snapshot['personal_resources']['tone'] == 'danger' or jdr_snapshot['engagement']['tone'] == 'danger' ) if demands_high or burnout_high or demands_pressure > 0.03 or mediator_pressure > 0.03: if resources_low or resource_pressure > 0.03: return ( '健康损耗与资源缓冲不足', 'danger', '当前结果同时表现出高要求累积与资源缓冲不足,更接近“工作要求上升 → 倦怠累积 → 缺勤增加”的复合路径。', ) return ( '健康损耗路径为主', 'warning', '当前结果更接近“工作要求上升 → 倦怠累积 → 缺勤增加”的健康损耗路径。', ) if resources_low or resource_pressure > 0.03: return ( '激励支撑不足路径', 'warning', '当前资源与个人恢复能力偏弱,工作投入对缺勤风险的缓冲作用有限。', ) if event_pressure > 0.04: return ( '事件触发型波动', 'info', '当前结果更容易受到请假类型、医院证明和节假日前后等事件情境直接触发。', ) return ( '混合影响路径', 'info', '当前结果同时受到工作要求、资源结构与事件情境的共同影响,尚不属于单一路径主导。', ) def _build_intervention_suggestions(self, data, jdr_snapshot, shap_local): suggestions = [] demand_items = [] overtime_hours = self._safe_float(data.get('monthly_overtime_hours', 0.0)) commute_minutes = self._safe_float(data.get('commute_minutes', 0.0)) if overtime_hours >= 25 or jdr_snapshot['job_demands']['tone'] == 'danger': demand_items.append('优先压降连续高负荷排班,尽量把月均加班控制在 20 小时以内。') if commute_minutes >= 45: demand_items.append('若条件允许,可通过弹性到岗、调班或就近安排缓和通勤压力。') if self._as_flag(data.get('is_night_shift')): demand_items.append('夜班岗位建议增加轮休和班后恢复时段,避免疲劳持续累积。') if self._as_flag(data.get('near_holiday_flag')): demand_items.append('节假日前后可提前做好替班和排班缓冲,减少事件性缺勤波动。') if not demand_items: demand_items.append('当前工作要求未明显失衡,重点保持排班稳定并持续监控波动。') suggestions.append({'category': '减要求', 'items': self._limit_unique_items(demand_items)}) resource_items = [] if jdr_snapshot['job_resources']['tone'] in {'warning', 'danger'}: resource_items.append('增加主管沟通、临时替班支持和班组协同,补足组织支持资源。') if jdr_snapshot['balance']['tone'] in {'warning', 'danger'}: resource_items.append('对高风险岗位提供更清晰的任务边界和优先级,降低角色冲突。') if str(data.get('leave_reason_category', '')) == '子女照护': resource_items.append('可结合弹性工时或家庭照护支持,缓解家庭事务对缺勤的放大作用。') if not resource_items: resource_items.append('当前资源面整体可用,建议继续维持支持性排班和沟通反馈机制。') suggestions.append({'category': '增资源', 'items': self._limit_unique_items(resource_items)}) personal_items = [] if self._as_flag(data.get('chronic_disease_flag')) or self._as_flag(data.get('medical_certificate_flag')): personal_items.append('结合健康监测、复诊安排和短期工作调整,降低身体不适带来的持续缺勤风险。') if jdr_snapshot['burnout_risk']['tone'] in {'warning', 'danger'}: personal_items.append('建议通过休息恢复、情绪支持和短周期工作调整,缓冲倦怠累积。') if jdr_snapshot['personal_resources']['tone'] == 'danger': personal_items.append('可通过辅导、复盘和岗位支持增强员工自我效能与心理韧性。') if not personal_items: personal_items.append('当前个体恢复能力整体可控,重点维持规律作息和健康管理即可。') suggestions.append({'category': '补个人资源', 'items': self._limit_unique_items(personal_items)}) return suggestions def _extract_dimension_scores(self, shap_local): if not shap_local: return {} dimension_contribution = shap_local.get('dimension_contribution', {}) return { key: self._safe_float(value) for key, value in dimension_contribution.items() if isinstance(value, (int, float)) } def _extract_feature_effects(self, shap_local, positive=True, limit=3): if not shap_local: return [] features = shap_local.get('features', []) filtered = [] for item in features: shap_value = self._safe_float(item.get('shap_value', 0.0)) if positive and shap_value <= 0: continue if not positive and shap_value >= 0: continue filtered.append({ 'name': item.get('name'), 'name_cn': item.get('name_cn') or item.get('name') or '未命名特征', 'dimension': self._dimension_label(item.get('dimension')), 'shap_value': round(shap_value, 4), }) filtered.sort(key=lambda entry: entry['shap_value'], reverse=positive) if not positive: filtered.sort(key=lambda entry: abs(entry['shap_value']), reverse=True) return filtered[:limit] def _dimension_label(self, key): if key in config.JDR_DIMENSIONS: return config.JDR_DIMENSIONS[key]['name_cn'] if key == 'event_context': return '事件上下文' if key == 'other': return '其他因素' return key or '其他因素' def _limit_unique_items(self, items, limit=3): unique_items = [] for item in items: if item not in unique_items: unique_items.append(item) return unique_items[:limit] def _classify_job_demands(self, score): score = self._safe_float(score) if score >= 5.2: return '偏高', 'danger' if score >= 4.0: return '中等', 'warning' return '适中', 'success' def _classify_resource_stock(self, score): score = self._safe_float(score) if score >= 3.8: return '充足', 'success' if score >= 3.0: return '中等', 'warning' return '偏低', 'danger' def _classify_balance(self, score): score = self._safe_float(score) if score >= 0.8: return '资源占优', 'success' if score >= 0.0: return '基本平衡', 'info' if score >= -0.8: return '轻度失衡', 'warning' return '明显失衡', 'danger' def _classify_burnout(self, score): score = self._safe_float(score) if score >= 2.8: return '偏高', 'danger' if score >= 2.0: return '中等', 'warning' return '可控', 'success' def _inverse_transform_prediction(self, prediction): if self.training_metadata.get('target_transform') == 'log1p': return float(np.expm1(prediction)) return float(prediction) def _get_risk_level(self, hours): if hours < 4: return 'low', '低风险' if hours <= 8: return 'medium', '中风险' return 'high', '高风险' def _get_default_prediction(self, data): base_hours = 3.8 base_hours += min(self._safe_float(data.get('monthly_overtime_hours', 24)) / 20, 3.0) base_hours += min(self._safe_float(data.get('commute_minutes', 40)) / 50, 2.0) base_hours += 1.6 if self._as_flag(data.get('is_night_shift')) else 0 base_hours += 1.8 if self._as_flag(data.get('chronic_disease_flag')) else 0 base_hours += 0.9 if self._as_flag(data.get('near_holiday_flag')) else 0 base_hours += 0.8 if self._as_flag(data.get('medical_certificate_flag')) else 0 base_hours += 0.5 * int(self._safe_float(data.get('children_count', 0))) if data.get('leave_type') in ['病假', '工伤假', '婚假', '丧假']: base_hours += 2.5 if data.get('stress_level') == '高': base_hours += 0.9 if data.get('performance_level') == 'A': base_hours -= 0.5 risk_level, risk_label = self._get_risk_level(base_hours) return { 'predicted_hours': round(max(0.5, base_hours), 2), 'risk_level': risk_level, 'risk_label': risk_label, 'risk_probability': {'low': 0.0, 'medium': 1.0, 'high': 0.0}, 'confidence': 0.72, 'model_used': 'default', 'model_name_cn': '默认规则', } def _get_risk_probability(self, features, model_type): classifier = self.classifiers.get(model_type) if classifier is None: classifier = self.classifiers.get('random_forest') if classifier is None: return {'low': 0.0, 'medium': 1.0, 'high': 0.0} try: proba = classifier.predict_proba([features])[0] classes = list(classifier.classes_) result = {'low': 0.0, 'medium': 0.0, 'high': 0.0} label_map = {0: 'low', 1: 'medium', 2: 'high'} for idx, cls in enumerate(classes): if cls in label_map: result[label_map[cls]] = round(float(proba[idx]), 4) return result except Exception: return {'low': 0.0, 'medium': 1.0, 'high': 0.0} def predict_risk_classification(self, data, model_type=None): self._ensure_models_loaded() model_type = model_type or self.default_model classifier = self.classifiers.get(model_type) if classifier is None: classifier = self.classifiers.get('random_forest') if classifier is None or self.scaler is None: return None features = self._prepare_features(data) try: pred_class = int(classifier.predict([features])[0]) proba = classifier.predict_proba([features])[0] label_map = {0: 'low', 1: 'medium', 2: 'high'} risk_labels_map = {'low': '低风险', 'medium': '中风险', 'high': '高风险'} risk_level = label_map.get(pred_class, 'medium') classes = list(classifier.classes_) probabilities = {'low': 0.0, 'medium': 0.0, 'high': 0.0} for idx, cls in enumerate(classes): if cls in label_map: probabilities[label_map[cls]] = round(float(proba[idx]), 4) return { 'risk_level': risk_level, 'risk_label': risk_labels_map[risk_level], 'risk_probability': probabilities, 'model_used': model_type, 'classification_metrics': self.classification_metrics.get(model_type, {}), } except Exception: return None def get_model_info(self): self._ensure_models_loaded() return { 'models': self.get_available_models(), 'training_info': { 'train_samples': self.training_metadata.get('train_samples', 0), 'test_samples': self.training_metadata.get('test_samples', 0), 'feature_count': self.training_metadata.get('feature_count_after_selection', 0), 'training_date': self.training_metadata.get('training_date', ''), 'sequence_window_size': self.training_metadata.get('sequence_window_size', 0), 'deep_learning_available': self.training_metadata.get('deep_learning_available', False), }, } def _safe_float(self, value, default=0.0): try: return float(value) except (TypeError, ValueError): return default def _as_flag(self, value): try: return int(value) == 1 except (TypeError, ValueError): return False predict_service = PredictService()