feat: 将数据集从国外员工缺勤数据替换为中国企业缺勤模拟数据

- 新增中国企业员工缺勤模拟数据集生成脚本(generate_dataset.py)，覆盖7个行业、180家企业、2600名员工 - 重构 config.py，更新特征字段为中文名称，调整目标列、员工ID、行业类型等配置 - 重构 clustering.py，简化聚类逻辑，更新聚类特征和群体命名（高压通勤型、健康波动型等） - 重构 feature_mining.py，更新相关性分析和群体比较维度（按行业、班次、婚姻状态等） - 新增 model_features.py 定义模型训练特征 - 更新 preprocessing.py 和 train_model.py 适配新数据结构 - 更新各 API 路由默认参数（model: random_forest, dimension: industry） - 前端更新主题样式和各视图组件适配中文字段 - 更新系统名称为 China Enterprise Absence Analysis System
2026-03-11 10:46:58 +08:00
parent a39d8b2fd2
commit e63267cef6
39 changed files with 15731 additions and 5648 deletions
@@ -1,6 +1,6 @@
 import os
+
 import joblib
-import numpy as np

 import config
 from core.feature_mining import get_correlation_for_heatmap, group_comparison
@@ -10,109 +10,95 @@ class AnalysisService:
    def __init__(self):
        self.models = {}
        self.feature_names = None
-    
+        self.selected_features = None
+        self.training_metadata = {}
+
    def _ensure_models_loaded(self):
-        if not self.models:
-            model_files = {
-                'random_forest': 'random_forest_model.pkl',
-                'xgboost': 'xgboost_model.pkl',
-                'lightgbm': 'lightgbm_model.pkl',
-            }
-            
-            for name, filename in model_files.items():
-                model_path = os.path.join(config.MODELS_DIR, filename)
-                if os.path.exists(model_path):
-                    try:
-                        self.models[name] = joblib.load(model_path)
-                    except Exception as e:
-                        print(f"Failed to load {name}: {e}")
-            
-            feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
-            if os.path.exists(feature_names_path):
-                self.feature_names = joblib.load(feature_names_path)
-    
+        if self.models:
+            return
+        metadata_path = os.path.join(config.MODELS_DIR, 'training_metadata.pkl')
+        if os.path.exists(metadata_path):
+            self.training_metadata = joblib.load(metadata_path)
+        model_files = {
+            'random_forest': 'random_forest_model.pkl',
+            'xgboost': 'xgboost_model.pkl',
+            'lightgbm': 'lightgbm_model.pkl',
+            'gradient_boosting': 'gradient_boosting_model.pkl',
+        }
+        allowed_models = self.training_metadata.get('available_models')
+        if allowed_models:
+            model_files = {k: v for k, v in model_files.items() if k in allowed_models}
+        for name, filename in model_files.items():
+            path = os.path.join(config.MODELS_DIR, filename)
+            if os.path.exists(path):
+                try:
+                    self.models[name] = joblib.load(path)
+                except Exception as exc:
+                    print(f'Failed to load model {name}: {exc}')
+        for filename, attr in [('feature_names.pkl', 'feature_names'), ('selected_features.pkl', 'selected_features')]:
+            path = os.path.join(config.MODELS_DIR, filename)
+            if os.path.exists(path):
+                try:
+                    setattr(self, attr, joblib.load(path))
+                except Exception as exc:
+                    print(f'Failed to load artifact {filename}: {exc}')
+
    def get_feature_importance(self, model_type='random_forest'):
        self._ensure_models_loaded()
-        
        if model_type not in self.models:
-            if self.models:
-                model_type = list(self.models.keys())[0]
-            else:
-                return self._get_default_importance()
-        
+            model_type = next(iter(self.models), 'default')
+        if model_type == 'default':
+            return self._get_default_importance()
        model = self.models[model_type]
-        
-        try:
-            if hasattr(model, 'feature_importances_'):
-                importances = model.feature_importances_
-            else:
-                return self._get_default_importance()
-            
-            feature_names = self.feature_names or [f'feature_{i}' for i in range(len(importances))]
-            
-            if len(feature_names) != len(importances):
-                feature_names = [f'feature_{i}' for i in range(len(importances))]
-            
-            feature_importance = list(zip(feature_names, importances))
-            feature_importance.sort(key=lambda x: x[1], reverse=True)
-            
-            features = []
-            for i, (name, imp) in enumerate(feature_importance[:15]):
-                features.append({
+        if not hasattr(model, 'feature_importances_'):
+            return self._get_default_importance()
+
+        importances = model.feature_importances_
+        feature_names = self.selected_features or self.feature_names or []
+        if len(feature_names) != len(importances):
+            feature_names = [f'feature_{idx}' for idx in range(len(importances))]
+        ranked = sorted(zip(feature_names, importances), key=lambda item: item[1], reverse=True)[:15]
+        return {
+            'model_type': model_type,
+            'features': [
+                {
                    'name': name,
                    'name_cn': config.FEATURE_NAME_CN.get(name, name),
-                    'importance': round(float(imp), 4),
-                    'rank': i + 1
-                })
-            
-            return {
-                'model_type': model_type,
-                'features': features
-            }
-        except Exception as e:
-            print(f"Error getting feature importance: {e}")
-            return self._get_default_importance()
-    
+                    'importance': round(float(importance), 4),
+                    'rank': idx + 1,
+                }
+                for idx, (name, importance) in enumerate(ranked)
+            ],
+        }
+
    def _get_default_importance(self):
-        default_features = [
-            ('Reason for absence', 0.25),
-            ('Transportation expense', 0.12),
-            ('Distance from Residence to Work', 0.10),
-            ('Service time', 0.08),
-            ('Age', 0.07),
-            ('Work load Average/day', 0.06),
-            ('Body mass index', 0.05),
-            ('Social drinker', 0.04),
-            ('Hit target', 0.03),
-            ('Son', 0.03),
-            ('Pet', 0.02),
-            ('Education', 0.02),
-            ('Social smoker', 0.01)
+        defaults = [
+            ('加班通勤压力指数', 0.24),
+            ('健康风险指数', 0.18),
+            ('请假类型', 0.12),
+            ('通勤时长分钟', 0.1),
+            ('月均加班时长', 0.08),
+            ('近90天缺勤次数', 0.07),
+            ('心理压力等级', 0.06),
+            ('家庭负担指数', 0.05),
        ]
-        
-        features = []
-        for i, (name, imp) in enumerate(default_features):
-            features.append({
-                'name': name,
-                'name_cn': config.FEATURE_NAME_CN.get(name, name),
-                'importance': imp,
-                'rank': i + 1
-            })
-        
        return {
            'model_type': 'default',
-            'features': features
+            'features': [
+                {
+                    'name': name,
+                    'name_cn': config.FEATURE_NAME_CN.get(name, name),
+                    'importance': importance,
+                    'rank': idx + 1,
+                }
+                for idx, (name, importance) in enumerate(defaults)
+            ],
        }
-    
+
    def get_correlation(self):
        return get_correlation_for_heatmap()
-    
+
    def get_group_comparison(self, dimension):
-        valid_dimensions = ['drinker', 'smoker', 'education', 'children', 'pet']
-        
-        if dimension not in valid_dimensions:
-            raise ValueError(f"Invalid dimension: {dimension}. Must be one of {valid_dimensions}")
-        
        return group_comparison(dimension)


@@ -11,7 +11,7 @@ class ClusterService:
    def get_cluster_profile(self, n_clusters=3):
        return self.analyzer.get_cluster_profile(n_clusters)
    
-    def get_scatter_data(self, n_clusters=3, x_axis='Age', y_axis='Absenteeism time in hours'):
+    def get_scatter_data(self, n_clusters=3, x_axis='月均加班时长', y_axis='缺勤时长（小时）'):
        return self.analyzer.get_scatter_data(n_clusters, x_axis, y_axis)


@@ -1,6 +1,3 @@
-import pandas as pd
-import numpy as np
-
 import config
 from core.preprocessing import get_clean_data

@@ -8,154 +5,103 @@ from core.preprocessing import get_clean_data
 class DataService:
    def __init__(self):
        self._df = None
-    
+
    @property
    def df(self):
        if self._df is None:
            self._df = get_clean_data()
        return self._df
-    
+
    def get_basic_stats(self):
        df = self.df
-        
        total_records = len(df)
-        total_employees = df['ID'].nunique()
-        total_absent_hours = df['Absenteeism time in hours'].sum()
-        avg_absent_hours = round(df['Absenteeism time in hours'].mean(), 2)
-        max_absent_hours = int(df['Absenteeism time in hours'].max())
-        min_absent_hours = int(df['Absenteeism time in hours'].min())
-        
-        high_risk_count = len(df[df['Absenteeism time in hours'] > 8])
-        high_risk_ratio = round(high_risk_count / total_records, 4)
-        
+        total_employees = df[config.EMPLOYEE_ID_COLUMN].nunique()
+        avg_absent_hours = round(df[config.TARGET_COLUMN].mean(), 2)
+        max_absent_hours = round(float(df[config.TARGET_COLUMN].max()), 1)
+        min_absent_hours = round(float(df[config.TARGET_COLUMN].min()), 1)
+        high_risk_count = len(df[df[config.TARGET_COLUMN] > 8])
        return {
            'total_records': total_records,
            'total_employees': total_employees,
-            'total_absent_hours': int(total_absent_hours),
            'avg_absent_hours': avg_absent_hours,
            'max_absent_hours': max_absent_hours,
            'min_absent_hours': min_absent_hours,
-            'high_risk_ratio': high_risk_ratio
+            'high_risk_ratio': round(high_risk_count / total_records, 4),
+            'industries_covered': int(df['所属行业'].nunique()),
        }
-    
+
    def get_monthly_trend(self):
        df = self.df
-        
-        monthly = df.groupby('Month of absence').agg({
-            'Absenteeism time in hours': ['sum', 'mean', 'count']
-        }).reset_index()
-        
+        monthly = df.groupby('缺勤月份').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
        monthly.columns = ['month', 'total_hours', 'avg_hours', 'record_count']
-        
-        months = ['1月', '2月', '3月', '4月', '5月', '6月', 
-                  '7月', '8月', '9月', '10月', '11月', '12月']
-        
-        result = {
-            'months': months,
-            'total_hours': [],
-            'avg_hours': [],
-            'record_counts': []
-        }
-        
-        for i in range(1, 13):
-            row = monthly[monthly['month'] == i]
-            if len(row) > 0:
-                result['total_hours'].append(int(row['total_hours'].values[0]))
+        result = {'months': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
+        for month in range(1, 13):
+            row = monthly[monthly['month'] == month]
+            result['months'].append(f'{month}月')
+            if len(row):
+                result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
                result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
                result['record_counts'].append(int(row['record_count'].values[0]))
            else:
                result['total_hours'].append(0)
                result['avg_hours'].append(0)
                result['record_counts'].append(0)
-        
        return result
-    
+
    def get_weekday_distribution(self):
        df = self.df
-        
-        weekday = df.groupby('Day of the week').agg({
-            'Absenteeism time in hours': ['sum', 'mean', 'count']
-        }).reset_index()
-        
+        weekday = df.groupby('星期几').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
        weekday.columns = ['weekday', 'total_hours', 'avg_hours', 'record_count']
-        
-        result = {
-            'weekdays': [],
-            'weekday_codes': [],
-            'total_hours': [],
-            'avg_hours': [],
-            'record_counts': []
-        }
-        
-        for code in [2, 3, 4, 5, 6]:
+        result = {'weekdays': [], 'weekday_codes': [], 'total_hours': [], 'avg_hours': [], 'record_counts': []}
+        for code in range(1, 8):
            row = weekday[weekday['weekday'] == code]
            result['weekdays'].append(config.WEEKDAY_NAMES.get(code, str(code)))
            result['weekday_codes'].append(code)
-            if len(row) > 0:
-                result['total_hours'].append(int(row['total_hours'].values[0]))
+            if len(row):
+                result['total_hours'].append(round(float(row['total_hours'].values[0]), 1))
                result['avg_hours'].append(round(float(row['avg_hours'].values[0]), 2))
                result['record_counts'].append(int(row['record_count'].values[0]))
            else:
                result['total_hours'].append(0)
                result['avg_hours'].append(0)
                result['record_counts'].append(0)
-        
        return result
-    
+
    def get_reason_distribution(self):
        df = self.df
-        
-        reason = df.groupby('Reason for absence').agg({
-            'Absenteeism time in hours': 'count'
-        }).reset_index()
-        
-        reason.columns = ['code', 'count']
+        reason = df.groupby('请假原因大类').agg({config.TARGET_COLUMN: 'count'}).reset_index()
+        reason.columns = ['name', 'count']
        reason = reason.sort_values('count', ascending=False)
-        
        total = reason['count'].sum()
-        
-        result = {
-            'reasons': []
+        return {
+            'reasons': [
+                {
+                    'name': row['name'],
+                    'count': int(row['count']),
+                    'percentage': round(float(row['count']) / total * 100, 1),
+                }
+                for _, row in reason.iterrows()
+            ]
        }
-        
-        for _, row in reason.iterrows():
-            code = int(row['code'])
-            result['reasons'].append({
-                'code': code,
-                'name': config.REASON_NAMES.get(code, f'原因{code}'),
-                'count': int(row['count']),
-                'percentage': round(row['count'] / total * 100, 1)
-            })
-        
-        return result
-    
+
    def get_season_distribution(self):
        df = self.df
-        
-        season = df.groupby('Seasons').agg({
-            'Absenteeism time in hours': ['sum', 'mean', 'count']
-        }).reset_index()
-        
+        season = df.groupby('季节').agg({config.TARGET_COLUMN: ['sum', 'mean', 'count']}).reset_index()
        season.columns = ['season', 'total_hours', 'avg_hours', 'record_count']
-        
        total_records = season['record_count'].sum()
-        
-        result = {
-            'seasons': []
-        }
-        
+        result = {'seasons': []}
        for code in [1, 2, 3, 4]:
            row = season[season['season'] == code]
-            if len(row) > 0:
-                result['seasons'].append({
-                    'code': int(code),
-                    'name': config.SEASON_NAMES.get(code, f'季节{code}'),
-                    'total_hours': int(row['total_hours'].values[0]),
-                    'avg_hours': round(float(row['avg_hours'].values[0]), 2),
-                    'record_count': int(row['record_count'].values[0]),
-                    'percentage': round(row['record_count'].values[0] / total_records * 100, 1)
-                })
-        
+            if not len(row):
+                continue
+            result['seasons'].append({
+                'code': code,
+                'name': config.SEASON_NAMES.get(code, f'季节{code}'),
+                'total_hours': round(float(row['total_hours'].values[0]), 1),
+                'avg_hours': round(float(row['avg_hours'].values[0]), 2),
+                'record_count': int(row['record_count'].values[0]),
+                'percentage': round(float(row['record_count'].values[0]) / total_records * 100, 1),
+            })
        return result


@@ -1,41 +1,25 @@
 import os
-import numpy as np
+
 import joblib
+import numpy as np

 import config
+from core.model_features import (
+    align_feature_frame,
+    apply_label_encoders,
+    build_prediction_dataframe,
+    engineer_features,
+    to_float_array,
+)


 MODEL_INFO = {
-    'random_forest': {
-        'name': 'random_forest',
-        'name_cn': '随机森林',
-        'description': '基于决策树的集成学习算法'
-    },
-    'xgboost': {
-        'name': 'xgboost',
-        'name_cn': 'XGBoost',
-        'description': '高效的梯度提升算法'
-    },
-    'lightgbm': {
-        'name': 'lightgbm',
-        'name_cn': 'LightGBM',
-        'description': '微软轻量级梯度提升框架'
-    },
-    'gradient_boosting': {
-        'name': 'gradient_boosting',
-        'name_cn': 'GBDT',
-        'description': '梯度提升决策树'
-    },
-    'extra_trees': {
-        'name': 'extra_trees',
-        'name_cn': '极端随机树',
-        'description': '随机森林的变体，随机性更强'
-    },
-    'stacking': {
-        'name': 'stacking',
-        'name_cn': 'Stacking集成',
-        'description': '多层堆叠集成学习'
-    }
+    'random_forest': {'name': 'random_forest', 'name_cn': '随机森林', 'description': '稳健的树模型集成'},
+    'xgboost': {'name': 'xgboost', 'name_cn': 'XGBoost', 'description': '梯度提升树模型'},
+    'lightgbm': {'name': 'lightgbm', 'name_cn': 'LightGBM', 'description': '轻量级梯度提升树'},
+    'gradient_boosting': {'name': 'gradient_boosting', 'name_cn': 'GBDT', 'description': '梯度提升决策树'},
+    'extra_trees': {'name': 'extra_trees', 'name_cn': '极端随机树', 'description': '高随机性的树模型'},
+    'stacking': {'name': 'stacking', 'name_cn': 'Stacking集成', 'description': '多模型融合'},
 }


@@ -47,326 +31,172 @@ class PredictService:
        self.selected_features = None
        self.label_encoders = {}
        self.model_metrics = {}
+        self.training_metadata = {}
        self.default_model = 'random_forest'
-    
+
    def _ensure_models_loaded(self):
        if not self.models:
            self.load_models()
-    
+
    def load_models(self):
+        metadata_path = os.path.join(config.MODELS_DIR, 'training_metadata.pkl')
+        if os.path.exists(metadata_path):
+            self.training_metadata = joblib.load(metadata_path)
+
        model_files = {
            'random_forest': 'random_forest_model.pkl',
            'xgboost': 'xgboost_model.pkl',
            'lightgbm': 'lightgbm_model.pkl',
            'gradient_boosting': 'gradient_boosting_model.pkl',
            'extra_trees': 'extra_trees_model.pkl',
-            'stacking': 'stacking_model.pkl'
+            'stacking': 'stacking_model.pkl',
        }
-        
+        allowed_models = self.training_metadata.get('available_models')
+        if allowed_models:
+            model_files = {k: v for k, v in model_files.items() if k in allowed_models}
+
        for name, filename in model_files.items():
-            model_path = os.path.join(config.MODELS_DIR, filename)
-            if os.path.exists(model_path):
+            path = os.path.join(config.MODELS_DIR, filename)
+            if os.path.exists(path):
                try:
-                    self.models[name] = joblib.load(model_path)
-                    print(f"Loaded {name} model")
-                except Exception as e:
-                    print(f"Failed to load {name}: {e}")
-        
+                    self.models[name] = joblib.load(path)
+                except Exception as exc:
+                    print(f'Failed to load model {name}: {exc}')
+
        if os.path.exists(config.SCALER_PATH):
            self.scaler = joblib.load(config.SCALER_PATH)
-        
-        feature_names_path = os.path.join(config.MODELS_DIR, 'feature_names.pkl')
-        if os.path.exists(feature_names_path):
-            self.feature_names = joblib.load(feature_names_path)
-        
-        selected_features_path = os.path.join(config.MODELS_DIR, 'selected_features.pkl')
-        if os.path.exists(selected_features_path):
-            self.selected_features = joblib.load(selected_features_path)
-        
-        label_encoders_path = os.path.join(config.MODELS_DIR, 'label_encoders.pkl')
-        if os.path.exists(label_encoders_path):
-            self.label_encoders = joblib.load(label_encoders_path)
-        
-        metrics_path = os.path.join(config.MODELS_DIR, 'model_metrics.pkl')
-        if os.path.exists(metrics_path):
-            self.model_metrics = joblib.load(metrics_path)
-        
-        if self.model_metrics:
-            valid_metrics = {k: v for k, v in self.model_metrics.items() if k in self.models}
-            if valid_metrics:
-                best_model = max(valid_metrics.items(), key=lambda x: x[1]['r2'])
-                self.default_model = best_model[0]
-    
+        for filename, attr in [
+            ('feature_names.pkl', 'feature_names'),
+            ('selected_features.pkl', 'selected_features'),
+            ('label_encoders.pkl', 'label_encoders'),
+            ('model_metrics.pkl', 'model_metrics'),
+        ]:
+            path = os.path.join(config.MODELS_DIR, filename)
+            if os.path.exists(path):
+                try:
+                    setattr(self, attr, joblib.load(path))
+                except Exception as exc:
+                    print(f'Failed to load artifact {filename}: {exc}')
+
+        valid_metrics = {key: value for key, value in self.model_metrics.items() if key in self.models}
+        if valid_metrics:
+            self.default_model = max(valid_metrics.items(), key=lambda item: item[1]['r2'])[0]
+
    def get_available_models(self):
        self._ensure_models_loaded()
-        
        models = []
        for name in self.models.keys():
-            info = MODEL_INFO.get(name, {
-                'name': name,
-                'name_cn': name,
-                'description': ''
-            }).copy()
+            info = MODEL_INFO.get(name, {'name': name, 'name_cn': name, 'description': ''}).copy()
            info['is_available'] = True
-            info['is_default'] = (name == self.default_model)
-            
-            if name in self.model_metrics:
-                info['metrics'] = self.model_metrics[name]
-            else:
-                info['metrics'] = {'r2': 0, 'rmse': 0, 'mae': 0}
-            
+            info['is_default'] = name == self.default_model
+            info['metrics'] = self.model_metrics.get(name, {'r2': 0, 'rmse': 0, 'mae': 0})
            models.append(info)
-        
-        models.sort(key=lambda x: x['metrics']['r2'], reverse=True)
-        
+        models.sort(key=lambda item: item['metrics']['r2'], reverse=True)
        return models
-    
+
    def predict_single(self, data, model_type=None):
        self._ensure_models_loaded()
-        
-        if model_type is None:
-            model_type = self.default_model
-        
+        model_type = model_type or self.default_model
        if model_type not in self.models:
-            available = list(self.models.keys())
-            if available:
-                model_type = available[0]
-            else:
+            fallback = next(iter(self.models), None)
+            if fallback is None:
                return self._get_default_prediction(data)
-        
-        model = self.models[model_type]
-        
+            model_type = fallback
        if self.scaler is None or self.feature_names is None:
            return self._get_default_prediction(data)
-        
+
        features = self._prepare_features(data)
-        
        try:
-            predicted_hours = model.predict([features])[0]
-            predicted_hours = max(0, float(predicted_hours))
-        except Exception as e:
-            print(f"Prediction error: {e}")
+            predicted_hours = self.models[model_type].predict([features])[0]
+            predicted_hours = self._inverse_transform_prediction(predicted_hours)
+            predicted_hours = max(0.5, float(predicted_hours))
+        except Exception:
            return self._get_default_prediction(data)
-        
+
        risk_level, risk_label = self._get_risk_level(predicted_hours)
-        
-        confidence = 0.85
-        if model_type in self.model_metrics:
-            confidence = max(0.5, self.model_metrics[model_type].get('r2', 0.85))
-        
+        confidence = max(0.5, self.model_metrics.get(model_type, {}).get('r2', 0.82))
        return {
            'predicted_hours': round(predicted_hours, 2),
            'risk_level': risk_level,
            'risk_label': risk_label,
            'confidence': round(confidence, 2),
            'model_used': model_type,
-            'model_name_cn': MODEL_INFO.get(model_type, {}).get('name_cn', model_type)
+            'model_name_cn': MODEL_INFO.get(model_type, {}).get('name_cn', model_type),
        }
-    
+
    def predict_compare(self, data):
        self._ensure_models_loaded()
-        
        results = []
-        
        for name in self.models.keys():
-            try:
-                result = self.predict_single(data, name)
-                result['model'] = name
-                result['model_name_cn'] = MODEL_INFO.get(name, {}).get('name_cn', name)
-                
-                if name in self.model_metrics:
-                    result['r2'] = self.model_metrics[name]['r2']
-                else:
-                    result['r2'] = 0
-                
-                results.append(result)
-            except Exception as e:
-                print(f"Compare error for {name}: {e}")
-        
-        results.sort(key=lambda x: x.get('r2', 0), reverse=True)
-        
+            result = self.predict_single(data, name)
+            result['model'] = name
+            result['model_name_cn'] = MODEL_INFO.get(name, {}).get('name_cn', name)
+            result['r2'] = self.model_metrics.get(name, {}).get('r2', 0)
+            results.append(result)
+        results.sort(key=lambda item: item.get('r2', 0), reverse=True)
        if results:
            results[0]['recommended'] = True
-        
        return results
-    
+
    def _prepare_features(self, data):
-        feature_map = {
-            'Reason for absence': data.get('reason_for_absence', 23),
-            'Month of absence': data.get('month_of_absence', 7),
-            'Day of the week': data.get('day_of_week', 3),
-            'Seasons': data.get('seasons', 1),
-            'Transportation expense': data.get('transportation_expense', 200),
-            'Distance from Residence to Work': data.get('distance', 20),
-            'Service time': data.get('service_time', 5),
-            'Age': data.get('age', 30),
-            'Work load Average/day': data.get('work_load', 250),
-            'Hit target': data.get('hit_target', 95),
-            'Disciplinary failure': data.get('disciplinary_failure', 0),
-            'Education': data.get('education', 1),
-            'Son': data.get('son', 0),
-            'Social drinker': data.get('social_drinker', 0),
-            'Social smoker': data.get('social_smoker', 0),
-            'Pet': data.get('pet', 0),
-            'Body mass index': data.get('bmi', 25)
-        }
-        
-        age = feature_map['Age']
-        service_time = feature_map['Service time']
-        work_load = feature_map['Work load Average/day']
-        distance = feature_map['Distance from Residence to Work']
-        expense = feature_map['Transportation expense']
-        bmi = feature_map['Body mass index']
-        son = feature_map['Son']
-        pet = feature_map['Pet']
-        social_drinker = feature_map['Social drinker']
-        social_smoker = feature_map['Social smoker']
-        hit_target = feature_map['Hit target']
-        seasons = feature_map['Seasons']
-        day_of_week = feature_map['Day of the week']
-        
-        derived_features = {
-            'workload_per_age': work_load / (age + 1),
-            'expense_per_distance': expense / (distance + 1),
-            'age_service_ratio': age / (service_time + 1),
-            'has_children': 1 if son > 0 else 0,
-            'has_pet': 1 if pet > 0 else 0,
-            'family_responsibility': son + pet,
-            'health_risk': 1 if (social_drinker == 1 or social_smoker == 1 or bmi > 30) else 0,
-            'lifestyle_risk': int(social_drinker) + int(social_smoker),
-            'age_group': 1 if age <= 30 else (2 if age <= 40 else (3 if age <= 50 else 4)),
-            'service_group': 1 if service_time <= 5 else (2 if service_time <= 10 else (3 if service_time <= 20 else 4)),
-            'bmi_category': 1 if bmi <= 18.5 else (2 if bmi <= 25 else (3 if bmi <= 30 else 4)),
-            'workload_category': 1 if work_load <= 200 else (2 if work_load <= 250 else (3 if work_load <= 300 else 4)),
-            'commute_category': 1 if distance <= 10 else (2 if distance <= 20 else (3 if distance <= 50 else 4)),
-            'seasonal_risk': 1 if seasons in [1, 3] else 0,
-            'weekday_risk': 1 if day_of_week in [2, 6] else 0,
-            'hit_target_ratio': hit_target / 100,
-            'experience_level': 1 if service_time <= 5 else (2 if service_time <= 10 else (3 if service_time <= 15 else 4)),
-            'age_workload_interaction': age * work_load / 10000,
-            'service_bmi_interaction': service_time * bmi / 100
-        }
-        
-        all_features = {**feature_map, **derived_features}
-        
-        features = []
-        for fname in self.feature_names:
-            if fname in all_features:
-                val = all_features[fname]
-                
-                if fname in self.label_encoders:
-                    try:
-                        val = self.label_encoders[fname].transform([str(val)])[0]
-                    except:
-                        val = 0
-                
-                features.append(float(val))
-            else:
-                features.append(0.0)
-        
-        features = np.array(features).reshape(1, -1)
-        features = self.scaler.transform(features)[0]
-        
+        X_df = build_prediction_dataframe(data)
+        X_df = engineer_features(X_df)
+        X_df = apply_label_encoders(X_df, self.label_encoders)
+        X_df = align_feature_frame(X_df, self.feature_names)
+        features = self.scaler.transform(to_float_array(X_df))[0]
        if self.selected_features:
-            selected_indices = []
-            for sf in self.selected_features:
-                if sf in self.feature_names:
-                    selected_indices.append(self.feature_names.index(sf))
+            selected_indices = [self.feature_names.index(name) for name in self.selected_features if name in self.feature_names]
            if selected_indices:
                features = features[selected_indices]
-        
        return features
-    
+
+    def _inverse_transform_prediction(self, prediction):
+        if self.training_metadata.get('target_transform') == 'log1p':
+            return float(np.expm1(prediction))
+        return float(prediction)
+
    def _get_risk_level(self, hours):
        if hours < 4:
            return 'low', '低风险'
-        elif hours <= 8:
+        if hours <= 8:
            return 'medium', '中风险'
-        else:
-            return 'high', '高风险'
-    
+        return 'high', '高风险'
+
    def _get_default_prediction(self, data):
-        base_hours = 5.0
-        
-        expense = data.get('transportation_expense', 200)
-        if expense > 300:
-            base_hours += 1.0
-        elif expense < 150:
+        base_hours = 3.8
+        base_hours += min(float(data.get('monthly_overtime_hours', 24)) / 20, 3.0)
+        base_hours += min(float(data.get('commute_minutes', 40)) / 50, 2.0)
+        base_hours += 1.6 if int(data.get('is_night_shift', 0)) == 1 else 0
+        base_hours += 1.8 if int(data.get('chronic_disease_flag', 0)) == 1 else 0
+        base_hours += 0.9 if int(data.get('near_holiday_flag', 0)) == 1 else 0
+        base_hours += 0.8 if int(data.get('medical_certificate_flag', 0)) == 1 else 0
+        base_hours += 0.5 * int(data.get('children_count', 0))
+        if data.get('leave_type') in ['病假', '工伤假', '婚假', '丧假']:
+            base_hours += 2.5
+        if data.get('stress_level') == '高':
+            base_hours += 0.9
+        if data.get('performance_level') == 'A':
            base_hours -= 0.5
-        
-        distance = data.get('distance', 20)
-        if distance > 40:
-            base_hours += 1.5
-        elif distance > 25:
-            base_hours += 0.8
-        
-        service_time = data.get('service_time', 5)
-        if service_time < 3:
-            base_hours += 0.5
-        elif service_time > 15:
-            base_hours -= 0.5
-        
-        age = data.get('age', 30)
-        if age > 50:
-            base_hours += 0.5
-        elif age < 25:
-            base_hours += 0.3
-        
-        work_load = data.get('work_load', 250)
-        if work_load > 300:
-            base_hours += 1.5
-        elif work_load > 260:
-            base_hours += 0.5
-        
-        bmi = data.get('bmi', 25)
-        if bmi > 30:
-            base_hours += 0.8
-        elif bmi < 20:
-            base_hours += 0.3
-        
-        if data.get('social_drinker', 0) == 1:
-            base_hours += 0.8
-        if data.get('social_smoker', 0) == 1:
-            base_hours += 0.5
-        
-        son = data.get('son', 0)
-        if son > 0:
-            base_hours += 0.3 * son
-        
-        pet = data.get('pet', 0)
-        if pet > 0:
-            base_hours -= 0.1 * pet
-        
-        hit_target = data.get('hit_target', 95)
-        if hit_target < 90:
-            base_hours += 0.5
-        
-        base_hours = max(0.5, base_hours)
-        
        risk_level, risk_label = self._get_risk_level(base_hours)
-        
        return {
-            'predicted_hours': round(base_hours, 2),
+            'predicted_hours': round(max(0.5, base_hours), 2),
            'risk_level': risk_level,
            'risk_label': risk_label,
-            'confidence': 0.75,
+            'confidence': 0.72,
            'model_used': 'default',
-            'model_name_cn': '默认规则'
+            'model_name_cn': '默认规则',
        }
-    
+
    def get_model_info(self):
        self._ensure_models_loaded()
-        
-        models = self.get_available_models()
-        
        return {
-            'models': models,
+            'models': self.get_available_models(),
            'training_info': {
-                'train_samples': 2884,
-                'test_samples': 722,
-                'feature_count': len(self.feature_names) if self.feature_names else 20,
-                'training_date': '2026-03-08'
-            }
+                'train_samples': self.training_metadata.get('train_samples', 0),
+                'test_samples': self.training_metadata.get('test_samples', 0),
+                'feature_count': self.training_metadata.get('feature_count_after_selection', 0),
+                'training_date': self.training_metadata.get('training_date', ''),
+            },
        }